diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 46c2477efb2..229e8b90ec9 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -151,7 +151,8 @@ jobs:
HOMEBREW_NO_AUTO_UPDATE=1 brew install dylibbundler
lipo -create ./macosx-x86_64/supertuxkart.app/Contents/MacOS/supertuxkart ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart -output ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart
chmod 755 ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart
- dylibbundler -od -b -x ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart -d ./macosx-arm64/supertuxkart.app/Contents/libs/ -p @executable_path/../libs/ -s dependencies-macosx/lib -ns
+ install_name_tool -change libcurl.4.dylib @rpath/libcurl.4.dylib ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart
+ dylibbundler -od -b -x ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart -d ./macosx-arm64/supertuxkart.app/Contents/libs/ -p @executable_path/../libs/ -s ./dependencies-macosx/lib -ns
# We use SDL_Vulkan_LoadLibrary for 10.9 compatibility, so otool -L supertuxkart has no libMoltenVK.dylib
cp ./dependencies-macosx/lib/libMoltenVK.dylib ./macosx-arm64/supertuxkart.app/Contents/libs/
cd ./macosx-arm64/supertuxkart.app/Contents/Resources/data
diff --git a/android/build.gradle b/android/build.gradle
index f846541156e..8241548db24 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -11,7 +11,7 @@ buildscript
// 4.1.2 is the minimum version to support native debug symbols file
// https://developer.android.com/studio/build/shrink-code#android_gradle_plugin_version_41_or_later
// 7.0.0 to fix https://stackoverflow.com/questions/68387270/android-studio-error-installed-build-tools-revision-31-0-0-is-corrupted
- classpath 'com.android.tools.build:gradle:8.2.1'
+ classpath 'com.android.tools.build:gradle:8.5.1'
}
}
@@ -48,6 +48,7 @@ android
versionCode project.getProperty('version_code').toInteger()
versionName project.getProperty('version_name')
minSdkVersion min_sdk_version.toInteger()
+ compileSdkVersion compile_sdk_version.toInteger()
targetSdkVersion target_sdk_version.toInteger()
externalNativeBuild
{
diff --git a/android/gradle/wrapper/gradle-wrapper.jar b/android/gradle/wrapper/gradle-wrapper.jar
index 249e5832f09..e6441136f3d 100644
Binary files a/android/gradle/wrapper/gradle-wrapper.jar and b/android/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/android/gradle/wrapper/gradle-wrapper.properties b/android/gradle/wrapper/gradle-wrapper.properties
index c65b8841c00..09523c0e549 100644
--- a/android/gradle/wrapper/gradle-wrapper.properties
+++ b/android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +1,7 @@
-#Sun Dec 03 18:24:53 EET 2023
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.9-bin.zip
+networkTimeout=10000
+validateDistributionUrl=true
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
diff --git a/android/gradlew b/android/gradlew
index a69d9cb6c20..b740cf13397 100755
--- a/android/gradlew
+++ b/android/gradlew
@@ -55,7 +55,7 @@
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
-# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
+# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
@@ -80,13 +80,11 @@ do
esac
done
-APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
-
-APP_NAME="Gradle"
+# This is normally unused
+# shellcheck disable=SC2034
APP_BASE_NAME=${0##*/}
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036)
+APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
@@ -133,22 +131,29 @@ location of your Java installation."
fi
else
JAVACMD=java
- which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+ if ! command -v java >/dev/null 2>&1
+ then
+ die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
+ fi
fi
# Increase the maximum file descriptors if we can.
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
+ # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked.
+ # shellcheck disable=SC2039,SC3045
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
+ # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked.
+ # shellcheck disable=SC2039,SC3045
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
@@ -193,11 +198,15 @@ if "$cygwin" || "$msys" ; then
done
fi
-# Collect all arguments for the java command;
-# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
-# shell script including quotes and variable substitutions, so put them in
-# double quotes to make sure that they get re-expanded; and
-# * put everything else in single quotes, so that it's not re-expanded.
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Collect all arguments for the java command:
+# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments,
+# and any embedded shellness will be escaped.
+# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be
+# treated as '${Hostname}' itself on the command line.
set -- \
"-Dorg.gradle.appname=$APP_BASE_NAME" \
diff --git a/android/make.sh b/android/make.sh
index 31c62ae79ed..f5a8f267056 100755
--- a/android/make.sh
+++ b/android/make.sh
@@ -435,6 +435,7 @@ convert -scale 432x432 "$APP_ICON_ADAPTIVE_FG" "$DIRNAME/res/drawable-xxxhdpi/ic
export ANDROID_HOME="$SDK_PATH"
./gradlew -Pcompile_sdk_version="$COMPILE_SDK_VERSION" \
-Pmin_sdk_version="$STK_MIN_ANDROID_SDK" \
+ -Pcompile_sdk_version="$STK_TARGET_ANDROID_SDK"\
-Ptarget_sdk_version="$STK_TARGET_ANDROID_SDK" \
-Pstorepass="$STK_STOREPASS" \
-Pkeystore="$STK_KEYSTORE" \
@@ -450,6 +451,7 @@ export ANDROID_HOME="$SDK_PATH"
if [ "$GRADLE_BUILD_TYPE" = "assembleRelease" ]; then
./gradlew -Pcompile_sdk_version="$COMPILE_SDK_VERSION" \
-Pmin_sdk_version="$STK_MIN_ANDROID_SDK" \
+ -Pcompile_sdk_version="$STK_TARGET_ANDROID_SDK"\
-Ptarget_sdk_version="$STK_TARGET_ANDROID_SDK" \
-Pstorepass="$STK_STOREPASS" \
-Pkeystore="$STK_KEYSTORE" \
diff --git a/data/gui/screens/arenas.stkgui b/data/gui/screens/arenas.stkgui
index 9746e93081a..6c71e81cf7f 100644
--- a/data/gui/screens/arenas.stkgui
+++ b/data/gui/screens/arenas.stkgui
@@ -3,9 +3,16 @@
-
-
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
diff --git a/data/supertuxkart.appdata.xml b/data/supertuxkart.appdata.xml
index a57e6e1e4e7..c3393a97262 100644
--- a/data/supertuxkart.appdata.xml
+++ b/data/supertuxkart.appdata.xml
@@ -3,6 +3,7 @@
supertuxkart.desktop
CC0-1.0
GPL-3.0+
+ supertuxkart.desktop
SuperTuxKart
A 3D open-source kart racing game
3D 開源卡丁車賽車遊戲
diff --git a/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp b/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp
index 955b1694dd3..6f8b7178306 100644
--- a/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp
+++ b/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp
@@ -92,6 +92,10 @@ CIrrDeviceSDL::CIrrDeviceSDL(const SIrrlichtCreationParameters& param)
// Switch SDL disables this hint by default: https://github.com/devkitPro/SDL/pull/55#issuecomment-633775255
SDL_SetHint(SDL_HINT_TOUCH_MOUSE_EVENTS, "1");
+#ifdef ANDROID
+ SDL_SetHint(SDL_HINT_ORIENTATIONS, "LandscapeLeft LandscapeRight");
+#endif
+
#ifndef MOBILE_STK
// Prevent fullscreen minimizes when losing focus
if (CreationParams.Fullscreen)
diff --git a/lib/simd_wrapper/simde/README.md b/lib/simd_wrapper/simde/README.md
index 52278c866a1..303893a01bc 100644
--- a/lib/simd_wrapper/simde/README.md
+++ b/lib/simd_wrapper/simde/README.md
@@ -1,10 +1,513 @@
-# SIMDe Without Test Cases
+# SIMD Everywhere
+
+[![All Contributors](https://img.shields.io/badge/all_contributors-73-orange.svg?style=flat-square)](#contributors-)
+
+[![Chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://matrix.to/#/#simd-everywhere_community:gitter.im)
+[![codecov](https://codecov.io/gh/simd-everywhere/simde/branch/master/graph/badge.svg?token=jcMBoRk0ui)](https://codecov.io/gh/simd-everywhere/simde)
-This repository contains only the core of
-[SIMDe](https://github.com/simd-everywhere/simde).
-It is generated automatically for every commit to master, and is
-intended to be used as a submodule in projects which don't want to
-include the (rather large) test cases.
+The SIMDe header-only library provides fast, portable implementations of
+[SIMD intrinsics](https://en.wikipedia.org/wiki/SIMD) on hardware which
+doesn't natively support them, such as calling [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions)
+functions on ARM. There is no performance penalty if the hardware
+supports the native implementation (*e.g.*, SSE/[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions)
+runs at full speed on [x86](https://en.wikipedia.org/wiki/X86),
+[NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)) on [ARM](https://en.wikipedia.org/wiki/ARM_architecture),
+*etc.*).
-All development work happens in the main repository, please do not
-file issues or create pull requests against this repository.
+This makes porting code to other architectures much easier in a few
+key ways:
+
+First, instead of forcing you to rewrite everything for each
+architecture, SIMDe lets you get a port up and running almost
+effortlessly. You can then start working on switching the most
+performance-critical sections to native intrinsics, improving
+performance gradually. SIMDe lets (for example) SSE/AVX and NEON code
+exist side-by-side, in the same implementation.
+
+Second, SIMDe makes it easier to write code targeting [ISA](https://en.wikipedia.org/wiki/Instruction_set_architecture)
+extensions you don't have convenient access to. You can run NEON code on your
+x86 machine *without an emulator*. Obviously you'll eventually want
+to test on the actual hardware you're targeting, but for most
+development, SIMDe can provide a much easier path.
+
+SIMDe takes a very different approach from most other SIMD abstraction
+layers in that it aims to expose the entire functionality of the
+underlying instruction set. Instead of limiting functionality to the
+lowest common denominator, SIMDe tries to minimize the amount of
+effort required to port while still allowing you the space to optimize
+as needed.
+
+The current focus is on writing complete portable implementations,
+though a large number of functions already have accelerated
+implementations using one (or more) of the following:
+
+ * SIMD intrinsics from other ISA extensions (e.g., using NEON to
+ implement SSE).
+ * Compiler-specific vector extensions and built-ins such as
+ [`__builtin_shufflevector`](http://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-shufflevector)
+ and
+ [`__builtin_convertvector`](http://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-convertvector)
+ * Compiler auto-vectorization hints, using:
+ * [OpenMP 4 SIMD](http://www.openmp.org/)
+ * [Cilk Plus](https://www.cilkplus.org/)
+ * [GCC loop-specific pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html)
+ * [clang pragma loop hint directives](http://llvm.org/docs/Vectorizers.html#pragma-loop-hint-directives)
+
+You can [try SIMDe online](https://simde.netlify.app/godbolt/demo)
+using Compiler Explorer and an amalgamated SIMDe header.
+
+If you have any questions, please feel free to use the
+[issue tracker](https://github.com/simd-everywhere/simde/issues) or the
+[mailing list](https://groups.google.com/forum/#!forum/simde).
+
+## Current Status
+
+There are currently complete implementations of the following instruction
+set extensions:
+
+* ARM
+ * [NEON](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)) [List](https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon])
+* x86 / x86_64
+ * [MMX](https://en.wikipedia.org/wiki/MMX_(instruction_set)) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX)
+ * [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE)
+ * [SSE2](https://en.wikipedia.org/wiki/SSE2) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE2)
+ * [SSE3](https://en.wikipedia.org/wiki/SSE3) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE3)
+ * [SSSE3](https://en.wikipedia.org/wiki/SSSE3) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSSE3)
+ * [SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE4_1)
+ * [CRC32](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&othertechs=CRC32)
+ * [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX)
+ * [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX2)
+ * [F16C](https://en.wikipedia.org/wiki/F16C) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C)
+ * [FMA](https://en.wikipedia.org/wiki/FMA_instruction_set) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=FMA)
+ * [GFNI](https://en.wikipedia.org/wiki/AVX-512#GFNI) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=GFNI)
+ * [XOP](https://en.wikipedia.org/wiki/XOP_instruction_set)
+ * [SVML](https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions-512-intel-avx-512-instructions/intrinsics-for-arithmetic-operations-1/intrinsics-for-short-vector-math-library-svml-operations.html) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=770&techs=SVML)
+ * [AVX512VPOPCNTDQ](https://en.wikipedia.org/wiki/AVX-512#VPOPCNTDQ_and_BITALG) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512VPOPCNTDQ)
+ * [AVX512_BITALG](https://en.wikipedia.org/wiki/AVX-512#VPOPCNTDQ_and_BITALG) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_BITALG)
+ * [AVX512_VBMI](https://en.wikipedia.org/wiki/AVX-512#Permute) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_VBMI)
+ * [AVX512_VNNI](https://en.wikipedia.org/wiki/AVX-512#VNNI) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_VNNI)
+ * [AVX512_VP2INTERSECT](https://en.wikipedia.org/wiki/AVX-512#VP2INTERSECT) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_VP2INTERSECT)
+ * [VPCLMULQDQ](https://en.wikipedia.org/wiki/AVX-512#VPCLMULQDQ) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=VPCLMULQDQ)
+* WebAssembly
+ * [SIMD128](https://github.com/WebAssembly/simd)
+
+As well as partial support for many others, including AES-ni, [CLMUL](https://en.wikipedia.org/wiki/CLMUL_instruction_set), SSE4.2, SVE, [MSA](https://en.wikipedia.org/wiki/MIPS_architecture#Application-specific_extensions) in
+addition to several AVX-512 extensions. See the
+[instruction-set-support](https://github.com/simd-everywhere/simde/issues?q=is%3Aissue+is%3Aopen+label%3Ainstruction-set-support+sort%3Aupdated-desc)
+label in the issue tracker for details on progress. If you'd like to
+be notified when an instruction set is available you may subscribe to
+the relevant issue.
+
+If you have a project you're interested in using with SIMDe but we
+don't yet support all the functions you need, please file an issue
+with a list of what's missing so we know what to prioritize.
+
+The default branch is protected so commits never reach it unless
+they have passed extensive CI checks. Status badges don't really
+make sense since they will always be green, but here are the links:
+
+* [GitHub Actions](https://github.com/simd-everywhere/simde/actions)
+* [Cirrus CI](https://cirrus-ci.com/github/simd-everywhere/simde)
+* [Semaphore CI](https://nemequ.semaphoreci.com/projects/simde)
+* [Circle CI](https://app.circleci.com/pipelines/github/simd-everywhere/simde)
+* [AppVeyor](https://ci.appveyor.com/project/nemequ/simde)
+* [Azure Pipelines](https://dev.azure.com/simd-everywhere/SIMDe/_build)
+* [Drone CI](https://cloud.drone.io/simd-everywhere/simde/)
+* [Travis CI](https://app.travis-ci.com/github/simd-everywhere/simde/)
+* [Packit CI](https://dashboard.packit.dev/projects/github.com/simd-everywhere/simde)
+
+If you're adding a new build I suggest Cirrus CI, which is where we
+currently have the most room given the number of builds currently on
+the platform and the quotas for free/open-source usage. Alternately,
+feel free to set up another provider (such as
+[Codefresh](https://codefresh.io/),
+[Shippable](https://www.shippable.com/),
+[Bitrise](https://www.bitrise.io/),
+[Werkaer](https://app.wercker.com/), etc.).
+
+*Notice*: we plan on changing the name of the default branch from
+"master" to something else soon; we are just trying to wait to see what
+name git settles on so we can be consistent.
+
+## Contributing
+
+First off, if you're reading this: thank you! Even considering
+contributing to SIMDe is very much appreciated!
+
+SIMDe is a fairly large undertaking; there are a *lot* of functions to
+get through and a lot of opportunities for optimization on different
+platforms, so we're very happy for any help you can provide.
+
+Programmers of all skill levels are welcome, there are lots of tasks
+which are pretty straightforward and don't require any special
+expertise.
+
+If you're not sure how you'd like to contribute, please consider taking
+a look at [the issue tracker](https://github.com/simd-everywhere/simde/issues).
+There is a [good first issue](https://github.com/simd-everywhere/simde/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22)
+tag if you want to ease into a your first contributions, but if you're
+interested in something else please get in touch via the issue tracker;
+we're happy to help you get a handle on whatever you are interested in.
+
+If you're interested in implementing currently unimplemented functions,
+there is [a
+guide](https://github.com/simd-everywhere/simde/wiki/Implementing-a-New-Function)
+explaining how to add new functions and how to quickly and easily get
+a test case in place. It's a bit rough right now, but if anything is
+unclear please feel free to use the issue tracker to ask about
+anything you're not clear on.
+
+## Usage
+
+First, it is important to note that *you do not need two separate
+versions* (one using SIMDe, the other native). If the native functions
+are available SIMDe will use them, and compilers easily optimize away
+any overhead from SIMDe; all they have to do is some basic inlining.
+`-O2` should be enough, but we strongly recommend `-O3` (or whatever
+flag instructs your compiler to aggressizely optimize) since many of
+the portable fallbacks are substantially faster with aggressive
+auto-vectorization that isn't enabled at lower optimization levels.
+
+Each instruction set has a separate file; `x86/mmx.h` for MMX,
+`x86/sse.h` for SSE, `x86/sse2.h` for SSE2, and so on. Just include
+the header for whichever instruction set(s) you want *instead of the
+native version* (if you include the native version after SIMDe it will
+result in compile-time errors if native aliases are enabled). SIMDe
+will provide the fastest implementation it can given which extensions
+you've enabled in your compiler (i.e., if you want to use NEON to
+implement SSE, you may need to pass something like `-mfpu=neon`
+or `-march=armv8-a+simd`. See
+[GCC ARM-Options](https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html)
+for more information).
+
+If you define `SIMDE_ENABLE_NATIVE_ALIASES` before including SIMDe
+you can use the same names as the native functions. Unfortunately,
+this is somewhat error-prone due to portability issues in the APIs, so
+it's recommended to only do this for testing. When
+`SIMDE_ENABLE_NATIVE_ALIASES` is undefined only the versions prefixed
+with `simde_` will be available; for example, the MMX `_mm_add_pi8`
+intrinsic becomes `simde_mm_add_pi8`, and `__m64` becomes `simde__m64`.
+
+Since SIMDe is meant to be portable, many functions which assume types
+are of a specific size have been altered to use fixed-width types
+instead. For example, Intel's APIs use `char` for signed 8-bit
+integers, but `char` on ARM is generally unsigned. SIMDe uses `int8_t`
+to make the API portable, but that means your code may require some
+minor changes (such as using `int8_t` instead of `char`) to work on
+other platforms.
+
+That said, the changes are usually quite minor. It's often enough to
+just use search and replace, manual changes are required pretty
+infrequently.
+
+### OpenMP 4 SIMD
+
+SIMDe makes extensive use of annotations to help the compiler vectorize
+code. By far the best annotations use the SIMD support built in to
+OpenMP 4, so if your compiler supports these annotations we strongly
+recommend you enable them.
+
+If you are already using OpenMP, SIMDe will automatically detect it
+using the `_OPENMP` macro and no further action is required.
+
+Some compilers allow you to enable OpenMP SIMD *without* enabling the
+full OpenMP. In such cases there is no runtime dependency on OpenMP
+and no runtime overhead; SIMDe will just be faster. Unfortunately,
+SIMDe has no way to detect such situations (the `_OPENMP` macro is not
+defined), so after enabling it in your compiler you'll need to define
+`SIMDE_ENABLE_OPENMP` (e.g., by passing `-DSIMDE_ENABLE_OPENMP`) to get
+SIMDe to output the relevant pragmas.
+
+Enabling OpenMP SIMD support varies by compiler:
+
+ * GCC 4.9+ and clang 6+ support a `-fopenmp-simd` command line flag.
+ * ICC supports a `-qopenmp-simd` command line flag.
+ * MCST's LCC enables OpenMP SIMD by default, so no flags are needed
+ (technically you don't even need to pass `-DSIMDE_ENABLE_OPENMP`).
+
+We are not currently aware of any other compilers which allow you to
+enable OpenMP SIMD support without enabling full OpenMP (if you are
+please file an issue to let us know). You should determine whether you
+wish to enable full OpenMP support on a case-by-case basis, but it is
+likely that the overhead of linking to (but not using) the OpenMP
+runtime library will be dwarfed by the performance improvements from
+using the OpenMP SIMD annotations in SIMDe.
+
+If you choose not to use OpenMP SIMD, SIMDe also supports
+using [Cilk Plus](https://www.cilkplus.org/), [GCC loop-specific
+pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html),
+or [clang pragma loop hint
+directives](http://llvm.org/docs/Vectorizers.html#pragma-loop-hint-directives),
+though these are not nearly as effective as OpenMP SIMD and depending
+on them will likely result in less efficient code. All of these are
+detected automatically by SIMDe, so if they are enabled in your
+compiler nothing more is required.
+
+If for some reason you do not wish to enable OpenMP 4 SIMD support even
+though SIMDe detects it, you should define `SIMDE_DISABLE_OPENMP` prior
+to including SIMDe.
+
+## Portability
+
+### Compilers
+
+SIMDe does depend on some C99 features, though the subset supported by
+MSVC also works. While we do our best to make sure we provide optimized
+implementations where they are supported, SIMDe does contain portable
+fallbacks which are designed to work on any C99 compiler.
+
+Every commit is tested in CI on multiple compilers, platforms, and
+configurations, and our test coverage is extremely extensive.
+Currently tested compilers include:
+
+ * GCC versions back to 4.8
+ * Clang versions back to 3.8
+ * Microsoft Visual Studio back to 12 (2013)
+ * IBM XL C/C++
+ * Intel C/C++ Compiler (ICC)
+
+I'm generally willing to accept patches to add support for other
+compilers, as long as they're not too disruptive, *especially* if we
+can get CI support going. If using one of our existing CI providers
+isn't an option then other CI platforms can be added.
+
+### Hardware
+
+The following architectures are tested in CI for every commit:
+
+ * x86_64/amd64
+ * x86
+ * AArch64
+ * ARMv8
+ * ARMv7 with VFPv3-D16 floating point
+ * ARMv5 EABI
+ * PPC64
+ * z/Architecture (with "-mzvector")
+ * MIPS Loongson 64
+ * RISC-V 64
+ * emscripten 32- & 64-bit; regular and relaxed
+
+We would love to add more, so patches are extremely welcome!
+
+## Related Projects
+
+ * The "builtins" module in
+ [portable-snippets](https://github.com/nemequ/portable-snippets)
+ does much the same thing, but for compiler-specific intrinsics
+ (think `__builtin_clz` and `_BitScanForward`), **not** SIMD
+ intrinsics.
+ * Intel offers an emulator, the [Intel® Software Development
+ Emulator](https://software.intel.com/en-us/articles/intel-software-development-emulator/)
+ which can be used to develop software which uses Intel intrinsics
+ without having to own hardware which supports them, though it
+ doesn't help for deployment.
+ * [Iris](https://github.com/AlexYaruki/iris) is the only other project
+ I'm aware of which is attempting to create portable implementations
+ like SIMDe. SIMDe is much further along on the Intel side, but Iris
+ looks to be in better shape on ARM. C++-only, Apache 2.0 license.
+ AFAICT there are no accelerated fallbacks, nor is there a good way to
+ add them since it relies extensively on templates.
+ * There are a few projects trying to implement one set with another:
+ * [ARM_NEON_2_x86_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE)
+ — implementing NEON using SSE. Quite extensive, Apache 2.0
+ license.
+ * [sse2neon](https://github.com/jratcliff63367/sse2neon) —
+ implementing SSE using NEON. This code has already been merged
+ into SIMDe.
+ * [veclib](https://github.com/IvantheDugtrio/veclib) — implementing
+ SSE2 using AltiVec/VMX, using a non-free IBM library called
+ [powerveclib](https://www.ibm.com/developerworks/community/groups/community/powerveclib/)
+ * [SSE-to-NEON](https://github.com/otim/SSE-to-NEON) — implementing
+ SSE with NEON. Non-free, C++.
+ * [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon) — Popular
+ AVX+ intrinsincs implemented in NEON. C, Apache 2.0 license.
+ * [neon2rvv](https://github.com/howjmay/neon2rvv) - A C/C++ header file that converts Arm/Aarch64 NEON intrinsics to RISC-V Vector (RVV) Extension, MIT license
+ * [sse2rvv](https://github.com/pattonkan/sse2rvv) - A C/C++ header file that converts Intel SSE intrinsics to RISCV-V Extension intrinsics, MIT license.
+ * [arm-neon-tests](https://github.com/christophe-lyon/arm-neon-tests)
+ contains tests to verify NEON implementations.
+
+If you know of any other related projects, please [let us
+know](https://github.com/simd-everywhere/simde/issues/new)!
+
+## Caveats
+
+Sometime features can't be emulated. If SIMDe is operating in native
+mode the functions will work as expected, but if there is no native
+support some caveats apply:
+
+ * Many functions require and/or . SIMDe will still
+ work without those headers, but the results of those functions are
+ undefined.
+ * x86 / x86_64
+ * SSE
+ * `SIMDE_MM_SET_ROUNDING_MODE()` will use `fesetround()`, altering
+ the global rounding mode.
+ * `simde_mm_getcsr` and `simde_mm_setcsr` only implement bits 13
+ and 14 (rounding mode).
+ * AVX
+ * `simde_mm256_test*` do not set the CF/ZF registers as there is
+ no portable way to implement that functionality.
+ * `simde_mm256_zeroall` and `simde_mm256_zeroupper` are not
+ implemented as there is no portable way to implement that
+ functionality.
+
+Additionally, there are some known limitations which apply when using
+native aliases (`SIMDE_ENABLE_NATIVE_ALIASES`):
+
+* On Windows x86 (but not x86_64), some MMX functions and SSE/SSE2
+ functions which use MMX types (__m64) other than for pointers may
+ return incorrect results.
+
+Also, as mentioned earlier, while some APIs make assumptions about
+basic types (*e.g.*, `int` is 32 bits), SIMDe does not, so many types
+have been altered to use portable fixed-width versions such as
+`int32_t`.
+
+If you find any other differences, please file an issue so we can either fix
+it or add it to the list above.
+
+## Benefactors
+
+SIMDe uses resources provided for free by a number of organizations.
+While this shouldn't be taken to imply endorsement of SIMDe, we're
+tremendously grateful for their support:
+
+ * [IntegriCloud](https://integricloud.com/) — provides access to a very
+ fast POWER9 server for developing AltiVec/VMX support.
+ * [GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm) — provides
+ access to a wide range of machines with different architectures for
+ developing support for various ISA extensions.
+ * [CodeCov.io](https://codecov.io/) — provides code coverage analysis
+ for our test cases.
+ * [Google](https://www.google.com/) — financing
+ [Summer of Code](https://summerofcode.withgoogle.com/), substantial
+ amounts of code (Sean Maher's contributions), and an [Open Source Peer
+ Bonus](https://opensource.google/docs/growing/peer-bonus/).
+
+Without such organizations donating resources, SIMDe wouldn't be nearly
+as useful or usable as it is today.
+
+We would also like to thank anyone who has helped develop the myriad
+of software on which SIMDe relies, including compilers and analysis
+tools.
+
+Finally, a special thank you to
+[anyone who has contributed](https://github.com/simd-everywhere/simde/graphs/contributors)
+to SIMDe, filed bugs, provided suggestions, or helped with SIMDe
+development in any way.
+
+## License
+
+SIMDe is distributed under an MIT-style license; see COPYING for
+details.
+
+## Contributors ✨
+
+Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
+
+
+
+
+
+
+
+
+
+
+This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind are welcome!
diff --git a/lib/simd_wrapper/simde/arm/neon.h b/lib/simd_wrapper/simde/arm/neon.h
index df91b0d9334..5835db69642 100644
--- a/lib/simd_wrapper/simde/arm/neon.h
+++ b/lib/simd_wrapper/simde/arm/neon.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_H)
@@ -30,23 +31,32 @@
#include "neon/types.h"
#include "neon/aba.h"
+#include "neon/abal.h"
+#include "neon/abal_high.h"
#include "neon/abd.h"
#include "neon/abdl.h"
+#include "neon/abdl_high.h"
#include "neon/abs.h"
#include "neon/add.h"
#include "neon/addhn.h"
+#include "neon/addhn_high.h"
#include "neon/addl.h"
#include "neon/addlv.h"
#include "neon/addl_high.h"
#include "neon/addv.h"
#include "neon/addw.h"
#include "neon/addw_high.h"
+#include "neon/aes.h"
#include "neon/and.h"
#include "neon/bcax.h"
#include "neon/bic.h"
#include "neon/bsl.h"
+#include "neon/cadd_rot270.h"
+#include "neon/cadd_rot90.h"
#include "neon/cage.h"
#include "neon/cagt.h"
+#include "neon/cale.h"
+#include "neon/calt.h"
#include "neon/ceq.h"
#include "neon/ceqz.h"
#include "neon/cge.h"
@@ -60,13 +70,24 @@
#include "neon/cltz.h"
#include "neon/clz.h"
#include "neon/cmla.h"
-#include "neon/cmla_rot90.h"
+#include "neon/cmla_lane.h"
#include "neon/cmla_rot180.h"
+#include "neon/cmla_rot180_lane.h"
#include "neon/cmla_rot270.h"
+#include "neon/cmla_rot270_lane.h"
+#include "neon/cmla_rot90.h"
+#include "neon/cmla_rot90_lane.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
+#include "neon/cvt_n.h"
+#include "neon/cvtm.h"
+#include "neon/cvtn.h"
+#include "neon/cvtp.h"
#include "neon/combine.h"
+#include "neon/copy_lane.h"
+#include "neon/crc32.h"
#include "neon/create.h"
+#include "neon/div.h"
#include "neon/dot.h"
#include "neon/dot_lane.h"
#include "neon/dup_lane.h"
@@ -76,6 +97,11 @@
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
+#include "neon/fmlal.h"
+#include "neon/fmlsl.h"
+#include "neon/fms.h"
+#include "neon/fms_lane.h"
+#include "neon/fms_n.h"
#include "neon/get_high.h"
#include "neon/get_lane.h"
#include "neon/get_low.h"
@@ -84,30 +110,48 @@
#include "neon/ld1.h"
#include "neon/ld1_dup.h"
#include "neon/ld1_lane.h"
+#include "neon/ld1_x2.h"
+#include "neon/ld1_x3.h"
+#include "neon/ld1_x4.h"
+#include "neon/ld1q_x2.h"
+#include "neon/ld1q_x3.h"
+#include "neon/ld1q_x4.h"
#include "neon/ld2.h"
+#include "neon/ld2_dup.h"
+#include "neon/ld2_lane.h"
#include "neon/ld3.h"
+#include "neon/ld3_dup.h"
+#include "neon/ld3_lane.h"
#include "neon/ld4.h"
+#include "neon/ld4_dup.h"
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
+#include "neon/maxnmv.h"
#include "neon/maxv.h"
#include "neon/min.h"
#include "neon/minnm.h"
+#include "neon/minnmv.h"
#include "neon/minv.h"
#include "neon/mla.h"
+#include "neon/mla_lane.h"
#include "neon/mla_n.h"
#include "neon/mlal.h"
#include "neon/mlal_high.h"
+#include "neon/mlal_high_lane.h"
#include "neon/mlal_high_n.h"
#include "neon/mlal_lane.h"
#include "neon/mlal_n.h"
#include "neon/mls.h"
+#include "neon/mls_lane.h"
#include "neon/mls_n.h"
#include "neon/mlsl.h"
#include "neon/mlsl_high.h"
+#include "neon/mlsl_high_lane.h"
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
+#include "neon/mmlaq.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
#include "neon/movn.h"
@@ -117,8 +161,13 @@
#include "neon/mul_n.h"
#include "neon/mull.h"
#include "neon/mull_high.h"
+#include "neon/mull_high_lane.h"
+#include "neon/mull_high_n.h"
#include "neon/mull_lane.h"
#include "neon/mull_n.h"
+#include "neon/mulx.h"
+#include "neon/mulx_lane.h"
+#include "neon/mulx_n.h"
#include "neon/mvn.h"
#include "neon/neg.h"
#include "neon/orn.h"
@@ -127,59 +176,117 @@
#include "neon/padd.h"
#include "neon/paddl.h"
#include "neon/pmax.h"
+#include "neon/pmaxnm.h"
#include "neon/pmin.h"
+#include "neon/pminnm.h"
#include "neon/qabs.h"
#include "neon/qadd.h"
+#include "neon/qdmlal.h"
+#include "neon/qdmlal_high.h"
+#include "neon/qdmlal_high_lane.h"
+#include "neon/qdmlal_high_n.h"
+#include "neon/qdmlal_lane.h"
+#include "neon/qdmlal_n.h"
+#include "neon/qdmlsl.h"
+#include "neon/qdmlsl_high.h"
+#include "neon/qdmlsl_high_lane.h"
+#include "neon/qdmlsl_high_n.h"
+#include "neon/qdmlsl_lane.h"
+#include "neon/qdmlsl_n.h"
#include "neon/qdmulh.h"
#include "neon/qdmulh_lane.h"
#include "neon/qdmulh_n.h"
#include "neon/qdmull.h"
+#include "neon/qdmull_high.h"
+#include "neon/qdmull_high_lane.h"
+#include "neon/qdmull_high_n.h"
+#include "neon/qdmull_lane.h"
+#include "neon/qdmull_n.h"
+#include "neon/qrdmlah.h"
+#include "neon/qrdmlah_lane.h"
+#include "neon/qrdmlsh.h"
+#include "neon/qrdmlsh_lane.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
+#include "neon/qrshl.h"
+#include "neon/qrshrn_high_n.h"
#include "neon/qrshrn_n.h"
+#include "neon/qrshrun_high_n.h"
#include "neon/qrshrun_n.h"
#include "neon/qmovn.h"
-#include "neon/qmovun.h"
#include "neon/qmovn_high.h"
+#include "neon/qmovun.h"
+#include "neon/qmovun_high.h"
#include "neon/qneg.h"
#include "neon/qsub.h"
#include "neon/qshl.h"
+#include "neon/qshl_n.h"
#include "neon/qshlu_n.h"
+#include "neon/qshrn_high_n.h"
#include "neon/qshrn_n.h"
+#include "neon/qshrun_high_n.h"
#include "neon/qshrun_n.h"
#include "neon/qtbl.h"
#include "neon/qtbx.h"
+#include "neon/raddhn.h"
+#include "neon/raddhn_high.h"
+#include "neon/rax.h"
#include "neon/rbit.h"
#include "neon/recpe.h"
#include "neon/recps.h"
+#include "neon/recpx.h"
#include "neon/reinterpret.h"
#include "neon/rev16.h"
#include "neon/rev32.h"
#include "neon/rev64.h"
#include "neon/rhadd.h"
#include "neon/rnd.h"
+#include "neon/rnd32x.h"
+#include "neon/rnd32z.h"
+#include "neon/rnd64x.h"
+#include "neon/rnd64z.h"
+#include "neon/rnda.h"
#include "neon/rndm.h"
#include "neon/rndi.h"
#include "neon/rndn.h"
#include "neon/rndp.h"
+#include "neon/rndx.h"
#include "neon/rshl.h"
#include "neon/rshr_n.h"
+#include "neon/rshrn_high_n.h"
#include "neon/rshrn_n.h"
#include "neon/rsqrte.h"
#include "neon/rsqrts.h"
#include "neon/rsra_n.h"
+#include "neon/rsubhn.h"
+#include "neon/rsubhn_high.h"
#include "neon/set_lane.h"
+#include "neon/sha1.h"
+#include "neon/sha256.h"
+#include "neon/sha512.h"
#include "neon/shl.h"
#include "neon/shl_n.h"
+#include "neon/shll_high_n.h"
#include "neon/shll_n.h"
#include "neon/shr_n.h"
+#include "neon/shrn_high_n.h"
#include "neon/shrn_n.h"
+#include "neon/sli_n.h"
+#include "neon/sm3.h"
+#include "neon/sm4.h"
#include "neon/sqadd.h"
+#include "neon/sqrt.h"
#include "neon/sra_n.h"
#include "neon/sri_n.h"
#include "neon/st1.h"
#include "neon/st1_lane.h"
+#include "neon/st1_x2.h"
+#include "neon/st1_x3.h"
+#include "neon/st1_x4.h"
+#include "neon/st1q_x2.h"
+#include "neon/st1q_x3.h"
+#include "neon/st1q_x4.h"
#include "neon/st2.h"
#include "neon/st2_lane.h"
#include "neon/st3.h"
@@ -188,10 +295,12 @@
#include "neon/st4_lane.h"
#include "neon/sub.h"
#include "neon/subhn.h"
+#include "neon/subhn_high.h"
#include "neon/subl.h"
#include "neon/subl_high.h"
#include "neon/subw.h"
#include "neon/subw_high.h"
+#include "neon/sudot_lane.h"
#include "neon/tbl.h"
#include "neon/tbx.h"
#include "neon/trn.h"
@@ -199,6 +308,8 @@
#include "neon/trn2.h"
#include "neon/tst.h"
#include "neon/uqadd.h"
+#include "neon/usdot.h"
+#include "neon/usdot_lane.h"
#include "neon/uzp.h"
#include "neon/uzp1.h"
#include "neon/uzp2.h"
diff --git a/lib/simd_wrapper/simde/arm/neon/abal.h b/lib/simd_wrapper/simde/arm/neon/abal.h
new file mode 100644
index 00000000000..7e5093d37ec
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/abal.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_ABAL_H)
+#define SIMDE_ARM_NEON_ABAL_H
+
+#include "abdl.h"
+#include "add.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vabal_s8(a, b, c);
+ #else
+ return simde_vaddq_s16(simde_vabdl_s8(b, c), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vabal_s8
+ #define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vabal_s16(a, b, c);
+ #else
+ return simde_vaddq_s32(simde_vabdl_s16(b, c), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vabal_s16
+ #define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vabal_s32(a, b, c);
+ #else
+ return simde_vaddq_s64(simde_vabdl_s32(b, c), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vabal_s32
+ #define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vabal_u8(a, b, c);
+ #else
+ return simde_vaddq_u16(simde_vabdl_u8(b, c), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vabal_u8
+ #define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vabal_u16(a, b, c);
+ #else
+ return simde_vaddq_u32(simde_vabdl_u16(b, c), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vabal_u16
+ #define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vabal_u32(a, b, c);
+ #else
+ return simde_vaddq_u64(simde_vabdl_u32(b, c), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vabal_u32
+ #define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c))
+#endif
+
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_abal_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/abal_high.h b/lib/simd_wrapper/simde/arm/neon/abal_high.h
new file mode 100644
index 00000000000..78f538dc410
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/abal_high.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_ABAL_HIGH_H)
+#define SIMDE_ARM_NEON_ABAL_HIGH_H
+
+#include "abdl.h"
+#include "add.h"
+#include "movl_high.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vabal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabal_high_s8(a, b, c);
+ #else
+ return simde_vaddq_s16(simde_vabdl_s8(simde_vget_high_s8(b), simde_vget_high_s8(c)), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabal_high_s8
+ #define vabal_high_s8(a, b, c) simde_vabal_high_s8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vabal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabal_high_s16(a, b, c);
+ #else
+ return simde_vaddq_s32(simde_vabdl_s16(simde_vget_high_s16(b), simde_vget_high_s16(c)), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabal_high_s16
+ #define vabal_high_s16(a, b, c) simde_vabal_high_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vabal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabal_high_s32(a, b, c);
+ #else
+ return simde_vaddq_s64(simde_vabdl_s32(simde_vget_high_s32(b), simde_vget_high_s32(c)), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabal_high_s32
+ #define vabal_high_s32(a, b, c) simde_vabal_high_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vabal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabal_high_u8(a, b, c);
+ #else
+ return simde_vaddq_u16(simde_vabdl_u8(simde_vget_high_u8(b), simde_vget_high_u8(c)), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabal_high_u8
+ #define vabal_high_u8(a, b, c) simde_vabal_high_u8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vabal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabal_high_u16(a, b, c);
+ #else
+ return simde_vaddq_u32(simde_vabdl_u16(simde_vget_high_u16(b), simde_vget_high_u16(c)), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabal_high_u16
+ #define vabal_high_u16(a, b, c) simde_vabal_high_u16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vabal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabal_high_u32(a, b, c);
+ #else
+ return simde_vaddq_u64(simde_vabdl_u32(simde_vget_high_u32(b), simde_vget_high_u32(c)), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabal_high_u32
+ #define vabal_high_u32(a, b, c) simde_vabal_high_u32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_abal_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/abd.h b/lib/simd_wrapper/simde/arm/neon/abd.h
index 0a814e8d97f..fdb1131adc9 100644
--- a/lib/simd_wrapper/simde/arm/neon/abd.h
+++ b/lib/simd_wrapper/simde/arm/neon/abd.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_ABD_H)
@@ -37,6 +38,23 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vabdh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vabdh_f16(a, b);
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+ simde_float32_t r_ = a_ - b_;
+ return r_ < 0 ? simde_float16_from_float32(-r_) : simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdh_f16
+ #define vabdh_f16(a, b) simde_vabdh_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vabds_f32(simde_float32_t a, simde_float32_t b) {
@@ -67,6 +85,20 @@ simde_vabdd_f64(simde_float64_t a, simde_float64_t b) {
#define vabdd_f64(a, b) simde_vabdd_f64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vabd_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vabd_f16(a, b);
+ #else
+ return simde_vabs_f16(simde_vsub_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vabd_f16
+ #define vabd_f16(a, b) simde_vabd_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vabd_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -116,6 +148,15 @@ simde_vabd_s8(simde_int8x8_t a, simde_int8x8_t b) {
m
);
+ return simde_int8x8_from_private(r_);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int8x8_private r_, max_, min_;
+ simde_int8x8_private a_ = simde_int8x8_to_private(a);
+ simde_int8x8_private b_ = simde_int8x8_to_private(b);
+
+ max_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64, b_.sv64, 8);
+ min_.sv64 = __riscv_vmin_vv_i8m1(a_.sv64, b_.sv64, 8);
+ r_.sv64 = __riscv_vsub_vv_i8m1(max_.sv64, min_.sv64, 8);
return simde_int8x8_from_private(r_);
#else
return simde_vmovn_s16(simde_vabsq_s16(simde_vsubl_s8(a, b)));
@@ -139,6 +180,15 @@ simde_vabd_s16(simde_int16x4_t a, simde_int16x4_t b) {
r_.m64 = _mm_sub_pi16(_mm_max_pi16(a_.m64, b_.m64), _mm_min_pi16(a_.m64, b_.m64));
+ return simde_int16x4_from_private(r_);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int16x4_private r_, max_, min_;
+ simde_int16x4_private a_ = simde_int16x4_to_private(a);
+ simde_int16x4_private b_ = simde_int16x4_to_private(b);
+
+ max_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64, b_.sv64, 4);
+ min_.sv64 = __riscv_vmin_vv_i16m1(a_.sv64, b_.sv64, 4);
+ r_.sv64 = __riscv_vsub_vv_i16m1(max_.sv64, min_.sv64, 4);
return simde_int16x4_from_private(r_);
#else
return simde_vmovn_s32(simde_vabsq_s32(simde_vsubl_s16(a, b)));
@@ -154,6 +204,15 @@ simde_int32x2_t
simde_vabd_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_s32(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int32x2_private r_, max_, min_;
+ simde_int32x2_private a_ = simde_int32x2_to_private(a);
+ simde_int32x2_private b_ = simde_int32x2_to_private(b);
+
+ max_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64, b_.sv64, 2);
+ min_.sv64 = __riscv_vmin_vv_i32m1(a_.sv64, b_.sv64, 2);
+ r_.sv64 = __riscv_vsub_vv_i32m1(max_.sv64, min_.sv64, 2);
+ return simde_int32x2_from_private(r_);
#else
return simde_vmovn_s64(simde_vabsq_s64(simde_vsubl_s32(a, b)));
#endif
@@ -168,6 +227,15 @@ simde_uint8x8_t
simde_vabd_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_u8(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x8_private r_, max_, min_;
+ simde_uint8x8_private a_ = simde_uint8x8_to_private(a);
+ simde_uint8x8_private b_ = simde_uint8x8_to_private(b);
+
+ max_.sv64 = __riscv_vmaxu_vv_u8m1(a_.sv64, b_.sv64, 8);
+ min_.sv64 = __riscv_vminu_vv_u8m1(a_.sv64, b_.sv64, 8);
+ r_.sv64 = __riscv_vsub_vv_u8m1(max_.sv64, min_.sv64, 8);
+ return simde_uint8x8_from_private(r_);
#else
return simde_vmovn_u16(
simde_vreinterpretq_u16_s16(
@@ -187,6 +255,15 @@ simde_uint16x4_t
simde_vabd_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_u16(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x4_private r_, max_, min_;
+ simde_uint16x4_private a_ = simde_uint16x4_to_private(a);
+ simde_uint16x4_private b_ = simde_uint16x4_to_private(b);
+
+ max_.sv64 = __riscv_vmaxu_vv_u16m1(a_.sv64, b_.sv64, 4);
+ min_.sv64 = __riscv_vminu_vv_u16m1(a_.sv64, b_.sv64, 4);
+ r_.sv64 = __riscv_vsub_vv_u16m1(max_.sv64, min_.sv64, 4);
+ return simde_uint16x4_from_private(r_);
#else
return simde_vmovn_u32(
simde_vreinterpretq_u32_s32(
@@ -206,6 +283,15 @@ simde_uint32x2_t
simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_u32(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x2_private r_, max_, min_;
+ simde_uint32x2_private a_ = simde_uint32x2_to_private(a);
+ simde_uint32x2_private b_ = simde_uint32x2_to_private(b);
+
+ max_.sv64 = __riscv_vmaxu_vv_u32m1(a_.sv64, b_.sv64, 2);
+ min_.sv64 = __riscv_vminu_vv_u32m1(a_.sv64, b_.sv64, 2);
+ r_.sv64 = __riscv_vsub_vv_u32m1(max_.sv64, min_.sv64, 2);
+ return simde_uint32x2_from_private(r_);
#else
return simde_vmovn_u64(
simde_vreinterpretq_u64_s64(
@@ -220,6 +306,20 @@ simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#define vabd_u32(a, b) simde_vabd_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vabdq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vabdq_f16(a, b);
+ #else
+ return simde_vabsq_f16(simde_vsubq_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdq_f16
+ #define vabdq_f16(a, b) simde_vabdq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vabdq_f32(simde_float32x4_t a, simde_float32x4_t b) {
@@ -277,6 +377,12 @@ simde_vabdq_s8(simde_int8x16_t a, simde_int8x16_t b) {
);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_sub(wasm_i8x16_max(a_.v128, b_.v128), wasm_i8x16_min(a_.v128, b_.v128));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int8x16_private max_, min_;
+
+ max_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128, b_.sv128, 16);
+ min_.sv128 = __riscv_vmin_vv_i8m1(a_.sv128, b_.sv128, 16);
+ r_.sv128 = __riscv_vsub_vv_i8m1(max_.sv128, min_.sv128, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -313,6 +419,12 @@ simde_vabdq_s16(simde_int16x8_t a, simde_int16x8_t b) {
r_.m128i = _mm_sub_epi16(_mm_max_epi16(a_.m128i, b_.m128i), _mm_min_epi16(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_sub(wasm_i16x8_max(a_.v128, b_.v128), wasm_i16x8_min(a_.v128, b_.v128));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int16x8_private max_, min_;
+
+ max_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128, b_.sv128, 8);
+ min_.sv128 = __riscv_vmin_vv_i16m1(a_.sv128, b_.sv128, 8);
+ r_.sv128 = __riscv_vsub_vv_i16m1(max_.sv128, min_.sv128, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -348,6 +460,8 @@ simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_X86_SSE4_1_NATIVE)
r_.m128i = _mm_sub_epi32(_mm_max_epi32(a_.m128i, b_.m128i), _mm_min_epi32(a_.m128i, b_.m128i));
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ r_.v128 = wasm_i32x4_sub(wasm_i32x4_max(a_.v128, b_.v128), wasm_i32x4_min(a_.v128, b_.v128));
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi32(b_.m128i, a_.m128i);
r_.m128i =
@@ -358,6 +472,12 @@ simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) {
),
m
);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int32x4_private max_, min_;
+
+ max_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128, b_.sv128, 4);
+ min_.sv128 = __riscv_vmin_vv_i32m1(a_.sv128, b_.sv128, 4);
+ r_.sv128 = __riscv_vsub_vv_i32m1(max_.sv128, min_.sv128, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -395,6 +515,12 @@ simde_vabdq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
r_.m128i = _mm_sub_epi8(_mm_max_epu8(a_.m128i, b_.m128i), _mm_min_epu8(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_sub(wasm_u8x16_max(a_.v128, b_.v128), wasm_u8x16_min(a_.v128, b_.v128));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x16_private max_, min_;
+
+ max_.sv128 = __riscv_vmaxu_vv_u8m1(a_.sv128, b_.sv128, 16);
+ min_.sv128 = __riscv_vminu_vv_u8m1(a_.sv128, b_.sv128, 16);
+ r_.sv128 = __riscv_vsub_vv_u8m1(max_.sv128, min_.sv128, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -432,6 +558,12 @@ simde_vabdq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
r_.m128i = _mm_sub_epi16(_mm_max_epu16(a_.m128i, b_.m128i), _mm_min_epu16(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_sub(wasm_u16x8_max(a_.v128, b_.v128), wasm_u16x8_min(a_.v128, b_.v128));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x8_private max_, min_;
+
+ max_.sv128 = __riscv_vmaxu_vv_u16m1(a_.sv128, b_.sv128, 8);
+ min_.sv128 = __riscv_vminu_vv_u16m1(a_.sv128, b_.sv128, 8);
+ r_.sv128 = __riscv_vsub_vv_u16m1(max_.sv128, min_.sv128, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -467,6 +599,14 @@ simde_vabdq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_X86_SSE4_2_NATIVE)
r_.m128i = _mm_sub_epi32(_mm_max_epu32(a_.m128i, b_.m128i), _mm_min_epu32(a_.m128i, b_.m128i));
+ #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ r_.v128 = wasm_i32x4_sub(wasm_u32x4_max(a_.v128, b_.v128), wasm_u32x4_min(a_.v128, b_.v128));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x4_private max_, min_;
+
+ max_.sv128 = __riscv_vmaxu_vv_u32m1(a_.sv128, b_.sv128, 4);
+ min_.sv128 = __riscv_vminu_vv_u32m1(a_.sv128, b_.sv128, 4);
+ r_.sv128 = __riscv_vsub_vv_u32m1(max_.sv128, min_.sv128, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
diff --git a/lib/simd_wrapper/simde/arm/neon/abdl_high.h b/lib/simd_wrapper/simde/arm/neon/abdl_high.h
new file mode 100644
index 00000000000..826b1ba33dd
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/abdl_high.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H)
+#define SIMDE_ARM_NEON_ABDL_HIGH_H
+
+#include "abdl.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabdl_high_s8(a, b);
+ #else
+ return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdl_high_s8
+ #define vabdl_high_s8(a, b) simde_vabdl_high_s8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabdl_high_s16(a, b);
+ #else
+ return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdl_high_s16
+ #define vabdl_high_s16(a, b) simde_vabdl_high_s16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabdl_high_s32(a, b);
+ #else
+ return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdl_high_s32
+ #define vabdl_high_s32(a, b) simde_vabdl_high_s32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabdl_high_u8(a, b);
+ #else
+ return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdl_high_u8
+ #define vabdl_high_u8(a, b) simde_vabdl_high_u8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabdl_high_u16(a, b);
+ #else
+ return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdl_high_u16
+ #define vabdl_high_u16(a, b) simde_vabdl_high_u16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vabdl_high_u32(a, b);
+ #else
+ return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vabdl_high_u32
+ #define vabdl_high_u32(a, b) simde_vabdl_high_u32((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/abs.h b/lib/simd_wrapper/simde/arm/neon/abs.h
index 3c705e98b4e..16250da787b 100644
--- a/lib/simd_wrapper/simde/arm/neon/abs.h
+++ b/lib/simd_wrapper/simde/arm/neon/abs.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_ABS_H)
@@ -47,6 +48,45 @@ simde_vabsd_s64(int64_t a) {
#define vabsd_s64(a) simde_vabsd_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vabsh_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vabsh_f16(a);
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+
+ return (a_ >= 0.0f) ? simde_float16_from_float32(a_) : simde_float16_from_float32(-a_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vabsh_f16
+ #define vabsh_f16(a) simde_vabsh_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vabs_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vabs_f16(a);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vabsh_f16(a_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vabs_f16
+ #define vabs_f16(a) simde_vabs_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vabs_f32(simde_float32x2_t a) {
@@ -211,6 +251,29 @@ simde_vabs_s64(simde_int64x1_t a) {
#define vabs_s64(a) simde_vabs_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vabsq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vabsq_f16(a);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vabsh_f16(a_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vabsq_f16
+ #define vabsq_f16(a) simde_vabsq_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vabsq_f32(simde_float32x4_t a) {
@@ -374,7 +437,7 @@ simde_vabsq_s32(simde_int32x4_t a) {
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
+ r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int32_t, 0 - HEDLEY_STATIC_CAST(uint32_t, a_.values[i])) : a_.values[i];
}
#endif
@@ -413,7 +476,7 @@ simde_vabsq_s64(simde_int64x2_t a) {
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i];
+ r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int64_t, 0 - HEDLEY_STATIC_CAST(uint64_t, a_.values[i])) : a_.values[i];
}
#endif
diff --git a/lib/simd_wrapper/simde/arm/neon/add.h b/lib/simd_wrapper/simde/arm/neon/add.h
index d3660f66085..8b4fe3499ae 100644
--- a/lib/simd_wrapper/simde/arm/neon/add.h
+++ b/lib/simd_wrapper/simde/arm/neon/add.h
@@ -22,6 +22,8 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Ju-Hung Li (Copyright owned by NTHU pllab)
*/
#if !defined(SIMDE_ARM_NEON_ADD_H)
@@ -35,7 +37,7 @@ SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_float16
-simde_vaddh_f16(simde_float16 a, simde_float16 b) {
+simde_vaddh_f16(simde_float16_t a, simde_float16_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vaddh_f16(a, b);
#else
@@ -88,10 +90,14 @@ simde_vadd_f16(simde_float16x4_t a, simde_float16x4_t b) {
a_ = simde_float16x4_to_private(a),
b_ = simde_float16x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]);
- }
+ #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
+ r_.sv64 = __riscv_vfadd_vv_f16m1(a_.sv64, b_.sv64, 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]);
+ }
+ #endif
return simde_float16x4_from_private(r_);
#endif
@@ -112,7 +118,9 @@ simde_vadd_f32(simde_float32x2_t a, simde_float32x2_t b) {
a_ = simde_float32x2_to_private(a),
b_ = simde_float32x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfadd_vv_f32m1(a_.sv64, b_.sv64, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -140,7 +148,9 @@ simde_vadd_f64(simde_float64x1_t a, simde_float64x1_t b) {
a_ = simde_float64x1_to_private(a),
b_ = simde_float64x1_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfadd_vv_f64m1(a_.sv64, b_.sv64, 1);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -168,7 +178,9 @@ simde_vadd_s8(simde_int8x8_t a, simde_int8x8_t b) {
a_ = simde_int8x8_to_private(a),
b_ = simde_int8x8_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_i8m1(a_.sv64, b_.sv64, 8);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#elif defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_add_pi8(a_.m64, b_.m64);
@@ -198,7 +210,9 @@ simde_vadd_s16(simde_int16x4_t a, simde_int16x4_t b) {
a_ = simde_int16x4_to_private(a),
b_ = simde_int16x4_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_i16m1(a_.sv64, b_.sv64, 4);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#elif defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_add_pi16(a_.m64, b_.m64);
@@ -228,7 +242,9 @@ simde_vadd_s32(simde_int32x2_t a, simde_int32x2_t b) {
a_ = simde_int32x2_to_private(a),
b_ = simde_int32x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_i32m1(a_.sv64, b_.sv64, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#elif defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_add_pi32(a_.m64, b_.m64);
@@ -258,7 +274,9 @@ simde_vadd_s64(simde_int64x1_t a, simde_int64x1_t b) {
a_ = simde_int64x1_to_private(a),
b_ = simde_int64x1_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_i64m1(a_.sv64, b_.sv64, 1);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -286,7 +304,9 @@ simde_vadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
a_ = simde_uint8x8_to_private(a),
b_ = simde_uint8x8_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_u8m1(a_.sv64, b_.sv64, 8);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -314,7 +334,10 @@ simde_vadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
a_ = simde_uint16x4_to_private(a),
b_ = simde_uint16x4_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_u16m1(a_.sv64, b_.sv64, 4);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -342,7 +365,9 @@ simde_vadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
a_ = simde_uint32x2_to_private(a),
b_ = simde_uint32x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_u32m1(a_.sv64, b_.sv64, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -370,7 +395,9 @@ simde_vadd_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
a_ = simde_uint64x1_to_private(a),
b_ = simde_uint64x1_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vadd_vv_u64m1(a_.sv64, b_.sv64, 1);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -397,10 +424,15 @@ simde_vaddq_f16(simde_float16x8_t a, simde_float16x8_t b) {
r_,
a_ = simde_float16x8_to_private(a),
b_ = simde_float16x8_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]);
- }
+
+ #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
+ r_.sv128 = __riscv_vfadd_vv_f16m1(a_.sv128, b_.sv128, 8);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]);
+ }
+ #endif
return simde_float16x8_from_private(r_);
#endif
@@ -431,6 +463,8 @@ simde_vaddq_f32(simde_float32x4_t a, simde_float32x4_t b) {
r_.m128 = _mm_add_ps(a_.m128, b_.m128);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f32x4_add(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfadd_vv_f32m1(a_.sv128, b_.sv128, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
@@ -465,6 +499,8 @@ simde_vaddq_f64(simde_float64x2_t a, simde_float64x2_t b) {
r_.m128d = _mm_add_pd(a_.m128d, b_.m128d);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f64x2_add(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfadd_vv_f64m1(a_.sv128, b_.sv128, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
@@ -499,6 +535,8 @@ simde_vaddq_s8(simde_int8x16_t a, simde_int8x16_t b) {
r_.m128i = _mm_add_epi8(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_add(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128, b_.sv128, 16);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
@@ -533,6 +571,8 @@ simde_vaddq_s16(simde_int16x8_t a, simde_int16x8_t b) {
r_.m128i = _mm_add_epi16(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_add(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_i16m1(a_.sv128, b_.sv128, 8);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
@@ -567,6 +607,8 @@ simde_vaddq_s32(simde_int32x4_t a, simde_int32x4_t b) {
r_.m128i = _mm_add_epi32(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i32x4_add(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_i32m1(a_.sv128, b_.sv128, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
@@ -601,6 +643,8 @@ simde_vaddq_s64(simde_int64x2_t a, simde_int64x2_t b) {
r_.m128i = _mm_add_epi64(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i64x2_add(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_i64m1(a_.sv128, b_.sv128, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
@@ -631,7 +675,9 @@ simde_vaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
a_ = simde_uint8x16_to_private(a),
b_ = simde_uint8x16_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_u8m1(a_.sv128, b_.sv128, 16);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -661,7 +707,9 @@ simde_vaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
a_ = simde_uint16x8_to_private(a),
b_ = simde_uint16x8_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_u16m1(a_.sv128, b_.sv128, 8);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -691,7 +739,9 @@ simde_vaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
a_ = simde_uint32x4_to_private(a),
b_ = simde_uint32x4_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_u32m1(a_.sv128, b_.sv128, 4);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -721,7 +771,9 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
a_ = simde_uint64x2_to_private(a),
b_ = simde_uint64x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vadd_vv_u64m1(a_.sv128, b_.sv128, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values + b_.values;
#else
SIMDE_VECTORIZE
@@ -738,6 +790,172 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vaddq_u64(a, b) simde_vaddq_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vadd_p8(simde_poly8x8_t a, simde_poly8x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H)
+ return vadd_p8(a, b);
+ #else
+ simde_poly8x8_private
+ r_,
+ a_ = simde_poly8x8_to_private(a),
+ b_ = simde_poly8x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFF);
+ }
+
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vadd_p8
+ #define vadd_p8(a, b) simde_vadd_p8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vadd_p16(simde_poly16x4_t a, simde_poly16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H)
+ return vadd_p16(a, b);
+ #else
+ simde_poly16x4_private
+ r_,
+ a_ = simde_poly16x4_to_private(a),
+ b_ = simde_poly16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFF);
+ }
+
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vadd_p16
+ #define vadd_p16(a, b) simde_vadd_p16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vadd_p64(simde_poly64x1_t a, simde_poly64x1_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \
+ !defined(_GCC_ARM_NEON_H)
+ return vadd_p64(a, b);
+ #else
+ simde_poly64x1_private
+ r_,
+ a_ = simde_poly64x1_to_private(a),
+ b_ = simde_poly64x1_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFFFFFFFFFFFFFF);
+ }
+
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vadd_p64
+ #define vadd_p64(a, b) simde_vadd_p64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vaddq_p8(simde_poly8x16_t a, simde_poly8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H)
+ return vaddq_p8(a, b);
+ #else
+ simde_poly8x16_private
+ r_,
+ a_ = simde_poly8x16_to_private(a),
+ b_ = simde_poly8x16_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFF);
+ }
+
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vaddq_p8
+ #define vaddq_p8(a, b) simde_vaddq_p8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vaddq_p16(simde_poly16x8_t a, simde_poly16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H)
+ return vaddq_p16(a, b);
+ #else
+ simde_poly16x8_private
+ r_,
+ a_ = simde_poly16x8_to_private(a),
+ b_ = simde_poly16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFF);
+ }
+
+ return simde_poly16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vaddq_p16
+ #define vaddq_p16(a, b) simde_vaddq_p16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vaddq_p64(simde_poly64x2_t a, simde_poly64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \
+ !defined(_GCC_ARM_NEON_H)
+ return vaddq_p64(a, b);
+ #else
+ simde_poly64x2_private
+ r_,
+ a_ = simde_poly64x2_to_private(a),
+ b_ = simde_poly64x2_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFFFFFFFFFFFFFF);
+ }
+
+ return simde_poly64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddq_p64
+ #define vaddq_p64(a, b) simde_vaddq_p64((a), (b))
+#endif
+
+#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE)
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly128_t
+simde_vaddq_p128(simde_poly128_t a, simde_poly128_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \
+ !defined(_GCC_ARM_NEON_H)
+ return vaddq_p128(a, b);
+ #else
+ simde_poly128_t mask = 0xFFFFFFFFFFFFFFFFull;
+ mask = mask << 64;
+ mask = mask | 0xFFFFFFFFFFFFFFFFull;
+ return b ^ ((0 ^ a) & mask);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddq_p128
+ #define vaddq_p128(a, b) simde_vaddq_p128((a), (b))
+#endif
+#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/addhn_high.h b/lib/simd_wrapper/simde/arm/neon/addhn_high.h
new file mode 100644
index 00000000000..0c96a24d456
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/addhn_high.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_ADDHN_HIGH_H)
+#define SIMDE_ARM_NEON_ADDHN_HIGH_H
+
+#include "addhn.h"
+#include "combine.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16_t
+simde_vaddhn_high_s16(simde_int8x8_t r, simde_int16x8_t a, simde_int16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vaddhn_high_s16(r, a, b);
+ #else
+ return simde_vcombine_s8(r, simde_vaddhn_s16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddhn_high_s16
+ #define vaddhn_high_s16(r, a, b) simde_vaddhn_high_s16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vaddhn_high_s32(simde_int16x4_t r, simde_int32x4_t a, simde_int32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vaddhn_high_s32(r, a, b);
+ #else
+ return simde_vcombine_s16(r, simde_vaddhn_s32(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddhn_high_s32
+ #define vaddhn_high_s32(r, a, b) simde_vaddhn_high_s32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vaddhn_high_s64(simde_int32x2_t r, simde_int64x2_t a, simde_int64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vaddhn_high_s64(r, a, b);
+ #else
+ return simde_vcombine_s32(r, simde_vaddhn_s64(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddhn_high_s64
+ #define vaddhn_high_s64(r, a, b) simde_vaddhn_high_s64((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vaddhn_high_u16(simde_uint8x8_t r, simde_uint16x8_t a, simde_uint16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vaddhn_high_u16(r, a, b);
+ #else
+ return simde_vcombine_u8(r, simde_vaddhn_u16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddhn_high_u16
+ #define vaddhn_high_u16(r, a, b) simde_vaddhn_high_u16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vaddhn_high_u32(simde_uint16x4_t r, simde_uint32x4_t a, simde_uint32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vaddhn_high_u32(r, a, b);
+ #else
+ return simde_vcombine_u16(r, simde_vaddhn_u32(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddhn_high_u32
+ #define vaddhn_high_u32(r, a, b) simde_vaddhn_high_u32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vaddhn_high_u64(simde_uint32x2_t r, simde_uint64x2_t a, simde_uint64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vaddhn_high_u64(r, a, b);
+ #else
+ return simde_vcombine_u32(r, simde_vaddhn_u64(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vaddhn_high_u64
+ #define vaddhn_high_u64(r, a, b) simde_vaddhn_high_u64((r), (a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_ADDHN_HIGH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/addlv.h b/lib/simd_wrapper/simde/arm/neon/addlv.h
index 79d9451b0d2..dc7de0c45c9 100644
--- a/lib/simd_wrapper/simde/arm/neon/addlv.h
+++ b/lib/simd_wrapper/simde/arm/neon/addlv.h
@@ -184,6 +184,12 @@ int16_t
simde_vaddlvq_s8(simde_int8x16_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddlvq_s8(a);
+ #elif defined(SIMDE_X86_SSE2_NATIVE)
+ __m128i a_ = simde_int8x16_to_m128i(a);
+ a_ = _mm_xor_si128(a_, _mm_set1_epi8('\x80'));
+ a_ = _mm_sad_epu8(a_, _mm_setzero_si128());
+ a_ = _mm_add_epi16(a_, _mm_shuffle_epi32(a_, 0xEE));
+ return HEDLEY_STATIC_CAST(int16_t, _mm_cvtsi128_si32(a_) - 2048);
#else
simde_int8x16_private a_ = simde_int8x16_to_private(a);
int16_t r = 0;
@@ -206,6 +212,13 @@ int32_t
simde_vaddlvq_s16(simde_int16x8_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddlvq_s16(a);
+ #elif defined(SIMDE_X86_SSSE3_NATIVE) && !defined(HEDLEY_MSVC_VERSION)
+ __m128i a_ = simde_int16x8_to_m128i(a);
+ a_ = _mm_xor_si128(a_, _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, 0x8000)));
+ a_ = _mm_shuffle_epi8(a_, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0));
+ a_ = _mm_sad_epu8(a_, _mm_setzero_si128());
+ a_ = _mm_add_epi32(a_, _mm_srli_si128(a_, 7));
+ return _mm_cvtsi128_si32(a_) - 262144;
#else
simde_int16x8_private a_ = simde_int16x8_to_private(a);
int32_t r = 0;
@@ -250,6 +263,11 @@ uint16_t
simde_vaddlvq_u8(simde_uint8x16_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddlvq_u8(a);
+ #elif defined(SIMDE_X86_SSE2_NATIVE)
+ __m128i a_ = simde_uint8x16_to_m128i(a);
+ a_ = _mm_sad_epu8(a_, _mm_setzero_si128());
+ a_ = _mm_add_epi16(a_, _mm_shuffle_epi32(a_, 0xEE));
+ return HEDLEY_STATIC_CAST(uint16_t, _mm_cvtsi128_si32(a_));
#else
simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
uint16_t r = 0;
@@ -272,6 +290,12 @@ uint32_t
simde_vaddlvq_u16(simde_uint16x8_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vaddlvq_u16(a);
+ #elif defined(SIMDE_X86_SSSE3_NATIVE)
+ __m128i a_ = simde_uint16x8_to_m128i(a);
+ a_ = _mm_shuffle_epi8(a_, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0));
+ a_ = _mm_sad_epu8(a_, _mm_setzero_si128());
+ a_ = _mm_add_epi32(a_, _mm_srli_si128(a_, 7));
+ return HEDLEY_STATIC_CAST(uint32_t, _mm_cvtsi128_si32(a_));
#else
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
uint32_t r = 0;
diff --git a/lib/simd_wrapper/simde/arm/neon/addv.h b/lib/simd_wrapper/simde/arm/neon/addv.h
index bcc082b34f1..6beb9836c48 100644
--- a/lib/simd_wrapper/simde/arm/neon/addv.h
+++ b/lib/simd_wrapper/simde/arm/neon/addv.h
@@ -352,6 +352,11 @@ simde_vaddvq_u8(simde_uint8x16_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
r = vaddvq_u8(a);
+ #elif defined(SIMDE_X86_SSE2_NATIVE)
+ __m128i a_ = simde_uint8x16_to_m128i(a);
+ a_ = _mm_sad_epu8(a_, _mm_setzero_si128());
+ a_ = _mm_add_epi8(a_, _mm_shuffle_epi32(a_, 0xEE));
+ return HEDLEY_STATIC_CAST(uint8_t, _mm_cvtsi128_si32(a_));
#else
simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
diff --git a/lib/simd_wrapper/simde/arm/neon/aes.h b/lib/simd_wrapper/simde/arm/neon/aes.h
new file mode 100644
index 00000000000..4e6896fc879
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/aes.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_AES_H)
+#define SIMDE_ARM_NEON_AES_H
+
+#include "types.h"
+#include "../../simde-aes.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+static uint8_t simde_xtime(uint8_t x)
+{
+ return HEDLEY_STATIC_CAST(uint8_t, (x<<1) ^ (((x>>7) & 1) * 0x1b));
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vaeseq_u8(simde_uint8x16_t data, simde_uint8x16_t key) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES)
+ return vaeseq_u8(data, key);
+ #else
+ /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */
+ simde_uint8x16_private
+ r_,
+ a_ = simde_uint8x16_to_private(data),
+ b_ = simde_uint8x16_to_private(key);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i];
+ }
+ // AESShiftRows
+ uint8_t tmp;
+ tmp = r_.values[1];
+ r_.values[1] = r_.values[5];
+ r_.values[5] = r_.values[9];
+ r_.values[9] = r_.values[13];
+ r_.values[13] = tmp;
+
+ tmp = r_.values[2];
+ r_.values[2] = r_.values[10];
+ r_.values[10] = tmp;
+
+ tmp = r_.values[6];
+ r_.values[6] = r_.values[14];
+ r_.values[14] = tmp;
+
+ tmp = r_.values[3];
+ r_.values[3] = r_.values[15];
+ r_.values[15] = r_.values[11];
+ r_.values[11] = r_.values[7];
+ r_.values[7] = tmp;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_x_aes_s_box[r_.values[i]];
+ }
+ return simde_uint8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vaeseq_u8
+ #define vaeseq_u8(data, key) simde_vaeseq_u8((data), (key))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vaesdq_u8(simde_uint8x16_t data, simde_uint8x16_t key) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES)
+ return vaesdq_u8(data, key);
+ #else
+ /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */
+ simde_uint8x16_private
+ r_,
+ a_ = simde_uint8x16_to_private(data),
+ b_ = simde_uint8x16_to_private(key);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i];
+ }
+ // AESInvShiftRows
+ uint8_t tmp;
+ tmp = r_.values[13];
+ r_.values[13] = r_.values[9];
+ r_.values[9] = r_.values[5];
+ r_.values[5] = r_.values[1];
+ r_.values[1] = tmp;
+
+ tmp = r_.values[2];
+ r_.values[2] = r_.values[10];
+ r_.values[10] = tmp;
+
+ tmp = r_.values[6];
+ r_.values[6] = r_.values[14];
+ r_.values[14] = tmp;
+
+ tmp = r_.values[3];
+ r_.values[3] = r_.values[7];
+ r_.values[7] = r_.values[11];
+ r_.values[11] = r_.values[15];
+ r_.values[15] = tmp;
+ for(int i = 0; i < 16; ++i) {
+ r_.values[i] = simde_x_aes_inv_s_box[r_.values[i]];
+ }
+ return simde_uint8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vaesdq_u8
+ #define vaesdq_u8(data, key) simde_vaesdq_u8((data), (key))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vaesmcq_u8(simde_uint8x16_t data) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES)
+ return vaesmcq_u8(data);
+ #else
+ /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */
+ simde_uint8x16_private
+ a_ = simde_uint8x16_to_private(data);
+ uint8_t i;
+ uint8_t Tmp, Tm, t;
+ for (i = 0; i < 4; ++i)
+ {
+ t = a_.values[i*4+0];
+ Tmp = a_.values[i*4+0] ^ a_.values[i*4+1] ^ a_.values[i*4+2] ^ a_.values[i*4+3] ;
+ Tm = a_.values[i*4+0] ^ a_.values[i*4+1] ; Tm = simde_xtime(Tm); a_.values[i*4+0] ^= Tm ^ Tmp ;
+ Tm = a_.values[i*4+1] ^ a_.values[i*4+2] ; Tm = simde_xtime(Tm); a_.values[i*4+1] ^= Tm ^ Tmp ;
+ Tm = a_.values[i*4+2] ^ a_.values[i*4+3] ; Tm = simde_xtime(Tm); a_.values[i*4+2] ^= Tm ^ Tmp ;
+ Tm = a_.values[i*4+3] ^ t ; Tm = simde_xtime(Tm); a_.values[i*4+3] ^= Tm ^ Tmp ;
+ }
+ return simde_uint8x16_from_private(a_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vaesmcq_u8
+ #define vaesmcq_u8(data) simde_vaesmcq_u8((data))
+#endif
+
+static uint8_t Multiply(uint8_t x, uint8_t y)
+{
+ return (((y & 1) * x) ^
+ ((y>>1 & 1) * simde_xtime(x)) ^
+ ((y>>2 & 1) * simde_xtime(simde_xtime(x))) ^
+ ((y>>3 & 1) * simde_xtime(simde_xtime(simde_xtime(x)))) ^
+ ((y>>4 & 1) * simde_xtime(simde_xtime(simde_xtime(simde_xtime(x)))))); /* this last call to simde_xtime() can be omitted */
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vaesimcq_u8(simde_uint8x16_t data) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES)
+ return vaesimcq_u8(data);
+ #else
+ simde_uint8x16_private
+ a_ = simde_uint8x16_to_private(data),
+ r_;
+ /* ref: simde/simde/x86/aes.h */
+ #if defined(SIMDE_X86_AES_NATIVE)
+ r_.m128i = _mm_aesimc_si128(a_.m128i);
+ #else
+ int Nb = simde_x_aes_Nb;
+ // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3
+ uint8_t i, j, col[4], res[4];
+
+ for (j = 0; j < Nb; j++) {
+ for (i = 0; i < 4; i++) {
+ col[i] = a_.values[Nb*j+i];
+ }
+
+ //coef_mult(k, col, res);
+ simde_x_aes_coef_mult_lookup(4, col, res);
+
+ for (i = 0; i < 4; i++) {
+ r_.values[Nb*j+i] = res[i];
+ }
+ }
+ #endif
+ return simde_uint8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vaesimcq_u8
+ #define vaesimcq_u8(data) simde_vaesimcq_u8((data))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_AES_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/and.h b/lib/simd_wrapper/simde/arm/neon/and.h
index 381154228f2..185683d75a8 100644
--- a/lib/simd_wrapper/simde/arm/neon/and.h
+++ b/lib/simd_wrapper/simde/arm/neon/and.h
@@ -47,6 +47,8 @@ simde_vand_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_i8m1(a_.sv64, b_.sv64, 8);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -77,6 +79,8 @@ simde_vand_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_i16m1(a_.sv64, b_.sv64, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -107,6 +111,8 @@ simde_vand_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_i32m1(a_.sv64, b_.sv64, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -137,6 +143,8 @@ simde_vand_s64(simde_int64x1_t a, simde_int64x1_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_i64m1(a_.sv64, b_.sv64, 1);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -167,6 +175,8 @@ simde_vand_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_u8m1(a_.sv64, b_.sv64, 8);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -197,6 +207,8 @@ simde_vand_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_u16m1(a_.sv64, b_.sv64, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -227,6 +239,8 @@ simde_vand_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_u32m1(a_.sv64, b_.sv64, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -257,6 +271,8 @@ simde_vand_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _mm_and_si64(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vand_vv_u64m1(a_.sv64, b_.sv64, 1);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -291,6 +307,8 @@ simde_vandq_s8(simde_int8x16_t a, simde_int8x16_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128, b_.sv128, 16);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -325,6 +343,8 @@ simde_vandq_s16(simde_int16x8_t a, simde_int16x8_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_i16m1(a_.sv128, b_.sv128, 8);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -359,6 +379,8 @@ simde_vandq_s32(simde_int32x4_t a, simde_int32x4_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_i32m1(a_.sv128, b_.sv128, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -393,6 +415,8 @@ simde_vandq_s64(simde_int64x2_t a, simde_int64x2_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_i64m1(a_.sv128, b_.sv128, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -427,6 +451,8 @@ simde_vandq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_u8m1(a_.sv128, b_.sv128, 16);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -461,6 +487,8 @@ simde_vandq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_u16m1(a_.sv128, b_.sv128, 8);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -495,6 +523,8 @@ simde_vandq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_u32m1(a_.sv128, b_.sv128, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
@@ -529,6 +559,8 @@ simde_vandq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
r_.m128i = _mm_and_si128(a_.m128i, b_.m128i);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_and(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vand_vv_u64m1(a_.sv128, b_.sv128, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values & b_.values;
#else
diff --git a/lib/simd_wrapper/simde/arm/neon/bcax.h b/lib/simd_wrapper/simde/arm/neon/bcax.h
index 929d8f8d887..b9e84ccba34 100644
--- a/lib/simd_wrapper/simde/arm/neon/bcax.h
+++ b/lib/simd_wrapper/simde/arm/neon/bcax.h
@@ -39,13 +39,13 @@ SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_uint8x16_t
simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_u8(a, b, c);
#else
return simde_veorq_u8(a, simde_vbicq_u8(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_u8
#define vbcaxq_u8(a, b, c) simde_vbcaxq_u8(a, b, c)
#endif
@@ -53,13 +53,13 @@ simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_u16(a, b, c);
#else
return simde_veorq_u16(a, simde_vbicq_u16(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_u16
#define vbcaxq_u16(a, b, c) simde_vbcaxq_u16(a, b, c)
#endif
@@ -67,13 +67,13 @@ simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_u32(a, b, c);
#else
return simde_veorq_u32(a, simde_vbicq_u32(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_u32
#define vbcaxq_u32(a, b, c) simde_vbcaxq_u32(a, b, c)
#endif
@@ -81,13 +81,13 @@ simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_u64(a, b, c);
#else
return simde_veorq_u64(a, simde_vbicq_u64(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_u64
#define vbcaxq_u64(a, b, c) simde_vbcaxq_u64(a, b, c)
#endif
@@ -95,13 +95,13 @@ simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int8x16_t
simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_s8(a, b, c);
#else
return simde_veorq_s8(a, simde_vbicq_s8(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_s8
#define vbcaxq_s8(a, b, c) simde_vbcaxq_s8(a, b, c)
#endif
@@ -109,13 +109,13 @@ simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int16x8_t
simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_s16(a, b, c);
#else
return simde_veorq_s16(a,simde_vbicq_s16(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_s16
#define vbcaxq_s16(a, b, c) simde_vbcaxq_s16(a, b, c)
#endif
@@ -123,13 +123,13 @@ simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_s32(a, b, c);
#else
return simde_veorq_s32(a, simde_vbicq_s32(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_s32
#define vbcaxq_s32(a, b, c) simde_vbcaxq_s32(a, b, c)
#endif
@@ -137,13 +137,13 @@ simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int64x2_t
simde_vbcaxq_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
return vbcaxq_s64(a, b, c);
#else
return simde_veorq_s64(a, simde_vbicq_s64(b, c));
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3))
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3))
#undef vbcaxq_s64
#define vbcaxq_s64(a, b, c) simde_vbcaxq_s64(a, b, c)
#endif
diff --git a/lib/simd_wrapper/simde/arm/neon/bsl.h b/lib/simd_wrapper/simde/arm/neon/bsl.h
index 0fc4ff270fd..40cdac89f1e 100644
--- a/lib/simd_wrapper/simde/arm/neon/bsl.h
+++ b/lib/simd_wrapper/simde/arm/neon/bsl.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_BSL_H)
@@ -755,6 +756,156 @@ simde_vbslq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) {
#define vbslq_u64(a, b, c) simde_vbslq_u64((a), (b), (c))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vbsl_p8(simde_uint8x8_t a, simde_poly8x8_t b, simde_poly8x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vbsl_p8(a, b, c);
+ #else
+ simde_poly8x8_private
+ r_,
+ b_ = simde_poly8x8_to_private(b),
+ c_ = simde_poly8x8_to_private(c);
+ simde_uint8x8_private a_ = simde_uint8x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]);
+ }
+
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vbsl_p8
+ #define vbsl_p8(a, b, c) simde_vbsl_p8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vbsl_p16(simde_uint16x4_t a, simde_poly16x4_t b, simde_poly16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vbsl_p16(a, b, c);
+ #else
+ simde_poly16x4_private
+ r_,
+ b_ = simde_poly16x4_to_private(b),
+ c_ = simde_poly16x4_to_private(c);
+ simde_uint16x4_private a_ = simde_uint16x4_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]);
+ }
+
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vbsl_p16
+ #define vbsl_p16(a, b, c) simde_vbsl_p16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vbsl_p64(simde_uint64x1_t a, simde_poly64x1_t b, simde_poly64x1_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vbsl_p64(a, b, c);
+ #else
+ simde_poly64x1_private
+ r_,
+ b_ = simde_poly64x1_to_private(b),
+ c_ = simde_poly64x1_to_private(c);
+ simde_uint64x1_private a_ = simde_uint64x1_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]);
+ }
+
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbsl_p64
+ #define vbsl_p64(a, b, c) simde_vbsl_p64((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vbslq_p8(simde_uint8x16_t a, simde_poly8x16_t b, simde_poly8x16_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vbslq_p8(a, b, c);
+ #else
+ simde_poly8x16_private
+ r_,
+ b_ = simde_poly8x16_to_private(b),
+ c_ = simde_poly8x16_to_private(c);
+ simde_uint8x16_private a_ = simde_uint8x16_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]);
+ }
+
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vbslq_p8
+ #define vbslq_p8(a, b, c) simde_vbslq_p8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vbslq_p16(simde_uint16x8_t a, simde_poly16x8_t b, simde_poly16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vbslq_p16(a, b, c);
+ #else
+ simde_poly16x8_private
+ r_,
+ b_ = simde_poly16x8_to_private(b),
+ c_ = simde_poly16x8_to_private(c);
+ simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]);
+ }
+
+ return simde_poly16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vbslq_p16
+ #define vbslq_p16(a, b, c) simde_vbslq_p16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vbslq_p64(simde_uint64x2_t a, simde_poly64x2_t b, simde_poly64x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vbslq_p64(a, b, c);
+ #else
+ simde_poly64x2_private
+ r_,
+ b_ = simde_poly64x2_to_private(b),
+ c_ = simde_poly64x2_to_private(c);
+ simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]);
+ }
+
+ return simde_poly64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbslq_p64
+ #define vbslq_p64(a, b, c) simde_vbslq_p64((a), (b), (c))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/cadd_rot270.h b/lib/simd_wrapper/simde/arm/neon/cadd_rot270.h
new file mode 100644
index 00000000000..17995f48a84
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cadd_rot270.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Chi-Wei Chu
+ */
+
+#if !defined(SIMDE_ARM_NEON_CADD_ROT270_H)
+#define SIMDE_ARM_NEON_CADD_ROT270_H
+
+#include "add.h"
+#include "types.h"
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16
+SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_
+_Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"")
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcadd_rot270_f16(a, b);
+ #else
+ simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]);
+ r_.values[2 * i + 1] =
+ simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]);
+ }
+ #endif
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcadd_rot270_f16
+ #define vcadd_rot270_f16(a, b) simde_vcadd_rot270_f16(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcaddq_rot270_f16(a, b);
+ #else
+ simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]);
+ r_.values[2 * i + 1] =
+ simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]);
+ }
+ #endif
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaddq_rot270_f16
+ #define vcaddq_rot270_f16(a, b) simde_vcaddq_rot270_f16(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcadd_rot270_f32(a, b);
+ #else
+ simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i];
+ r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcadd_rot270_f32
+ #define vcadd_rot270_f32(a, b) simde_vcadd_rot270_f32(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcaddq_rot270_f32(a, b);
+ #else
+ simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i];
+ r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaddq_rot270_f32
+ #define vcaddq_rot270_f32(a, b) simde_vcaddq_rot270_f32(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcaddq_rot270_f64(a, b);
+ #else
+ simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i];
+ r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaddq_rot270_f64
+ #define vcaddq_rot270_f64(a, b) simde_vcaddq_rot270_f64(a, b)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT270_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/cadd_rot90.h b/lib/simd_wrapper/simde/arm/neon/cadd_rot90.h
new file mode 100644
index 00000000000..0c448a52191
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cadd_rot90.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Chi-Wei Chu
+ */
+
+#if !defined(SIMDE_ARM_NEON_CADD_ROT90_H)
+#define SIMDE_ARM_NEON_CADD_ROT90_H
+
+#include "add.h"
+#include "types.h"
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16
+SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_
+_Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"")
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcadd_rot90_f16(a, b);
+ #else
+ simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] =
+ simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i + 1])), a_.values[2 * i]);
+ r_.values[2 * i + 1] = simde_vaddh_f16(b_.values[2 * i], a_.values[2 * i + 1]);
+ }
+ #endif
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcadd_rot90_f16
+ #define vcadd_rot90_f16(a, b) simde_vcadd_rot90_f16(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcaddq_rot90_f16(a, b);
+ #else
+ simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] =
+ simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i + 1])), a_.values[2 * i]);
+ r_.values[2 * i + 1] = simde_vaddh_f16(b_.values[2 * i], a_.values[2 * i + 1]);
+ }
+ #endif
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaddq_rot90_f16
+ #define vcaddq_rot90_f16(a, b) simde_vcaddq_rot90_f16(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcadd_rot90_f32(a, b);
+ #else
+ simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i];
+ r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcadd_rot90_f32
+ #define vcadd_rot90_f32(a, b) simde_vcadd_rot90_f32(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcaddq_rot90_f32(a, b);
+ #else
+ simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i];
+ r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaddq_rot90_f32
+ #define vcaddq_rot90_f32(a, b) simde_vcaddq_rot90_f32(a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t b)
+{
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ return vcaddq_rot90_f64(a, b);
+ #else
+ simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b);
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2);
+ r_.values = b_.values + a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i];
+ r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+#undef vcaddq_rot90_f64
+#define vcaddq_rot90_f64(a, b) simde_vcaddq_rot90_f64(a, b)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT90_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/cale.h b/lib/simd_wrapper/simde/arm/neon/cale.h
new file mode 100644
index 00000000000..f2baa51581a
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cale.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CALE_H)
+#define SIMDE_ARM_NEON_CALE_H
+
+#include "cage.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcaleh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcaleh_f16(a, b);
+ #else
+ return simde_vcageh_f16(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaleh_f16
+ #define vcaleh_f16(a, b) simde_vcaleh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcales_f32(simde_float32_t a, simde_float32_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcales_f32(a, b);
+ #else
+ return simde_vcages_f32(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcales_f32
+ #define vcales_f32(a, b) simde_vcales_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcaled_f64(simde_float64_t a, simde_float64_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcaled_f64(a, b);
+ #else
+ return simde_vcaged_f64(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaled_f64
+ #define vcaled_f64(a, b) simde_vcaled_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcale_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcale_f16(a, b);
+ #else
+ return simde_vcage_f16(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcale_f16
+ #define vcale_f16(a, b) simde_vcale_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcale_f32(simde_float32x2_t a, simde_float32x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcale_f32(a, b);
+ #else
+ return simde_vcage_f32(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcale_f32
+ #define vcale_f32(a, b) simde_vcale_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcale_f64(simde_float64x1_t a, simde_float64x1_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcale_f64(a, b);
+ #else
+ return simde_vcage_f64(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcale_f64
+ #define vcale_f64(a, b) simde_vcale_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcaleq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcaleq_f16(a, b);
+ #else
+ return simde_vcageq_f16(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaleq_f16
+ #define vcaleq_f16(a, b) simde_vcaleq_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcaleq_f32(simde_float32x4_t a, simde_float32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcaleq_f32(a, b);
+ #else
+ return simde_vcageq_f32(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcaleq_f32
+ #define vcaleq_f32(a, b) simde_vcaleq_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcaleq_f64(simde_float64x2_t a, simde_float64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcaleq_f64(a, b);
+ #else
+ return simde_vcageq_f64(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaleq_f64
+ #define vcaleq_f64(a, b) simde_vcaleq_f64((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_cale_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/calt.h b/lib/simd_wrapper/simde/arm/neon/calt.h
new file mode 100644
index 00000000000..99fa3841932
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/calt.h
@@ -0,0 +1,165 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CALT_H)
+#define SIMDE_ARM_NEON_CALT_H
+
+#include "cagt.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcalth_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcalth_f16(a, b);
+ #else
+ return simde_vcagth_f16(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcalth_f16
+ #define vcalth_f16(a, b) simde_vcalth_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcalts_f32(simde_float32_t a, simde_float32_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcalts_f32(a, b);
+ #else
+ return simde_vcagts_f32(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcalts_f32
+ #define vcalts_f32(a, b) simde_vcalts_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcaltd_f64(simde_float64_t a, simde_float64_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcaltd_f64(a, b);
+ #else
+ return simde_vcagtd_f64(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaltd_f64
+ #define vcaltd_f64(a, b) simde_vcaltd_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcalt_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcalt_f16(a, b);
+ #else
+ return simde_vcagt_f16(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcalt_f16
+ #define vcalt_f16(a, b) simde_vcalt_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcalt_f32(simde_float32x2_t a, simde_float32x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcalt_f32(a, b);
+ #else
+ return simde_vcagt_f32(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcalt_f32
+ #define vcalt_f32(a, b) simde_vcalt_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcalt_f64(simde_float64x1_t a, simde_float64x1_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcalt_f64(a, b);
+ #else
+ return simde_vcagt_f64(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcalt_f64
+ #define vcalt_f64(a, b) simde_vcalt_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcaltq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcaltq_f16(a, b);
+ #else
+ return simde_vcagtq_f16(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaltq_f16
+ #define vcaltq_f16(a, b) simde_vcaltq_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcaltq_f32(simde_float32x4_t a, simde_float32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcaltq_f32(a, b);
+ #else
+ return simde_vcagtq_f32(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcaltq_f32
+ #define vcaltq_f32(a, b) simde_vcaltq_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcaltq_f64(simde_float64x2_t a, simde_float64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcaltq_f64(a, b);
+ #else
+ return simde_vcagtq_f64(b, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcaltq_f64
+ #define vcaltq_f64(a, b) simde_vcaltq_f64((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CAGT_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ceq.h b/lib/simd_wrapper/simde/arm/neon/ceq.h
index e60a4bf79d2..03a9c861223 100644
--- a/lib/simd_wrapper/simde/arm/neon/ceq.h
+++ b/lib/simd_wrapper/simde/arm/neon/ceq.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CEQ_H)
@@ -766,6 +767,102 @@ simde_vceqq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vceqq_u64(a, b) simde_vceqq_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8_t
+simde_vceq_p8(simde_poly8x8_t a, simde_poly8x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vceq_p8(a, b);
+ #else
+ simde_uint8x8_private r_;
+ simde_poly8x8_private
+ a_ = simde_poly8x8_to_private(a),
+ b_ = simde_poly8x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (a_.values[i] == b_.values[i]) ? HEDLEY_STATIC_CAST(uint8_t, ~UINT8_C(0)) : HEDLEY_STATIC_CAST(uint8_t, UINT8_C(0));
+ }
+
+ return simde_uint8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vceq_p8
+ #define vceq_p8(a, b) simde_vceq_p8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vceqq_p8(simde_poly8x16_t a, simde_poly8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vceqq_p8(a, b);
+ #else
+ simde_uint8x16_private r_;
+ simde_poly8x16_private
+ a_ = simde_poly8x16_to_private(a),
+ b_ = simde_poly8x16_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (a_.values[i] == b_.values[i]) ? HEDLEY_STATIC_CAST(uint8_t, ~UINT8_C(0)) : HEDLEY_STATIC_CAST(uint8_t, UINT8_C(0));
+ }
+
+ return simde_uint8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vceqq_p8
+ #define vceqq_p8(a, b) simde_vceqq_p8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vceq_p64(simde_poly64x1_t a, simde_poly64x1_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vceq_p64(a, b);
+ #else
+ simde_uint64x1_private r_;
+ simde_poly64x1_private
+ a_ = simde_poly64x1_to_private(a),
+ b_ = simde_poly64x1_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (a_.values[i] == b_.values[i]) ? ~UINT64_C(0) : UINT64_C(0);
+ }
+
+ return simde_uint64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vceq_p64
+ #define vceq_p64(a, b) simde_vceq_p64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vceqq_p64(simde_poly64x2_t a, simde_poly64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vceqq_p64(a, b);
+ #else
+ simde_uint64x2_private r_;
+ simde_poly64x2_private
+ a_ = simde_poly64x2_to_private(a),
+ b_ = simde_poly64x2_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (a_.values[i] == b_.values[i]) ? ~UINT64_C(0) : UINT64_C(0);
+ }
+
+ return simde_uint64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vceqq_p64
+ #define vceqq_p64(a, b) simde_vceqq_p64((a), (b))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/ceqz.h b/lib/simd_wrapper/simde/arm/neon/ceqz.h
index 176ecce0f8a..54f3ce8fbb4 100644
--- a/lib/simd_wrapper/simde/arm/neon/ceqz.h
+++ b/lib/simd_wrapper/simde/arm/neon/ceqz.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CEQZ_H)
@@ -375,7 +376,7 @@ simde_vceqzd_u64(uint64_t a) {
SIMDE_FUNCTION_ATTRIBUTES
uint16_t
-simde_vceqzh_f16(simde_float16 a) {
+simde_vceqzh_f16(simde_float16_t a) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vceqzh_f16(a);
#else
@@ -415,6 +416,62 @@ simde_vceqzd_f64(simde_float64_t a) {
#define vceqzd_f64(a) simde_vceqzd_f64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8_t
+simde_vceqz_p8(simde_poly8x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vceqz_p8(a);
+ #else
+ return simde_vceq_p8(a, simde_vdup_n_p8(0));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vceqz_p8
+ #define vceqz_p8(a) simde_vceqz_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vceqzq_p8(simde_poly8x16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vceqzq_p8(a);
+ #else
+ return simde_vceqq_p8(a, simde_vdupq_n_p8(0));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vceqzq_p8
+ #define vceqzq_p8(a) simde_vceqzq_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vceqz_p64(simde_poly64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vceqz_p64(a);
+ #else
+ return simde_vceq_p64(a, simde_vdup_n_p64(0));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vceqz_p64
+ #define vceqz_p64(a) simde_vceqz_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vceqzq_p64(simde_poly64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vceqzq_p64(a);
+ #else
+ return simde_vceqq_p64(a, simde_vdupq_n_p64(0));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vceqzq_p64
+ #define vceqzq_p64(a) simde_vceqzq_p64((a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/cgez.h b/lib/simd_wrapper/simde/arm/neon/cgez.h
index b8440836165..04024c48ec8 100644
--- a/lib/simd_wrapper/simde/arm/neon/cgez.h
+++ b/lib/simd_wrapper/simde/arm/neon/cgez.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CGEZ_H)
@@ -78,6 +79,42 @@ simde_vcgezs_f32(simde_float32_t a) {
#define vcgezs_f32(a) simde_vcgezs_f32(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcgezh_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vcgezh_f16(a));
+ #else
+ return (simde_float16_to_float32(a) >= SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgezh_f16
+ #define vcgezh_f16(a) simde_vcgezh_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcgezq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcgezq_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcgezh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgezq_f16
+ #define vcgezq_f16(a) simde_vcgezq_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcgezq_f32(simde_float32x4_t a) {
@@ -246,6 +283,28 @@ simde_vcgezq_s64(simde_int64x2_t a) {
#define vcgezq_s64(a) simde_vcgezq_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcgez_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcgez_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcgezh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgez_f16
+ #define vcgez_f16(a) simde_vcgez_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vcgez_f32(simde_float32x2_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/cgt.h b/lib/simd_wrapper/simde/arm/neon/cgt.h
index a090dca5b85..465cdb91786 100644
--- a/lib/simd_wrapper/simde/arm/neon/cgt.h
+++ b/lib/simd_wrapper/simde/arm/neon/cgt.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CGT_H)
@@ -78,6 +79,23 @@ simde_vcgtd_u64(uint64_t a, uint64_t b) {
#define vcgtd_u64(a, b) simde_vcgtd_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcgth_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vcgth_f16(a, b));
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+
+ return (a_ > b_) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgth_f16
+ #define vcgth_f16(a, b) simde_vcgth_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
uint32_t
simde_vcgts_f32(simde_float32_t a, simde_float32_t b) {
@@ -92,6 +110,30 @@ simde_vcgts_f32(simde_float32_t a, simde_float32_t b) {
#define vcgts_f32(a, b) simde_vcgts_f32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcgtq_f16(a, b);
+ #else
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgtq_f16
+ #define vcgtq_f16(a, b) simde_vcgtq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) {
@@ -442,6 +484,30 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vcgtq_u64(a, b) simde_vcgtq_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcgt_f16(a, b);
+ #else
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgt_f16
+ #define vcgt_f16(a, b) simde_vcgt_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/cgtz.h b/lib/simd_wrapper/simde/arm/neon/cgtz.h
index 125e009b2a1..30c6e5dd04f 100644
--- a/lib/simd_wrapper/simde/arm/neon/cgtz.h
+++ b/lib/simd_wrapper/simde/arm/neon/cgtz.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CGTZ_H)
@@ -66,6 +67,42 @@ simde_vcgtzd_f64(simde_float64_t a) {
#define vcgtzd_f64(a) simde_vcgtzd_f64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcgtzh_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vcgtzh_f16(a));
+ #else
+ return (simde_float16_to_float32(a) > SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgtzh_f16
+ #define vcgtzh_f16(a) simde_vcgtzh_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcgtzq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcgtzq_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcgtzh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgtzq_f16
+ #define vcgtzq_f16(a) simde_vcgtzq_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
uint32_t
simde_vcgtzs_f32(simde_float32_t a) {
@@ -248,6 +285,28 @@ simde_vcgtzq_s64(simde_int64x2_t a) {
#define vcgtzq_s64(a) simde_vcgtzq_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcgtz_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcgtz_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcgtzh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcgtz_f16
+ #define vcgtz_f16(a) simde_vcgtz_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vcgtz_f32(simde_float32x2_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/cle.h b/lib/simd_wrapper/simde/arm/neon/cle.h
index 5a1591b3039..fedfcc52245 100644
--- a/lib/simd_wrapper/simde/arm/neon/cle.h
+++ b/lib/simd_wrapper/simde/arm/neon/cle.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CLE_H)
@@ -90,6 +91,44 @@ simde_vcles_f32(simde_float32_t a, simde_float32_t b) {
#define vcles_f32(a, b) simde_vcles_f32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcleh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vcleh_f16(a, b));
+ #else
+ return (simde_float16_to_float32(a) <= simde_float16_to_float32(b)) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcleh_f16
+ #define vcleh_f16(a, b) simde_vcleh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcleq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcleq_f16(a, b);
+ #else
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcleq_f16
+ #define vcleq_f16(a, b) simde_vcleq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcleq_f32(simde_float32x4_t a, simde_float32x4_t b) {
@@ -475,6 +514,30 @@ simde_vcleq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vcleq_u64(a, b) simde_vcleq_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcle_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcle_f16(a, b);
+ #else
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcle_f16
+ #define vcle_f16(a, b) simde_vcle_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vcle_f32(simde_float32x2_t a, simde_float32x2_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/clez.h b/lib/simd_wrapper/simde/arm/neon/clez.h
index ae3eea9b8a7..dd308c7f4ba 100644
--- a/lib/simd_wrapper/simde/arm/neon/clez.h
+++ b/lib/simd_wrapper/simde/arm/neon/clez.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CLEZ_H)
@@ -78,6 +79,44 @@ simde_vclezs_f32(simde_float32_t a) {
#define vclezs_f32(a) simde_vclezs_f32(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vclezh_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vclezh_f16(a));
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+
+ return (a_ <= 0.0f) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vclezh_f16
+ #define vclezh_f16(a) simde_vclezh_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vclezq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vclezq_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vclezh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vclezq_f16
+ #define vclezq_f16(a) simde_vclezq_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vclezq_f32(simde_float32x4_t a) {
@@ -246,6 +285,28 @@ simde_vclezq_s64(simde_int64x2_t a) {
#define vclezq_s64(a) simde_vclezq_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vclez_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vclez_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vclezh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vclez_f16
+ #define vclez_f16(a) simde_vclez_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vclez_f32(simde_float32x2_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/clt.h b/lib/simd_wrapper/simde/arm/neon/clt.h
index ae36027327b..9d3cf407647 100644
--- a/lib/simd_wrapper/simde/arm/neon/clt.h
+++ b/lib/simd_wrapper/simde/arm/neon/clt.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CLT_H)
@@ -77,6 +78,23 @@ simde_vcltd_u64(uint64_t a, uint64_t b) {
#define vcltd_u64(a, b) simde_vcltd_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vclth_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vclth_f16(a, b));
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+
+ return (a_ < b_) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vclth_f16
+ #define vclth_f16(a, b) simde_vclth_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
uint32_t
simde_vclts_f32(simde_float32_t a, simde_float32_t b) {
@@ -91,6 +109,30 @@ simde_vclts_f32(simde_float32_t a, simde_float32_t b) {
#define vclts_f32(a, b) simde_vclts_f32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcltq_f16(a, b);
+ #else
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcltq_f16
+ #define vcltq_f16(a, b) simde_vcltq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) {
@@ -450,6 +492,30 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define vcltq_u64(a, b) simde_vcltq_u64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vclt_f16(a, b);
+ #else
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vclt_f16
+ #define vclt_f16(a, b) simde_vclt_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/cltz.h b/lib/simd_wrapper/simde/arm/neon/cltz.h
index a9c94984e98..2c61d1a1622 100644
--- a/lib/simd_wrapper/simde/arm/neon/cltz.h
+++ b/lib/simd_wrapper/simde/arm/neon/cltz.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
/* TODO: float fallbacks should use vclt(a, vdup_n(0.0)) */
@@ -81,6 +82,42 @@ simde_vcltzs_f32(simde_float32_t a) {
#define vcltzs_f32(a) simde_vcltzs_f32(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcltzh_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return HEDLEY_STATIC_CAST(uint16_t, vcltzh_f16(a));
+ #else
+ return (simde_float16_to_float32(a) < SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcltzh_f16
+ #define vcltzh_f16(a) simde_vcltzh_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcltz_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcltz_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcltzh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcltz_f16
+ #define vcltz_f16(a) simde_vcltz_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vcltz_f32(simde_float32x2_t a) {
@@ -201,6 +238,28 @@ simde_vcltz_s64(simde_int64x1_t a) {
#define vcltz_s64(a) simde_vcltz_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcltzq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcltzq_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcltzh_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcltzq_f16
+ #define vcltzq_f16(a) simde_vcltzq_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vcltzq_f32(simde_float32x4_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla.h b/lib/simd_wrapper/simde/arm/neon/cmla.h
index 559e607032c..68b9a0065c8 100644
--- a/lib/simd_wrapper/simde/arm/neon/cmla.h
+++ b/lib/simd_wrapper/simde/arm/neon/cmla.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Atharva Nimbalkar
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CMLA_H)
@@ -33,12 +34,47 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcmla_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmla_f16(r, a, b);
+ #else
+ simde_float16x4_private
+ r_ = simde_float16x4_to_private(r),
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0]) / 2) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) +
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) +
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_f16
+ #define vcmla_f16(r, a, b) simde_vcmla_f16(r, a, b)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmla_f32(r, a, b);
#else
simde_float32x2_private
@@ -64,12 +100,47 @@ simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
#define vcmla_f32(r, a, b) simde_vcmla_f32(r, a, b)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcmlaq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmlaq_f16(r, a, b);
+ #else
+ simde_float16x8_private
+ r_ = simde_float16x8_to_private(r),
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0]) / 2) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) +
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) +
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_f16
+ #define vcmlaq_f16(r, a, b) simde_vcmlaq_f16(r, a, b)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_f32(r, a, b);
#else
simde_float32x4_private
@@ -77,7 +148,9 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b)
a_ = simde_float32x4_to_private(a),
b_ = simde_float32x4_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, wasm_i32x4_shuffle(a_.v128, a_.v128, 0, 0, 2, 2)));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2);
r_.values += b_.values * a_.values;
#else
@@ -100,7 +173,8 @@ simde_float64x2_t
simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_f64(r, a, b);
#else
simde_float64x2_private
@@ -108,7 +182,9 @@ simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b)
a_ = simde_float64x2_to_private(a),
b_ = simde_float64x2_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, wasm_i64x2_shuffle(a_.v128, a_.v128, 0, 0)));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 0);
r_.values += b_.values * a_.values;
#else
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_lane.h
new file mode 100644
index 00000000000..4355bf7a5f7
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_lane.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Chi-Wei Chu
+ */
+
+#if !defined(SIMDE_ARM_NEON_CMLA_LANE_H)
+#define SIMDE_ARM_NEON_CMLA_LANE_H
+
+#include "add.h"
+#include "combine.h"
+#include "cvt.h"
+#include "dup_lane.h"
+#include "get_high.h"
+#include "get_low.h"
+#include "mul.h"
+#include "types.h"
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[lane] * a_.values[2 * i];
+ r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_lane_f16
+ #define vcmla_lane_f16(r, a, b, lane) simde_vcmla_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_lane_f16(r, a, b, lane) vcmla_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[lane] * a_.values[2 * i];
+ r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_lane_f32
+ #define vcmla_lane_f32(r, a, b, lane) simde_vcmla_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_lane_f32(r, a, b, lane) vcmla_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[lane] * a_.values[2 * i];
+ r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_laneq_f16
+ #define vcmla_laneq_f16(r, a, b, lane) simde_vcmla_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_laneq_f16(r, a, b, lane) vcmla_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[lane] * a_.values[2 * i];
+ r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_laneq_f32
+ #define vcmla_laneq_f32(r, a, b, lane) simde_vcmla_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_laneq_f32(r, a, b, lane) vcmla_laneq_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2);
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2);
+ r_low.values += b_.values * a_low.values;
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i];
+ r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i];
+ r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i];
+ r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_lane_f16
+ #define vcmlaq_lane_f16(r, a, b, lane) simde_vcmlaq_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_lane_f16(r, a, b, lane) vcmlaq_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[lane] * a_.values[2 * i];
+ r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_lane_f32
+ #define vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_lane_f32(r, a, b, lane) vcmlaq_lane_f32(r, a, b, 0);
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2);
+ r_low.values += b_.values * a_low.values;
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2);
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i];
+ r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i];
+ r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i];
+ r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_laneq_f16
+ #define vcmlaq_laneq_f16(r, a, b, lane) simde_vcmlaq_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_laneq_f16(r, a, b, lane) vcmlaq_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane]));
+
+ #if defined(SIMDE_SHUFFLE_VECTOR_)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[lane] * a_.values[2 * i];
+ r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_laneq_f32
+ #define vcmlaq_laneq_f32(r, a, b, lane) simde_vcmlaq_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_laneq_f32(r, a, b, lane) vcmlaq_laneq_f32(r, a, b, lane)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CMLA_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h
index 5a5fa3f85a2..44cf283121c 100644
--- a/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Atharva Nimbalkar
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CMLA_ROT180_H)
@@ -33,12 +34,82 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcmla_rot180_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmla_rot180_f16(r, a, b);
+ #else
+ simde_float16x4_private
+ r_ = simde_float16x4_to_private(r),
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) -
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) -
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot180_f16
+ #define vcmla_rot180_f16(r, a, b) simde_vcmla_rot180_f16(r, a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmlaq_rot180_f16(r, a, b);
+ #else
+ simde_float16x8_private
+ r_ = simde_float16x8_to_private(r),
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) -
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) -
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i]));
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot180_f16
+ #define vcmlaq_rot180_f16(r, a, b) simde_vcmlaq_rot180_f16(r, a, b)
+#endif
+
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmla_rot180_f32(r, a, b);
#else
simde_float32x2_private
@@ -71,7 +142,8 @@ simde_float32x4_t
simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_rot180_f32(r, a, b);
#else
simde_float32x4_private
@@ -79,7 +151,11 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x
a_ = simde_float32x4_to_private(a),
b_ = simde_float32x4_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 0, 0, 2, 2);
+ b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), wasm_f32x4_neg(b_.v128), 0, 1, 2, 3);
+ r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2);
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, -b_.values, 0, 1, 2, 3);
r_.values += b_.values * a_.values;
@@ -104,7 +180,8 @@ simde_float64x2_t
simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_rot180_f64(r, a, b);
#else
simde_float64x2_private
@@ -112,7 +189,11 @@ simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x
a_ = simde_float64x2_to_private(a),
b_ = simde_float64x2_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 0, 0);
+ b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), wasm_f64x2_neg(b_.v128), 0, 1);
+ r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 0);
b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, -b_.values, 0, 1);
r_.values += b_.values * a_.values;
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot180_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot180_lane.h
new file mode 100644
index 00000000000..d7222591786
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot180_lane.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Chi-Wei Chu
+ */
+
+#if !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H)
+#define SIMDE_ARM_NEON_CMLA_ROT180_LANE_H
+
+#include "add.h"
+#include "combine.h"
+#include "cvt.h"
+#include "dup_lane.h"
+#include "get_high.h"
+#include "get_low.h"
+#include "mul.h"
+#include "types.h"
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
+ r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot180_lane_f16
+ #define vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot180_lane_f16(r, a, b, lane) vcmla_rot180_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_rot180_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
+ r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot180_lane_f32
+ #define vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot180_lane_f32(r, a, b, lane) vcmla_rot180_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2);
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3);
+ r_low.values += b_.values * a_low.values;
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i];
+ r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i];
+ r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i];
+ r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot180_lane_f16
+ #define vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot180_lane_f16(r, a, b, lane) vcmlaq_rot180_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_rot180_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
+ r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot180_lane_f32
+ #define vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot180_lane_f32(r, a, b, lane) vcmlaq_rot180_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
+ r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot180_laneq_f16
+ #define vcmla_rot180_laneq_f16(r, a, b, lane) simde_vcmla_rot180_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot180_laneq_f16(r, a, b, lane) vcmla_rot180_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_rot180_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
+ r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot180_laneq_f32
+ #define vcmla_rot180_laneq_f32(r, a, b, lane) simde_vcmla_rot180_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot180_laneq_f32(r, a, b, lane) vcmla_rot180_laneq_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b,
+ const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2);
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3);
+ r_low.values += b_.values * a_low.values;
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i];
+ r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i];
+ r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i];
+ r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot180_laneq_f16
+ #define vcmlaq_rot180_laneq_f16(r, a, b, lane) simde_vcmlaq_rot180_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) vcmlaq_rot180_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_rot180_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b,
+ const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i];
+ r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot180_laneq_f32
+ #define vcmlaq_rot180_laneq_f32(r, a, b, lane) simde_vcmlaq_rot180_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) vcmlaq_rot180_laneq_f32(r, a, b, lane)
+#endif
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h
index cb9835c1fe5..530a30ae95d 100644
--- a/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Atharva Nimbalkar
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CMLA_ROT270_H)
@@ -33,12 +34,81 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcmla_rot270_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmla_rot270_f16(r, a, b);
+ #else
+ simde_float16x4_private
+ r_ = simde_float16x4_to_private(r),
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) +
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) -
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot270_f16
+ #define vcmla_rot270_f16(r, a, b) simde_vcmla_rot270_f16(r, a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcmlaq_rot270_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmlaq_rot270_f16(r, a, b);
+ #else
+ simde_float16x8_private
+ r_ = simde_float16x8_to_private(r),
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) +
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) -
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot270_f16
+ #define vcmlaq_rot270_f16(r, a, b) simde_vcmlaq_rot270_f16(r, a, b)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vcmla_rot270_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmla_rot270_f32(r, a, b);
#else
simde_float32x2_private
@@ -71,7 +141,8 @@ simde_float32x4_t
simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_rot270_f32(r, a, b);
#else
simde_float32x4_private
@@ -79,7 +150,11 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x
a_ = simde_float32x4_to_private(a),
b_ = simde_float32x4_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 1, 1, 3, 3);
+ b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), b_.v128, 5, 0, 7, 2);
+ r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3);
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2);
r_.values += b_.values * a_.values;
@@ -104,7 +179,8 @@ simde_float64x2_t
simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_rot270_f64(r, a, b);
#else
simde_float64x2_private
@@ -112,7 +188,11 @@ simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x
a_ = simde_float64x2_to_private(a),
b_ = simde_float64x2_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 1, 1);
+ b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), b_.v128, 3, 0);
+ r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 1, 1);
b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0);
r_.values += b_.values * a_.values;
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot270_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot270_lane.h
new file mode 100644
index 00000000000..d8d64dd388e
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot270_lane.h
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Chi-Wei Chu
+ */
+
+#if !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H)
+#define SIMDE_ARM_NEON_CMLA_ROT270_LANE_H
+
+#include "add.h"
+#include "combine.h"
+#include "cvt.h"
+#include "dup_lane.h"
+#include "get_high.h"
+#include "get_low.h"
+#include "mul.h"
+#include "types.h"
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot270_lane_f16
+ #define vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot270_lane_f16(r, a, b, lane) vcmla_rot270_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_rot270_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot270_lane_f32
+ #define vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot270_lane_f32(r, a, b, lane) vcmla_rot270_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3);
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2);
+ r_low.values += b_.values * a_low.values;
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1];
+ r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1];
+ r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1];
+ r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot270_lane_f16
+ #define vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot270_lane_f16(r, a, b, lane) vcmlaq_rot270_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot270_lane_f32
+ #define vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot270_lane_f32(r, a, b, lane) vcmlaq_rot270_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot270_laneq_f16
+ #define vcmla_rot270_laneq_f16(r, a, b, lane) simde_vcmla_rot270_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot270_laneq_f16(r, a, b, lane) vcmla_rot270_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_rot270_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot270_laneq_f32
+ #define vcmla_rot270_laneq_f32(r, a, b, lane) simde_vcmla_rot270_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot270_laneq_f32(r, a, b, lane) vcmla_rot270_laneq_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b,
+ const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3);
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2);
+ r_high.values += b_.values * a_high.values;
+ r_low.values += b_.values * a_low.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1];
+ r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1];
+ r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1];
+ r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot270_laneq_f16
+ #define vcmlaq_rot270_laneq_f16(r, a, b, lane) simde_vcmlaq_rot270_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) vcmlaq_rot270_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b,
+ const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot270_laneq_f32
+ #define vcmlaq_rot270_laneq_f32(r, a, b, lane) simde_vcmlaq_rot270_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) vcmlaq_rot270_laneq_f32(r, a, b, lane)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h
index f4ebd13df19..d16a09b20dd 100644
--- a/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Atharva Nimbalkar
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CMLA_ROT90_H)
@@ -33,12 +34,81 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcmla_rot90_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmla_rot90_f16(r, a, b);
+ #else
+ simde_float16x4_private
+ r_ = simde_float16x4_to_private(r),
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) -
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) +
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot90_f16
+ #define vcmla_rot90_f16(r, a, b) simde_vcmla_rot90_f16(r, a, b)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcmlaq_rot90_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX)
+ return vcmlaq_rot90_f16(r, a, b);
+ #else
+ simde_float16x8_private
+ r_ = simde_float16x8_to_private(r),
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) {
+ r_.values[2 * i] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i]) -
+ simde_float16_to_float32(b_.values[2 * i + 1]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ r_.values[2 * i + 1] = simde_float16_from_float32(
+ simde_float16_to_float32(r_.values[2 * i + 1]) +
+ simde_float16_to_float32(b_.values[2 * i]) *
+ simde_float16_to_float32(a_.values[2 * i + 1]));
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot90_f16
+ #define vcmlaq_rot90_f16(r, a, b) simde_vcmlaq_rot90_f16(r, a, b)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vcmla_rot90_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmla_rot90_f32(r, a, b);
#else
simde_float32x2_private
@@ -71,7 +141,8 @@ simde_float32x4_t
simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_rot90_f32(r, a, b);
#else
simde_float32x4_private
@@ -79,7 +150,11 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4
a_ = simde_float32x4_to_private(a),
b_ = simde_float32x4_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 1, 1, 3, 3);
+ b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), b_.v128, 1, 4, 3, 6);
+ r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3);
b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6);
r_.values += b_.values * a_.values;
@@ -104,7 +179,8 @@ simde_float64x2_t
simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \
(!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0))
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \
+ defined(SIMDE_ARCH_ARM_COMPLEX)
return vcmlaq_rot90_f64(r, a, b);
#else
simde_float64x2_private
@@ -112,7 +188,11 @@ simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2
a_ = simde_float64x2_to_private(a),
b_ = simde_float64x2_to_private(b);
- #if defined(SIMDE_SHUFFLE_VECTOR_)
+ #if defined(SIMDE_WASM_SIMD128_NATIVE)
+ a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 1, 1);
+ b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), b_.v128, 1, 2);
+ r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128));
+ #elif defined(SIMDE_SHUFFLE_VECTOR_)
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 1, 1);
b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2);
r_.values += b_.values * a_.values;
diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot90_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot90_lane.h
new file mode 100644
index 00000000000..45df8c0ed48
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot90_lane.h
@@ -0,0 +1,311 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Chi-Wei Chu
+ */
+
+#if !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H)
+#define SIMDE_ARM_NEON_CMLA_ROT90_LANE_H
+
+#include "add.h"
+#include "combine.h"
+#include "cvt.h"
+#include "dup_lane.h"
+#include "get_high.h"
+#include "get_low.h"
+#include "mul.h"
+#include "types.h"
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot90_lane_f16
+ #define vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot90_lane_f16(r, a, b, lane) vcmla_rot90_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_rot90_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot90_lane_f32
+ #define vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot90_lane_f32(r, a, b, lane) vcmla_rot90_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)),
+ a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcvt_f16_f32(simde_float32x4_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot90_laneq_f16
+ #define vcmla_rot90_laneq_f16(r, a, b, lane) simde_vcmla_rot90_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot90_laneq_f16(r, a, b, lane) vcmla_rot90_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t simde_vcmla_rot90_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane]));
+
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmla_rot90_laneq_f32
+ #define vcmla_rot90_laneq_f32(r, a, b, lane) simde_vcmla_rot90_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmla_rot90_laneq_f32(r, a, b, lane) vcmla_rot90_laneq_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3);
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6);
+ r_low.values += b_.values * a_low.values;
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1];
+ r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1];
+ r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1];
+ r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot90_lane_f16
+ #define vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_lane_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) vcmlaq_rot90_lane_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_rot90_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot90_lane_f32
+ #define vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_lane_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) vcmlaq_rot90_lane_f32(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3)
+{
+ simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))),
+ a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))),
+ r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))),
+ a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))),
+ b_ = simde_float32x4_to_private(
+ simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane])));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && \
+ ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16))
+ a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3);
+ a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6);
+ r_low.values += b_.values * a_low.values;
+ r_high.values += b_.values * a_high.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++)
+ {
+ r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1];
+ r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1];
+ r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1];
+ r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1];
+ }
+ #endif
+ return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)),
+ simde_vcvt_f16_f32(simde_float32x4_from_private(r_high)));
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot90_laneq_f16
+ #define vcmlaq_rot90_laneq_f16(r, a, b, lane) simde_vcmlaq_rot90_laneq_f16(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) vcmlaq_rot90_laneq_f16(r, a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t simde_vcmlaq_rot90_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1)
+{
+ simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane]));
+ #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760)
+ a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3);
+ b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6);
+ r_.values += b_.values * a_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++)
+ {
+ r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1];
+ r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1];
+ }
+ #endif
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcmlaq_rot90_laneq_f32
+ #define vcmlaq_rot90_laneq_f32(r, a, b, lane) simde_vcmlaq_rot90_laneq_f32(r, a, b, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0))
+ #define simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) vcmlaq_rot90_laneq_f32(r, a, b, lane)
+#endif
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/cnt.h b/lib/simd_wrapper/simde/arm/neon/cnt.h
index e1fda38e758..9169f7e24ec 100644
--- a/lib/simd_wrapper/simde/arm/neon/cnt.h
+++ b/lib/simd_wrapper/simde/arm/neon/cnt.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CNT_H)
@@ -164,6 +165,34 @@ simde_vcntq_u8(simde_uint8x16_t a) {
#define vcntq_u8(a) simde_vcntq_u8((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vcnt_p8(simde_poly8x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcnt_p8(a);
+ #else
+ return simde_vreinterpret_p8_s8(simde_vcnt_s8(simde_vreinterpret_s8_p8(a)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcnt_p8
+ #define vcnt_p8(a) simde_vcnt_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vcntq_p8(simde_poly8x16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcntq_p8(a);
+ #else
+ return simde_vreinterpretq_p8_s8(simde_vcntq_s8(simde_vreinterpretq_s8_p8(a)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcntq_p8
+ #define vcntq_p8(a) simde_vcntq_p8((a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/combine.h b/lib/simd_wrapper/simde/arm/neon/combine.h
index 66c1df646cb..1a92187846b 100644
--- a/lib/simd_wrapper/simde/arm/neon/combine.h
+++ b/lib/simd_wrapper/simde/arm/neon/combine.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_COMBINE_H)
@@ -34,6 +35,32 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcombine_f16(simde_float16x4_t low, simde_float16x4_t high) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcombine_f16(low, high);
+ #else
+ simde_float16x8_private r_;
+ simde_float16x4_private
+ low_ = simde_float16x4_to_private(low),
+ high_ = simde_float16x4_to_private(high);
+
+ size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2;
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < halfway ; i++) {
+ r_.values[i] = low_.values[i];
+ r_.values[i + halfway] = high_.values[i];
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcombine_f16
+ #define vcombine_f16(low, high) simde_vcombine_f16((low), (high))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vcombine_f32(simde_float32x2_t low, simde_float32x2_t high) {
@@ -337,6 +364,110 @@ simde_vcombine_u64(simde_uint64x1_t low, simde_uint64x1_t high) {
#define vcombine_u64(low, high) simde_vcombine_u64((low), (high))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vcombine_p8(simde_poly8x8_t low, simde_poly8x8_t high) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcombine_p8(low, high);
+ #else
+ simde_poly8x16_private r_;
+ simde_poly8x8_private
+ low_ = simde_poly8x8_to_private(low),
+ high_ = simde_poly8x8_to_private(high);
+
+ size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2;
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < halfway ; i++) {
+ r_.values[i] = low_.values[i];
+ r_.values[i + halfway] = high_.values[i];
+ }
+
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcombine_p8
+ #define vcombine_p8(low, high) simde_vcombine_p8((low), (high))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vcombine_p16(simde_poly16x4_t low, simde_poly16x4_t high) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcombine_p16(low, high);
+ #else
+ simde_poly16x8_private r_;
+ simde_poly16x4_private
+ low_ = simde_poly16x4_to_private(low),
+ high_ = simde_poly16x4_to_private(high);
+
+ size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2;
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < halfway ; i++) {
+ r_.values[i] = low_.values[i];
+ r_.values[i + halfway] = high_.values[i];
+ }
+
+ return simde_poly16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcombine_p16
+ #define vcombine_p16(low, high) simde_vcombine_p16((low), (high))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vcombine_p64(simde_poly64x1_t low, simde_poly64x1_t high) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vcombine_p64(low, high);
+ #else
+ simde_poly64x2_private r_;
+ simde_poly64x1_private
+ low_ = simde_poly64x1_to_private(low),
+ high_ = simde_poly64x1_to_private(high);
+
+ size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2;
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < halfway ; i++) {
+ r_.values[i] = low_.values[i];
+ r_.values[i + halfway] = high_.values[i];
+ }
+
+ return simde_poly64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcombine_p64
+ #define vcombine_p64(low, high) simde_vcombine_p64((low), (high))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vcombine_bf16(simde_bfloat16x4_t low, simde_bfloat16x4_t high) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcombine_bf16(low, high);
+ #else
+ simde_bfloat16x8_private r_;
+ simde_bfloat16x4_private
+ low_ = simde_bfloat16x4_to_private(low),
+ high_ = simde_bfloat16x4_to_private(high);
+
+ size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2;
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < halfway ; i++) {
+ r_.values[i] = low_.values[i];
+ r_.values[i + halfway] = high_.values[i];
+ }
+
+ return simde_bfloat16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcombine_bf16
+ #define vcombine_bf16(low, high) simde_vcombine_bf16((low), (high))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/copy_lane.h b/lib/simd_wrapper/simde/arm/neon/copy_lane.h
new file mode 100644
index 00000000000..7195c8076fb
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/copy_lane.h
@@ -0,0 +1,1184 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_COPY_LANE_H)
+#define SIMDE_ARM_NEON_COPY_LANE_H
+
+#include "types.h"
+#include "cvt.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8_t
+simde_vcopy_lane_s8(simde_int8x8_t a, const int lane1, simde_int8x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_int8x8_private
+ b_ = simde_int8x8_to_private(b),
+ r_ = simde_int8x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int8x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_s8(a, lane1, b, lane2) vcopy_lane_s8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_s8
+ #define vcopy_lane_s8(a, lane1, b, lane2) simde_vcopy_lane_s8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4_t
+simde_vcopy_lane_s16(simde_int16x4_t a, const int lane1, simde_int16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_int16x4_private
+ b_ = simde_int16x4_to_private(b),
+ r_ = simde_int16x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_s16(a, lane1, b, lane2) vcopy_lane_s16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_s16
+ #define vcopy_lane_s16(a, lane1, b, lane2) simde_vcopy_lane_s16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2_t
+simde_vcopy_lane_s32(simde_int32x2_t a, const int lane1, simde_int32x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_int32x2_private
+ b_ = simde_int32x2_to_private(b),
+ r_ = simde_int32x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_s32(a, lane1, b, lane2) vcopy_lane_s32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_s32
+ #define vcopy_lane_s32(a, lane1, b, lane2) simde_vcopy_lane_s32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1_t
+simde_vcopy_lane_s64(simde_int64x1_t a, const int lane1, simde_int64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_int64x1_private
+ b_ = simde_int64x1_to_private(b),
+ r_ = simde_int64x1_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_s64(a, lane1, b, lane2) vcopy_lane_s64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_s64
+ #define vcopy_lane_s64(a, lane1, b, lane2) simde_vcopy_lane_s64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8_t
+simde_vcopy_lane_u8(simde_uint8x8_t a, const int lane1, simde_uint8x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_uint8x8_private
+ b_ = simde_uint8x8_to_private(b),
+ r_ = simde_uint8x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint8x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_u8(a, lane1, b, lane2) vcopy_lane_u8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_u8
+ #define vcopy_lane_u8(a, lane1, b, lane2) simde_vcopy_lane_u8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcopy_lane_u16(simde_uint16x4_t a, const int lane1, simde_uint16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_uint16x4_private
+ b_ = simde_uint16x4_to_private(b),
+ r_ = simde_uint16x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_u16(a, lane1, b, lane2) vcopy_lane_u16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_u16
+ #define vcopy_lane_u16(a, lane1, b, lane2) simde_vcopy_lane_u16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcopy_lane_u32(simde_uint32x2_t a, const int lane1, simde_uint32x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_uint32x2_private
+ b_ = simde_uint32x2_to_private(b),
+ r_ = simde_uint32x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_u32(a, lane1, b, lane2) vcopy_lane_u32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_u32
+ #define vcopy_lane_u32(a, lane1, b, lane2) simde_vcopy_lane_u32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcopy_lane_u64(simde_uint64x1_t a, const int lane1, simde_uint64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_uint64x1_private
+ b_ = simde_uint64x1_to_private(b),
+ r_ = simde_uint64x1_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_u64(a, lane1, b, lane2) vcopy_lane_u64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_u64
+ #define vcopy_lane_u64(a, lane1, b, lane2) simde_vcopy_lane_u64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vcopy_lane_f32(simde_float32x2_t a, const int lane1, simde_float32x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_float32x2_private
+ b_ = simde_float32x2_to_private(b),
+ r_ = simde_float32x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_f32(a, lane1, b, lane2) vcopy_lane_f32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_f32
+ #define vcopy_lane_f32(a, lane1, b, lane2) simde_vcopy_lane_f32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vcopy_lane_f64(simde_float64x1_t a, const int lane1, simde_float64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_float64x1_private
+ b_ = simde_float64x1_to_private(b),
+ r_ = simde_float64x1_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_lane_f64(a, lane1, b, lane2) vcopy_lane_f64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_f64
+ #define vcopy_lane_f64(a, lane1, b, lane2) simde_vcopy_lane_f64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8_t
+simde_vcopy_laneq_s8(simde_int8x8_t a, const int lane1, simde_int8x16_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) {
+ simde_int8x8_private
+ r_ = simde_int8x8_to_private(a);
+ simde_int8x16_private
+ b_ = simde_int8x16_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int8x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_s8(a, lane1, b, lane2) vcopy_laneq_s8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_s8
+ #define vcopy_laneq_s8(a, lane1, b, lane2) simde_vcopy_laneq_s8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4_t
+simde_vcopy_laneq_s16(simde_int16x4_t a, const int lane1, simde_int16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_int16x4_private
+ r_ = simde_int16x4_to_private(a);
+ simde_int16x8_private
+ b_ = simde_int16x8_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_s16(a, lane1, b, lane2) vcopy_laneq_s16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_s16
+ #define vcopy_laneq_s16(a, lane1, b, lane2) simde_vcopy_laneq_s16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2_t
+simde_vcopy_laneq_s32(simde_int32x2_t a, const int lane1, simde_int32x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_int32x2_private
+ r_ = simde_int32x2_to_private(a);
+ simde_int32x4_private
+ b_ = simde_int32x4_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_s32(a, lane1, b, lane2) vcopy_laneq_s32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_s32
+ #define vcopy_laneq_s32(a, lane1, b, lane2) simde_vcopy_laneq_s32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1_t
+simde_vcopy_laneq_s64(simde_int64x1_t a, const int lane1, simde_int64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_int64x1_private
+ r_ = simde_int64x1_to_private(a);
+ simde_int64x2_private
+ b_ = simde_int64x2_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_s64(a, lane1, b, lane2) vcopy_laneq_s64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_s64
+ #define vcopy_laneq_s64(a, lane1, b, lane2) simde_vcopy_laneq_s64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8_t
+simde_vcopy_laneq_u8(simde_uint8x8_t a, const int lane1, simde_uint8x16_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) {
+ simde_uint8x8_private
+ r_ = simde_uint8x8_to_private(a);
+ simde_uint8x16_private
+ b_ = simde_uint8x16_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint8x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_u8(a, lane1, b, lane2) vcopy_laneq_u8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_u8
+ #define vcopy_laneq_u8(a, lane1, b, lane2) simde_vcopy_laneq_u8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcopy_laneq_u16(simde_uint16x4_t a, const int lane1, simde_uint16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_uint16x4_private
+ r_ = simde_uint16x4_to_private(a);
+ simde_uint16x8_private
+ b_ = simde_uint16x8_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_u16(a, lane1, b, lane2) vcopy_laneq_u16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_u16
+ #define vcopy_laneq_u16(a, lane1, b, lane2) simde_vcopy_laneq_u16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcopy_laneq_u32(simde_uint32x2_t a, const int lane1, simde_uint32x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_uint32x2_private
+ r_ = simde_uint32x2_to_private(a);
+ simde_uint32x4_private
+ b_ = simde_uint32x4_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_u32(a, lane1, b, lane2) vcopy_laneq_u32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_u32
+ #define vcopy_laneq_u32(a, lane1, b, lane2) simde_vcopy_laneq_u32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcopy_laneq_u64(simde_uint64x1_t a, const int lane1, simde_uint64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_uint64x1_private
+ r_ = simde_uint64x1_to_private(a);
+ simde_uint64x2_private
+ b_ = simde_uint64x2_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_u64(a, lane1, b, lane2) vcopy_laneq_u64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_u64
+ #define vcopy_laneq_u64(a, lane1, b, lane2) simde_vcopy_laneq_u64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vcopy_laneq_f32(simde_float32x2_t a, const int lane1, simde_float32x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_float32x2_private
+ r_ = simde_float32x2_to_private(a);
+ simde_float32x4_private
+ b_ = simde_float32x4_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_f32(a, lane1, b, lane2) vcopy_laneq_f32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_f32
+ #define vcopy_laneq_f32(a, lane1, b, lane2) simde_vcopy_laneq_f32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vcopy_laneq_f64(simde_float64x1_t a, const int lane1, simde_float64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_float64x1_private
+ r_ = simde_float64x1_to_private(a);
+ simde_float64x2_private
+ b_ = simde_float64x2_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopy_laneq_f64(a, lane1, b, lane2) vcopy_laneq_f64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_f64
+ #define vcopy_laneq_f64(a, lane1, b, lane2) simde_vcopy_laneq_f64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16_t
+simde_vcopyq_lane_s8(simde_int8x16_t a, const int lane1, simde_int8x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_int8x8_private
+ b_ = simde_int8x8_to_private(b);
+ simde_int8x16_private
+ r_ = simde_int8x16_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int8x16_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_s8(a, lane1, b, lane2) vcopyq_lane_s8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_s8
+ #define vcopyq_lane_s8(a, lane1, b, lane2) simde_vcopyq_lane_s8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vcopyq_lane_s16(simde_int16x8_t a, const int lane1, simde_int16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_int16x4_private
+ b_ = simde_int16x4_to_private(b);
+ simde_int16x8_private
+ r_ = simde_int16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_s16(a, lane1, b, lane2) vcopyq_lane_s16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_s16
+ #define vcopyq_lane_s16(a, lane1, b, lane2) simde_vcopyq_lane_s16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vcopyq_lane_s32(simde_int32x4_t a, const int lane1, simde_int32x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_int32x2_private
+ b_ = simde_int32x2_to_private(b);
+ simde_int32x4_private
+ r_ = simde_int32x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_s32(a, lane1, b, lane2) vcopyq_lane_s32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_s32
+ #define vcopyq_lane_s32(a, lane1, b, lane2) simde_vcopyq_lane_s32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vcopyq_lane_s64(simde_int64x2_t a, const int lane1, simde_int64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_int64x1_private
+ b_ = simde_int64x1_to_private(b);
+ simde_int64x2_private
+ r_ = simde_int64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_s64(a, lane1, b, lane2) vcopyq_lane_s64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_s64
+ #define vcopyq_lane_s64(a, lane1, b, lane2) simde_vcopyq_lane_s64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vcopyq_lane_u8(simde_uint8x16_t a, const int lane1, simde_uint8x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_uint8x8_private
+ b_ = simde_uint8x8_to_private(b);
+ simde_uint8x16_private
+ r_ = simde_uint8x16_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint8x16_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_u8(a, lane1, b, lane2) vcopyq_lane_u8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_u8
+ #define vcopyq_lane_u8(a, lane1, b, lane2) simde_vcopyq_lane_u8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcopyq_lane_u16(simde_uint16x8_t a, const int lane1, simde_uint16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_uint16x4_private
+ b_ = simde_uint16x4_to_private(b);
+ simde_uint16x8_private
+ r_ = simde_uint16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_u16(a, lane1, b, lane2) vcopyq_lane_u16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_u16
+ #define vcopyq_lane_u16(a, lane1, b, lane2) simde_vcopyq_lane_u16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcopyq_lane_u32(simde_uint32x4_t a, const int lane1, simde_uint32x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_uint32x2_private
+ b_ = simde_uint32x2_to_private(b);
+ simde_uint32x4_private
+ r_ = simde_uint32x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_u32(a, lane1, b, lane2) vcopyq_lane_u32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_u32
+ #define vcopyq_lane_u32(a, lane1, b, lane2) simde_vcopyq_lane_u32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcopyq_lane_u64(simde_uint64x2_t a, const int lane1, simde_uint64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_uint64x1_private
+ b_ = simde_uint64x1_to_private(b);
+ simde_uint64x2_private
+ r_ = simde_uint64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_u64(a, lane1, b, lane2) vcopyq_lane_u64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_u64
+ #define vcopyq_lane_u64(a, lane1, b, lane2) simde_vcopyq_lane_u64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcopyq_lane_f32(simde_float32x4_t a, const int lane1, simde_float32x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_float32x2_private
+ b_ = simde_float32x2_to_private(b);
+ simde_float32x4_private
+ r_ = simde_float32x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_f32(a, lane1, b, lane2) vcopyq_lane_f32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_f32
+ #define vcopyq_lane_f32(a, lane1, b, lane2) simde_vcopyq_lane_f32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vcopyq_lane_f64(simde_float64x2_t a, const int lane1, simde_float64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_float64x1_private
+ b_ = simde_float64x1_to_private(b);
+ simde_float64x2_private
+ r_ = simde_float64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_lane_f64(a, lane1, b, lane2) vcopyq_lane_f64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_f64
+ #define vcopyq_lane_f64(a, lane1, b, lane2) simde_vcopyq_lane_f64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16_t
+simde_vcopyq_laneq_s8(simde_int8x16_t a, const int lane1, simde_int8x16_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) {
+ simde_int8x16_private
+ b_ = simde_int8x16_to_private(b),
+ r_ = simde_int8x16_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int8x16_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_s8(a, lane1, b, lane2) vcopyq_laneq_s8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_s8
+ #define vcopyq_laneq_s8(a, lane1, b, lane2) simde_vcopyq_laneq_s8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vcopyq_laneq_s16(simde_int16x8_t a, const int lane1, simde_int16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_int16x8_private
+ b_ = simde_int16x8_to_private(b),
+ r_ = simde_int16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_s16(a, lane1, b, lane2) vcopyq_laneq_s16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_s16
+ #define vcopyq_laneq_s16(a, lane1, b, lane2) simde_vcopyq_laneq_s16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vcopyq_laneq_s32(simde_int32x4_t a, const int lane1, simde_int32x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_int32x4_private
+ b_ = simde_int32x4_to_private(b),
+ r_ = simde_int32x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_s32(a, lane1, b, lane2) vcopyq_laneq_s32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_s32
+ #define vcopyq_laneq_s32(a, lane1, b, lane2) simde_vcopyq_laneq_s32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vcopyq_laneq_s64(simde_int64x2_t a, const int lane1, simde_int64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_int64x2_private
+ b_ = simde_int64x2_to_private(b),
+ r_ = simde_int64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_int64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_s64(a, lane1, b, lane2) vcopyq_laneq_s64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_s64
+ #define vcopyq_laneq_s64(a, lane1, b, lane2) simde_vcopyq_laneq_s64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vcopyq_laneq_u8(simde_uint8x16_t a, const int lane1, simde_uint8x16_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) {
+ simde_uint8x16_private
+ b_ = simde_uint8x16_to_private(b),
+ r_ = simde_uint8x16_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint8x16_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_u8(a, lane1, b, lane2) vcopyq_laneq_u8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_u8
+ #define vcopyq_laneq_u8(a, lane1, b, lane2) simde_vcopyq_laneq_u8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcopyq_laneq_u16(simde_uint16x8_t a, const int lane1, simde_uint16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_uint16x8_private
+ b_ = simde_uint16x8_to_private(b),
+ r_ = simde_uint16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_u16(a, lane1, b, lane2) vcopyq_laneq_u16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_u16
+ #define vcopyq_laneq_u16(a, lane1, b, lane2) simde_vcopyq_laneq_u16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcopyq_laneq_u32(simde_uint32x4_t a, const int lane1, simde_uint32x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_uint32x4_private
+ b_ = simde_uint32x4_to_private(b),
+ r_ = simde_uint32x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_u32(a, lane1, b, lane2) vcopyq_laneq_u32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_u32
+ #define vcopyq_laneq_u32(a, lane1, b, lane2) simde_vcopyq_laneq_u32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcopyq_laneq_u64(simde_uint64x2_t a, const int lane1, simde_uint64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_uint64x2_private
+ b_ = simde_uint64x2_to_private(b),
+ r_ = simde_uint64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_uint64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_u64(a, lane1, b, lane2) vcopyq_laneq_u64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_u64
+ #define vcopyq_laneq_u64(a, lane1, b, lane2) simde_vcopyq_laneq_u64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcopyq_laneq_f32(simde_float32x4_t a, const int lane1, simde_float32x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_float32x4_private
+ b_ = simde_float32x4_to_private(b),
+ r_ = simde_float32x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_f32(a, lane1, b, lane2) vcopyq_laneq_f32((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_f32
+ #define vcopyq_laneq_f32(a, lane1, b, lane2) simde_vcopyq_laneq_f32((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vcopyq_laneq_f64(simde_float64x2_t a, const int lane1, simde_float64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_float64x2_private
+ b_ = simde_float64x2_to_private(b),
+ r_ = simde_float64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_float64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcopyq_laneq_f64(a, lane1, b, lane2) vcopyq_laneq_f64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_f64
+ #define vcopyq_laneq_f64(a, lane1, b, lane2) simde_vcopyq_laneq_f64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vcopy_lane_p8(simde_poly8x8_t a, const int lane1, simde_poly8x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_poly8x8_private
+ b_ = simde_poly8x8_to_private(b),
+ r_ = simde_poly8x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly8x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopy_lane_p8(a, lane1, b, lane2) vcopy_lane_p8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_p8
+ #define vcopy_lane_p8(a, lane1, b, lane2) simde_vcopy_lane_p8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vcopy_lane_p16(simde_poly16x4_t a, const int lane1, simde_poly16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_poly16x4_private
+ b_ = simde_poly16x4_to_private(b),
+ r_ = simde_poly16x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopy_lane_p16(a, lane1, b, lane2) vcopy_lane_p16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_p16
+ #define vcopy_lane_p16(a, lane1, b, lane2) simde_vcopy_lane_p16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vcopy_lane_p64(simde_poly64x1_t a, const int lane1, simde_poly64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_poly64x1_private
+ b_ = simde_poly64x1_to_private(b),
+ r_ = simde_poly64x1_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopy_lane_p64(a, lane1, b, lane2) vcopy_lane_p64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_p64
+ #define vcopy_lane_p64(a, lane1, b, lane2) simde_vcopy_lane_p64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vcopy_laneq_p8(simde_poly8x8_t a, const int lane1, simde_poly8x16_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) {
+ simde_poly8x8_private
+ r_ = simde_poly8x8_to_private(a);
+ simde_poly8x16_private
+ b_ = simde_poly8x16_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly8x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopy_laneq_p8(a, lane1, b, lane2) vcopy_laneq_p8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_p8
+ #define vcopy_laneq_p8(a, lane1, b, lane2) simde_vcopy_laneq_p8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vcopy_laneq_p16(simde_poly16x4_t a, const int lane1, simde_poly16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_poly16x4_private
+ r_ = simde_poly16x4_to_private(a);
+ simde_poly16x8_private
+ b_ = simde_poly16x8_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopy_laneq_p16(a, lane1, b, lane2) vcopy_laneq_p16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_p16
+ #define vcopy_laneq_p16(a, lane1, b, lane2) simde_vcopy_laneq_p16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vcopy_laneq_p64(simde_poly64x1_t a, const int lane1, simde_poly64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_poly64x1_private
+ r_ = simde_poly64x1_to_private(a);
+ simde_poly64x2_private
+ b_ = simde_poly64x2_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopy_laneq_p64(a, lane1, b, lane2) vcopy_laneq_p64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_p64
+ #define vcopy_laneq_p64(a, lane1, b, lane2) simde_vcopy_laneq_p64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vcopyq_lane_p8(simde_poly8x16_t a, const int lane1, simde_poly8x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_poly8x8_private
+ b_ = simde_poly8x8_to_private(b);
+ simde_poly8x16_private
+ r_ = simde_poly8x16_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly8x16_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopyq_lane_p8(a, lane1, b, lane2) vcopyq_lane_p8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_p8
+ #define vcopyq_lane_p8(a, lane1, b, lane2) simde_vcopyq_lane_p8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vcopyq_lane_p16(simde_poly16x8_t a, const int lane1, simde_poly16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_poly16x4_private
+ b_ = simde_poly16x4_to_private(b);
+ simde_poly16x8_private
+ r_ = simde_poly16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopyq_lane_p16(a, lane1, b, lane2) vcopyq_lane_p16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_p16
+ #define vcopyq_lane_p16(a, lane1, b, lane2) simde_vcopyq_lane_p16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vcopyq_lane_p64(simde_poly64x2_t a, const int lane1, simde_poly64x1_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) {
+ simde_poly64x1_private
+ b_ = simde_poly64x1_to_private(b);
+ simde_poly64x2_private
+ r_ = simde_poly64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopyq_lane_p64(a, lane1, b, lane2) vcopyq_lane_p64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_p64
+ #define vcopyq_lane_p64(a, lane1, b, lane2) simde_vcopyq_lane_p64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vcopyq_laneq_p8(simde_poly8x16_t a, const int lane1, simde_poly8x16_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) {
+ simde_poly8x16_private
+ b_ = simde_poly8x16_to_private(b),
+ r_ = simde_poly8x16_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly8x16_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopyq_laneq_p8(a, lane1, b, lane2) vcopyq_laneq_p8((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_p8
+ #define vcopyq_laneq_p8(a, lane1, b, lane2) simde_vcopyq_laneq_p8((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vcopyq_laneq_p16(simde_poly16x8_t a, const int lane1, simde_poly16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_poly16x8_private
+ b_ = simde_poly16x8_to_private(b),
+ r_ = simde_poly16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopyq_laneq_p16(a, lane1, b, lane2) vcopyq_laneq_p16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_p16
+ #define vcopyq_laneq_p16(a, lane1, b, lane2) simde_vcopyq_laneq_p16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vcopyq_laneq_p64(simde_poly64x2_t a, const int lane1, simde_poly64x2_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) {
+ simde_poly64x2_private
+ b_ = simde_poly64x2_to_private(b),
+ r_ = simde_poly64x2_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_poly64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vcopyq_laneq_p64(a, lane1, b, lane2) vcopyq_laneq_p64((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_p64
+ #define vcopyq_laneq_p64(a, lane1, b, lane2) simde_vcopyq_laneq_p64((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vcopy_lane_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_bfloat16x4_private
+ b_ = simde_bfloat16x4_to_private(b),
+ r_ = simde_bfloat16x4_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_bfloat16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vcopy_lane_bf16(a, lane1, b, lane2) vcopy_lane_bf16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_lane_bf16
+ #define vcopy_lane_bf16(a, lane1, b, lane2) simde_vcopy_lane_bf16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vcopy_laneq_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(a);
+ simde_bfloat16x8_private b_ = simde_bfloat16x8_to_private(b);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_bfloat16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vcopy_laneq_bf16(a, lane1, b, lane2) vcopy_laneq_bf16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopy_laneq_bf16
+ #define vcopy_laneq_bf16(a, lane1, b, lane2) simde_vcopy_laneq_bf16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vcopyq_lane_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x4_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) {
+ simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b);
+ simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_bfloat16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vcopyq_lane_bf16(a, lane1, b, lane2) vcopyq_lane_bf16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_lane_bf16
+ #define vcopyq_lane_bf16(a, lane1, b, lane2) simde_vcopyq_lane_bf16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vcopyq_laneq_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x8_t b, const int lane2)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) {
+ simde_bfloat16x8_private
+ b_ = simde_bfloat16x8_to_private(b),
+ r_ = simde_bfloat16x8_to_private(a);
+
+ r_.values[lane1] = b_.values[lane2];
+ return simde_bfloat16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vcopyq_laneq_bf16(a, lane1, b, lane2) vcopyq_laneq_bf16((a), (lane1), (b), (lane2))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcopyq_laneq_bf16
+ #define vcopyq_laneq_bf16(a, lane1, b, lane2) simde_vcopyq_laneq_bf16((a), (lane1), (b), (lane2))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* SIMDE_ARM_NEON_COPY_LANE_H */
diff --git a/lib/simd_wrapper/simde/arm/neon/crc32.h b/lib/simd_wrapper/simde/arm/neon/crc32.h
new file mode 100644
index 00000000000..1223190c189
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/crc32.h
@@ -0,0 +1,282 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CRC32_H)
+#define SIMDE_ARM_NEON_CRC32_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t simde_crc32_reverseBits(uint64_t num, int num_of_bits)
+{
+ uint64_t reverse_num = 0;
+ for (int i = 0; i < num_of_bits; i++) {
+ if (num & (1ULL << i))
+ reverse_num |= 1ULL << (num_of_bits - 1 - i);
+ }
+ return reverse_num;
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t simde_crc32_eor_mask(uint32_t a, uint32_t b, uint32_t mask) {
+ uint32_t part_a = a & mask;
+ uint32_t part_result = part_a ^ b;
+ uint32_t result = (a & ~mask) | part_result;
+ return result;
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32b(uint32_t a, uint8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32b(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 8) << 24));
+ uint32_t head = r_acc ^ r_val;
+ uint32_t tail = 0;
+ const uint32_t poly = 0x04C11DB7;
+ for(int i = 31; i >= 24; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ uint32_t result = ((head & 0x00FFFFFF) << 8) | ((tail & 0xFF000000) >> 24);
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32b
+ #define __crc32b(a, b) simde___crc32b((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32h(uint32_t a, uint16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32h(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 16) << 16));
+ uint32_t head = r_acc ^ r_val;
+ uint32_t tail = 0;
+ const uint32_t poly = 0x04C11DB7;
+ for(int i = 31; i >= 16; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ uint32_t result = ((head & 0x0000FFFF) << 16) | ((tail & 0xFFFF0000) >> 16);
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32h
+ #define __crc32h(a, b) simde___crc32h((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32w(uint32_t a, uint32_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32w(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 32));
+ uint32_t head = r_acc ^ r_val;
+ uint32_t tail = 0;
+ const uint32_t poly = 0x04C11DB7;
+ for(int i = 31; i >= 0; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32w
+ #define __crc32w(a, b) simde___crc32w((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32d(uint32_t a, uint64_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32d(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint64_t r_val = simde_crc32_reverseBits(b, 64);
+ uint32_t val_head = HEDLEY_STATIC_CAST(uint32_t, r_val >> 32);
+ uint32_t val_mid = HEDLEY_STATIC_CAST(uint32_t, r_val & 0x00000000FFFFFFFF);
+ uint32_t head = r_acc ^ val_head;
+ uint32_t mid = 0u ^ val_mid;
+ uint32_t tail = 0u;
+ const uint32_t poly = 0x04C11DB7;
+ for(int i = 31; i >= 0; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ mid = simde_crc32_eor_mask(mid, poly << i, 0xFFFFFFFF);
+ tail = simde_crc32_eor_mask(tail, 0x0, 0xFFFFFFFF);
+ }
+ }
+ for(int i = 31; i >= 0; --i) {
+ if ((mid>>i) & 1) {
+ mid = simde_crc32_eor_mask(mid, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32d
+ #define __crc32d(a, b) simde___crc32d((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32cb(uint32_t a, uint8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32cb(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 8) << 24));
+ uint32_t head = r_acc ^ r_val;
+ uint32_t tail = 0;
+ const uint32_t poly = 0x1EDC6F41;
+ for(int i = 31; i >= 24; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ uint32_t result = ((head & 0x00FFFFFF) << 8) | ((tail & 0xFF000000) >> 24);
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32cb
+ #define __crc32cb(a, b) simde___crc32cb((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32ch(uint32_t a, uint16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32ch(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 16) << 16);
+ uint32_t head = r_acc ^ r_val;
+ uint32_t tail = 0;
+ const uint32_t poly = 0x1EDC6F41;
+ for(int i = 31; i >= 16; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ uint32_t result = ((head & 0x0000FFFF) << 16) | ((tail & 0xFFFF0000) >> 16);
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32ch
+ #define __crc32ch(a, b) simde___crc32ch((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32cw(uint32_t a, uint32_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32cw(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 32));
+ uint32_t head = r_acc ^ r_val;
+ uint32_t tail = 0;
+ const uint32_t poly = 0x1EDC6F41;
+ for(int i = 31; i >= 0; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32cw
+ #define __crc32cw(a, b) simde___crc32cw((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde___crc32cd(uint32_t a, uint64_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE)
+ return __crc32cd(a, b);
+ #else
+ uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32));
+ uint64_t r_val = simde_crc32_reverseBits(b, 64);
+ uint32_t val_head = HEDLEY_STATIC_CAST(uint32_t, r_val >> 32);
+ uint32_t val_mid = HEDLEY_STATIC_CAST(uint32_t, r_val & 0x00000000FFFFFFFF);
+ uint32_t head = r_acc ^ val_head;
+ uint32_t mid = 0u ^ val_mid;
+ uint32_t tail = 0u;
+ const uint32_t poly = 0x1EDC6F41;
+ for(int i = 31; i >= 0; --i) {
+ if ((head>>i) & 1) {
+ head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1);
+ mid = simde_crc32_eor_mask(mid, poly << i, 0xFFFFFFFF);
+ tail = simde_crc32_eor_mask(tail, 0x0, 0xFFFFFFFF);
+ }
+ }
+ for(int i = 31; i >= 0; --i) {
+ if ((mid>>i) & 1) {
+ mid = simde_crc32_eor_mask(mid, poly >> (32-i), (1u << (i)) - 1);
+ tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF);
+ }
+ }
+ return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef __crc32cd
+ #define __crc32cd(a, b) simde___crc32cd((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_CRC32_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/create.h b/lib/simd_wrapper/simde/arm/neon/create.h
index 57f6f6ebaa9..5954922bb14 100644
--- a/lib/simd_wrapper/simde/arm/neon/create.h
+++ b/lib/simd_wrapper/simde/arm/neon/create.h
@@ -23,12 +23,9 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
-/* N.B. CM: vcreate_f16 and vcreate_bf16 are omitted as
- * SIMDe has no 16-bit floating point support.
- * Idem for the poly types. */
-
#if !defined(SIMDE_ARM_NEON_CREATE_H)
#define SIMDE_ARM_NEON_CREATE_H
@@ -152,6 +149,20 @@ simde_vcreate_u64(uint64_t a) {
#define vcreate_u64(a) simde_vcreate_u64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcreate_f16(uint64_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcreate_f16(a);
+ #else
+ return simde_vreinterpret_f16_u64(simde_vdup_n_u64(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcreate_f16
+ #define vcreate_f16(a) simde_vcreate_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vcreate_f32(uint64_t a) {
@@ -180,6 +191,62 @@ simde_vcreate_f64(uint64_t a) {
#define vcreate_f64(a) simde_vcreate_f64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vcreate_p8(simde_poly64_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcreate_p8(a);
+ #else
+ return simde_vreinterpret_p8_p64(simde_vdup_n_p64(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcreate_p8
+ #define vcreate_p8(a) simde_vcreate_p8(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vcreate_p16(simde_poly64_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vcreate_p16(a);
+ #else
+ return simde_vreinterpret_p16_p64(simde_vdup_n_p64(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcreate_p16
+ #define vcreate_p16(a) simde_vcreate_p16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vcreate_p64(simde_poly64_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vcreate_p64(a);
+ #else
+ return simde_vdup_n_p64(a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcreate_p64
+ #define vcreate_p64(a) simde_vcreate_p64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vcreate_bf16(uint64_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcreate_bf16(a);
+ #else
+ return simde_vreinterpret_bf16_u64(simde_vdup_n_u64(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcreate_bf16
+ #define vcreate_bf16(a) simde_vcreate_bf16(a)
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/cvt.h b/lib/simd_wrapper/simde/arm/neon/cvt.h
index 55693c86943..ab5122527f4 100644
--- a/lib/simd_wrapper/simde/arm/neon/cvt.h
+++ b/lib/simd_wrapper/simde/arm/neon/cvt.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Sean Maher
* 2020-2021 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_CVT_H)
@@ -43,7 +44,7 @@ simde_vcvt_f16_f32(simde_float32x4_t a) {
simde_float32x4_private a_ = simde_float32x4_to_private(a);
simde_float16x4_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
@@ -69,7 +70,7 @@ simde_vcvt_f32_f16(simde_float16x4_t a) {
simde_float16x4_private a_ = simde_float16x4_to_private(a);
simde_float32x4_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
@@ -139,42 +140,134 @@ simde_vcvt_f64_f32(simde_float32x2_t a) {
#endif
SIMDE_FUNCTION_ATTRIBUTES
-int16_t
-simde_x_vcvts_s16_f16(simde_float16 a) {
- #if defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_ARM_NEON_FP16)
- return HEDLEY_STATIC_CAST(int16_t, a);
+uint16_t
+simde_vcvth_u16_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_u16_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint16_t,
+ simde_float16_to_float32(a));
#else
simde_float32 af = simde_float16_to_float32(a);
- if (HEDLEY_UNLIKELY(af < HEDLEY_STATIC_CAST(simde_float32, INT16_MIN))) {
- return INT16_MIN;
- } else if (HEDLEY_UNLIKELY(af > HEDLEY_STATIC_CAST(simde_float32, INT16_MAX))) {
- return INT16_MAX;
- } else if (HEDLEY_UNLIKELY(simde_math_isnanf(af))) {
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) {
+ return UINT16_MAX;
+ } else if (simde_isnanhf(a)) {
return 0;
} else {
- return HEDLEY_STATIC_CAST(int16_t, af);
+ return HEDLEY_STATIC_CAST(uint16_t, af);
}
#endif
}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_u16_f16
+ #define vcvth_u16_f16(a) simde_vcvth_u16_f16(a)
+#endif
SIMDE_FUNCTION_ATTRIBUTES
-uint16_t
-simde_x_vcvts_u16_f16(simde_float16 a) {
- #if defined(SIMDE_FAST_CONVERSION_RANGE)
- return HEDLEY_STATIC_CAST(uint16_t, simde_float16_to_float32(a));
+int32_t
+simde_vcvth_s32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_s32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t,
+ simde_float16_to_float32(a));
#else
simde_float32 af = simde_float16_to_float32(a);
- if (HEDLEY_UNLIKELY(af < SIMDE_FLOAT32_C(0.0))) {
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
return 0;
- } else if (HEDLEY_UNLIKELY(af > HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) {
- return UINT16_MAX;
- } else if (simde_math_isnanf(af)) {
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, af);
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_s32_f16
+ #define vcvth_s32_f16(a) simde_vcvth_s32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvth_u32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_u32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t,
+ simde_float16_to_float32(a));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
return 0;
} else {
- return HEDLEY_STATIC_CAST(uint16_t, af);
+ return HEDLEY_STATIC_CAST(uint32_t, af);
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_u32_f16
+ #define vcvth_u32_f16(a) simde_vcvth_u32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvth_s64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_s64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t,
+ simde_float16_to_float32(a));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, af);
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_s64_f16
+ #define vcvth_s64_f16(a) simde_vcvth_s64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvth_u64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_u64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t,
+ simde_float16_to_float32(a));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, af);
}
#endif
}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_u64_f16
+ #define vcvth_u64_f16(a) simde_vcvth_u64_f16(a)
+#endif
SIMDE_FUNCTION_ATTRIBUTES
int32_t
@@ -265,7 +358,7 @@ simde_vcvtd_s64_f64(simde_float64 a) {
return INT64_MIN;
} else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) {
return INT64_MAX;
- } else if (simde_math_isnanf(a)) {
+ } else if (simde_math_isnan(a)) {
return 0;
} else {
return HEDLEY_STATIC_CAST(int64_t, a);
@@ -330,29 +423,99 @@ simde_vcvtd_f64_u64(uint64_t a) {
#endif
SIMDE_FUNCTION_ATTRIBUTES
-simde_int16x4_t
-simde_vcvt_s16_f16(simde_float16x4_t a) {
+simde_float16_t
+simde_vcvth_f16_u32(uint32_t a) {
#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
- return vcvt_s16_f16(a);
+ return vcvth_f16_u32(a);
+ #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI
+ return HEDLEY_STATIC_CAST(simde_float16_t, a);
#else
- simde_float16x4_private a_ = simde_float16x4_to_private(a);
- simde_int16x4_private r_;
+ return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_f16_u32
+ #define vcvth_f16_u32(a) simde_vcvth_f16_u32(a)
+#endif
- #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
- SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
- #else
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = simde_x_vcvts_s16_f16(a_.values[i]);
- }
- #endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_f16_u64(uint64_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_f16_u64(a);
+ #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI
+ return HEDLEY_STATIC_CAST(simde_float16_t, a);
+ #else
+ return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_f16_u64
+ #define vcvth_f16_u64(a) simde_vcvth_f16_u64(a)
+#endif
- return simde_int16x4_from_private(r_);
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_f16_s32(int32_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_f16_s32(a);
+ #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI
+ return HEDLEY_STATIC_CAST(simde_float16_t, a);
+ #else
+ return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a));
#endif
}
#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
- #undef vcvt_s16_f16
- #define vcvt_s16_f16(a) simde_vcvt_s16_f16(a)
+ #undef vcvth_f16_s32
+ #define vcvth_f16_s32(a) simde_vcvth_f16_s32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_f16_s64(int64_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_f16_s64(a);
+ #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI
+ return HEDLEY_STATIC_CAST(simde_float16_t, a);
+ #else
+ return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_f16_s64
+ #define vcvth_f16_s64(a) simde_vcvth_f16_s64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_f16_s16(int16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_f16_s16(a);
+ #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI
+ return HEDLEY_STATIC_CAST(simde_float16_t, a);
+ #else
+ return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_f16_s16
+ #define vcvth_f16_s16(a) simde_vcvth_f16_s16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_f16_u16(uint16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvth_f16_u16(a);
+ #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI
+ return HEDLEY_STATIC_CAST(simde_float16_t, a);
+ #else
+ return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_f16_u16
+ #define vcvth_f16_u16(a) simde_vcvth_f16_u16(a)
#endif
SIMDE_FUNCTION_ATTRIBUTES
@@ -390,12 +553,12 @@ simde_vcvt_u16_f16(simde_float16x4_t a) {
simde_float16x4_private a_ = simde_float16x4_to_private(a);
simde_uint16x4_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = simde_x_vcvts_u16_f16(a_.values[i]);
+ r_.values[i] = simde_vcvth_u16_f16(a_.values[i]);
}
#endif
@@ -486,33 +649,6 @@ simde_vcvt_u64_f64(simde_float64x1_t a) {
#define vcvt_u64_f64(a) simde_vcvt_u64_f64(a)
#endif
-
-SIMDE_FUNCTION_ATTRIBUTES
-simde_int16x8_t
-simde_vcvtq_s16_f16(simde_float16x8_t a) {
- #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
- return vcvtq_s16_f16(a);
- #else
- simde_float16x8_private a_ = simde_float16x8_to_private(a);
- simde_int16x8_private r_;
-
- #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
- SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
- #else
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = simde_x_vcvts_s16_f16(a_.values[i]);
- }
- #endif
-
- return simde_int16x8_from_private(r_);
- #endif
-}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
- #undef vcvtq_s16_f16
- #define vcvtq_s16_f16(a) simde_vcvtq_s16_f16(a)
-#endif
-
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vcvtq_s32_f32(simde_float32x4_t a) {
@@ -600,12 +736,12 @@ simde_vcvtq_u16_f16(simde_float16x8_t a) {
simde_float16x8_private a_ = simde_float16x8_to_private(a);
simde_uint16x8_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = simde_x_vcvts_u16_f16(a_.values[i]);
+ r_.values[i] = simde_vcvth_u16_f16(a_.values[i]);
}
#endif
@@ -850,7 +986,7 @@ simde_vcvt_f16_s16(simde_int16x4_t a) {
simde_int16x4_private a_ = simde_int16x4_to_private(a);
simde_float16x4_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
@@ -1010,7 +1146,7 @@ simde_vcvtq_f16_s16(simde_int16x8_t a) {
simde_int16x8_private a_ = simde_int16x8_to_private(a);
simde_float16x8_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
@@ -1066,7 +1202,7 @@ simde_vcvtq_f16_u16(simde_uint16x8_t a) {
simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
simde_float16x8_private r_;
- #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16)
+ #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR)
SIMDE_CONVERT_VECTOR_(r_.values, a_.values);
#else
SIMDE_VECTORIZE
@@ -1169,6 +1305,785 @@ simde_vcvtq_f64_u64(simde_uint64x2_t a) {
#define vcvtq_f64_u64(a) simde_vcvtq_f64_u64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcvtah_u16_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtah_u16_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint16_t,
+ simde_math_roundf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) {
+ return UINT16_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtah_u16_f16
+ #define vcvtah_u16_f16(a) simde_vcvtah_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvtah_s32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtah_s32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t,
+ simde_math_roundf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtah_s32_f16
+ #define vcvtah_s32_f16(a) simde_vcvtah_s32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtah_u32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtah_u32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t,
+ simde_math_roundf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtah_u32_f16
+ #define vcvtah_u32_f16(a) simde_vcvtah_u32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtah_s64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtah_s64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t,
+ simde_math_roundf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_roundf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtah_s64_f16
+ #define vcvtah_s64_f16(a) simde_vcvtah_s64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtah_u64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtah_u64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t,
+ simde_math_roundf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtah_u64_f16
+ #define vcvtah_u64_f16(a) simde_vcvtah_u64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtad_s64_f64(simde_float64 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtad_s64_f64(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_round(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= HEDLEY_STATIC_CAST(simde_float64, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnan(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_round(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtad_s64_f64
+ #define vcvtad_s64_f64(a) simde_vcvtad_s64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtad_u64_f64(simde_float64 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ return vcvtad_u64_f64(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_round(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (simde_math_isnan(a)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_round(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtad_u64_f64
+ #define vcvtad_u64_f64(a) simde_vcvtad_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvtas_s32_f32(simde_float32 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtas_s32_f32(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtas_s32_f32
+ #define vcvtas_s32_f32(a) simde_vcvtas_s32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtas_u32_f32(simde_float32 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtas_u32_f32(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(a));
+ #else
+ if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) {
+ return 0;
+ } else {
+ if (a < 0) return 0;
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtas_u32_f32
+ #define vcvtas_u32_f32(a) simde_vcvtas_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcvta_u16_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvta_u16_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtah_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvta_u16_f16
+ #define vcvta_u16_f16(a) simde_vcvta_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1_t
+simde_vcvta_s64_f64(simde_float64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvta_s64_f64(a);
+ #else
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_int64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtad_s64_f64(a_.values[i]);
+ }
+
+ return simde_int64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvta_s64_f64
+ #define vcvta_s64_f64(a) simde_vcvta_s64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcvta_u64_f64(simde_float64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvta_u64_f64(a);
+ #else
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_uint64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtad_u64_f64(a_.values[i]);
+ }
+
+ return simde_uint64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvta_u64_f64
+ #define vcvta_u64_f64(a) simde_vcvta_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2_t
+simde_vcvta_s32_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvta_s32_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_int32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtas_s32_f32(a_.values[i]);
+ }
+
+ return simde_int32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvta_s32_f32
+ #define vcvta_s32_f32(a) simde_vcvta_s32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcvtaq_u16_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtaq_u16_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtah_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtaq_u16_f16
+ #define vcvtaq_u16_f16(a) simde_vcvtaq_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vcvtaq_s32_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtaq_s32_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_int32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtas_s32_f32(a_.values[i]);
+ }
+
+ return simde_int32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtaq_s32_f32
+ #define vcvtaq_s32_f32(a) simde_vcvtaq_s32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vcvtaq_s64_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtaq_s64_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_int64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtad_s64_f64(a_.values[i]);
+ }
+
+ return simde_int64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtaq_s64_f64
+ #define vcvtaq_s64_f64(a) simde_vcvtaq_s64_f64(a)
+#endif
+
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcvtaq_u64_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtaq_u64_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_uint64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtad_u64_f64(a_.values[i]);
+ }
+
+ return simde_uint64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtaq_u64_f64
+ #define vcvtaq_u64_f64(a) simde_vcvtaq_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcvta_u32_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvta_u32_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_uint32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtas_u32_f32(a_.values[i]);
+ }
+
+ return simde_uint32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvta_u32_f32
+ #define vcvta_u32_f32(a) simde_vcvta_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcvtaq_u32_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtaq_u32_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_uint32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtas_u32_f32(a_.values[i]);
+ }
+
+ return simde_uint32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtaq_u32_f32
+ #define vcvtaq_u32_f32(a) simde_vcvtaq_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcvt_high_f16_f32(simde_float16x4_t r, simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvt_high_f16_f32(r, a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_float16x4_private b_ = simde_float16x4_to_private(r);
+ simde_float16x8_private r_;
+
+ size_t half_pos = (sizeof(r_.values) / sizeof(r_.values[0]) / 2);
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < half_pos; i++) {
+ r_.values[i] = b_.values[i];
+ }
+ SIMDE_VECTORIZE
+ for (size_t i = half_pos; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(a_.values[i-half_pos]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_high_f16_f32
+ #define vcvt_high_f16_f32(r, a) simde_vcvt_high_f16_f32((r), (a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvt_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvt_high_f32_f64(r, a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_float32x2_private b_ = simde_float32x2_to_private(r);
+ simde_float32x4_private r_;
+
+ size_t half_pos = (sizeof(r_.values) / sizeof(r_.values[0]) / 2);
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < half_pos; i++) {
+ r_.values[i] = b_.values[i];
+ }
+ SIMDE_VECTORIZE
+ for (size_t i = half_pos; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float32, a_.values[i-half_pos]);
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_high_f32_f64
+ #define vcvt_high_f32_f64(r, a) simde_vcvt_high_f32_f64((r), (a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvt_high_f32_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvt_high_f32_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_float32x4_private r_;
+
+ size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0]));
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < rsize; i++) {
+ r_.values[i] = simde_float16_to_float32(a_.values[i+rsize]);
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_high_f32_f16
+ #define vcvt_high_f32_f16(a) simde_vcvt_high_f32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vcvt_high_f64_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvt_high_f64_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_float64x2_private r_;
+
+ size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0]));
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float64, a_.values[i+rsize]);
+ }
+
+ return simde_float64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_high_f64_f32
+ #define vcvt_high_f64_f32(a) simde_vcvt_high_f64_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vcvtxd_f32_f64(simde_float64_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtxd_f32_f64(a);
+ #else
+ return HEDLEY_STATIC_CAST(simde_float32_t, a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtxd_f32_f64
+ #define vcvtxd_f32_f64(a) simde_vcvtxd_f32_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vcvtx_f32_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtx_f32_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_float32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtxd_f32_f64(a_.values[i]);
+ }
+
+ return simde_float32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtx_f32_f64
+ #define vcvtx_f32_f64(a) simde_vcvtx_f32_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvtx_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtx_high_f32_f64(r, a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_float32x2_private r_ = simde_float32x2_to_private(r);
+ simde_float32x4_private ret;
+
+ size_t half_pos = (sizeof(ret.values) / sizeof(ret.values[0]) / 2);
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < half_pos; i++) {
+ ret.values[i] = r_.values[i];
+ }
+ SIMDE_VECTORIZE
+ for (size_t i = half_pos; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = simde_vcvtxd_f32_f64(a_.values[i-half_pos]);
+ }
+
+ return simde_float32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtx_high_f32_f64
+ #define vcvtx_high_f32_f64(r, a) simde_vcvtx_high_f32_f64((r), (a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vcvt_bf16_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvt_bf16_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_bfloat16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_bfloat16_from_float32(a_.values[i]);
+ }
+
+ return simde_bfloat16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_bf16_f32
+ #define vcvt_bf16_f32(a) simde_vcvt_bf16_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvt_f32_bf16(simde_bfloat16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvt_f32_bf16(a);
+ #else
+ simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a);
+ simde_float32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_bfloat16_to_float32(a_.values[i]);
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_f32_bf16
+ #define vcvt_f32_bf16(a) simde_vcvt_f32_bf16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vcvtah_f32_bf16(simde_bfloat16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvtah_f32_bf16(a);
+ #else
+ return simde_bfloat16_to_float32(a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtah_f32_bf16
+ #define vcvtah_f32_bf16(a) simde_vcvtah_f32_bf16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16_t
+simde_vcvth_bf16_f32(float a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvth_bf16_f32(a);
+ #else
+ return simde_bfloat16_from_float32(a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_bf16_f32
+ #define vcvth_bf16_f32(a) simde_vcvth_bf16_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvtq_low_f32_bf16(simde_bfloat16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvtq_low_f32_bf16(a);
+ #else
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+ simde_float32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_bfloat16_to_float32(a_.values[i]);
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_low_f32_bf16
+ #define vcvtq_low_f32_bf16(a) simde_vcvtq_low_f32_bf16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvtq_high_f32_bf16(simde_bfloat16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvtq_high_f32_bf16(a);
+ #else
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+ simde_float32x4_private r_;
+
+ size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0]));
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_bfloat16_to_float32(a_.values[i + rsize]);
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_high_f32_bf16
+ #define vcvtq_high_f32_bf16(a) simde_vcvtq_high_f32_bf16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vcvtq_low_bf16_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvtq_low_bf16_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_bfloat16x8_private r_;
+
+ size_t asize = (sizeof(a_.values) / sizeof(a_.values[0]));
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < asize; i++) {
+ r_.values[i] = simde_bfloat16_from_float32(a_.values[i]);
+ r_.values[i + asize] = SIMDE_BFLOAT16_VALUE(0.0);
+ }
+
+ return simde_bfloat16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_low_bf16_f32
+ #define vcvtq_low_bf16_f32(a) simde_vcvtq_low_bf16_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vcvtq_high_bf16_f32(simde_bfloat16x8_t inactive, simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vcvtq_high_bf16_f32(inactive, a);
+ #else
+ simde_bfloat16x8_private inactive_ = simde_bfloat16x8_to_private(inactive);
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_bfloat16x8_private r_;
+
+ size_t asize = (sizeof(a_.values) / sizeof(a_.values[0]));
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ r_.values[i] = inactive_.values[i];
+ r_.values[i + asize] = simde_bfloat16_from_float32(a_.values[i]);
+ }
+ return simde_bfloat16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_high_bf16_f32
+ #define vcvtq_high_bf16_f32(inactive, a) simde_vcvtq_high_bf16_f32((inactive), (a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/cvt_n.h b/lib/simd_wrapper/simde/arm/neon/cvt_n.h
new file mode 100644
index 00000000000..677751525b0
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cvt_n.h
@@ -0,0 +1,691 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CVT_N_H)
+#define SIMDE_ARM_NEON_CVT_N_H
+
+#include "types.h"
+#include "cvt.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcvth_n_u16_f16(simde_float16_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ return simde_vcvth_u16_f16(
+ simde_float16_from_float32(
+ simde_float16_to_float32(a) * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvth_n_u16_f16(a, n) vcvth_n_u16_f16(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_n_u16_f16
+ #define vcvth_n_u16_f16(a, n) simde_vcvth_n_u16_f16(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_n_f16_s16(int16_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ return simde_float16_from_float32(
+ HEDLEY_STATIC_CAST(simde_float32_t,
+ HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n)));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvth_n_f16_s16(a, n) vcvth_n_f16_s16(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_n_f16_s16
+ #define vcvth_n_f16_s16(a, n) simde_vcvth_n_f16_s16(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vcvth_n_f16_u16(uint16_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ return simde_float16_from_float32(
+ HEDLEY_STATIC_CAST(simde_float32_t,
+ HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n)));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvth_n_f16_u16(a, n) vcvth_n_f16_u16(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvth_n_f16_u16
+ #define vcvth_n_f16_u16(a, n) simde_vcvth_n_f16_u16(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvts_n_s32_f32(simde_float32_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ return simde_vcvts_s32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvts_n_s32_f32(a, n) vcvts_n_s32_f32(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvts_n_s32_f32
+ #define vcvts_n_s32_f32(a, n) simde_vcvts_n_s32_f32(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvts_n_u32_f32(simde_float32_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ return simde_vcvts_u32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvts_n_u32_f32(a, n) vcvts_n_u32_f32(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvts_n_u32_f32
+ #define vcvts_n_u32_f32(a, n) simde_vcvts_n_u32_f32(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vcvts_n_f32_s32(int32_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ return HEDLEY_STATIC_CAST(simde_float32_t,
+ HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvts_n_f32_s32(a, n) vcvts_n_f32_s32(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvts_n_f32_s32
+ #define vcvts_n_f32_s32(a, n) simde_vcvts_n_f32_s32(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vcvts_n_f32_u32(uint32_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ return HEDLEY_STATIC_CAST(simde_float32_t,
+ HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvts_n_f32_u32(a, n) vcvts_n_f32_u32(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvts_n_f32_u32
+ #define vcvts_n_f32_u32(a, n) simde_vcvts_n_f32_u32(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtd_n_s64_f64(simde_float64_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ return simde_vcvtd_s64_f64(a * pow(2, n));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtd_n_s64_f64(a, n) vcvtd_n_s64_f64(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtd_n_s64_f64
+ #define vcvtd_n_s64_f64(a, n) simde_vcvtd_n_s64_f64(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtd_n_u64_f64(simde_float64_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ return simde_vcvtd_u64_f64(a * pow(2, n));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtd_n_u64_f64(a, n) vcvtd_n_u64_f64(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtd_n_u64_f64
+ #define vcvtd_n_u64_f64(a, n) simde_vcvtd_n_u64_f64(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vcvtd_n_f64_s64(int64_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ return HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtd_n_f64_s64(a, n) vcvtd_n_f64_s64(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtd_n_f64_s64
+ #define vcvtd_n_f64_s64(a, n) simde_vcvtd_n_f64_s64(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vcvtd_n_f64_u64(uint64_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ return HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtd_n_f64_u64(a, n) vcvtd_n_f64_u64(a, n)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtd_n_f64_u64
+ #define vcvtd_n_f64_u64(a, n) simde_vcvtd_n_f64_u64(a, n)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2_t
+simde_vcvt_n_s32_f32(simde_float32x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_int32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)));
+ }
+
+ return simde_int32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvt_n_s32_f32(a, n) vcvt_n_s32_f32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_s32_f32
+ #define vcvt_n_s32_f32(a, n) simde_vcvt_n_s32_f32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1_t
+simde_vcvt_n_s64_f64(simde_float64x1_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_int64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * pow(2, n));
+ }
+
+ return simde_int64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvt_n_s64_f64(a, n) vcvt_n_s64_f64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_s64_f64
+ #define vcvt_n_s64_f64(a, n) simde_vcvt_n_s64_f64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcvt_n_u16_f16(simde_float16x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32(
+ simde_float16_to_float32(a_.values[i]) *
+ HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))));
+ }
+
+ return simde_uint16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvt_n_u16_f16(a, n) vcvt_n_u16_f16((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_u16_f16
+ #define vcvt_n_u16_f16(a, n) simde_vcvt_n_u16_f16((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcvt_n_u32_f32(simde_float32x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_uint32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)));
+ }
+
+ return simde_uint32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvt_n_u32_f32(a, n) vcvt_n_u32_f32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_u32_f32
+ #define vcvt_n_u32_f32(a, n) simde_vcvt_n_u32_f32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcvt_n_u64_f64(simde_float64x1_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_uint64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * pow(2, n));
+ }
+
+ return simde_uint64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ #define simde_vcvt_n_u64_f64(a, n) vcvt_n_u64_f64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_u64_f64
+ #define vcvt_n_u64_f64(a, n) simde_vcvt_n_u64_f64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vcvtq_n_s32_f32(simde_float32x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_int32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)));
+ }
+
+ return simde_int32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvtq_n_s32_f32(a, n) vcvtq_n_s32_f32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_s32_f32
+ #define vcvtq_n_s32_f32(a, n) simde_vcvtq_n_s32_f32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vcvtq_n_s64_f64(simde_float64x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_int64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * pow(2, n));
+ }
+
+ return simde_int64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtq_n_s64_f64(a, n) vcvtq_n_s64_f64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_s64_f64
+ #define vcvtq_n_s64_f64(a, n) simde_vcvtq_n_s64_f64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcvtq_n_u16_f16(simde_float16x8_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32(
+ simde_float16_to_float32(a_.values[i]) *
+ HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))));
+ }
+
+ return simde_uint16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+#define simde_vcvtq_n_u16_f16(a, n) vcvtq_n_u16_f16((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_u16_f16
+ #define vcvtq_n_u16_f16(a, n) simde_vcvtq_n_u16_f16((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcvtq_n_u32_f32(simde_float32x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_uint32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)));
+ }
+
+ return simde_uint32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ #define simde_vcvtq_n_u32_f32(a, n) vcvtq_n_u32_f32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_u32_f32
+ #define vcvtq_n_u32_f32(a, n) simde_vcvtq_n_u32_f32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcvtq_n_u64_f64(simde_float64x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_uint64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * pow(2, n));
+ }
+
+ return simde_uint64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ #define simde_vcvtq_n_u64_f64(a, n) vcvtq_n_u64_f64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_u64_f64
+ #define vcvtq_n_u64_f64(a, n) simde_vcvtq_n_u64_f64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcvt_n_f16_u16(simde_uint16x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ simde_uint16x4_private a_ = simde_uint16x4_to_private(a);
+ simde_float16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)));
+ }
+
+ return simde_float16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvt_n_f16_u16(a, n) vcvt_n_f16_u16((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_f16_u16
+ #define vcvt_n_f16_u16(a, n) simde_vcvt_n_f16_u16((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vcvt_n_f16_s16(simde_int16x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ simde_int16x4_private a_ = simde_int16x4_to_private(a);
+ simde_float16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)));
+ }
+
+ return simde_float16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvt_n_f16_s16(a, n) vcvt_n_f16_s16((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_f16_s16
+ #define vcvt_n_f16_s16(a, n) simde_vcvt_n_f16_s16((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcvtq_n_f16_u16(simde_uint16x8_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ simde_uint16x8_private a_ = simde_uint16x8_to_private(a);
+ simde_float16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)));
+ }
+
+ return simde_float16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvtq_n_f16_u16(a, n) vcvtq_n_f16_u16((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_f16_u16
+ #define vcvtq_n_f16_u16(a, n) simde_vcvtq_n_f16_u16((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vcvtq_n_f16_s16(simde_int16x8_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) {
+ simde_int16x8_private a_ = simde_int16x8_to_private(a);
+ simde_float16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, (a_.values[i] / pow(2, n))));
+ }
+
+ return simde_float16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vcvtq_n_f16_s16(a, n) vcvtq_n_f16_s16((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_f16_s16
+ #define vcvtq_n_f16_s16(a, n) simde_vcvtq_n_f16_s16((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vcvt_n_f32_u32(simde_uint32x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_uint32x2_private a_ = simde_uint32x2_to_private(a);
+ simde_float32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvt_n_f32_u32(a, n) vcvt_n_f32_u32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_f32_u32
+ #define vcvt_n_f32_u32(a, n) simde_vcvt_n_f32_u32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vcvt_n_f32_s32(simde_int32x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_int32x2_private a_ = simde_int32x2_to_private(a);
+ simde_float32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvt_n_f32_s32(a, n) vcvt_n_f32_s32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_f32_s32
+ #define vcvt_n_f32_s32(a, n) simde_vcvt_n_f32_s32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vcvt_n_f64_u64(simde_uint64x1_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_uint64x1_private a_ = simde_uint64x1_to_private(a);
+ simde_float64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvt_n_f64_u64(a, n) vcvt_n_f64_u64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_f64_u64
+ #define vcvt_n_f64_u64(a, n) simde_vcvt_n_f64_u64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vcvtq_n_f64_u64(simde_uint64x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_uint64x2_private a_ = simde_uint64x2_to_private(a);
+ simde_float64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtq_n_f64_u64(a, n) vcvtq_n_f64_u64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_f64_u64
+ #define vcvtq_n_f64_u64(a, n) simde_vcvtq_n_f64_u64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vcvt_n_f64_s64(simde_int64x1_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_int64x1_private a_ = simde_int64x1_to_private(a);
+ simde_float64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvt_n_f64_s64(a, n) vcvt_n_f64_s64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvt_n_f64_s64
+ #define vcvt_n_f64_s64(a, n) simde_vcvt_n_f64_s64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vcvtq_n_f64_s64(simde_int64x2_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) {
+ simde_int64x2_private a_ = simde_int64x2_to_private(a);
+ simde_float64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vcvtq_n_f64_s64(a, n) vcvtq_n_f64_s64((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_f64_s64
+ #define vcvtq_n_f64_s64(a, n) simde_vcvtq_n_f64_s64((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvtq_n_f32_s32(simde_int32x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_int32x4_private a_ = simde_int32x4_to_private(a);
+ simde_float32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvtq_n_f32_s32(a, n) vcvtq_n_f32_s32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_f32_s32
+ #define vcvtq_n_f32_s32(a, n) simde_vcvtq_n_f32_s32((a), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vcvtq_n_f32_u32(simde_uint32x4_t a, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) {
+ simde_uint32x4_private a_ = simde_uint32x4_to_private(a);
+ simde_float32x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n));
+ }
+
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vcvtq_n_f32_u32(a, n) vcvtq_n_f32_u32((a), (n))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vcvtq_n_f32_u32
+ #define vcvtq_n_f32_u32(a, n) simde_vcvtq_n_f32_u32((a), (n))
+#endif
+
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* SIMDE_ARM_NEON_CVT_N_H */
diff --git a/lib/simd_wrapper/simde/arm/neon/cvtm.h b/lib/simd_wrapper/simde/arm/neon/cvtm.h
new file mode 100644
index 00000000000..ae2c98ae02f
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cvtm.h
@@ -0,0 +1,381 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CVTM_H)
+#define SIMDE_ARM_NEON_CVTM_H
+
+#include "types.h"
+#include "cvt.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtmh_s64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtmh_s64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t,
+ simde_math_floorf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_floorf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmh_s64_f16
+ #define vcvtmh_s64_f16(a) simde_vcvtmh_s64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvtmh_s32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtmh_s32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t,
+ simde_math_floorf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_floorf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmh_s32_f16
+ #define vcvtmh_s32_f16(a) simde_vcvtmh_s32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtmh_u64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtmh_u64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t,
+ simde_math_floorf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_floorf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmh_u64_f16
+ #define vcvtmh_u64_f16(a) simde_vcvtmh_u64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtmh_u32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtmh_u32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t,
+ simde_math_floorf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmh_u32_f16
+ #define vcvtmh_u32_f16(a) simde_vcvtmh_u32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcvtmh_u16_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtmh_u16_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint16_t,
+ simde_math_floorf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) {
+ return UINT16_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint16_t, simde_math_floorf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmh_u16_f16
+ #define vcvtmh_u16_f16(a) simde_vcvtmh_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtms_u32_f32(simde_float32 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtms_u32_f32(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtms_u32_f32
+ #define vcvtms_u32_f32(a) simde_vcvtms_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtmd_u64_f64(simde_float64 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtmd_u64_f64(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_floor(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (simde_math_isnan(a)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_floor(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmd_u64_f64
+ #define vcvtmd_u64_f64(a) simde_vcvtmd_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcvtmq_u16_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtmq_u16_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtmh_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmq_u16_f16
+ #define vcvtmq_u16_f16(a) simde_vcvtmq_u16_f16(a)
+#endif
+
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcvtmq_u32_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ return vcvtmq_u32_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_uint32x4_private r_;
+
+ #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ // Hmm.. this doesn't work, unlike the signed versions
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtps_epu32(a_.m128);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtps_epu32(a_.m128);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtms_u32_f32(a_.values[i]);
+ }
+ #endif
+
+ return simde_uint32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmq_u32_f32
+ #define vcvtmq_u32_f32(a) simde_vcvtmq_u32_f32(a)
+#endif
+
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcvtmq_u64_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtmq_u64_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_uint64x2_private r_;
+
+ #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ // Hmm.. this doesn't work, unlike the signed versions
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtpd_epu64(a_.m128d);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtpd_epu64(a_.m128d);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtmd_u64_f64(a_.values[i]);
+ }
+ #endif
+
+ return simde_uint64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtmq_u64_f64
+ #define vcvtmq_u64_f64(a) simde_vcvtmq_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcvtm_u16_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtm_u16_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtmh_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtm_u16_f16
+ #define vcvtm_u16_f16(a) simde_vcvtm_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcvtm_u32_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtm_u32_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_uint32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtms_u32_f32(a_.values[i]);
+ }
+
+ return simde_uint32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtm_u32_f32
+ #define vcvtm_u32_f32(a) simde_vcvtm_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcvtm_u64_f64(simde_float64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtm_u64_f64(a);
+ #else
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_uint64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtmd_u64_f64(a_.values[i]);
+ }
+
+ return simde_uint64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtm_u64_f64
+ #define vcvtm_u64_f64(a) simde_vcvtm_u64_f64(a)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* SIMDE_ARM_NEON_CVTM_H */
diff --git a/lib/simd_wrapper/simde/arm/neon/cvtn.h b/lib/simd_wrapper/simde/arm/neon/cvtn.h
new file mode 100644
index 00000000000..8198a9721b7
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cvtn.h
@@ -0,0 +1,530 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Michael R. Crusoe
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CVTN_H)
+#define SIMDE_ARM_NEON_CVTN_H
+
+#include "types.h"
+#include "cvt.h"
+#include "calt.h"
+#include "cagt.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vcvtnq_s32_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtnq_s32_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_int32x4_private r_;
+
+ #if defined(SIMDE_X86_SSE2_NATIVE)
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtps_epi32(a_.m128);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtps_epi32(a_.m128);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a_.values[i]));
+ }
+ #endif
+
+ return simde_int32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnq_s32_f32
+ #define vcvtnq_s32_f32(a) simde_vcvtnq_s32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vcvtnq_s64_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtnq_s64_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_int64x2_private r_;
+
+ #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtpd_epi64(a_.m128d);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtpd_epi64(a_.m128d);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a_.values[i]));
+ }
+ #endif
+
+ return simde_int64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnq_s64_f64
+ #define vcvtnq_s64_f64(a) simde_vcvtnq_s64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtnh_s64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtnh_s64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_roundevenf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 a_ = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (simde_math_isnanf(a_)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_roundevenf(a_));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnh_s64_f16
+ #define vcvtnh_s64_f16(a) simde_vcvtnh_s64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvtnh_s32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtnh_s32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 a_ = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (simde_math_isnanf(a_)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a_));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnh_s32_f16
+ #define vcvtnh_s32_f16(a) simde_vcvtnh_s32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtnh_u64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtnh_u64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundevenf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 a_ = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (simde_math_isnanf(a_)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundevenf(a_));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnh_u64_f16
+ #define vcvtnh_u64_f16(a) simde_vcvtnh_u64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtnh_u32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtnh_u32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 a_ = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (simde_math_isnanf(a_)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a_));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnh_u32_f16
+ #define vcvtnh_u32_f16(a) simde_vcvtnh_u32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcvtnh_u16_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtnh_u16_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundevenf(simde_float16_to_float32(a)));
+ #else
+ simde_float32 a_ = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) {
+ return UINT16_MAX;
+ } else if (simde_math_isnanf(a_)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundevenf(a_));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnh_u16_f16
+ #define vcvtnh_u16_f16(a) simde_vcvtnh_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvtns_s32_f32(simde_float32 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtns_s32_f32(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a));
+ #else
+ if (HEDLEY_UNLIKELY(a < HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtns_s32_f32
+ #define vcvtns_s32_f32(a) simde_vcvtns_s32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtns_u32_f32(simde_float32 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtns_u32_f32(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a));
+ #else
+ if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtns_u32_f32
+ #define vcvtns_u32_f32(a) simde_vcvtns_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcvtnq_u32_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ return vcvtnq_u32_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_uint32x4_private r_;
+
+ #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ // Hmm.. this doesn't work, unlike the signed versions
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtps_epu32(a_.m128);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtps_epu32(a_.m128);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtns_u32_f32(a_.values[i]);
+ }
+ #endif
+
+ return simde_uint32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnq_u32_f32
+ #define vcvtnq_u32_f32(a) simde_vcvtnq_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtnd_s64_f64(simde_float64 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtnd_s64_f64(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a));
+ #else
+ if (HEDLEY_UNLIKELY(a < HEDLEY_STATIC_CAST(simde_float64, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (simde_math_isnan(a)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnd_s64_f64
+ #define vcvtnd_s64_f64(a) simde_vcvtnd_s64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtnd_u64_f64(simde_float64 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtnd_u64_f64(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundeven(a));
+ #else
+ if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT64_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (simde_math_isnan(a)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundeven(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnd_u64_f64
+ #define vcvtnd_u64_f64(a) simde_vcvtnd_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcvtnq_u64_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtnq_u64_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_uint64x2_private r_;
+
+ #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ // Hmm.. this doesn't work, unlike the signed versions
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtpd_epu64(a_.m128d);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtpd_epu64(a_.m128d);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtnd_u64_f64(a_.values[i]);
+ }
+ #endif
+
+ return simde_uint64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnq_u64_f64
+ #define vcvtnq_u64_f64(a) simde_vcvtnq_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcvtnq_u16_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtnq_u16_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtnh_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtnq_u16_f16
+ #define vcvtnq_u16_f16(a) simde_vcvtnq_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcvtn_u16_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtn_u16_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtnh_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtn_u16_f16
+ #define vcvtn_u16_f16(a) simde_vcvtn_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcvtn_u32_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtn_u32_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_uint32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtns_u32_f32(a_.values[i]);
+ }
+
+ return simde_uint32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtn_u32_f32
+ #define vcvtn_u32_f32(a) simde_vcvtn_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2_t
+simde_vcvtn_s32_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtn_s32_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_int32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtns_s32_f32(a_.values[i]);
+ }
+
+ return simde_int32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtn_s32_f32
+ #define vcvtn_s32_f32(a) simde_vcvtn_s32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1_t
+simde_vcvtn_s64_f64(simde_float64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtn_s64_f64(a);
+ #else
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_int64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtnd_s64_f64(a_.values[i]);
+ }
+
+ return simde_int64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtn_s64_f64
+ #define vcvtn_s64_f64(a) simde_vcvtn_s64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcvtn_u64_f64(simde_float64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtn_u64_f64(a);
+ #else
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_uint64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtnd_u64_f64(a_.values[i]);
+ }
+
+ return simde_uint64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtn_u64_f64
+ #define vcvtn_u64_f64(a) simde_vcvtn_u64_f64(a)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* SIMDE_ARM_NEON_CVTN_H */
diff --git a/lib/simd_wrapper/simde/arm/neon/cvtp.h b/lib/simd_wrapper/simde/arm/neon/cvtp.h
new file mode 100644
index 00000000000..92bcb2b99f9
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/cvtp.h
@@ -0,0 +1,379 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_CVTP_H)
+#define SIMDE_ARM_NEON_CVTP_H
+
+#include "types.h"
+#include "cvt.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vcvtph_s64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtph_s64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int64_t,
+ simde_math_ceilf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) {
+ return INT64_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) {
+ return INT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int64_t, simde_math_ceilf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtph_s64_f16
+ #define vcvtph_s64_f16(a) simde_vcvtph_s64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vcvtph_s32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtph_s32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(int32_t,
+ simde_math_ceilf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) {
+ return INT32_MIN;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) {
+ return INT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(int32_t, simde_math_ceilf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtph_s32_f16
+ #define vcvtph_s32_f16(a) simde_vcvtph_s32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtph_u64_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtph_u64_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t,
+ simde_math_ceilf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceilf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtph_u64_f16
+ #define vcvtph_u64_f16(a) simde_vcvtph_u64_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtph_u32_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtph_u32_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t,
+ simde_math_ceilf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtph_u32_f16
+ #define vcvtph_u32_f16(a) simde_vcvtph_u32_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vcvtph_u16_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtph_u16_f16(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint16_t,
+ simde_math_ceilf(
+ simde_float16_to_float32(a)));
+ #else
+ simde_float32 af = simde_float16_to_float32(a);
+ if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) {
+ return UINT16_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint16_t, simde_math_ceilf(af));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtph_u16_f16
+ #define vcvtph_u16_f16(a) simde_vcvtph_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint32_t
+simde_vcvtps_u32_f32(simde_float32 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtps_u32_f32(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT32_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) {
+ return UINT32_MAX;
+ } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtps_u32_f32
+ #define vcvtps_u32_f32(a) simde_vcvtps_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint64_t
+simde_vcvtpd_u64_f64(simde_float64 a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtpd_u64_f64(a);
+ #elif defined(SIMDE_FAST_CONVERSION_RANGE)
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceil(a));
+ #else
+ if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) {
+ return 0;
+ } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) {
+ return UINT64_MAX;
+ } else if (simde_math_isnan(a)) {
+ return 0;
+ } else {
+ return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceil(a));
+ }
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtpd_u64_f64
+ #define vcvtpd_u64_f64(a) simde_vcvtpd_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vcvtpq_u16_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtpq_u16_f16(a);
+ #else
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtph_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtpq_u16_f16
+ #define vcvtpq_u16_f16(a) simde_vcvtpq_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vcvtpq_u32_f32(simde_float32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844)
+ return vcvtpq_u32_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+ simde_uint32x4_private r_;
+
+ #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ // Hmm.. this doesn't work, unlike the signed versions
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtps_epu32(a_.m128);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtps_epu32(a_.m128);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtps_u32_f32(a_.values[i]);
+ }
+ #endif
+
+ return simde_uint32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtpq_u32_f32
+ #define vcvtpq_u32_f32(a) simde_vcvtpq_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vcvtpq_u64_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtpq_u64_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ simde_uint64x2_private r_;
+
+ #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
+ // Hmm.. this doesn't work, unlike the signed versions
+ if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) {
+ unsigned int rounding_mode = _MM_GET_ROUNDING_MODE();
+ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+ r_.m128i = _mm_cvtpd_epu64(a_.m128d);
+ _MM_SET_ROUNDING_MODE(rounding_mode);
+ } else {
+ r_.m128i = _mm_cvtpd_epu64(a_.m128d);
+ }
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtpd_u64_f64(a_.values[i]);
+ }
+ #endif
+
+ return simde_uint64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtpq_u64_f64
+ #define vcvtpq_u64_f64(a) simde_vcvtpq_u64_f64(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4_t
+simde_vcvtp_u16_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vcvtp_u16_f16(a);
+ #else
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+ simde_uint16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtph_u16_f16(a_.values[i]);
+ }
+
+ return simde_uint16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtp_u16_f16
+ #define vcvtp_u16_f16(a) simde_vcvtp_u16_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2_t
+simde_vcvtp_u32_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtp_u32_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ simde_uint32x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtps_u32_f32(a_.values[i]);
+ }
+
+ return simde_uint32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtp_u32_f32
+ #define vcvtp_u32_f32(a) simde_vcvtp_u32_f32(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1_t
+simde_vcvtp_u64_f64(simde_float64x1_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vcvtp_u64_f64(a);
+ #else
+ simde_float64x1_private a_ = simde_float64x1_to_private(a);
+ simde_uint64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vcvtpd_u64_f64(a_.values[i]);
+ }
+
+ return simde_uint64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vcvtp_u64_f64
+ #define vcvtp_u64_f64(a) simde_vcvtp_u64_f64(a)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* SIMDE_ARM_NEON_CVTP_H */
diff --git a/lib/simd_wrapper/simde/arm/neon/div.h b/lib/simd_wrapper/simde/arm/neon/div.h
new file mode 100644
index 00000000000..05a59084b76
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/div.h
@@ -0,0 +1,199 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_DIV_H)
+#define SIMDE_ARM_NEON_DIV_H
+
+#include "types.h"
+
+#include "reinterpret.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vdivh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vdivh_f16(a, b);
+ #else
+ return simde_float16_from_float32(simde_float16_to_float32(a) / simde_float16_to_float32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdivh_f16
+ #define vdivh_f16(a, b) simde_vdivh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vdiv_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vdiv_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vdivh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdiv_f16
+ #define vdiv_f16(a, b) simde_vdiv_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vdivq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vdivq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vdivh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdivq_f16
+ #define vdivq_f16(a, b) simde_vdivq_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vdiv_f32(simde_float32x2_t a, simde_float32x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vdiv_f32(a, b);
+ #else
+ simde_float32x2_private
+ r_,
+ a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] / b_.values[i];
+ }
+
+ return simde_float32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdiv_f32
+ #define vdiv_f32(a, b) simde_vdiv_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vdivq_f32(simde_float32x4_t a, simde_float32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vdivq_f32(a, b);
+ #else
+ simde_float32x4_private
+ r_,
+ a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] / b_.values[i];
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdivq_f32
+ #define vdivq_f32(a, b) simde_vdivq_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vdiv_f64(simde_float64x1_t a, simde_float64x1_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vdiv_f64(a, b);
+ #else
+ simde_float64x1_private
+ r_,
+ a_ = simde_float64x1_to_private(a),
+ b_ = simde_float64x1_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] / b_.values[i];
+ }
+
+ return simde_float64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdiv_f64
+ #define vdiv_f64(a, b) simde_vdiv_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vdivq_f64(simde_float64x2_t a, simde_float64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vdivq_f64(a, b);
+ #else
+ simde_float64x2_private
+ r_,
+ a_ = simde_float64x2_to_private(a),
+ b_ = simde_float64x2_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] / b_.values[i];
+ }
+
+ return simde_float64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdivq_f64
+ #define vdivq_f64(a, b) simde_vdivq_f64((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MUL_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/dot.h b/lib/simd_wrapper/simde/arm/neon/dot.h
index fa7febe0364..a05d32d47e8 100644
--- a/lib/simd_wrapper/simde/arm/neon/dot.h
+++ b/lib/simd_wrapper/simde/arm/neon/dot.h
@@ -46,7 +46,7 @@ SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x2_t
simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) {
- #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
return vdot_s32(r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return simde_vadd_s32(r, simde_vmovn_s64(simde_vpaddlq_s32(simde_vpaddlq_s16(simde_vmull_s8(a, b)))));
@@ -67,7 +67,7 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) {
return simde_vadd_s32(r, simde_int32x2_from_private(r_));
#endif
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdot_s32
#define vdot_s32(r, a, b) simde_vdot_s32((r), (a), (b))
#endif
@@ -75,7 +75,7 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x2_t
simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) {
- #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
return vdot_u32(r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return simde_vadd_u32(r, simde_vmovn_u64(simde_vpaddlq_u32(simde_vpaddlq_u16(simde_vmull_u8(a, b)))));
@@ -97,7 +97,7 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) {
return simde_vadd_u32(r, simde_uint32x2_from_private(r_));
#endif
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdot_u32
#define vdot_u32(r, a, b) simde_vdot_u32((r), (a), (b))
#endif
@@ -105,7 +105,7 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) {
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x4_t
simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) {
- #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
return vdotq_s32(r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return simde_vaddq_s32(r,
@@ -128,7 +128,7 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) {
return simde_vaddq_s32(r, simde_int32x4_from_private(r_));
#endif
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdotq_s32
#define vdotq_s32(r, a, b) simde_vdotq_s32((r), (a), (b))
#endif
@@ -136,7 +136,7 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint32x4_t
simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) {
- #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
return vdotq_u32(r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return simde_vaddq_u32(r,
@@ -159,11 +159,64 @@ simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) {
return simde_vaddq_u32(r, simde_uint32x4_from_private(r_));
#endif
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdotq_u32
#define vdotq_u32(r, a, b) simde_vdotq_u32((r), (a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vbfdot_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ return vbfdot_f32(r, a, b);
+ #else
+ simde_float32x2_private r_ = simde_float32x2_to_private(r);
+ simde_bfloat16x4_private
+ a_ = simde_bfloat16x4_to_private(a),
+ b_ = simde_bfloat16x4_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]);
+ simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]);
+ simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * i + 0]);
+ simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * i + 1]);
+ r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b;
+ }
+ return simde_float32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfdot_f32
+ #define vbfdot_f32(r, a, b) simde_vbfdot_f32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfdotq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ return vbfdotq_f32(r, a, b);
+ #else
+ simde_float32x4_private r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]);
+ simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]);
+ simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * i + 0]);
+ simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * i + 1]);
+ r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b;
+ }
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfdotq_f32
+ #define vbfdotq_f32(r, a, b) simde_vbfdotq_f32((r), (a), (b))
+#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/dot_lane.h b/lib/simd_wrapper/simde/arm/neon/dot_lane.h
index 84f706948bd..a7d570b4ab9 100644
--- a/lib/simd_wrapper/simde/arm/neon/dot_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/dot_lane.h
@@ -45,7 +45,7 @@ simde_int32x2_t
simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
simde_int32x2_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_2_(vdot_lane_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_int32x2_t
@@ -86,7 +86,7 @@ simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdot_lane_s32
#define vdot_lane_s32(r, a, b, lane) simde_vdot_lane_s32((r), (a), (b), (lane))
#endif
@@ -96,7 +96,7 @@ simde_uint32x2_t
simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
simde_uint32x2_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_2_(vdot_lane_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_uint32x2_t
@@ -137,7 +137,7 @@ simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, co
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdot_lane_u32
#define vdot_lane_u32(r, a, b, lane) simde_vdot_lane_u32((r), (a), (b), (lane))
#endif
@@ -147,7 +147,7 @@ simde_int32x2_t
simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
simde_int32x2_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_4_(vdot_laneq_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_int32x2_t b_lane;
@@ -186,7 +186,7 @@ simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, con
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdot_laneq_s32
#define vdot_laneq_s32(r, a, b, lane) simde_vdot_laneq_s32((r), (a), (b), (lane))
#endif
@@ -196,7 +196,7 @@ simde_uint32x2_t
simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
simde_uint32x2_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_4_(vdot_laneq_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_uint32x2_t b_lane;
@@ -234,7 +234,7 @@ simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b,
#endif
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdot_laneq_u32
#define vdot_laneq_u32(r, a, b, lane) simde_vdot_laneq_u32((r), (a), (b), (lane))
#endif
@@ -244,7 +244,7 @@ simde_uint32x4_t
simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
simde_uint32x4_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_4_(vdotq_laneq_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_uint32x4_t
@@ -296,7 +296,7 @@ simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b
#endif
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdotq_laneq_u32
#define vdotq_laneq_u32(r, a, b, lane) simde_vdotq_laneq_u32((r), (a), (b), (lane))
#endif
@@ -306,7 +306,7 @@ simde_int32x4_t
simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
simde_int32x4_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_4_(vdotq_laneq_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_int32x4_t
@@ -358,7 +358,7 @@ simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, c
#endif
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdotq_laneq_s32
#define vdotq_laneq_s32(r, a, b, lane) simde_vdotq_laneq_s32((r), (a), (b), (lane))
#endif
@@ -368,7 +368,7 @@ simde_uint32x4_t
simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
simde_uint32x4_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_2_(vdotq_lane_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_uint32x2_t
@@ -419,7 +419,7 @@ simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b,
#endif
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdotq_lane_u32
#define vdotq_lane_u32(r, a, b, lane) simde_vdotq_lane_u32((r), (a), (b), (lane))
#endif
@@ -429,7 +429,7 @@ simde_int32x4_t
simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, const int lane)
SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
simde_int32x4_t result;
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD)
SIMDE_CONSTIFY_2_(vdotq_lane_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
simde_int32x2_t
@@ -480,11 +480,137 @@ simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, con
#endif
return result;
}
-#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD))
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
#undef vdotq_lane_s32
#define vdotq_lane_s32(r, a, b, lane) simde_vdotq_lane_s32((r), (a), (b), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vbfdot_lane_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float32x2_t result;
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ SIMDE_CONSTIFY_2_(vbfdot_lane_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
+ #else
+ simde_float32x2_private r_ = simde_float32x2_to_private(r);
+ simde_bfloat16x4_private
+ a_ = simde_bfloat16x4_to_private(a),
+ b_ = simde_bfloat16x4_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]);
+ simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]);
+ simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]);
+ simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]);
+ r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b;
+ }
+
+ result = simde_float32x2_from_private(r_);
+ #endif
+
+ return result;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfdot_lane_f32
+ #define vbfdot_lane_f32(r, a, b, lane) simde_vbfdot_lane_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfdotq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float32x4_t result;
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ SIMDE_CONSTIFY_2_(vbfdotq_lane_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
+ #else
+ simde_float32x4_private r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+ simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]);
+ simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]);
+ simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]);
+ simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]);
+ r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b;
+ }
+
+ result = simde_float32x4_from_private(r_);
+ #endif
+
+ return result;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfdotq_lane_f32
+ #define vbfdotq_lane_f32(r, a, b, lane) simde_vbfdotq_lane_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vbfdot_laneq_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x2_t result;
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ SIMDE_CONSTIFY_4_(vbfdot_laneq_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
+ #else
+ simde_float32x2_private r_ = simde_float32x2_to_private(r);
+ simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a);
+ simde_bfloat16x8_private b_ = simde_bfloat16x8_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]);
+ simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]);
+ simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]);
+ simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]);
+ r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b;
+ }
+
+ result = simde_float32x2_from_private(r_);
+ #endif
+
+ return result;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfdot_laneq_f32
+ #define vbfdot_laneq_f32(r, a, b, lane) simde_vbfdot_laneq_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfdotq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_t result;
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ SIMDE_CONSTIFY_4_(vbfdotq_laneq_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b);
+ #else
+ simde_float32x4_private r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]);
+ simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]);
+ simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]);
+ simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]);
+ r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b;
+ }
+
+ result = simde_float32x4_from_private(r_);
+ #endif
+
+ return result;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfdotq_laneq_f32
+ #define vbfdotq_laneq_f32(r, a, b, lane) simde_vbfdotq_laneq_f32((r), (a), (b), (lane))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/dup_lane.h b/lib/simd_wrapper/simde/arm/neon/dup_lane.h
index bc1720518a4..44db662be64 100644
--- a/lib/simd_wrapper/simde/arm/neon/dup_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/dup_lane.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020-2021 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_DUP_LANE_H)
@@ -146,6 +147,59 @@ simde_vdupd_lane_u64(simde_uint64x1_t vec, const int lane)
#define vdupd_lane_u64(vec, lane) simde_vdupd_lane_u64((vec), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vduph_lane_f16(simde_float16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_float16x4_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vduph_lane_f16(vec, lane) vduph_lane_f16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_lane_f16
+ #define vduph_lane_f16(vec, lane) simde_vduph_lane_f16((vec), (lane))
+#endif
+
+// simde_vdup_lane_f16
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vdup_lane_f16(vec, lane) vdup_lane_f16(vec, lane)
+#else
+ #define simde_vdup_lane_f16(vec, lane) simde_vdup_n_f16(simde_vduph_lane_f16(vec, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_lane_f16
+ #define vdup_lane_f16(vec, lane) simde_vdup_lane_f16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vdup_laneq_f16(simde_float16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdup_n_f16(simde_float16x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vdup_laneq_f16(vec, lane) vdup_laneq_f16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_laneq_f16
+ #define vdup_laneq_f16(vec, lane) simde_vdup_laneq_f16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vdupq_lane_f16(simde_float16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vdupq_n_f16(simde_float16x4_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vdupq_lane_f16(vec, lane) vdupq_lane_f16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_lane_f16
+ #define vdupq_lane_f16(vec, lane) simde_vdupq_lane_f16((vec), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float64_t
simde_vdupd_lane_f64(simde_float64x1_t vec, const int lane)
@@ -924,6 +978,20 @@ simde_vdupq_lane_u64(simde_uint64x1_t vec, const int lane)
#define vdupq_lane_u64(vec, lane) simde_vdupq_lane_u64((vec), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vdupq_laneq_f16(simde_float16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdupq_n_f16(simde_float16x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vdupq_laneq_f16(vec, lane) vdupq_laneq_f16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_laneq_f16
+ #define vdupq_laneq_f16(vec, lane) simde_vdupq_laneq_f16((vec), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vdupq_laneq_f32(simde_float32x4_t vec, const int lane)
@@ -1194,6 +1262,437 @@ simde_vdupq_laneq_u64(simde_uint64x2_t vec, const int lane)
#define vdupq_laneq_u64(vec, lane) simde_vdupq_laneq_u64((vec), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+int8_t
+simde_vdupb_lane_s8(simde_int8x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_int8x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupb_lane_s8(vec, lane) vdupb_lane_s8(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupb_lane_s8
+ #define vdupb_lane_s8(vec, lane) simde_vdupb_lane_s8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint8_t
+simde_vdupb_lane_u8(simde_uint8x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_uint8x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupb_lane_u8(vec, lane) vdupb_lane_u8(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupb_lane_u8
+ #define vdupb_lane_u8(vec, lane) simde_vdupb_lane_u8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int8_t
+simde_vdupb_laneq_s8(simde_int8x16_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ return simde_int8x16_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupb_laneq_s8(vec, lane) vdupb_laneq_s8(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupb_laneq_s8
+ #define vdupb_laneq_s8(vec, lane) simde_vdupb_laneq_s8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint8_t
+simde_vdupb_laneq_u8(simde_uint8x16_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ return simde_uint8x16_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupb_laneq_u8(vec, lane) vdupb_laneq_u8(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupb_laneq_u8
+ #define vdupb_laneq_u8(vec, lane) simde_vdupb_laneq_u8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int16_t
+simde_vduph_lane_s16(simde_int16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_int16x4_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vduph_lane_s16(vec, lane) vduph_lane_s16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_lane_s16
+ #define vduph_lane_s16(vec, lane) simde_vduph_lane_s16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vduph_lane_u16(simde_uint16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_uint16x4_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vduph_lane_u16(vec, lane) vduph_lane_u16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_lane_u16
+ #define vduph_lane_u16(vec, lane) simde_vduph_lane_u16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int16_t
+simde_vduph_laneq_s16(simde_int16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_int16x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vduph_laneq_s16(vec, lane) vduph_laneq_s16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_laneq_s16
+ #define vduph_laneq_s16(vec, lane) simde_vduph_laneq_s16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+uint16_t
+simde_vduph_laneq_u16(simde_uint16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_uint16x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vduph_laneq_u16(vec, lane) vduph_laneq_u16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_laneq_u16
+ #define vduph_laneq_u16(vec, lane) simde_vduph_laneq_u16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vduph_laneq_f16(simde_float16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_float16x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vduph_laneq_f16(vec, lane) vduph_laneq_f16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_laneq_f16
+ #define vduph_laneq_f16(vec, lane) simde_vduph_laneq_f16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vdup_lane_p8(simde_poly8x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdup_n_p8(simde_poly8x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vdup_lane_p8(vec, lane) vdup_lane_p8((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_lane_p8
+ #define vdup_lane_p8(vec, lane) simde_vdup_lane_p8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vdup_lane_p16(simde_poly16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vdup_n_p16(simde_poly16x4_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vdup_lane_p16(vec, lane) vdup_lane_p16((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_lane_p16
+ #define vdup_lane_p16(vec, lane) simde_vdup_lane_p16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vdup_lane_p64(simde_poly64x1_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ return simde_vdup_n_p64(simde_poly64x1_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ #define simde_vdup_lane_p64(vec, lane) vdup_lane_p64((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_lane_p64
+ #define vdup_lane_p64(vec, lane) simde_vdup_lane_p64((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vdup_laneq_p8(simde_poly8x16_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ return simde_vdup_n_p8(simde_poly8x16_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdup_laneq_p8(vec, lane) vdup_laneq_p8((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_laneq_p8
+ #define vdup_laneq_p8(vec, lane) simde_vdup_laneq_p8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vdup_laneq_p16(simde_poly16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdup_n_p16(simde_poly16x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdup_laneq_p16(vec, lane) vdup_laneq_p16((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_laneq_p16
+ #define vdup_laneq_p16(vec, lane) simde_vdup_laneq_p16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vdup_laneq_p64(simde_poly64x2_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return simde_vdup_n_p64(simde_poly64x2_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdup_laneq_p64(vec, lane) vdup_laneq_p64((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_laneq_p64
+ #define vdup_laneq_p64(vec, lane) simde_vdup_laneq_p64((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vdupq_lane_p8(simde_poly8x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdupq_n_p8(simde_poly8x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vdupq_lane_p8(vec, lane) vdupq_lane_p8((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_lane_p8
+ #define vdupq_lane_p8(vec, lane) simde_vdupq_lane_p8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vdupq_lane_p16(simde_poly16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vdupq_n_p16(simde_poly16x4_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vdupq_lane_p16(vec, lane) vdupq_lane_p16((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_lane_p16
+ #define vdupq_lane_p16(vec, lane) simde_vdupq_lane_p16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vdupq_lane_p64(simde_poly64x1_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ return simde_vdupq_n_p64(simde_poly64x1_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ #define simde_vdupq_lane_p64(vec, lane) vdupq_lane_p64((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_lane_p64
+ #define vdupq_lane_p64(vec, lane) simde_vdupq_lane_p64((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vdupq_laneq_p8(simde_poly8x16_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ return simde_vdupq_n_p8(simde_poly8x16_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupq_laneq_p8(vec, lane) vdupq_laneq_p8((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_laneq_p8
+ #define vdupq_laneq_p8(vec, lane) simde_vdupq_laneq_p8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vdupq_laneq_p16(simde_poly16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdupq_n_p16(simde_poly16x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupq_laneq_p16(vec, lane) vdupq_laneq_p16((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_laneq_p16
+ #define vdupq_laneq_p16(vec, lane) simde_vdupq_laneq_p16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vdupq_laneq_p64(simde_poly64x2_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return simde_vdupq_n_p64(simde_poly64x2_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vdupq_laneq_p64(vec, lane) vdupq_laneq_p64((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_laneq_p64
+ #define vdupq_laneq_p64(vec, lane) simde_vdupq_laneq_p64((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8_t
+simde_vdupb_lane_p8(simde_poly8x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_poly8x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vdupb_lane_p8(vec, lane) vdupb_lane_p8((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupb_lane_p8
+ #define vdupb_lane_p8(vec, lane) simde_vdupb_lane_p8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8_t
+simde_vdupb_laneq_p8(simde_poly8x16_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ return simde_poly8x16_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vdupb_laneq_p8(vec, lane) vdupb_laneq_p8((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupb_laneq_p8
+ #define vdupb_laneq_p8(vec, lane) simde_vdupb_laneq_p8((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16_t
+simde_vduph_lane_p16(simde_poly16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_poly16x4_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vduph_lane_p16(vec, lane) vduph_lane_p16((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_lane_p16
+ #define vduph_lane_p16(vec, lane) simde_vduph_lane_p16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16_t
+simde_vduph_laneq_p16(simde_poly16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_poly16x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vduph_laneq_p16(vec, lane) vduph_laneq_p16((vec), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_laneq_p16
+ #define vduph_laneq_p16(vec, lane) simde_vduph_laneq_p16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16_t
+simde_vduph_lane_bf16(simde_bfloat16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_bfloat16x4_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vduph_lane_bf16(vec, lane) vduph_lane_bf16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_lane_bf16
+ #define vduph_lane_bf16(vec, lane) simde_vduph_lane_bf16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16_t
+simde_vduph_laneq_bf16(simde_bfloat16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_bfloat16x8_to_private(vec).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vduph_laneq_bf16(vec, lane) vduph_laneq_bf16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vduph_laneq_bf16
+ #define vduph_laneq_bf16(vec, lane) simde_vduph_laneq_bf16((vec), (lane))
+#endif
+
+// simde_vdup_lane_bf16
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vdup_lane_bf16(vec, lane) vdup_lane_bf16(vec, lane)
+#else
+ #define simde_vdup_lane_bf16(vec, lane) simde_vdup_n_bf16(simde_vduph_lane_bf16(vec, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_lane_bf16
+ #define vdup_lane_bf16(vec, lane) simde_vdup_lane_bf16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vdup_laneq_bf16(simde_bfloat16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdup_n_bf16(simde_bfloat16x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+#define simde_vdup_laneq_bf16(vec, lane) vdup_laneq_bf16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_laneq_bf16
+ #define vdup_laneq_bf16(vec, lane) simde_vdup_laneq_bf16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vdupq_lane_bf16(simde_bfloat16x4_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vdupq_n_bf16(simde_bfloat16x4_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+#define simde_vdupq_lane_bf16(vec, lane) vdupq_lane_bf16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_lane_bf16
+ #define vdupq_lane_bf16(vec, lane) simde_vdupq_lane_bf16((vec), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vdupq_laneq_bf16(simde_bfloat16x8_t vec, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vdupq_n_bf16(simde_bfloat16x8_to_private(vec).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vdupq_laneq_bf16(vec, lane) vdupq_laneq_bf16(vec, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_laneq_bf16
+ #define vdupq_laneq_bf16(vec, lane) simde_vdupq_laneq_bf16((vec), (lane))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/dup_n.h b/lib/simd_wrapper/simde/arm/neon/dup_n.h
index e945e99c902..365293edf87 100644
--- a/lib/simd_wrapper/simde/arm/neon/dup_n.h
+++ b/lib/simd_wrapper/simde/arm/neon/dup_n.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Sean Maher (Copyright owned by Google, LLC)
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_DUP_N_H)
@@ -36,7 +37,7 @@ SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_float16x4_t
-simde_vdup_n_f16(simde_float16 value) {
+simde_vdup_n_f16(simde_float16_t value) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vdup_n_f16(value);
#else
@@ -324,7 +325,7 @@ simde_vdup_n_u64(uint64_t value) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float16x8_t
-simde_vdupq_n_f16(simde_float16 value) {
+simde_vdupq_n_f16(simde_float16_t value) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vdupq_n_f16(value);
#else
@@ -338,7 +339,7 @@ simde_vdupq_n_f16(simde_float16 value) {
return simde_float16x8_from_private(r_);
#endif
}
-#define simde_vmovq_n_f32 simde_vdupq_n_f32
+#define simde_vmovq_n_f16 simde_vdupq_n_f16
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vdupq_n_f16
#define vdupq_n_f16(value) simde_vdupq_n_f16((value))
@@ -668,6 +669,186 @@ simde_vdupq_n_u64(uint64_t value) {
#define vmovq_n_u64(value) simde_vmovq_n_u64((value))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vdup_n_p8(simde_poly8_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vdup_n_p8(value);
+ #else
+ simde_poly8x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#define simde_vmov_n_p8 simde_vdup_n_p8
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_n_p8
+ #define vdup_n_p8(value) simde_vdup_n_p8((value))
+ #undef vmov_n_p8
+ #define vmov_n_p8(value) simde_vmov_n_p8((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vdup_n_p16(simde_poly16_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vdup_n_p16(value);
+ #else
+ simde_poly16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#define simde_vmov_n_p16 simde_vdup_n_p16
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdup_n_p16
+ #define vdup_n_p16(value) simde_vdup_n_p16((value))
+ #undef vmov_n_p16
+ #define vmov_n_p16(value) simde_vmov_n_p16((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vdup_n_p64(simde_poly64_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vdup_n_p64(value);
+ #else
+ simde_poly64x1_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_n_p64
+ #define vdup_n_p64(value) simde_vdup_n_p64((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vdupq_n_p8(simde_poly8_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vdupq_n_p8(value);
+ #else
+ simde_poly8x16_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#define simde_vmovq_n_p8 simde_vdupq_n_p8
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_n_p8
+ #define vdupq_n_p8(value) simde_vdupq_n_p8((value))
+ #undef vmovq_n_p8
+ #define vmovq_n_p8(value) simde_vmovq_n_p8((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vdupq_n_p16(simde_poly16_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vdupq_n_p16(value);
+ #else
+ simde_poly16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_poly16x8_from_private(r_);
+ #endif
+}
+#define simde_vmovq_n_p16 simde_vdupq_n_p16
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_n_p16
+ #define vdupq_n_p16(value) simde_vdupq_n_p16((value))
+ #undef vmovq_n_p16
+ #define vmovq_n_p16(value) simde_vmovq_n_p16((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vdupq_n_p64(simde_poly64_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vdupq_n_p64(value);
+ #else
+ simde_poly64x2_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_poly64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_n_p64
+ #define vdupq_n_p64(value) simde_vdupq_n_p64((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vdup_n_bf16(simde_bfloat16_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vdup_n_bf16(value);
+ #else
+ simde_bfloat16x4_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_bfloat16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdup_n_bf16
+ #define vdup_n_bf16(value) simde_vdup_n_bf16((value))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vdupq_n_bf16(simde_bfloat16_t value) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vdupq_n_bf16(value);
+ #else
+ simde_bfloat16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = value;
+ }
+
+ return simde_bfloat16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vdupq_n_bf16
+ #define vdupq_n_bf16(value) simde_vdupq_n_bf16((value))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/eor.h b/lib/simd_wrapper/simde/arm/neon/eor.h
index bf5a66d3b6a..9514760251c 100644
--- a/lib/simd_wrapper/simde/arm/neon/eor.h
+++ b/lib/simd_wrapper/simde/arm/neon/eor.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_EOR_H)
@@ -546,6 +547,207 @@ simde_veorq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
#define veorq_u64(a, b) simde_veorq_u64((a), (b))
#endif
+// Note: EOR3 instructions are implemented only when FEAT_SHA3 is implemented.
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16_t
+simde_veor3q_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_s8(a, b, c);
+ #else
+ simde_int8x16_private
+ r_,
+ a_ = simde_int8x16_to_private(a),
+ b_ = simde_int8x16_to_private(b),
+ c_ = simde_int8x16_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_int8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_s8
+ #define veor3q_s8(a, b, c) simde_veor3q_s8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_veor3q_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_s16(a, b, c);
+ #else
+ simde_int16x8_private
+ r_,
+ a_ = simde_int16x8_to_private(a),
+ b_ = simde_int16x8_to_private(b),
+ c_ = simde_int16x8_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_int16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_s16
+ #define veor3q_s16(a, b, c) simde_veor3q_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_veor3q_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_s32(a, b, c);
+ #else
+ simde_int32x4_private
+ r_,
+ a_ = simde_int32x4_to_private(a),
+ b_ = simde_int32x4_to_private(b),
+ c_ = simde_int32x4_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_int32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_s32
+ #define veor3q_s32(a, b, c) simde_veor3q_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_veor3q_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_s64(a, b, c);
+ #else
+ simde_int64x2_private
+ r_,
+ a_ = simde_int64x2_to_private(a),
+ b_ = simde_int64x2_to_private(b),
+ c_ = simde_int64x2_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_int64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_s64
+ #define veor3q_s64(a, b, c) simde_veor3q_s64((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_veor3q_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_u8(a, b, c);
+ #else
+ simde_uint8x16_private
+ r_,
+ a_ = simde_uint8x16_to_private(a),
+ b_ = simde_uint8x16_to_private(b),
+ c_ = simde_uint8x16_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_uint8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_u8
+ #define veor3q_u8(a, b, c) simde_veor3q_u8((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_veor3q_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_u16(a, b, c);
+ #else
+ simde_uint16x8_private
+ r_,
+ a_ = simde_uint16x8_to_private(a),
+ b_ = simde_uint16x8_to_private(b),
+ c_ = simde_uint16x8_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_uint16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_u16
+ #define veor3q_u16(a, b, c) simde_veor3q_u16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_veor3q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_u32(a, b, c);
+ #else
+ simde_uint32x4_private
+ r_,
+ a_ = simde_uint32x4_to_private(a),
+ b_ = simde_uint32x4_to_private(b),
+ c_ = simde_uint32x4_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_uint32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_u32
+ #define veor3q_u32(a, b, c) simde_veor3q_u32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_veor3q_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3)
+ return veor3q_u64(a, b, c);
+ #else
+ simde_uint64x2_private
+ r_,
+ a_ = simde_uint64x2_to_private(a),
+ b_ = simde_uint64x2_to_private(b),
+ c_ = simde_uint64x2_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i];
+ }
+
+ return simde_uint64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef veor3q_u64
+ #define veor3q_u64(a, b, c) simde_veor3q_u64((a), (b), (c))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/ext.h b/lib/simd_wrapper/simde/arm/neon/ext.h
index 0768e9d1a77..45c5aa0f009 100644
--- a/lib/simd_wrapper/simde/arm/neon/ext.h
+++ b/lib/simd_wrapper/simde/arm/neon/ext.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_EXT_H)
@@ -33,6 +34,32 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 3) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ simde_float16x4_t r;
+ SIMDE_CONSTIFY_4_(vext_f16, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3];
+ }
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vext_f16
+ #define vext_f16(a, b, n) simde_vext_f16((a), (b), (n))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n)
@@ -54,7 +81,7 @@ simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n)
return simde_float32x2_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_f32(a, b, n) simde_float32x2_from_m64(_mm_alignr_pi8(simde_float32x2_to_m64(b), simde_float32x2_to_m64(a), n * sizeof(simde_float32)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_f32(a, b, n) (__extension__ ({ \
@@ -89,7 +116,7 @@ simde_vext_f64(simde_float64x1_t a, simde_float64x1_t b, const int n)
return simde_float64x1_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_f64(a, b, n) simde_float64x1_from_m64(_mm_alignr_pi8(simde_float64x1_to_m64(b), simde_float64x1_to_m64(a), n * sizeof(simde_float64)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_f64(a, b, n) (__extension__ ({ \
@@ -125,7 +152,7 @@ simde_vext_s8(simde_int8x8_t a, simde_int8x8_t b, const int n)
return simde_int8x8_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s8(a, b, n) simde_int8x8_from_m64(_mm_alignr_pi8(simde_int8x8_to_m64(b), simde_int8x8_to_m64(a), n * sizeof(int8_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_s8(a, b, n) (__extension__ ({ \
@@ -164,7 +191,7 @@ simde_vext_s16(simde_int16x4_t a, simde_int16x4_t b, const int n)
return simde_int16x4_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s16(a, b, n) simde_int16x4_from_m64(_mm_alignr_pi8(simde_int16x4_to_m64(b), simde_int16x4_to_m64(a), n * sizeof(int16_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_s16(a, b, n) (__extension__ ({ \
@@ -201,7 +228,7 @@ simde_vext_s32(simde_int32x2_t a, simde_int32x2_t b, const int n)
return simde_int32x2_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s32(a, b, n) simde_int32x2_from_m64(_mm_alignr_pi8(simde_int32x2_to_m64(b), simde_int32x2_to_m64(a), n * sizeof(int32_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_s32(a, b, n) (__extension__ ({ \
@@ -236,7 +263,7 @@ simde_vext_s64(simde_int64x1_t a, simde_int64x1_t b, const int n)
return simde_int64x1_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_s64(a, b, n) simde_int64x1_from_m64(_mm_alignr_pi8(simde_int64x1_to_m64(b), simde_int64x1_to_m64(a), n * sizeof(int64_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_s64(a, b, n) (__extension__ ({ \
@@ -272,7 +299,7 @@ simde_vext_u8(simde_uint8x8_t a, simde_uint8x8_t b, const int n)
return simde_uint8x8_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u8(a, b, n) simde_uint8x8_from_m64(_mm_alignr_pi8(simde_uint8x8_to_m64(b), simde_uint8x8_to_m64(a), n * sizeof(uint8_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_u8(a, b, n) (__extension__ ({ \
@@ -311,7 +338,7 @@ simde_vext_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int n)
return simde_uint16x4_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u16(a, b, n) simde_uint16x4_from_m64(_mm_alignr_pi8(simde_uint16x4_to_m64(b), simde_uint16x4_to_m64(a), n * sizeof(uint16_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_u16(a, b, n) (__extension__ ({ \
@@ -348,7 +375,7 @@ simde_vext_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int n)
return simde_uint32x2_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u32(a, b, n) simde_uint32x2_from_m64(_mm_alignr_pi8(simde_uint32x2_to_m64(b), simde_uint32x2_to_m64(a), n * sizeof(uint32_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760)
#define simde_vext_u32(a, b, n) (__extension__ ({ \
@@ -383,7 +410,7 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n)
return simde_uint64x1_from_private(r_);
#endif
}
-#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
+#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vext_u64(a, b, n) simde_uint64x1_from_m64(_mm_alignr_pi8(simde_uint64x1_to_m64(b), simde_uint64x1_to_m64(a), n * sizeof(uint64_t)))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vext_u64(a, b, n) (__extension__ ({ \
@@ -398,6 +425,32 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n)
#define vext_u64(a, b, n) simde_vext_u64((a), (b), (n))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ simde_float16x8_t r;
+ SIMDE_CONSTIFY_8_(vextq_f16, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7];
+ }
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vextq_f16
+ #define vextq_f16(a, b, n) simde_vextq_f16((a), (b), (n))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n)
@@ -420,7 +473,15 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n)
#endif
}
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
- #define simde_vextq_f32(a, b, n) simde_float32x4_from_m128(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(simde_float32x4_to_m128(b)), _mm_castps_si128(simde_float32x4_to_m128(a)), n * sizeof(simde_float32))))
+ #define simde_vextq_f32(a, b, n) simde_float32x4_from_m128(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(simde_float32x4_to_m128(b)), _mm_castps_si128(simde_float32x4_to_m128(a)), (n) * sizeof(simde_float32))))
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define simde_vextq_f32(a, b, n) (__extension__ ({ \
+ simde_float32x4_private simde_vextq_f32_r_; \
+ simde_vextq_f32_r_.v128 = wasm_i32x4_shuffle(simde_float32x4_to_private(a).v128, simde_float32x4_to_private(b).v128, \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \
+ simde_float32x4_from_private(simde_vextq_f32_r_); \
+ }))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_f32(a, b, n) (__extension__ ({ \
simde_float32x4_private simde_vextq_f32_r_; \
@@ -457,7 +518,14 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n)
#endif
}
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
- #define simde_vextq_f64(a, b, n) simde_float64x2_from_m128d(_mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(simde_float64x2_to_m128d(b)), _mm_castpd_si128(simde_float64x2_to_m128d(a)), n * sizeof(simde_float64))))
+ #define simde_vextq_f64(a, b, n) simde_float64x2_from_m128d(_mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(simde_float64x2_to_m128d(b)), _mm_castpd_si128(simde_float64x2_to_m128d(a)), (n) * sizeof(simde_float64))))
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define simde_vextq_f64(a, b, n) (__extension__ ({ \
+ simde_float64x2_private simde_vextq_f64_r_; \
+ simde_vextq_f64_r_.v128 = wasm_i64x2_shuffle(simde_float64x2_to_private(a).v128, simde_float64x2_to_private(b).v128, \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \
+ simde_float64x2_from_private(simde_vextq_f64_r_); \
+ }))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_f64(a, b, n) (__extension__ ({ \
simde_float64x2_private simde_vextq_f64_r_; \
@@ -494,6 +562,20 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n)
}
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s8(a, b, n) simde_int8x16_from_m128i(_mm_alignr_epi8(simde_int8x16_to_m128i(b), simde_int8x16_to_m128i(a), n * sizeof(int8_t)))
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define simde_vextq_s8(a, b, n) (__extension__ ({ \
+ simde_int8x16_private simde_vextq_s8_r_; \
+ simde_vextq_s8_r_.v128 = wasm_i8x16_shuffle(simde_int8x16_to_private(a).v128, simde_int8x16_to_private(b).v128, \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 4)), HEDLEY_STATIC_CAST(int8_t, ((n) + 5)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 8)), HEDLEY_STATIC_CAST(int8_t, ((n) + 9)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 10)), HEDLEY_STATIC_CAST(int8_t, ((n) + 11)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 12)), HEDLEY_STATIC_CAST(int8_t, ((n) + 13)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 14)), HEDLEY_STATIC_CAST(int8_t, ((n) + 15))); \
+ simde_int8x16_from_private(simde_vextq_s8_r_); \
+ }))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s8(a, b, n) (__extension__ ({ \
simde_int8x16_private simde_vextq_s8_r_; \
@@ -537,6 +619,16 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n)
}
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s16(a, b, n) simde_int16x8_from_m128i(_mm_alignr_epi8(simde_int16x8_to_m128i(b), simde_int16x8_to_m128i(a), n * sizeof(int16_t)))
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define simde_vextq_s16(a, b, n) (__extension__ ({ \
+ simde_int16x8_private simde_vextq_s16_r_; \
+ simde_vextq_s16_r_.v128 = wasm_i16x8_shuffle(simde_int16x8_to_private(a).v128, simde_int16x8_to_private(b).v128, \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 4)), HEDLEY_STATIC_CAST(int8_t, ((n) + 5)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \
+ simde_int16x8_from_private(simde_vextq_s16_r_); \
+ }))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s16(a, b, n) (__extension__ ({ \
simde_int16x8_private simde_vextq_s16_r_; \
@@ -576,6 +668,14 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n)
}
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s32(a, b, n) simde_int32x4_from_m128i(_mm_alignr_epi8(simde_int32x4_to_m128i(b), simde_int32x4_to_m128i(a), n * sizeof(int32_t)))
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define simde_vextq_s32(a, b, n) (__extension__ ({ \
+ simde_int32x4_private simde_vextq_s32_r_; \
+ simde_vextq_s32_r_.v128 = wasm_i32x4_shuffle(simde_int32x4_to_private(a).v128, simde_int32x4_to_private(b).v128, \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \
+ simde_int32x4_from_private(simde_vextq_s32_r_); \
+ }))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s32(a, b, n) (__extension__ ({ \
simde_int32x4_private simde_vextq_s32_r_; \
@@ -613,6 +713,13 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n)
}
#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE)
#define simde_vextq_s64(a, b, n) simde_int64x2_from_m128i(_mm_alignr_epi8(simde_int64x2_to_m128i(b), simde_int64x2_to_m128i(a), n * sizeof(int64_t)))
+#elif defined(SIMDE_WASM_SIMD128_NATIVE)
+ #define simde_vextq_s64(a, b, n) (__extension__ ({ \
+ simde_int64x2_private simde_vextq_s64_r_; \
+ simde_vextq_s64_r_.v128 = wasm_i64x2_shuffle(simde_int64x2_to_private(a).v128, simde_int64x2_to_private(b).v128, \
+ HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \
+ simde_int64x2_from_private(simde_vextq_s64_r_); \
+ }))
#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32)
#define simde_vextq_s64(a, b, n) (__extension__ ({ \
simde_int64x2_private simde_vextq_s64_r_; \
@@ -790,6 +897,161 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n)
#define vextq_u64(a, b, n) simde_vextq_u64((a), (b), (n))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vext_p8(simde_poly8x8_t a, simde_poly8x8_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ simde_poly8x8_t r;
+ SIMDE_CONSTIFY_8_(vext_p8, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_poly8x8_private
+ a_ = simde_poly8x8_to_private(a),
+ b_ = simde_poly8x8_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7];
+ }
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vext_p8
+ #define vext_p8(a, b, n) simde_vext_p8((a), (b), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vext_p16(simde_poly16x4_t a, simde_poly16x4_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 3) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ simde_poly16x4_t r;
+ SIMDE_CONSTIFY_4_(vext_p16, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_poly16x4_private
+ a_ = simde_poly16x4_to_private(a),
+ b_ = simde_poly16x4_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3];
+ }
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vext_p16
+ #define vext_p16(a, b, n) simde_vext_p16((a), (b), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vext_p64(simde_poly64x1_t a, simde_poly64x1_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 0) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ (void) n;
+ return vext_p64(a, b, 0);
+ #else
+ simde_poly64x1_private
+ a_ = simde_poly64x1_to_private(a),
+ b_ = simde_poly64x1_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0];
+ }
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vext_p64
+ #define vext_p64(a, b, n) simde_vext_p64((a), (b), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vextq_p8(simde_poly8x16_t a, simde_poly8x16_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ simde_poly8x16_t r;
+ SIMDE_CONSTIFY_16_(vextq_p8, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_poly8x16_private
+ a_ = simde_poly8x16_to_private(a),
+ b_ = simde_poly8x16_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15];
+ }
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vextq_p8
+ #define vextq_p8(a, b, n) simde_vextq_p8((a), (b), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vextq_p16(simde_poly16x8_t a, simde_poly16x8_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ simde_poly16x8_t r;
+ SIMDE_CONSTIFY_8_(vextq_p16, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_poly16x8_private
+ a_ = simde_poly16x8_to_private(a),
+ b_ = simde_poly16x8_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7];
+ }
+ return simde_poly16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vextq_p16
+ #define vextq_p16(a, b, n) simde_vextq_p16((a), (b), (n))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vextq_p64(simde_poly64x2_t a, simde_poly64x2_t b, const int n)
+ SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 1) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ simde_poly64x2_t r;
+ SIMDE_CONSTIFY_2_(vextq_p64, r, (HEDLEY_UNREACHABLE(), a), n, a, b);
+ return r;
+ #else
+ simde_poly64x2_private
+ a_ = simde_poly64x2_to_private(a),
+ b_ = simde_poly64x2_to_private(b),
+ r_ = a_;
+ const size_t n_ = HEDLEY_STATIC_CAST(size_t, n);
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ size_t src = i + n_;
+ r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1];
+ }
+ return simde_poly64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vextq_p64
+ #define vextq_p64(a, b, n) simde_vextq_p64((a), (b), (n))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/fma.h b/lib/simd_wrapper/simde/arm/neon/fma.h
index 4ee30d1d677..aaf9e04e056 100644
--- a/lib/simd_wrapper/simde/arm/neon/fma.h
+++ b/lib/simd_wrapper/simde/arm/neon/fma.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Atharva Nimbalkar
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_FMA_H)
@@ -34,10 +35,24 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vfmah_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ return vfmah_f16(a, b, c);
+ #else
+ return simde_vaddh_f16(a, simde_vmulh_f16(b, c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmah_f16
+ #define vfmah_f16(a, b, c) simde_vfmah_f16(a, b, c)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) {
- #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
return vfma_f32(a, b, c);
#else
return simde_vadd_f32(a, simde_vmul_f32(b, c));
@@ -51,7 +66,7 @@ simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x1_t
simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
return vfma_f64(a, b, c);
#else
return simde_vadd_f64(a, simde_vmul_f64(b, c));
@@ -62,10 +77,38 @@ simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) {
#define vfma_f64(a, b, c) simde_vfma_f64(a, b, c)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vfma_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ return vfma_f16(a, b, c);
+ #else
+ return simde_vadd_f16(a, simde_vmul_f16(b, c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfma_f16
+ #define vfma_f16(a, b, c) simde_vfma_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ return vfmaq_f16(a, b, c);
+ #else
+ return simde_vaddq_f16(a, simde_vmulq_f16(b, c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmaq_f16
+ #define vfmaq_f16(a, b, c) simde_vfmaq_f16(a, b, c)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) {
- #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
return vfmaq_f32(a, b, c);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_madd(b, c, a);
@@ -94,7 +137,7 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x2_t
simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
return vfmaq_f64(a, b, c);
#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_madd(b, c, a);
diff --git a/lib/simd_wrapper/simde/arm/neon/fma_lane.h b/lib/simd_wrapper/simde/arm/neon/fma_lane.h
index 6100ed78ca0..e937f715cb3 100644
--- a/lib/simd_wrapper/simde/arm/neon/fma_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/fma_lane.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Atharva Nimbalkar
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_FMA_LANE_H)
@@ -38,7 +39,7 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
/* simde_vfmad_lane_f64 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
#define simde_vfmad_lane_f64(a, b, v, lane) \
SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_lane_f64(a, b, v, lane))
@@ -61,7 +62,7 @@ SIMDE_BEGIN_DECLS_
#endif
/* simde_vfmad_laneq_f64 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
#define simde_vfmad_laneq_f64(a, b, v, lane) \
SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_laneq_f64(a, b, v, lane))
@@ -83,8 +84,54 @@ SIMDE_BEGIN_DECLS_
#define vfmad_laneq_f64(a, b, v, lane) simde_vfmad_laneq_f64(a, b, v, lane)
#endif
+/* simde_vfmah_lane_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmah_lane_f16(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmah_lane_f16(a, b, v, lane))
+ #else
+ #define simde_vfmah_lane_f16(a, b, v, lane) vfmah_lane_f16((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmah_lane_f16(a, b, v, lane) \
+ simde_vget_lane_f16( \
+ simde_vadd_f16( \
+ simde_vdup_n_f16(a), \
+ simde_vdup_n_f16(simde_vmulh_lane_f16(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmah_lane_f16
+ #define vfmah_lane_f16(a, b, v, lane) simde_vfmah_lane_f16(a, b, v, lane)
+#endif
+
+/* simde_vfmah_laneq_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmah_laneq_f16(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmah_laneq_f16(a, b, v, lane))
+ #else
+ #define simde_vfmah_laneq_f16(a, b, v, lane) vfmah_laneq_f16((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmah_laneq_f16(a, b, v, lane) \
+ simde_vget_lane_f16( \
+ simde_vadd_f16( \
+ simde_vdup_n_f16(a), \
+ simde_vdup_n_f16(simde_vmulh_laneq_f16(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmah_laneq_f16
+ #define vfmah_laneq_f16(a, b, v, lane) simde_vfmah_laneq_f16(a, b, v, lane)
+#endif
+
/* simde_vfmas_lane_f32 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
#define simde_vfmas_lane_f32(a, b, v, lane) \
SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_lane_f32(a, b, v, lane))
@@ -107,7 +154,7 @@ SIMDE_BEGIN_DECLS_
#endif
/* simde_vfmas_laneq_f32 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
#define simde_vfmas_laneq_f32(a, b, v, lane) \
SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_laneq_f32(a, b, v, lane))
@@ -129,8 +176,19 @@ SIMDE_BEGIN_DECLS_
#define vfmas_laneq_f32(a, b, v, lane) simde_vfmas_laneq_f32(a, b, v, lane)
#endif
+/* simde_vfma_lane_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfma_lane_f16(a, b, v, lane) vfma_lane_f16(a, b, v, lane)
+#else
+ #define simde_vfma_lane_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_lane_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfma_lane_f16
+ #define vfma_lane_f16(a, b, v, lane) simde_vfma_lane_f16(a, b, v, lane)
+#endif
+
/* simde_vfma_lane_f32 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfma_lane_f32(a, b, v, lane) vfma_lane_f32(a, b, v, lane)
#else
#define simde_vfma_lane_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_lane_f32(b, v, lane))
@@ -141,7 +199,7 @@ SIMDE_BEGIN_DECLS_
#endif
/* simde_vfma_lane_f64 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfma_lane_f64(a, b, v, lane) vfma_lane_f64((a), (b), (v), (lane))
#else
#define simde_vfma_lane_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_lane_f64(b, v, lane))
@@ -151,8 +209,19 @@ SIMDE_BEGIN_DECLS_
#define vfma_lane_f64(a, b, v, lane) simde_vfma_lane_f64(a, b, v, lane)
#endif
+/* simde_vfma_laneq_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfma_laneq_f16(a, b, v, lane) vfma_laneq_f16((a), (b), (v), (lane))
+#else
+ #define simde_vfma_laneq_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_laneq_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfma_laneq_f16
+ #define vfma_laneq_f16(a, b, v, lane) simde_vfma_laneq_f16(a, b, v, lane)
+#endif
+
/* simde_vfma_laneq_f32 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfma_laneq_f32(a, b, v, lane) vfma_laneq_f32((a), (b), (v), (lane))
#else
#define simde_vfma_laneq_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_laneq_f32(b, v, lane))
@@ -163,7 +232,7 @@ SIMDE_BEGIN_DECLS_
#endif
/* simde_vfma_laneq_f64 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfma_laneq_f64(a, b, v, lane) vfma_laneq_f64((a), (b), (v), (lane))
#else
#define simde_vfma_laneq_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_laneq_f64(b, v, lane))
@@ -174,7 +243,7 @@ SIMDE_BEGIN_DECLS_
#endif
/* simde_vfmaq_lane_f64 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfmaq_lane_f64(a, b, v, lane) vfmaq_lane_f64((a), (b), (v), (lane))
#else
#define simde_vfmaq_lane_f64(a, b, v, lane) simde_vaddq_f64(a, simde_vmulq_lane_f64(b, v, lane))
@@ -184,8 +253,19 @@ SIMDE_BEGIN_DECLS_
#define vfmaq_lane_f64(a, b, v, lane) simde_vfmaq_lane_f64(a, b, v, lane)
#endif
+/* simde_vfmaq_lane_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfmaq_lane_f16(a, b, v, lane) vfmaq_lane_f16((a), (b), (v), (lane))
+#else
+ #define simde_vfmaq_lane_f16(a, b, v, lane) simde_vaddq_f16(a, simde_vmulq_lane_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmaq_lane_f16
+ #define vfmaq_lane_f16(a, b, v, lane) simde_vfmaq_lane_f16(a, b, v, lane)
+#endif
+
/* simde_vfmaq_lane_f32 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfmaq_lane_f32(a, b, v, lane) vfmaq_lane_f32((a), (b), (v), (lane))
#else
#define simde_vfmaq_lane_f32(a, b, v, lane) simde_vaddq_f32(a, simde_vmulq_lane_f32(b, v, lane))
@@ -195,8 +275,20 @@ SIMDE_BEGIN_DECLS_
#define vfmaq_lane_f32(a, b, v, lane) simde_vfmaq_lane_f32(a, b, v, lane)
#endif
+/* simde_vfmaq_laneq_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfmaq_laneq_f16(a, b, v, lane) vfmaq_laneq_f16((a), (b), (v), (lane))
+#else
+ #define simde_vfmaq_laneq_f16(a, b, v, lane) \
+ simde_vaddq_f16(a, simde_vmulq_laneq_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmaq_laneq_f16
+ #define vfmaq_laneq_f16(a, b, v, lane) simde_vfmaq_laneq_f16(a, b, v, lane)
+#endif
+
/* simde_vfmaq_laneq_f32 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfmaq_laneq_f32(a, b, v, lane) vfmaq_laneq_f32((a), (b), (v), (lane))
#else
#define simde_vfmaq_laneq_f32(a, b, v, lane) \
@@ -208,7 +300,7 @@ SIMDE_BEGIN_DECLS_
#endif
/* simde_vfmaq_laneq_f64 */
-#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA)
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
#define simde_vfmaq_laneq_f64(a, b, v, lane) vfmaq_laneq_f64((a), (b), (v), (lane))
#else
#define simde_vfmaq_laneq_f64(a, b, v, lane) \
diff --git a/lib/simd_wrapper/simde/arm/neon/fma_n.h b/lib/simd_wrapper/simde/arm/neon/fma_n.h
index 6cf58259c06..0a23407c6cb 100644
--- a/lib/simd_wrapper/simde/arm/neon/fma_n.h
+++ b/lib/simd_wrapper/simde/arm/neon/fma_n.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Evan Nemerson
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_FMA_N_H)
@@ -35,10 +36,38 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vfma_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16)
+ return vfma_n_f16(a, b, c);
+ #else
+ return simde_vfma_f16(a, b, simde_vdup_n_f16(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfma_n_f16
+ #define vfma_n_f16(a, b, c) simde_vfma_n_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vfmaq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16)
+ return vfmaq_n_f16(a, b, c);
+ #else
+ return simde_vfmaq_f16(a, b, simde_vdupq_n_f16(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmaq_n_f16
+ #define vfmaq_n_f16(a, b, c) simde_vfmaq_n_f16(a, b, c)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) {
- #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399)
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399)
return vfma_n_f32(a, b, c);
#else
return simde_vfma_f32(a, b, simde_vdup_n_f32(c));
@@ -52,7 +81,7 @@ simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x1_t
simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
return vfma_n_f64(a, b, c);
#else
return simde_vfma_f64(a, b, simde_vdup_n_f64(c));
@@ -66,7 +95,7 @@ simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) {
- #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399)
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399)
return vfmaq_n_f32(a, b, c);
#else
return simde_vfmaq_f32(a, b, simde_vdupq_n_f32(c));
@@ -80,7 +109,7 @@ simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x2_t
simde_vfmaq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) {
- #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
return vfmaq_n_f64(a, b, c);
#else
return simde_vfmaq_f64(a, b, simde_vdupq_n_f64(c));
diff --git a/lib/simd_wrapper/simde/arm/neon/fmlal.h b/lib/simd_wrapper/simde/arm/neon/fmlal.h
new file mode 100644
index 00000000000..f71d3019c8f
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/fmlal.h
@@ -0,0 +1,527 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_FMLAL_H)
+#define SIMDE_ARM_NEON_FMLAL_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlal_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlal_low_f16(r, a, b);
+ #else
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]);
+ }
+ return simde_float32x2_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlal_low_f16
+ #define vfmlal_low_f16(r, a, b) simde_vfmlal_low_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlalq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlalq_low_f16(r, a, b);
+ #else
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]);
+ }
+ return simde_float32x4_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlalq_low_f16
+ #define vfmlalq_low_f16(r, a, b) simde_vfmlalq_low_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlal_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlal_high_f16(r, a, b);
+ #else
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]);
+ }
+ return simde_float32x2_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlal_high_f16
+ #define vfmlal_high_f16(r, a, b) simde_vfmlal_high_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlalq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlalq_high_f16(r, a, b);
+ #else
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]);
+ }
+ return simde_float32x4_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlalq_high_f16
+ #define vfmlalq_high_f16(r, a, b) simde_vfmlalq_high_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlal_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlal_lane_low_f16(r, a, b, lane) vfmlal_lane_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlal_lane_low_f16
+ #define vfmlal_lane_low_f16(r, a, b, lane) simde_vfmlal_lane_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlal_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a);
+ simde_float16x8_private
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlal_laneq_low_f16(r, a, b, lane) vfmlal_laneq_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlal_laneq_low_f16
+ #define vfmlal_laneq_low_f16(r, a, b, lane) simde_vfmlal_laneq_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlalq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x4_private
+ b_ = simde_float16x4_to_private(b);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlalq_lane_low_f16(r, a, b, lane) vfmlalq_lane_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlalq_lane_low_f16
+ #define vfmlalq_lane_low_f16(r, a, b, lane) simde_vfmlalq_lane_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlalq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlalq_laneq_low_f16(r, a, b, lane) vfmlalq_laneq_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlalq_laneq_low_f16
+ #define vfmlalq_laneq_low_f16(r, a, b, lane) simde_vfmlalq_laneq_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlal_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlal_lane_high_f16(r, a, b, lane) vfmlal_lane_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlal_lane_high_f16
+ #define vfmlal_lane_high_f16(r, a, b, lane) simde_vfmlal_lane_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlal_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a);
+ simde_float16x8_private
+ b_ = simde_float16x8_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlal_laneq_high_f16(r, a, b, lane) vfmlal_laneq_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlal_laneq_high_f16
+ #define vfmlal_laneq_high_f16(r, a, b, lane) simde_vfmlal_laneq_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlalq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x4_private
+ b_ = simde_float16x4_to_private(b);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlalq_lane_high_f16(r, a, b, lane) vfmlalq_lane_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlalq_lane_high_f16
+ #define vfmlalq_lane_high_f16(r, a, b, lane) simde_vfmlalq_lane_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlalq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] +
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlalq_laneq_high_f16(r, a, b, lane) vfmlalq_laneq_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlalq_laneq_high_f16
+ #define vfmlalq_laneq_high_f16(r, a, b, lane) simde_vfmlalq_laneq_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmlalbq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vbfmlalbq_f32(r, a, b);
+ #else
+ simde_float32x4_private
+ ret,
+ r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = r_.values[i] +
+ simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[i * 2]);
+ }
+ return simde_float32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmlalbq_f32
+ #define vbfmlalbq_f32(r, a, b) simde_vbfmlalbq_f32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmlaltq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vbfmlaltq_f32(r, a, b);
+ #else
+ simde_float32x4_private
+ ret,
+ r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = r_.values[i] +
+ simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[i * 2 + 1]);
+ }
+ return simde_float32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmlaltq_f32
+ #define vbfmlaltq_f32(r, a, b) simde_vbfmlaltq_f32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmlalbq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ ret,
+ r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+ simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = r_.values[i] +
+ simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vbfmlalbq_lane_f32(r, a, b, lane) vbfmlalbq_lane_f32((r), (a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmlalbq_lane_f32
+ #define vbfmlalbq_lane_f32(r, a, b, lane) simde_vbfmlalbq_lane_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmlalbq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x4_private
+ ret,
+ r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = r_.values[i] +
+ simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vbfmlalbq_laneq_f32(r, a, b, lane) vbfmlalbq_laneq_f32((r), (a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmlalbq_laneq_f32
+ #define vbfmlalbq_laneq_f32(r, a, b, lane) simde_vbfmlalbq_laneq_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmlaltq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ ret,
+ r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+ simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = r_.values[i] +
+ simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vbfmlaltq_lane_f32(r, a, b, lane) vbfmlaltq_lane_f32((r), (a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmlaltq_lane_f32
+ #define vbfmlaltq_lane_f32(r, a, b, lane) simde_vbfmlaltq_lane_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmlaltq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x4_private
+ ret,
+ r_ = simde_float32x4_to_private(r);
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) {
+ ret.values[i] = r_.values[i] +
+ simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vbfmlaltq_laneq_f32(r, a, b, lane) vbfmlaltq_laneq_f32((r), (a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmlaltq_laneq_f32
+ #define vbfmlaltq_laneq_f32(r, a, b, lane) simde_vbfmlaltq_laneq_f32((r), (a), (b), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_FMLAL_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/fmlsl.h b/lib/simd_wrapper/simde/arm/neon/fmlsl.h
new file mode 100644
index 00000000000..8a5be5461c3
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/fmlsl.h
@@ -0,0 +1,373 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_FMLSL_H)
+#define SIMDE_ARM_NEON_FMLSL_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlsl_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlsl_low_f16(r, a, b);
+ #else
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]);
+ }
+ return simde_float32x2_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlsl_low_f16
+ #define vfmlsl_low_f16(r, a, b) simde_vfmlsl_low_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlslq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlslq_low_f16(r, a, b);
+ #else
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]);
+ }
+ return simde_float32x4_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlslq_low_f16
+ #define vfmlslq_low_f16(r, a, b) simde_vfmlslq_low_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlsl_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlsl_high_f16(r, a, b);
+ #else
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]);
+ }
+ return simde_float32x2_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlsl_high_f16
+ #define vfmlsl_high_f16(r, a, b) simde_vfmlsl_high_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlslq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ return vfmlslq_high_f16(r, a, b);
+ #else
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]);
+ }
+ return simde_float32x4_from_private(ret_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlslq_high_f16
+ #define vfmlslq_high_f16(r, a, b) simde_vfmlslq_high_f16((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlsl_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlsl_lane_low_f16(r, a, b, lane) vfmlsl_lane_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlsl_lane_low_f16
+ #define vfmlsl_lane_low_f16(r, a, b, lane) simde_vfmlsl_lane_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlsl_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a);
+ simde_float16x8_private
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlsl_laneq_low_f16(r, a, b, lane) vfmlsl_laneq_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlsl_laneq_low_f16
+ #define vfmlsl_laneq_low_f16(r, a, b, lane) simde_vfmlsl_laneq_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlslq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x4_private
+ b_ = simde_float16x4_to_private(b);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlslq_lane_low_f16(r, a, b, lane) vfmlslq_lane_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlslq_lane_low_f16
+ #define vfmlslq_lane_low_f16(r, a, b, lane) simde_vfmlslq_lane_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlslq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlslq_laneq_low_f16(r, a, b, lane) vfmlslq_laneq_low_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlslq_laneq_low_f16
+ #define vfmlslq_laneq_low_f16(r, a, b, lane) simde_vfmlslq_laneq_low_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlsl_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlsl_lane_high_f16(r, a, b, lane) vfmlsl_lane_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlsl_lane_high_f16
+ #define vfmlsl_lane_high_f16(r, a, b, lane) simde_vfmlsl_lane_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfmlsl_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x2_private
+ ret_,
+ r_ = simde_float32x2_to_private(r);
+ simde_float16x4_private
+ a_ = simde_float16x4_to_private(a);
+ simde_float16x8_private
+ b_ = simde_float16x8_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x2_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlsl_laneq_high_f16(r, a, b, lane) vfmlsl_laneq_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlsl_laneq_high_f16
+ #define vfmlsl_laneq_high_f16(r, a, b, lane) simde_vfmlsl_laneq_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlslq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x4_private
+ b_ = simde_float16x4_to_private(b);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlslq_lane_high_f16(r, a, b, lane) vfmlslq_lane_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlslq_lane_high_f16
+ #define vfmlslq_lane_high_f16(r, a, b, lane) simde_vfmlslq_lane_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmlslq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float32x4_private
+ ret_,
+ r_ = simde_float32x4_to_private(r);
+ simde_float16x8_private
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) {
+ ret_.values[i] = r_.values[i] -
+ simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]);
+ }
+ return simde_float32x4_from_private(ret_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ defined(SIMDE_ARCH_ARM_FP16_FML)
+ #define simde_vfmlslq_laneq_high_f16(r, a, b, lane) vfmlslq_laneq_high_f16((r), (a), (b), (lane));
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmlslq_laneq_high_f16
+ #define vfmlslq_laneq_high_f16(r, a, b, lane) simde_vfmlslq_laneq_high_f16((r), (a), (b), (lane));
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_FMLSL_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/fms.h b/lib/simd_wrapper/simde/arm/neon/fms.h
new file mode 100644
index 00000000000..0ad265c3d09
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/fms.h
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use, copy,
+* modify, merge, publish, distribute, sublicense, and/or sell copies
+* of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+* Copyright:
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+*/
+
+#if !defined(SIMDE_ARM_NEON_FMS_H)
+#define SIMDE_ARM_NEON_FMS_H
+
+#include "add.h"
+#include "mul.h"
+#include "neg.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vfmsh_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ return vfmsh_f16(a, b, c);
+ #else
+ return simde_vaddh_f16(a, simde_vnegh_f16(simde_vmulh_f16(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsh_f16
+ #define vfmsh_f16(a, b, c) simde_vfmsh_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfms_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ return vfms_f32(a, b, c);
+ #else
+ return simde_vadd_f32(a, simde_vneg_f32(simde_vmul_f32(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vfms_f32
+ #define vfms_f32(a, b, c) simde_vfms_f32(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vfms_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ return vfms_f64(a, b, c);
+ #else
+ return simde_vadd_f64(a, simde_vneg_f64(simde_vmul_f64(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vfms_f64
+ #define vfms_f64(a, b, c) simde_vfms_f64(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vfms_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ return vfms_f16(a, b, c);
+ #else
+ return simde_vadd_f16(a, simde_vneg_f16(simde_vmul_f16(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_f16
+ #define vfms_f16(a, b, c) simde_vfms_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vfmsq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ return vfmsq_f16(a, b, c);
+ #else
+ return simde_vaddq_f16(a, simde_vnegq_f16(simde_vmulq_f16(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_f16
+ #define vfmsq_f16(a, b, c) simde_vfmsq_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ return vfmsq_f32(a, b, c);
+ #else
+ return simde_vaddq_f32(a, simde_vnegq_f32(simde_vmulq_f32(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_f32
+ #define vfmsq_f32(a, b, c) simde_vfmsq_f32(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vfmsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ return vfmsq_f64(a, b, c);
+ #else
+ return simde_vaddq_f64(a, simde_vnegq_f64(simde_vmulq_f64(b, c)));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_f64
+ #define vfmsq_f64(a, b, c) simde_vfmsq_f64(a, b, c)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_FMS_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/fms_lane.h b/lib/simd_wrapper/simde/arm/neon/fms_lane.h
new file mode 100644
index 00000000000..05ef96ae3d0
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/fms_lane.h
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use, copy,
+* modify, merge, publish, distribute, sublicense, and/or sell copies
+* of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+* Copyright:
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+*/
+
+#if !defined(SIMDE_ARM_NEON_FMS_LANE_H)
+#define SIMDE_ARM_NEON_FMS_LANE_H
+
+#include "sub.h"
+#include "dup_n.h"
+#include "get_lane.h"
+#include "mul.h"
+#include "mul_lane.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+/* simde_vfmsd_lane_f64 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmsd_lane_f64(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsd_lane_f64(a, b, v, lane))
+ #else
+ #define simde_vfmsd_lane_f64(a, b, v, lane) vfmsd_lane_f64((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmsd_lane_f64(a, b, v, lane) \
+ simde_vget_lane_f64( \
+ simde_vsub_f64( \
+ simde_vdup_n_f64(a), \
+ simde_vdup_n_f64(simde_vmuld_lane_f64(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsd_lane_f64
+ #define vfmsd_lane_f64(a, b, v, lane) simde_vfmsd_lane_f64(a, b, v, lane)
+#endif
+
+/* simde_vfmsd_laneq_f64 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmsd_laneq_f64(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsd_laneq_f64(a, b, v, lane))
+ #else
+ #define simde_vfmsd_laneq_f64(a, b, v, lane) vfmsd_laneq_f64((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmsd_laneq_f64(a, b, v, lane) \
+ simde_vget_lane_f64( \
+ simde_vsub_f64( \
+ simde_vdup_n_f64(a), \
+ simde_vdup_n_f64(simde_vmuld_laneq_f64(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsd_laneq_f64
+ #define vfmsd_laneq_f64(a, b, v, lane) simde_vfmsd_laneq_f64(a, b, v, lane)
+#endif
+
+/* simde_vfmsh_lane_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmsh_lane_f16(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsh_lane_f16(a, b, v, lane))
+ #else
+ #define simde_vfmsh_lane_f16(a, b, v, lane) vfmsh_lane_f16((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmsh_lane_f16(a, b, v, lane) \
+ simde_vget_lane_f16( \
+ simde_vsub_f16( \
+ simde_vdup_n_f16(a), \
+ simde_vdup_n_f16(simde_vmulh_lane_f16(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsh_lane_f16
+ #define vfmsh_lane_f16(a, b, v, lane) simde_vfmsh_lane_f16(a, b, v, lane)
+#endif
+
+/* simde_vfmsh_laneq_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmsh_laneq_f16(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsh_laneq_f16(a, b, v, lane))
+ #else
+ #define simde_vfmsh_laneq_f16(a, b, v, lane) vfmsh_laneq_f16((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmsh_laneq_f16(a, b, v, lane) \
+ simde_vget_lane_f16( \
+ simde_vsub_f16( \
+ simde_vdup_n_f16(a), \
+ simde_vdup_n_f16(simde_vmulh_laneq_f16(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsh_laneq_f16
+ #define vfmsh_laneq_f16(a, b, v, lane) simde_vfmsh_laneq_f16(a, b, v, lane)
+#endif
+
+/* simde_vfmss_lane_f32 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmss_lane_f32(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmss_lane_f32(a, b, v, lane))
+ #else
+ #define simde_vfmss_lane_f32(a, b, v, lane) vfmss_lane_f32((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmss_lane_f32(a, b, v, lane) \
+ simde_vget_lane_f32( \
+ simde_vsub_f32( \
+ simde_vdup_n_f32(a), \
+ simde_vdup_n_f32(simde_vmuls_lane_f32(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmss_lane_f32
+ #define vfmss_lane_f32(a, b, v, lane) simde_vfmss_lane_f32(a, b, v, lane)
+#endif
+
+/* simde_vfmss_laneq_f32 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vfmss_laneq_f32(a, b, v, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmss_laneq_f32(a, b, v, lane))
+ #else
+ #define simde_vfmss_laneq_f32(a, b, v, lane) vfmss_laneq_f32((a), (b), (v), (lane))
+ #endif
+#else
+ #define simde_vfmss_laneq_f32(a, b, v, lane) \
+ simde_vget_lane_f32( \
+ simde_vsub_f32( \
+ simde_vdup_n_f32(a), \
+ simde_vdup_n_f32(simde_vmuls_laneq_f32(b, v, lane)) \
+ ), \
+ 0 \
+ )
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmss_laneq_f32
+ #define vfmss_laneq_f32(a, b, v, lane) simde_vfmss_laneq_f32(a, b, v, lane)
+#endif
+
+/* simde_vfms_lane_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfms_lane_f16(a, b, v, lane) vfms_lane_f16(a, b, v, lane)
+#else
+ #define simde_vfms_lane_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_lane_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_lane_f16
+ #define vfms_lane_f16(a, b, v, lane) simde_vfms_lane_f16(a, b, v, lane)
+#endif
+
+/* simde_vfms_lane_f32 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfms_lane_f32(a, b, v, lane) vfms_lane_f32(a, b, v, lane)
+#else
+ #define simde_vfms_lane_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_lane_f32(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_lane_f32
+ #define vfms_lane_f32(a, b, v, lane) simde_vfms_lane_f32(a, b, v, lane)
+#endif
+
+/* simde_vfms_lane_f64 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfms_lane_f64(a, b, v, lane) vfms_lane_f64((a), (b), (v), (lane))
+#else
+ #define simde_vfms_lane_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_lane_f64(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_lane_f64
+ #define vfms_lane_f64(a, b, v, lane) simde_vfms_lane_f64(a, b, v, lane)
+#endif
+
+/* simde_vfms_laneq_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfms_laneq_f16(a, b, v, lane) vfms_laneq_f16((a), (b), (v), (lane))
+#else
+ #define simde_vfms_laneq_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_laneq_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_laneq_f16
+ #define vfms_laneq_f16(a, b, v, lane) simde_vfms_laneq_f16(a, b, v, lane)
+#endif
+
+/* simde_vfms_laneq_f32 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfms_laneq_f32(a, b, v, lane) vfms_laneq_f32((a), (b), (v), (lane))
+#else
+ #define simde_vfms_laneq_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_laneq_f32(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_laneq_f32
+ #define vfms_laneq_f32(a, b, v, lane) simde_vfms_laneq_f32(a, b, v, lane)
+#endif
+
+/* simde_vfms_laneq_f64 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfms_laneq_f64(a, b, v, lane) vfms_laneq_f64((a), (b), (v), (lane))
+#else
+ #define simde_vfms_laneq_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_laneq_f64(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_laneq_f64
+ #define vfms_laneq_f64(a, b, v, lane) simde_vfms_laneq_f64(a, b, v, lane)
+#endif
+
+/* simde_vfmsq_lane_f64 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfmsq_lane_f64(a, b, v, lane) vfmsq_lane_f64((a), (b), (v), (lane))
+#else
+ #define simde_vfmsq_lane_f64(a, b, v, lane) simde_vsubq_f64(a, simde_vmulq_lane_f64(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_lane_f64
+ #define vfmsq_lane_f64(a, b, v, lane) simde_vfmsq_lane_f64(a, b, v, lane)
+#endif
+
+/* simde_vfmsq_lane_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfmsq_lane_f16(a, b, v, lane) vfmsq_lane_f16((a), (b), (v), (lane))
+#else
+ #define simde_vfmsq_lane_f16(a, b, v, lane) simde_vsubq_f16(a, simde_vmulq_lane_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_lane_f16
+ #define vfmsq_lane_f16(a, b, v, lane) simde_vfmsq_lane_f16(a, b, v, lane)
+#endif
+
+/* simde_vfmsq_lane_f32 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfmsq_lane_f32(a, b, v, lane) vfmsq_lane_f32((a), (b), (v), (lane))
+#else
+ #define simde_vfmsq_lane_f32(a, b, v, lane) simde_vsubq_f32(a, simde_vmulq_lane_f32(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_lane_f32
+ #define vfmsq_lane_f32(a, b, v, lane) simde_vfmsq_lane_f32(a, b, v, lane)
+#endif
+
+/* simde_vfmsq_laneq_f16 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vfmsq_laneq_f16(a, b, v, lane) vfmsq_laneq_f16((a), (b), (v), (lane))
+#else
+ #define simde_vfmsq_laneq_f16(a, b, v, lane) \
+ simde_vsubq_f16(a, simde_vmulq_laneq_f16(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_laneq_f16
+ #define vfmsq_laneq_f16(a, b, v, lane) simde_vfmsq_laneq_f16(a, b, v, lane)
+#endif
+
+/* simde_vfmsq_laneq_f32 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfmsq_laneq_f32(a, b, v, lane) vfmsq_laneq_f32((a), (b), (v), (lane))
+#else
+ #define simde_vfmsq_laneq_f32(a, b, v, lane) \
+ simde_vsubq_f32(a, simde_vmulq_laneq_f32(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_laneq_f32
+ #define vfmsq_laneq_f32(a, b, v, lane) simde_vfmsq_laneq_f32(a, b, v, lane)
+#endif
+
+/* simde_vfmsq_laneq_f64 */
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA)
+ #define simde_vfmsq_laneq_f64(a, b, v, lane) vfmsq_laneq_f64((a), (b), (v), (lane))
+#else
+ #define simde_vfmsq_laneq_f64(a, b, v, lane) \
+ simde_vsubq_f64(a, simde_vmulq_laneq_f64(b, v, lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_laneq_f64
+ #define vfmsq_laneq_f64(a, b, v, lane) simde_vfmsq_laneq_f64(a, b, v, lane)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_FMS_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/fms_n.h b/lib/simd_wrapper/simde/arm/neon/fms_n.h
new file mode 100644
index 00000000000..6011ae41539
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/fms_n.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: MIT
+*
+* Permission is hereby granted, free of charge, to any person
+* obtaining a copy of this software and associated documentation
+* files (the "Software"), to deal in the Software without
+* restriction, including without limitation the rights to use, copy,
+* modify, merge, publish, distribute, sublicense, and/or sell copies
+* of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+* Copyright:
+* 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+*/
+
+#if !defined(SIMDE_ARM_NEON_FMS_N_H)
+#define SIMDE_ARM_NEON_FMS_N_H
+
+#include "types.h"
+#include "dup_n.h"
+#include "fms.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vfms_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16)
+ return vfms_n_f16(a, b, c);
+ #else
+ return simde_vfms_f16(a, b, simde_vdup_n_f16(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_n_f16
+ #define vfms_n_f16(a, b, c) simde_vfms_n_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vfmsq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16)
+ return vfmsq_n_f16(a, b, c);
+ #else
+ return simde_vfmsq_f16(a, b, simde_vdupq_n_f16(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_n_f16
+ #define vfmsq_n_f16(a, b, c) simde_vfmsq_n_f16(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vfms_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399)
+ return vfms_n_f32(a, b, c);
+ #else
+ return simde_vfms_f32(a, b, simde_vdup_n_f32(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfms_n_f32
+ #define vfms_n_f32(a, b, c) simde_vfms_n_f32(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vfms_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vfms_n_f64(a, b, c);
+ #else
+ return simde_vfms_f64(a, b, simde_vdup_n_f64(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vfms_n_f64
+ #define vfms_n_f64(a, b, c) simde_vfms_n_f64(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vfmsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399)
+ return vfmsq_n_f32(a, b, c);
+ #else
+ return simde_vfmsq_f32(a, b, simde_vdupq_n_f32(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_n_f32
+ #define vfmsq_n_f32(a, b, c) simde_vfmsq_n_f32(a, b, c)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vfmsq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vfmsq_n_f64(a, b, c);
+ #else
+ return simde_vfmsq_f64(a, b, simde_vdupq_n_f64(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vfmsq_n_f64
+ #define vfmsq_n_f64(a, b, c) simde_vfmsq_n_f64(a, b, c)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_FMS_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/get_high.h b/lib/simd_wrapper/simde/arm/neon/get_high.h
index 654c63bd609..df37ccccae0 100644
--- a/lib/simd_wrapper/simde/arm/neon/get_high.h
+++ b/lib/simd_wrapper/simde/arm/neon/get_high.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_GET_HIGH_H)
@@ -34,6 +35,28 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vget_high_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vget_high_f16(a);
+ #else
+ simde_float16x4_private r_;
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))];
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_high_f16
+ #define vget_high_f16(a) simde_vget_high_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vget_high_f32(simde_float32x4_t a) {
@@ -294,6 +317,94 @@ simde_vget_high_u64(simde_uint64x2_t a) {
#define vget_high_u64(a) simde_vget_high_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vget_high_p8(simde_poly8x16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vget_high_p8(a);
+ #else
+ simde_poly8x8_private r_;
+ simde_poly8x16_private a_ = simde_poly8x16_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))];
+ }
+
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_high_p8
+ #define vget_high_p8(a) simde_vget_high_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vget_high_p16(simde_poly16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vget_high_p16(a);
+ #else
+ simde_poly16x4_private r_;
+ simde_poly16x8_private a_ = simde_poly16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))];
+ }
+
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_high_p16
+ #define vget_high_p16(a) simde_vget_high_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vget_high_p64(simde_poly64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vget_high_p64(a);
+ #else
+ simde_poly64x1_private r_;
+ simde_poly64x2_private a_ = simde_poly64x2_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))];
+ }
+
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_high_p64
+ #define vget_high_p64(a) simde_vget_high_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vget_high_bf16(simde_bfloat16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vget_high_bf16(a);
+ #else
+ simde_bfloat16x4_private r_;
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))];
+ }
+
+ return simde_bfloat16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_high_bf16
+ #define vget_high_bf16(a) simde_vget_high_bf16((a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/get_lane.h b/lib/simd_wrapper/simde/arm/neon/get_lane.h
index 2dbeb55c6e0..06040eb2c32 100644
--- a/lib/simd_wrapper/simde/arm/neon/get_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/get_lane.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_GET_LANE_H)
@@ -34,6 +35,27 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vget_lane_f16(simde_float16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16_t r;
+
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ SIMDE_CONSTIFY_4_(vget_lane_f16, r, (HEDLEY_UNREACHABLE(), SIMDE_FLOAT16_VALUE(0.0)), lane, v);
+ #else
+ simde_float16x4_private v_ = simde_float16x4_to_private(v);
+
+ r = v_.values[lane];
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_lane_f16
+ #define vget_lane_f16(v, lane) simde_vget_lane_f16((v), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vget_lane_f32(simde_float32x2_t v, const int lane)
@@ -247,6 +269,27 @@ simde_vget_lane_u64(simde_uint64x1_t v, const int lane)
#define vget_lane_u64(v, lane) simde_vget_lane_u64((v), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vgetq_lane_f16(simde_float16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16_t r;
+
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ SIMDE_CONSTIFY_8_(vgetq_lane_f16, r, (HEDLEY_UNREACHABLE(), SIMDE_FLOAT16_VALUE(0.0)), lane, v);
+ #else
+ simde_float16x8_private v_ = simde_float16x8_to_private(v);
+
+ r = v_.values[lane];
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vgetq_lane_f16
+ #define vgetq_lane_f16(v, lane) simde_vgetq_lane_f16((v), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vgetq_lane_f32(simde_float32x4_t v, const int lane)
@@ -513,6 +556,161 @@ simde_vgetq_lane_u64(simde_uint64x2_t v, const int lane)
#define vgetq_lane_u64(v, lane) simde_vgetq_lane_u64((v), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8_t
+simde_vget_lane_p8(simde_poly8x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly8_t r;
+ simde_poly8x8_private v_ = simde_poly8x8_to_private(v);
+ r = v_.values[lane];
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vget_lane_p8(v, lane) vget_lane_p8((v), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_lane_p8
+ #define vget_lane_p8(v, lane) simde_vget_lane_p8((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16_t
+simde_vget_lane_p16(simde_poly16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_poly16_t r;
+ simde_poly16x4_private v_ = simde_poly16x4_to_private(v);
+
+ r = v_.values[lane];
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vget_lane_p16(v, lane) vget_lane_p16((v), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_lane_p16
+ #define vget_lane_p16(v, lane) simde_vget_lane_p16((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64_t
+simde_vget_lane_p64(simde_poly64x1_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_poly64_t r;
+ simde_poly64x1_private v_ = simde_poly64x1_to_private(v);
+
+ r = v_.values[lane];
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vget_lane_p64(v, lane) vget_lane_p64((v), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_lane_p64
+ #define vget_lane_p64(v, lane) simde_vget_lane_p64((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8_t
+simde_vgetq_lane_p8(simde_poly8x16_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_poly8_t r;
+ simde_poly8x16_private v_ = simde_poly8x16_to_private(v);
+
+ r = v_.values[lane];
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vgetq_lane_p8(v, lane) vgetq_lane_p8((v), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vgetq_lane_p8
+ #define vgetq_lane_p8(v, lane) simde_vgetq_lane_p8((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16_t
+simde_vgetq_lane_p16(simde_poly16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly16_t r;
+ simde_poly16x8_private v_ = simde_poly16x8_to_private(v);
+
+ r = v_.values[lane];
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vgetq_lane_p16(v, lane) vgetq_lane_p16((v), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vgetq_lane_p16
+ #define vgetq_lane_p16(v, lane) simde_vgetq_lane_p16((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64_t
+simde_vgetq_lane_p64(simde_poly64x2_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_poly64_t r;
+ simde_poly64x2_private v_ = simde_poly64x2_to_private(v);
+
+ r = v_.values[lane];
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362)
+ #define simde_vgetq_lane_p64(v, lane) vgetq_lane_p64((v), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vgetq_lane_p64
+ #define vgetq_lane_p64(v, lane) simde_vgetq_lane_p64((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16_t
+simde_vget_lane_bf16(simde_bfloat16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_bfloat16_t r;
+
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ SIMDE_CONSTIFY_4_(vget_lane_bf16, r, (HEDLEY_UNREACHABLE(), SIMDE_BFLOAT16_VALUE(0.0)), lane, v);
+ #else
+ simde_bfloat16x4_private v_ = simde_bfloat16x4_to_private(v);
+
+ r = v_.values[lane];
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_lane_bf16
+ #define vget_lane_bf16(v, lane) simde_vget_lane_bf16((v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16_t
+simde_vgetq_lane_bf16(simde_bfloat16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_bfloat16_t r;
+
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ SIMDE_CONSTIFY_8_(vgetq_lane_bf16, r, (HEDLEY_UNREACHABLE(), SIMDE_BFLOAT16_VALUE(0.0)), lane, v);
+ #else
+ simde_bfloat16x8_private v_ = simde_bfloat16x8_to_private(v);
+
+ r = v_.values[lane];
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vgetq_lane_bf16
+ #define vgetq_lane_bf16(v, lane) simde_vgetq_lane_bf16((v), (lane))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/get_low.h b/lib/simd_wrapper/simde/arm/neon/get_low.h
index 84e17783c3a..4594a3064ef 100644
--- a/lib/simd_wrapper/simde/arm/neon/get_low.h
+++ b/lib/simd_wrapper/simde/arm/neon/get_low.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_GET_LOW_H)
@@ -34,6 +35,28 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vget_low_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vget_low_f16(a);
+ #else
+ simde_float16x4_private r_;
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i];
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_low_f16
+ #define vget_low_f16(a) simde_vget_low_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vget_low_f32(simde_float32x4_t a) {
@@ -326,6 +349,94 @@ simde_vget_low_u64(simde_uint64x2_t a) {
#define vget_low_u64(a) simde_vget_low_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vget_low_p8(simde_poly8x16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vget_low_p8(a);
+ #else
+ simde_poly8x8_private r_;
+ simde_poly8x16_private a_ = simde_poly8x16_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i];
+ }
+
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_low_p8
+ #define vget_low_p8(a) simde_vget_low_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vget_low_p16(simde_poly16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vget_low_p16(a);
+ #else
+ simde_poly16x4_private r_;
+ simde_poly16x8_private a_ = simde_poly16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i];
+ }
+
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vget_low_p16
+ #define vget_low_p16(a) simde_vget_low_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vget_low_p64(simde_poly64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vget_low_p64(a);
+ #else
+ simde_poly64x1_private r_;
+ simde_poly64x2_private a_ = simde_poly64x2_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i];
+ }
+
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_low_p64
+ #define vget_low_p64(a) simde_vget_low_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vget_low_bf16(simde_bfloat16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vget_low_bf16(a);
+ #else
+ simde_bfloat16x4_private r_;
+ simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i];
+ }
+
+ return simde_bfloat16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vget_low_bf16
+ #define vget_low_bf16(a) simde_vget_low_bf16((a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/hadd.h b/lib/simd_wrapper/simde/arm/neon/hadd.h
index 53e26d71698..7e72ba3f794 100644
--- a/lib/simd_wrapper/simde/arm/neon/hadd.h
+++ b/lib/simd_wrapper/simde/arm/neon/hadd.h
@@ -46,6 +46,14 @@ simde_int8x8_t
simde_vhadd_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_s8(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int8x8_private
+ r_,
+ a_ = simde_int8x8_to_private(a),
+ b_ = simde_int8x8_to_private(b);
+
+ r_.sv64 = __riscv_vaadd_vv_i8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8);
+ return simde_int8x8_from_private(r_);
#else
return simde_vmovn_s16(simde_vshrq_n_s16(simde_vaddl_s8(a, b), 1));
#endif
@@ -60,6 +68,14 @@ simde_int16x4_t
simde_vhadd_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_s16(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int16x4_private
+ r_,
+ a_ = simde_int16x4_to_private(a),
+ b_ = simde_int16x4_to_private(b);
+
+ r_.sv64 = __riscv_vaadd_vv_i16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4);
+ return simde_int16x4_from_private(r_);
#else
return simde_vmovn_s32(simde_vshrq_n_s32(simde_vaddl_s16(a, b), 1));
#endif
@@ -74,6 +90,14 @@ simde_int32x2_t
simde_vhadd_s32(simde_int32x2_t a, simde_int32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_s32(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int32x2_private
+ r_,
+ a_ = simde_int32x2_to_private(a),
+ b_ = simde_int32x2_to_private(b);
+
+ r_.sv64 = __riscv_vaadd_vv_i32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2);
+ return simde_int32x2_from_private(r_);
#else
return simde_vmovn_s64(simde_vshrq_n_s64(simde_vaddl_s32(a, b), 1));
#endif
@@ -88,6 +112,14 @@ simde_uint8x8_t
simde_vhadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_u8(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x8_private
+ r_,
+ a_ = simde_uint8x8_to_private(a),
+ b_ = simde_uint8x8_to_private(b);
+
+ r_.sv64 = __riscv_vaaddu_vv_u8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8);
+ return simde_uint8x8_from_private(r_);
#else
return simde_vmovn_u16(simde_vshrq_n_u16(simde_vaddl_u8(a, b), 1));
#endif
@@ -102,6 +134,14 @@ simde_uint16x4_t
simde_vhadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_u16(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x4_private
+ r_,
+ a_ = simde_uint16x4_to_private(a),
+ b_ = simde_uint16x4_to_private(b);
+
+ r_.sv64 = __riscv_vaaddu_vv_u16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4);
+ return simde_uint16x4_from_private(r_);
#else
return simde_vmovn_u32(simde_vshrq_n_u32(simde_vaddl_u16(a, b), 1));
#endif
@@ -116,6 +156,14 @@ simde_uint32x2_t
simde_vhadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vhadd_u32(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x2_private
+ r_,
+ a_ = simde_uint32x2_to_private(a),
+ b_ = simde_uint32x2_to_private(b);
+
+ r_.sv64 = __riscv_vaaddu_vv_u32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2);
+ return simde_uint32x2_from_private(r_);
#else
return simde_vmovn_u64(simde_vshrq_n_u64(simde_vaddl_u32(a, b), 1));
#endif
@@ -138,6 +186,8 @@ simde_vhaddq_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_add_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vaadd_vv_i8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -166,6 +216,8 @@ simde_vhaddq_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_add_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vaadd_vv_i16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -194,6 +246,8 @@ simde_vhaddq_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_add_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vaadd_vv_i32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -233,6 +287,8 @@ simde_vhaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
1);
r_.v128 = wasm_i8x16_shuffle(lo, hi, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20,
22, 24, 26, 28, 30);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vaaddu_vv_u8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -261,6 +317,8 @@ simde_vhaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_add_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vaaddu_vv_u16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -289,6 +347,8 @@ simde_vhaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_X86_AVX512VL_NATIVE)
r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_add_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1));
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vaaddu_vv_u32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1.h b/lib/simd_wrapper/simde/arm/neon/ld1.h
index 2fa8d1f5608..5dd2d17c6e5 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld1.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld1.h
@@ -23,6 +23,8 @@
* Copyright:
* 2020 Evan Nemerson
* 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
*/
#if !defined(SIMDE_ARM_NEON_LD1_H)
@@ -36,12 +38,16 @@ SIMDE_BEGIN_DECLS_
SIMDE_FUNCTION_ATTRIBUTES
simde_float16x4_t
-simde_vld1_f16(simde_float16 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+simde_vld1_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vld1_f16(ptr);
#else
simde_float16x4_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
+ r_.sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_float16x4_from_private(r_);
#endif
}
@@ -57,7 +63,11 @@ simde_vld1_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(2)]) {
return vld1_f32(ptr);
#else
simde_float32x2_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle32_v_f32m1(ptr , 2);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_float32x2_from_private(r_);
#endif
}
@@ -73,7 +83,11 @@ simde_vld1_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(1)]) {
return vld1_f64(ptr);
#else
simde_float64x1_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle64_v_f64m1(ptr , 1);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_float64x1_from_private(r_);
#endif
}
@@ -89,7 +103,11 @@ simde_vld1_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld1_s8(ptr);
#else
simde_int8x8_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle8_v_i8m1(ptr , 8);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_int8x8_from_private(r_);
#endif
}
@@ -105,7 +123,11 @@ simde_vld1_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
return vld1_s16(ptr);
#else
simde_int16x4_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle16_v_i16m1(ptr , 4);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_int16x4_from_private(r_);
#endif
}
@@ -121,7 +143,11 @@ simde_vld1_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
return vld1_s32(ptr);
#else
simde_int32x2_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle32_v_i32m1(ptr , 2);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_int32x2_from_private(r_);
#endif
}
@@ -137,7 +163,11 @@ simde_vld1_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) {
return vld1_s64(ptr);
#else
simde_int64x1_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle64_v_i64m1(ptr , 1);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_int64x1_from_private(r_);
#endif
}
@@ -153,7 +183,11 @@ simde_vld1_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld1_u8(ptr);
#else
simde_uint8x8_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_uint8x8_from_private(r_);
#endif
}
@@ -169,7 +203,11 @@ simde_vld1_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
return vld1_u16(ptr);
#else
simde_uint16x4_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_uint16x4_from_private(r_);
#endif
}
@@ -185,7 +223,11 @@ simde_vld1_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
return vld1_u32(ptr);
#else
simde_uint32x2_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle32_v_u32m1(ptr , 2);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_uint32x2_from_private(r_);
#endif
}
@@ -201,7 +243,11 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) {
return vld1_u64(ptr);
#else
simde_uint64x1_private r_;
- simde_memcpy(&r_, ptr, sizeof(r_));
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
return simde_uint64x1_from_private(r_);
#endif
}
@@ -212,13 +258,15 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float16x8_t
-simde_vld1q_f16(simde_float16 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+simde_vld1q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
return vld1q_f16(ptr);
#else
simde_float16x8_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH)
+ r_.sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -239,6 +287,8 @@ simde_vld1q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
simde_float32x4_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle32_v_f32m1(ptr , 4);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -259,6 +309,8 @@ simde_vld1q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) {
simde_float64x2_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle64_v_f64m1(ptr , 2);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -279,6 +331,8 @@ simde_vld1q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
simde_int8x16_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle8_v_i8m1(ptr , 16);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -299,6 +353,8 @@ simde_vld1q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_int16x8_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle16_v_i16m1(ptr , 8);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -319,6 +375,8 @@ simde_vld1q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
simde_int32x4_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle32_v_i32m1(ptr , 4);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -339,6 +397,8 @@ simde_vld1q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
simde_int64x2_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle64_v_i64m1(ptr , 2);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -359,6 +419,8 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
simde_uint8x16_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -370,82 +432,6 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
#define vld1q_u8(a) simde_vld1q_u8((a))
#endif
-#if !defined(SIMDE_BUG_INTEL_857088)
-
-SIMDE_FUNCTION_ATTRIBUTES
-simde_uint8x16x2_t
-simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
- #if \
- defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
- (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
- return vld1q_u8_x2(ptr);
- #else
- simde_uint8x16_private a_[2];
- for (size_t i = 0; i < 32; i++) {
- a_[i / 16].values[i % 16] = ptr[i];
- }
- simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]),
- simde_uint8x16_from_private(a_[1]) } };
- return s_;
- #endif
-}
-#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
- #undef vld1q_u8_x2
- #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a))
-#endif
-
-SIMDE_FUNCTION_ATTRIBUTES
-simde_uint8x16x3_t
-simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) {
- #if \
- defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
- (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
- return vld1q_u8_x3(ptr);
- #else
- simde_uint8x16_private a_[3];
- for (size_t i = 0; i < 48; i++) {
- a_[i / 16].values[i % 16] = ptr[i];
- }
- simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]),
- simde_uint8x16_from_private(a_[1]),
- simde_uint8x16_from_private(a_[2]) } };
- return s_;
- #endif
-}
-#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
- #undef vld1q_u8_x3
- #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a))
-#endif
-
-SIMDE_FUNCTION_ATTRIBUTES
-simde_uint8x16x4_t
-simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
- #if \
- defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
- (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
- (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
- return vld1q_u8_x4(ptr);
- #else
- simde_uint8x16_private a_[4];
- for (size_t i = 0; i < 64; i++) {
- a_[i / 16].values[i % 16] = ptr[i];
- }
- simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]),
- simde_uint8x16_from_private(a_[1]),
- simde_uint8x16_from_private(a_[2]),
- simde_uint8x16_from_private(a_[3]) } };
- return s_;
- #endif
-}
-#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
- #undef vld1q_u8_x4
- #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a))
-#endif
-
-#endif /* !defined(SIMDE_BUG_INTEL_857088) */
-
SIMDE_FUNCTION_ATTRIBUTES
simde_uint16x8_t
simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
@@ -455,6 +441,8 @@ simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_uint16x8_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -475,6 +463,8 @@ simde_vld1q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
simde_uint32x4_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle32_v_u32m1(ptr , 4);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -495,6 +485,8 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
simde_uint64x2_private r_;
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_v128_load(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2);
#else
simde_memcpy(&r_, ptr, sizeof(r_));
#endif
@@ -506,6 +498,177 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
#define vld1q_u64(a) simde_vld1q_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vld1_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1_p8(ptr);
+ #else
+ simde_poly8x8_private r_;
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p8
+ #define vld1_p8(a) simde_vld1_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vld1_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1_p16(ptr);
+ #else
+ simde_poly16x4_private r_;
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
+ return simde_poly16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p16
+ #define vld1_p16(a) simde_vld1_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vld1_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld1_p64(ptr);
+ #else
+ simde_poly64x1_private r_;
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
+ return simde_poly64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p64
+ #define vld1_p64(a) simde_vld1_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vld1q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld1q_p8(ptr);
+ #else
+ simde_poly8x16_private r_;
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p8
+ #define vld1q_p8(a) simde_vld1q_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vld1q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1q_p16(ptr);
+ #else
+ simde_poly16x8_private r_;
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
+ return simde_poly16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p16
+ #define vld1q_p16(a) simde_vld1q_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vld1q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld1q_p64(ptr);
+ #else
+ simde_poly64x2_private r_;
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ #else
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ #endif
+ return simde_poly64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p64
+ #define vld1q_p64(a) simde_vld1q_p64((a))
+#endif
+
+#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE)
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly128_t
+simde_vldrq_p128(simde_poly128_t const ptr[HEDLEY_ARRAY_PARAM(1)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)
+ return vldrq_p128(ptr);
+ #else
+ simde_poly128_t r_;
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ return r_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vldrq_p128
+ #define vldrq_p128(a) simde_vldrq_p128((a))
+#endif
+
+#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vld1_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1_bf16(ptr);
+ #else
+ simde_bfloat16x4_private r_;
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ return simde_bfloat16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_bf16
+ #define vld1_bf16(a) simde_vld1_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vld1q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1q_bf16(ptr);
+ #else
+ simde_bfloat16x8_private r_;
+ simde_memcpy(&r_, ptr, sizeof(r_));
+ return simde_bfloat16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_bf16
+ #define vld1q_bf16(a) simde_vld1q_bf16((a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_dup.h b/lib/simd_wrapper/simde/arm/neon/ld1_dup.h
index 9df7477b7de..cc15cf98230 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld1_dup.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld1_dup.h
@@ -23,6 +23,7 @@
* Copyright:
* 2021 Zhi An Ng (Copyright owned by Google, LLC)
* 2021 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_LD1_DUP_H)
@@ -35,6 +36,20 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vld1_dup_f16(simde_float16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld1_dup_f16(ptr);
+ #else
+ return simde_vdup_n_f16(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_dup_f16
+ #define vld1_dup_f16(a) simde_vld1_dup_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vld1_dup_f32(simde_float32 const * ptr) {
@@ -177,6 +192,20 @@ simde_vld1_dup_u64(uint64_t const * ptr) {
#define vld1_dup_u64(a) simde_vld1_dup_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vld1q_dup_f16(simde_float16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld1q_dup_f16(ptr);
+ #else
+ return simde_vdupq_n_f16(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_dup_f16
+ #define vld1q_dup_f16(a) simde_vld1q_dup_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vld1q_dup_f32(simde_float32 const * ptr) {
@@ -401,6 +430,118 @@ simde_vld1q_dup_u64(uint64_t const * ptr) {
#define vld1q_dup_u64(a) simde_vld1q_dup_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vld1_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1_dup_p8(ptr);
+ #else
+ return simde_vdup_n_p8(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_dup_p8
+ #define vld1_dup_p8(a) simde_vld1_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vld1_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1_dup_p16(ptr);
+ #else
+ return simde_vdup_n_p16(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_dup_p16
+ #define vld1_dup_p16(a) simde_vld1_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vld1_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld1_dup_p64(ptr);
+ #else
+ return simde_vdup_n_p64(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_dup_p64
+ #define vld1_dup_p64(a) simde_vld1_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vld1q_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1q_dup_p8(ptr);
+ #else
+ return simde_vdupq_n_p8(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_dup_p8
+ #define vld1q_dup_p8(a) simde_vld1q_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vld1q_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld1q_dup_p16(ptr);
+ #else
+ return simde_vdupq_n_p16(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_dup_p16
+ #define vld1q_dup_p16(a) simde_vld1q_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vld1q_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld1q_dup_p64(ptr);
+ #else
+ return simde_vdupq_n_p64(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_dup_p64
+ #define vld1q_dup_p64(a) simde_vld1q_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t
+simde_vld1_dup_bf16(simde_bfloat16 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1_dup_bf16(ptr);
+ #else
+ return simde_vdup_n_bf16(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_dup_bf16
+ #define vld1_dup_bf16(a) simde_vld1_dup_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t
+simde_vld1q_dup_bf16(simde_bfloat16 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1q_dup_bf16(ptr);
+ #else
+ return simde_vdupq_n_bf16(*ptr);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_dup_bf16
+ #define vld1q_dup_bf16(a) simde_vld1q_dup_bf16((a))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_lane.h b/lib/simd_wrapper/simde/arm/neon/ld1_lane.h
index 4e36caf5249..5818ead64df 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld1_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld1_lane.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_LD1_LANE_H)
@@ -161,6 +162,22 @@ simde_uint64x1_t simde_vld1_lane_u64(uint64_t const *ptr, simde_uint64x1_t src,
#define vld1_lane_u64(ptr, src, lane) simde_vld1_lane_u64((ptr), (src), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t simde_vld1_lane_f16(simde_float16_t const *ptr, simde_float16x4_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x4_private r = simde_float16x4_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_float16x4_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vld1_lane_f16(ptr, src, lane) vld1_lane_f16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_lane_f16
+ #define vld1_lane_f16(ptr, src, lane) simde_vld1_lane_f16((ptr), (src), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t simde_vld1_lane_f32(simde_float32_t const *ptr, simde_float32x2_t src,
const int lane)
@@ -321,6 +338,22 @@ simde_uint64x2_t simde_vld1q_lane_u64(uint64_t const *ptr, simde_uint64x2_t src,
#define vld1q_lane_u64(ptr, src, lane) simde_vld1q_lane_u64((ptr), (src), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t simde_vld1q_lane_f16(simde_float16_t const *ptr, simde_float16x8_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x8_private r = simde_float16x8_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_float16x8_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vld1q_lane_f16(ptr, src, lane) vld1q_lane_f16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_lane_f16
+ #define vld1q_lane_f16(ptr, src, lane) simde_vld1q_lane_f16((ptr), (src), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t simde_vld1q_lane_f32(simde_float32_t const *ptr, simde_float32x4_t src,
const int lane)
@@ -353,6 +386,139 @@ simde_float64x2_t simde_vld1q_lane_f64(simde_float64_t const *ptr, simde_float64
#define vld1q_lane_f64(ptr, src, lane) simde_vld1q_lane_f64((ptr), (src), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vld1_lane_p8(simde_poly8_t const *ptr, simde_poly8x8_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly8x8_private r = simde_poly8x8_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_poly8x8_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld1_lane_p8(ptr, src, lane) vld1_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_lane_p8
+ #define vld1_lane_p8(ptr, src, lane) simde_vld1_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4_t
+simde_vld1_lane_p16(simde_poly16_t const *ptr, simde_poly16x4_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_poly16x4_private r = simde_poly16x4_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_poly16x4_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld1_lane_p16(ptr, src, lane) vld1_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_lane_p16
+ #define vld1_lane_p16(ptr, src, lane) simde_vld1_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1_t
+simde_vld1_lane_p64(simde_poly64_t const *ptr, simde_poly64x1_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_poly64x1_private r = simde_poly64x1_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_poly64x1_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ #define simde_vld1_lane_p64(ptr, src, lane) vld1_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_lane_p64
+ #define vld1_lane_p64(ptr, src, lane) simde_vld1_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vld1q_lane_p8(simde_poly8_t const *ptr, simde_poly8x16_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_poly8x16_private r = simde_poly8x16_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_poly8x16_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld1q_lane_p8(ptr, src, lane) vld1q_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_lane_p8
+ #define vld1q_lane_p8(ptr, src, lane) simde_vld1q_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vld1q_lane_p16(simde_poly16_t const *ptr, simde_poly16x8_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly16x8_private r = simde_poly16x8_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_poly16x8_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld1q_lane_p16(ptr, src, lane) vld1q_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_lane_p16
+ #define vld1q_lane_p16(ptr, src, lane) simde_vld1q_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2_t
+simde_vld1q_lane_p64(simde_poly64_t const *ptr, simde_poly64x2_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_poly64x2_private r = simde_poly64x2_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_poly64x2_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ #define simde_vld1q_lane_p64(ptr, src, lane) vld1q_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_lane_p64
+ #define vld1q_lane_p64(ptr, src, lane) simde_vld1q_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4_t simde_vld1_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloat16x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_bfloat16x4_private r = simde_bfloat16x4_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_bfloat16x4_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld1_lane_bf16(ptr, src, lane) vld1_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_lane_bf16
+ #define vld1_lane_bf16(ptr, src, lane) simde_vld1_lane_bf16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8_t simde_vld1q_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloat16x8_t src,
+ const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_bfloat16x8_private r = simde_bfloat16x8_to_private(src);
+ r.values[lane] = *ptr;
+ return simde_bfloat16x8_from_private(r);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld1q_lane_bf16(ptr, src, lane) vld1q_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_lane_bf16
+ #define vld1q_lane_bf16(ptr, src, lane) simde_vld1q_lane_bf16((ptr), (src), (lane))
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_x2.h b/lib/simd_wrapper/simde/arm/neon/ld1_x2.h
new file mode 100644
index 00000000000..75ce61d10b3
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld1_x2.h
@@ -0,0 +1,456 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2020 Evan Nemerson
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2021 Décio Luiz Gazzoni Filho
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD1_X2_H)
+#define SIMDE_ARM_NEON_LD1_X2_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if HEDLEY_GCC_VERSION_CHECK(7,0,0)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+SIMDE_BEGIN_DECLS_
+
+#if !defined(SIMDE_BUG_INTEL_857088)
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x2_t
+simde_vld1_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_f16_x2(ptr);
+ #else
+ simde_float16x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH
+ a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_float16x4x2_t s_ = { { simde_float16x4_from_private(a_[0]),
+ simde_float16x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f16_x2
+ #define vld1_f16_x2(a) simde_vld1_f16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x2_t
+simde_vld1_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_f32_x2(ptr);
+ #else
+ simde_float32x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_float32x2x2_t s_ = { { simde_float32x2_from_private(a_[0]),
+ simde_float32x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f32_x2
+ #define vld1_f32_x2(a) simde_vld1_f32_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x2_t
+simde_vld1_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vld1_f64_x2(ptr);
+ #else
+ simde_float64x1_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1);
+ #else
+ for (size_t i = 0; i < 2; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_float64x1x2_t s_ = { { simde_float64x1_from_private(a_[0]),
+ simde_float64x1_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f64_x2
+ #define vld1_f64_x2(a) simde_vld1_f64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x2_t
+simde_vld1_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s8_x2(ptr);
+ #else
+ simde_int8x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_int8x8x2_t s_ = { { simde_int8x8_from_private(a_[0]),
+ simde_int8x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s8_x2
+ #define vld1_s8_x2(a) simde_vld1_s8_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x2_t
+simde_vld1_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s16_x2(ptr);
+ #else
+ simde_int16x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_int16x4x2_t s_ = { { simde_int16x4_from_private(a_[0]),
+ simde_int16x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s16_x2
+ #define vld1_s16_x2(a) simde_vld1_s16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x2_t
+simde_vld1_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s32_x2(ptr);
+ #else
+ simde_int32x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_int32x2x2_t s_ = { { simde_int32x2_from_private(a_[0]),
+ simde_int32x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s32_x2
+ #define vld1_s32_x2(a) simde_vld1_s32_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x2_t
+simde_vld1_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s64_x2(ptr);
+ #else
+ simde_int64x1_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1);
+ #else
+ for (size_t i = 0; i < 2; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_int64x1x2_t s_ = { { simde_int64x1_from_private(a_[0]),
+ simde_int64x1_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s64_x2
+ #define vld1_s64_x2(a) simde_vld1_s64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x2_t
+simde_vld1_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u8_x2(ptr);
+ #else
+ simde_uint8x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_uint8x8x2_t s_ = { { simde_uint8x8_from_private(a_[0]),
+ simde_uint8x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u8_x2
+ #define vld1_u8_x2(a) simde_vld1_u8_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x2_t
+simde_vld1_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u16_x2(ptr);
+ #else
+ simde_uint16x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_uint16x4x2_t s_ = { { simde_uint16x4_from_private(a_[0]),
+ simde_uint16x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u16_x2
+ #define vld1_u16_x2(a) simde_vld1_u16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x2_t
+simde_vld1_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u32_x2(ptr);
+ #else
+ simde_uint32x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_uint32x2x2_t s_ = { { simde_uint32x2_from_private(a_[0]),
+ simde_uint32x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u32_x2
+ #define vld1_u32_x2(a) simde_vld1_u32_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x2_t
+simde_vld1_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u64_x2(ptr);
+ #else
+ simde_uint64x1_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1);
+ #else
+ for (size_t i = 0; i < 2; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_uint64x1x2_t s_ = { { simde_uint64x1_from_private(a_[0]),
+ simde_uint64x1_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u64_x2
+ #define vld1_u64_x2(a) simde_vld1_u64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x2_t
+simde_vld1_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld1_p8_x2(ptr);
+ #else
+ simde_poly8x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_poly8x8x2_t s_ = { { simde_poly8x8_from_private(a_[0]),
+ simde_poly8x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p8_x2
+ #define vld1_p8_x2(a) simde_vld1_p8_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x2_t
+simde_vld1_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld1_p16_x2(ptr);
+ #else
+ simde_poly16x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_poly16x4x2_t s_ = { { simde_poly16x4_from_private(a_[0]),
+ simde_poly16x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p16_x2
+ #define vld1_p16_x2(a) simde_vld1_p16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x2_t
+simde_vld1_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p64_x2(ptr);
+ #else
+ simde_poly64x1_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1);
+ #else
+ for (size_t i = 0; i < 2; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_poly64x1x2_t s_ = { { simde_poly64x1_from_private(a_[0]),
+ simde_poly64x1_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p64_x2
+ #define vld1_p64_x2(a) simde_vld1_p64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x2_t
+simde_vld1_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1_bf16_x2(ptr);
+ #else
+ simde_bfloat16x4_private a_[2];
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ simde_bfloat16x4x2_t s_ = { { simde_bfloat16x4_from_private(a_[0]),
+ simde_bfloat16x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_bf16_x2
+ #define vld1_bf16_x2(a) simde_vld1_bf16_x2((a))
+#endif
+
+#endif /* !defined(SIMDE_BUG_INTEL_857088) */
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD1_X2_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_x3.h b/lib/simd_wrapper/simde/arm/neon/ld1_x3.h
new file mode 100644
index 00000000000..bdaf8e527a2
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld1_x3.h
@@ -0,0 +1,486 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2020 Evan Nemerson
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD1_X3_H)
+#define SIMDE_ARM_NEON_LD1_X3_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if HEDLEY_GCC_VERSION_CHECK(7,0,0)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+SIMDE_BEGIN_DECLS_
+
+#if !defined(SIMDE_BUG_INTEL_857088)
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x3_t
+simde_vld1_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_f16_x3(ptr);
+ #else
+ simde_float16x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH
+ a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4);
+ a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_float16x4x3_t s_ = { { simde_float16x4_from_private(a_[0]),
+ simde_float16x4_from_private(a_[1]),
+ simde_float16x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f16_x3
+ #define vld1_f16_x3(a) simde_vld1_f16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x3_t
+simde_vld1_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(6)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_f32_x3(ptr);
+ #else
+ simde_float32x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2);
+ a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_float32x2x3_t s_ = { { simde_float32x2_from_private(a_[0]),
+ simde_float32x2_from_private(a_[1]),
+ simde_float32x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f32_x3
+ #define vld1_f32_x3(a) simde_vld1_f32_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x3_t
+simde_vld1_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vld1_f64_x3(ptr);
+ #else
+ simde_float64x1_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1);
+ #else
+ for (size_t i = 0; i < 3; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_float64x1x3_t s_ = { { simde_float64x1_from_private(a_[0]),
+ simde_float64x1_from_private(a_[1]),
+ simde_float64x1_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f64_x3
+ #define vld1_f64_x3(a) simde_vld1_f64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x3_t
+simde_vld1_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s8_x3(ptr);
+ #else
+ simde_int8x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8);
+ a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_int8x8x3_t s_ = { { simde_int8x8_from_private(a_[0]),
+ simde_int8x8_from_private(a_[1]),
+ simde_int8x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s8_x3
+ #define vld1_s8_x3(a) simde_vld1_s8_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x3_t
+simde_vld1_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s16_x3(ptr);
+ #else
+ simde_int16x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4);
+ a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_int16x4x3_t s_ = { { simde_int16x4_from_private(a_[0]),
+ simde_int16x4_from_private(a_[1]),
+ simde_int16x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s16_x3
+ #define vld1_s16_x3(a) simde_vld1_s16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x3_t
+simde_vld1_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s32_x3(ptr);
+ #else
+ simde_int32x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2);
+ a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_int32x2x3_t s_ = { { simde_int32x2_from_private(a_[0]),
+ simde_int32x2_from_private(a_[1]),
+ simde_int32x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s32_x3
+ #define vld1_s32_x3(a) simde_vld1_s32_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x3_t
+simde_vld1_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s64_x3(ptr);
+ #else
+ simde_int64x1_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1);
+ #else
+ for (size_t i = 0; i < 3; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_int64x1x3_t s_ = { { simde_int64x1_from_private(a_[0]),
+ simde_int64x1_from_private(a_[1]),
+ simde_int64x1_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s64_x3
+ #define vld1_s64_x3(a) simde_vld1_s64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x3_t
+simde_vld1_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u8_x3(ptr);
+ #else
+ simde_uint8x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8);
+ a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_uint8x8x3_t s_ = { { simde_uint8x8_from_private(a_[0]),
+ simde_uint8x8_from_private(a_[1]),
+ simde_uint8x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u8_x3
+ #define vld1_u8_x3(a) simde_vld1_u8_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x3_t
+simde_vld1_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u16_x3(ptr);
+ #else
+ simde_uint16x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4);
+ a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_uint16x4x3_t s_ = { { simde_uint16x4_from_private(a_[0]),
+ simde_uint16x4_from_private(a_[1]),
+ simde_uint16x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u16_x3
+ #define vld1_u16_x3(a) simde_vld1_u16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x3_t
+simde_vld1_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u32_x3(ptr);
+ #else
+ simde_uint32x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2);
+ a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_uint32x2x3_t s_ = { { simde_uint32x2_from_private(a_[0]),
+ simde_uint32x2_from_private(a_[1]),
+ simde_uint32x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u32_x3
+ #define vld1_u32_x3(a) simde_vld1_u32_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x3_t
+simde_vld1_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u64_x3(ptr);
+ #else
+ simde_uint64x1_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1);
+ #else
+ for (size_t i = 0; i < 3; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_uint64x1x3_t s_ = { { simde_uint64x1_from_private(a_[0]),
+ simde_uint64x1_from_private(a_[1]),
+ simde_uint64x1_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u64_x3
+ #define vld1_u64_x3(a) simde_vld1_u64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x3_t
+simde_vld1_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p8_x3(ptr);
+ #else
+ simde_poly8x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8);
+ a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_poly8x8x3_t s_ = { { simde_poly8x8_from_private(a_[0]),
+ simde_poly8x8_from_private(a_[1]),
+ simde_poly8x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p8_x3
+ #define vld1_p8_x3(a) simde_vld1_p8_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x3_t
+simde_vld1_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p16_x3(ptr);
+ #else
+ simde_poly16x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4);
+ a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_poly16x4x3_t s_ = { { simde_poly16x4_from_private(a_[0]),
+ simde_poly16x4_from_private(a_[1]),
+ simde_poly16x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p16_x3
+ #define vld1_p16_x3(a) simde_vld1_p16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x3_t
+simde_vld1_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p64_x3(ptr);
+ #else
+ simde_poly64x1_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1);
+ #else
+ for (size_t i = 0; i < 3; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_poly64x1x3_t s_ = { { simde_poly64x1_from_private(a_[0]),
+ simde_poly64x1_from_private(a_[1]),
+ simde_poly64x1_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p64_x3
+ #define vld1_p64_x3(a) simde_vld1_p64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x3_t
+simde_vld1_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1_bf16_x3(ptr);
+ #else
+ simde_bfloat16x4_private a_[3];
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ simde_bfloat16x4x3_t s_ = { { simde_bfloat16x4_from_private(a_[0]),
+ simde_bfloat16x4_from_private(a_[1]),
+ simde_bfloat16x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_bf16_x3
+ #define vld1_bf16_x3(a) simde_vld1_bf16_x3((a))
+#endif
+
+#endif /* !defined(SIMDE_BUG_INTEL_857088) */
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD1_X3_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_x4.h b/lib/simd_wrapper/simde/arm/neon/ld1_x4.h
new file mode 100644
index 00000000000..1d797364b60
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld1_x4.h
@@ -0,0 +1,516 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2020 Evan Nemerson
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2021 Décio Luiz Gazzoni Filho
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD1_X4_H)
+#define SIMDE_ARM_NEON_LD1_X4_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if HEDLEY_GCC_VERSION_CHECK(7,0,0)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+SIMDE_BEGIN_DECLS_
+
+#if !defined(SIMDE_BUG_INTEL_857088)
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x4_t
+simde_vld1_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_f16_x4(ptr);
+ #else
+ simde_float16x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH
+ a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4);
+ a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4);
+ a_[3].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+12) , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]),
+ simde_float16x4_from_private(a_[1]),
+ simde_float16x4_from_private(a_[2]),
+ simde_float16x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f16_x4
+ #define vld1_f16_x4(a) simde_vld1_f16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x4_t
+simde_vld1_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_f32_x4(ptr);
+ #else
+ simde_float32x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2);
+ a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2);
+ a_[3].sv64 = __riscv_vle32_v_f32m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]),
+ simde_float32x2_from_private(a_[1]),
+ simde_float32x2_from_private(a_[2]),
+ simde_float32x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f32_x4
+ #define vld1_f32_x4(a) simde_vld1_f32_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x4_t
+simde_vld1_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vld1_f64_x4(ptr);
+ #else
+ simde_float64x1_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1);
+ a_[3].sv64 = __riscv_vle64_v_f64m1(ptr+3 , 1);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]),
+ simde_float64x1_from_private(a_[1]),
+ simde_float64x1_from_private(a_[2]),
+ simde_float64x1_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_f64_x4
+ #define vld1_f64_x4(a) simde_vld1_f64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x4_t
+simde_vld1_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s8_x4(ptr);
+ #else
+ simde_int8x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8);
+ a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8);
+ a_[3].sv64 = __riscv_vle8_v_i8m1(ptr+24 , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]),
+ simde_int8x8_from_private(a_[1]),
+ simde_int8x8_from_private(a_[2]),
+ simde_int8x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s8_x4
+ #define vld1_s8_x4(a) simde_vld1_s8_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x4_t
+simde_vld1_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s16_x4(ptr);
+ #else
+ simde_int16x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4);
+ a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4);
+ a_[3].sv64 = __riscv_vle16_v_i16m1(ptr+12 , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]),
+ simde_int16x4_from_private(a_[1]),
+ simde_int16x4_from_private(a_[2]),
+ simde_int16x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s16_x4
+ #define vld1_s16_x4(a) simde_vld1_s16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x4_t
+simde_vld1_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s32_x4(ptr);
+ #else
+ simde_int32x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2);
+ a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2);
+ a_[3].sv64 = __riscv_vle32_v_i32m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]),
+ simde_int32x2_from_private(a_[1]),
+ simde_int32x2_from_private(a_[2]),
+ simde_int32x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s32_x4
+ #define vld1_s32_x4(a) simde_vld1_s32_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x4_t
+simde_vld1_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_s64_x4(ptr);
+ #else
+ simde_int64x1_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1);
+ a_[3].sv64 = __riscv_vle64_v_i64m1(ptr+3 , 1);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]),
+ simde_int64x1_from_private(a_[1]),
+ simde_int64x1_from_private(a_[2]),
+ simde_int64x1_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_s64_x4
+ #define vld1_s64_x4(a) simde_vld1_s64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x4_t
+simde_vld1_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u8_x4(ptr);
+ #else
+ simde_uint8x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8);
+ a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8);
+ a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]),
+ simde_uint8x8_from_private(a_[1]),
+ simde_uint8x8_from_private(a_[2]),
+ simde_uint8x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u8_x4
+ #define vld1_u8_x4(a) simde_vld1_u8_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x4_t
+simde_vld1_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u16_x4(ptr);
+ #else
+ simde_uint16x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4);
+ a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4);
+ a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]),
+ simde_uint16x4_from_private(a_[1]),
+ simde_uint16x4_from_private(a_[2]),
+ simde_uint16x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u16_x4
+ #define vld1_u16_x4(a) simde_vld1_u16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x4_t
+simde_vld1_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u32_x4(ptr);
+ #else
+ simde_uint32x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2);
+ a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2);
+ a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2);
+ a_[3].sv64 = __riscv_vle32_v_u32m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]),
+ simde_uint32x2_from_private(a_[1]),
+ simde_uint32x2_from_private(a_[2]),
+ simde_uint32x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u32_x4
+ #define vld1_u32_x4(a) simde_vld1_u32_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x4_t
+simde_vld1_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_u64_x4(ptr);
+ #else
+ simde_uint64x1_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1);
+ a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]),
+ simde_uint64x1_from_private(a_[1]),
+ simde_uint64x1_from_private(a_[2]),
+ simde_uint64x1_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_u64_x4
+ #define vld1_u64_x4(a) simde_vld1_u64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x4_t
+simde_vld1_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p8_x4(ptr);
+ #else
+ simde_poly8x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8);
+ a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8);
+ a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8);
+ a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]),
+ simde_poly8x8_from_private(a_[1]),
+ simde_poly8x8_from_private(a_[2]),
+ simde_poly8x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p8_x4
+ #define vld1_p8_x4(a) simde_vld1_p8_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x4_t
+simde_vld1_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p16_x4(ptr);
+ #else
+ simde_poly16x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4);
+ a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4);
+ a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4);
+ a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]),
+ simde_poly16x4_from_private(a_[1]),
+ simde_poly16x4_from_private(a_[2]),
+ simde_poly16x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p16_x4
+ #define vld1_p16_x4(a) simde_vld1_p16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x4_t
+simde_vld1_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1_p64_x4(ptr);
+ #else
+ simde_poly64x1_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1);
+ a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1);
+ a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1);
+ a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i].values[0] = ptr[i];
+ }
+ #endif
+ simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]),
+ simde_poly64x1_from_private(a_[1]),
+ simde_poly64x1_from_private(a_[2]),
+ simde_poly64x1_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_p64_x4
+ #define vld1_p64_x4(a) simde_vld1_p64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x4_t
+simde_vld1_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1_bf16_x4(ptr);
+ #else
+ simde_bfloat16x4_private a_[4];
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ simde_bfloat16x4x4_t s_ = { { simde_bfloat16x4_from_private(a_[0]),
+ simde_bfloat16x4_from_private(a_[1]),
+ simde_bfloat16x4_from_private(a_[2]),
+ simde_bfloat16x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1_bf16_x4
+ #define vld1_bf16_x4(a) simde_vld1_bf16_x4((a))
+#endif
+
+#endif /* !defined(SIMDE_BUG_INTEL_857088) */
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD1_X4_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1q_x2.h b/lib/simd_wrapper/simde/arm/neon/ld1q_x2.h
new file mode 100644
index 00000000000..da1da866af7
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld1q_x2.h
@@ -0,0 +1,461 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2020 Evan Nemerson
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2021 Décio Luiz Gazzoni Filho
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD1Q_X2_H)
+#define SIMDE_ARM_NEON_LD1Q_X2_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if HEDLEY_GCC_VERSION_CHECK(7,0,0)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+SIMDE_BEGIN_DECLS_
+
+#if !defined(SIMDE_BUG_INTEL_857088)
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x2_t
+simde_vld1q_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ defined(SIMDE_ARM_NEON_FP16)
+ return vld1q_f16_x2(ptr);
+ #else
+ simde_float16x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH
+ a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_float16x8x2_t s_ = { { simde_float16x8_from_private(a_[0]),
+ simde_float16x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f16_x2
+ #define vld1q_f16_x2(a) simde_vld1q_f16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x2_t
+simde_vld1q_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_f32_x2(ptr);
+ #else
+ simde_float32x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_float32x4x2_t s_ = { { simde_float32x4_from_private(a_[0]),
+ simde_float32x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f32_x2
+ #define vld1q_f32_x2(a) simde_vld1q_f32_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x2_t
+simde_vld1q_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vld1q_f64_x2(ptr);
+ #else
+ simde_float64x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_float64x2x2_t s_ = { { simde_float64x2_from_private(a_[0]),
+ simde_float64x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f64_x2
+ #define vld1q_f64_x2(a) simde_vld1q_f64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x2_t
+simde_vld1q_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s8_x2(ptr);
+ #else
+ simde_int8x16_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_int8x16x2_t s_ = { { simde_int8x16_from_private(a_[0]),
+ simde_int8x16_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s8_x2
+ #define vld1q_s8_x2(a) simde_vld1q_s8_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x2_t
+simde_vld1q_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s16_x2(ptr);
+ #else
+ simde_int16x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_int16x8x2_t s_ = { { simde_int16x8_from_private(a_[0]),
+ simde_int16x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s16_x2
+ #define vld1q_s16_x2(a) simde_vld1q_s16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x2_t
+simde_vld1q_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s32_x2(ptr);
+ #else
+ simde_int32x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_int32x4x2_t s_ = { { simde_int32x4_from_private(a_[0]),
+ simde_int32x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s32_x2
+ #define vld1q_s32_x2(a) simde_vld1q_s32_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x2_t
+simde_vld1q_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s64_x2(ptr);
+ #else
+ simde_int64x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_int64x2x2_t s_ = { { simde_int64x2_from_private(a_[0]),
+ simde_int64x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s64_x2
+ #define vld1q_s64_x2(a) simde_vld1q_s64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x2_t
+simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u8_x2(ptr);
+ #else
+ simde_uint8x16_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]),
+ simde_uint8x16_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u8_x2
+ #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x2_t
+simde_vld1q_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u16_x2(ptr);
+ #else
+ simde_uint16x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_uint16x8x2_t s_ = { { simde_uint16x8_from_private(a_[0]),
+ simde_uint16x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u16_x2
+ #define vld1q_u16_x2(a) simde_vld1q_u16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x2_t
+simde_vld1q_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u32_x2(ptr);
+ #else
+ simde_uint32x4_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_uint32x4x2_t s_ = { { simde_uint32x4_from_private(a_[0]),
+ simde_uint32x4_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u32_x2
+ #define vld1q_u32_x2(a) simde_vld1q_u32_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x2_t
+simde_vld1q_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u64_x2(ptr);
+ #else
+ simde_uint64x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_uint64x2x2_t s_ = { { simde_uint64x2_from_private(a_[0]),
+ simde_uint64x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u64_x2
+ #define vld1q_u64_x2(a) simde_vld1q_u64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x2_t
+simde_vld1q_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p8_x2(ptr);
+ #else
+ simde_poly8x16_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_poly8x16x2_t s_ = { { simde_poly8x16_from_private(a_[0]),
+ simde_poly8x16_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p8_x2
+ #define vld1q_p8_x2(a) simde_vld1q_p8_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x2_t
+simde_vld1q_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p16_x2(ptr);
+ #else
+ simde_poly16x8_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_poly16x8x2_t s_ = { { simde_poly16x8_from_private(a_[0]),
+ simde_poly16x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p16_x2
+ #define vld1q_p16_x2(a) simde_vld1q_p16_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x2_t
+simde_vld1q_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p64_x2(ptr);
+ #else
+ simde_poly64x2_private a_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2);
+ #else
+ for (size_t i = 0; i < 4; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_poly64x2x2_t s_ = { { simde_poly64x2_from_private(a_[0]),
+ simde_poly64x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p64_x2
+ #define vld1q_p64_x2(a) simde_vld1q_p64_x2((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x2_t
+simde_vld1q_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1q_bf16_x2(ptr);
+ #else
+ simde_bfloat16x8_private a_[2];
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ simde_bfloat16x8x2_t s_ = { { simde_bfloat16x8_from_private(a_[0]),
+ simde_bfloat16x8_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_bf16_x2
+ #define vld1q_bf16_x2(a) simde_vld1q_bf16_x2((a))
+#endif
+
+
+#endif /* !defined(SIMDE_BUG_INTEL_857088) */
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X2_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1q_x3.h b/lib/simd_wrapper/simde/arm/neon/ld1q_x3.h
new file mode 100644
index 00000000000..ec82989e74c
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld1q_x3.h
@@ -0,0 +1,487 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2020 Evan Nemerson
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD1Q_X3_H)
+#define SIMDE_ARM_NEON_LD1Q_X3_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if HEDLEY_GCC_VERSION_CHECK(7,0,0)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+SIMDE_BEGIN_DECLS_
+
+#if !defined(SIMDE_BUG_INTEL_857088)
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x3_t
+simde_vld1q_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_f16_x3(ptr);
+ #else
+ simde_float16x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH
+ a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8);
+ a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_float16x8x3_t s_ = { { simde_float16x8_from_private(a_[0]),
+ simde_float16x8_from_private(a_[1]),
+ simde_float16x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f16_x3
+ #define vld1q_f16_x3(a) simde_vld1q_f16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x3_t
+simde_vld1q_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_f32_x3(ptr);
+ #else
+ simde_float32x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4);
+ a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_float32x4x3_t s_ = { { simde_float32x4_from_private(a_[0]),
+ simde_float32x4_from_private(a_[1]),
+ simde_float32x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f32_x3
+ #define vld1q_f32_x3(a) simde_vld1q_f32_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x3_t
+simde_vld1q_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(6)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vld1q_f64_x3(ptr);
+ #else
+ simde_float64x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_float64x2x3_t s_ = { { simde_float64x2_from_private(a_[0]),
+ simde_float64x2_from_private(a_[1]),
+ simde_float64x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f64_x3
+ #define vld1q_f64_x3(a) simde_vld1q_f64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x3_t
+simde_vld1q_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s8_x3(ptr);
+ #else
+ simde_int8x16_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16);
+ a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16);
+ #else
+ for (size_t i = 0; i < 48; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_int8x16x3_t s_ = { { simde_int8x16_from_private(a_[0]),
+ simde_int8x16_from_private(a_[1]),
+ simde_int8x16_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s8_x3
+ #define vld1q_s8_x3(a) simde_vld1q_s8_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x3_t
+simde_vld1q_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s16_x3(ptr);
+ #else
+ simde_int16x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8);
+ a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_int16x8x3_t s_ = { { simde_int16x8_from_private(a_[0]),
+ simde_int16x8_from_private(a_[1]),
+ simde_int16x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s16_x3
+ #define vld1q_s16_x3(a) simde_vld1q_s16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x3_t
+simde_vld1q_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s32_x3(ptr);
+ #else
+ simde_int32x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4);
+ a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_int32x4x3_t s_ = { { simde_int32x4_from_private(a_[0]),
+ simde_int32x4_from_private(a_[1]),
+ simde_int32x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s32_x3
+ #define vld1q_s32_x3(a) simde_vld1q_s32_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x3_t
+simde_vld1q_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s64_x3(ptr);
+ #else
+ simde_int64x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_int64x2x3_t s_ = { { simde_int64x2_from_private(a_[0]),
+ simde_int64x2_from_private(a_[1]),
+ simde_int64x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s64_x3
+ #define vld1q_s64_x3(a) simde_vld1q_s64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x3_t
+simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u8_x3(ptr);
+ #else
+ simde_uint8x16_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16);
+ a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16);
+ #else
+ for (size_t i = 0; i < 48; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]),
+ simde_uint8x16_from_private(a_[1]),
+ simde_uint8x16_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u8_x3
+ #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x3_t
+simde_vld1q_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u16_x3(ptr);
+ #else
+ simde_uint16x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8);
+ a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_uint16x8x3_t s_ = { { simde_uint16x8_from_private(a_[0]),
+ simde_uint16x8_from_private(a_[1]),
+ simde_uint16x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u16_x3
+ #define vld1q_u16_x3(a) simde_vld1q_u16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x3_t
+simde_vld1q_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u32_x3(ptr);
+ #else
+ simde_uint32x4_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4);
+ a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4);
+ #else
+ for (size_t i = 0; i < 12; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_uint32x4x3_t s_ = { { simde_uint32x4_from_private(a_[0]),
+ simde_uint32x4_from_private(a_[1]),
+ simde_uint32x4_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u32_x3
+ #define vld1q_u32_x3(a) simde_vld1q_u32_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x3_t
+simde_vld1q_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u64_x3(ptr);
+ #else
+ simde_uint64x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_uint64x2x3_t s_ = { { simde_uint64x2_from_private(a_[0]),
+ simde_uint64x2_from_private(a_[1]),
+ simde_uint64x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u64_x3
+ #define vld1q_u64_x3(a) simde_vld1q_u64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x3_t
+simde_vld1q_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p8_x3(ptr);
+ #else
+ simde_poly8x16_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16);
+ a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16);
+ #else
+ for (size_t i = 0; i < 48; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_poly8x16x3_t s_ = { { simde_poly8x16_from_private(a_[0]),
+ simde_poly8x16_from_private(a_[1]),
+ simde_poly8x16_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p8_x3
+ #define vld1q_p8_x3(a) simde_vld1q_p8_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x3_t
+simde_vld1q_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p16_x3(ptr);
+ #else
+ simde_poly16x8_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8);
+ a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8);
+ #else
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_poly16x8x3_t s_ = { { simde_poly16x8_from_private(a_[0]),
+ simde_poly16x8_from_private(a_[1]),
+ simde_poly16x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p16_x3
+ #define vld1q_p16_x3(a) simde_vld1q_p16_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x3_t
+simde_vld1q_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p64_x3(ptr);
+ #else
+ simde_poly64x2_private a_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2);
+ #else
+ for (size_t i = 0; i < 6; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_poly64x2x3_t s_ = { { simde_poly64x2_from_private(a_[0]),
+ simde_poly64x2_from_private(a_[1]),
+ simde_poly64x2_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p64_x3
+ #define vld1q_p64_x3(a) simde_vld1q_p64_x3((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x3_t
+simde_vld1q_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(24)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1q_bf16_x3(ptr);
+ #else
+ simde_bfloat16x8_private a_[3];
+ for (size_t i = 0; i < 24; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ simde_bfloat16x8x3_t s_ = { { simde_bfloat16x8_from_private(a_[0]),
+ simde_bfloat16x8_from_private(a_[1]),
+ simde_bfloat16x8_from_private(a_[2]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_bf16_x3
+ #define vld1q_bf16_x3(a) simde_vld1q_bf16_x3((a))
+#endif
+
+#endif /* !defined(SIMDE_BUG_INTEL_857088) */
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X3_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld1q_x4.h b/lib/simd_wrapper/simde/arm/neon/ld1q_x4.h
new file mode 100644
index 00000000000..2fa4c1a6996
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld1q_x4.h
@@ -0,0 +1,517 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2020 Evan Nemerson
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2021 Décio Luiz Gazzoni Filho
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD1Q_X4_H)
+#define SIMDE_ARM_NEON_LD1Q_X4_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+#if HEDLEY_GCC_VERSION_CHECK(7,0,0)
+ SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_
+#endif
+SIMDE_BEGIN_DECLS_
+
+#if !defined(SIMDE_BUG_INTEL_857088)
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x4_t
+simde_vld1q_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_f16_x4(ptr);
+ #else
+ simde_float16x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH
+ a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8);
+ a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8);
+ a_[3].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+24) , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]),
+ simde_float16x8_from_private(a_[1]),
+ simde_float16x8_from_private(a_[2]),
+ simde_float16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f16_x4
+ #define vld1q_f16_x4(a) simde_vld1q_f16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x4_t
+simde_vld1q_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_f32_x4(ptr);
+ #else
+ simde_float32x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4);
+ a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4);
+ a_[3].sv128 = __riscv_vle32_v_f32m1(ptr+12 , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]),
+ simde_float32x4_from_private(a_[1]),
+ simde_float32x4_from_private(a_[2]),
+ simde_float32x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f32_x4
+ #define vld1q_f32_x4(a) simde_vld1q_f32_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x4_t
+simde_vld1q_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \
+ (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))
+ return vld1q_f64_x4(ptr);
+ #else
+ simde_float64x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2);
+ a_[3].sv128 = __riscv_vle64_v_f64m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]),
+ simde_float64x2_from_private(a_[1]),
+ simde_float64x2_from_private(a_[2]),
+ simde_float64x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_f64_x4
+ #define vld1q_f64_x4(a) simde_vld1q_f64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x4_t
+simde_vld1q_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s8_x4(ptr);
+ #else
+ simde_int8x16_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16);
+ a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16);
+ a_[3].sv128 = __riscv_vle8_v_i8m1(ptr+48 , 16);
+ #else
+ for (size_t i = 0; i < 64; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]),
+ simde_int8x16_from_private(a_[1]),
+ simde_int8x16_from_private(a_[2]),
+ simde_int8x16_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s8_x4
+ #define vld1q_s8_x4(a) simde_vld1q_s8_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x4_t
+simde_vld1q_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s16_x4(ptr);
+ #else
+ simde_int16x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8);
+ a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8);
+ a_[3].sv128 = __riscv_vle16_v_i16m1(ptr+24 , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]),
+ simde_int16x8_from_private(a_[1]),
+ simde_int16x8_from_private(a_[2]),
+ simde_int16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s16_x4
+ #define vld1q_s16_x4(a) simde_vld1q_s16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x4_t
+simde_vld1q_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s32_x4(ptr);
+ #else
+ simde_int32x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4);
+ a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4);
+ a_[3].sv128 = __riscv_vle32_v_i32m1(ptr+12 , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]),
+ simde_int32x4_from_private(a_[1]),
+ simde_int32x4_from_private(a_[2]),
+ simde_int32x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s32_x4
+ #define vld1q_s32_x4(a) simde_vld1q_s32_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x4_t
+simde_vld1q_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_s64_x4(ptr);
+ #else
+ simde_int64x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2);
+ a_[3].sv128 = __riscv_vle64_v_i64m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]),
+ simde_int64x2_from_private(a_[1]),
+ simde_int64x2_from_private(a_[1]),
+ simde_int64x2_from_private(a_[1]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_s64_x4
+ #define vld1q_s64_x4(a) simde_vld1q_s64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x4_t
+simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u8_x4(ptr);
+ #else
+ simde_uint8x16_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16);
+ a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16);
+ a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16);
+ #else
+ for (size_t i = 0; i < 64; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]),
+ simde_uint8x16_from_private(a_[1]),
+ simde_uint8x16_from_private(a_[2]),
+ simde_uint8x16_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u8_x4
+ #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x4_t
+simde_vld1q_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u16_x4(ptr);
+ #else
+ simde_uint16x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8);
+ a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8);
+ a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]),
+ simde_uint16x8_from_private(a_[1]),
+ simde_uint16x8_from_private(a_[2]),
+ simde_uint16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u16_x4
+ #define vld1q_u16_x4(a) simde_vld1q_u16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x4_t
+simde_vld1q_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u32_x4(ptr);
+ #else
+ simde_uint32x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4);
+ a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4);
+ a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4);
+ a_[3].sv128 = __riscv_vle32_v_u32m1(ptr+12 , 4);
+ #else
+ for (size_t i = 0; i < 16; i++) {
+ a_[i / 4].values[i % 4] = ptr[i];
+ }
+ #endif
+ simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]),
+ simde_uint32x4_from_private(a_[1]),
+ simde_uint32x4_from_private(a_[2]),
+ simde_uint32x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u32_x4
+ #define vld1q_u32_x4(a) simde_vld1q_u32_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x4_t
+simde_vld1q_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \
+ (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_u64_x4(ptr);
+ #else
+ simde_uint64x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2);
+ a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]),
+ simde_uint64x2_from_private(a_[1]),
+ simde_uint64x2_from_private(a_[2]),
+ simde_uint64x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_u64_x4
+ #define vld1q_u64_x4(a) simde_vld1q_u64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x4_t
+simde_vld1q_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p8_x4(ptr);
+ #else
+ simde_poly8x16_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16);
+ a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16);
+ a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16);
+ a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16);
+ #else
+ for (size_t i = 0; i < 64; i++) {
+ a_[i / 16].values[i % 16] = ptr[i];
+ }
+ #endif
+ simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]),
+ simde_poly8x16_from_private(a_[1]),
+ simde_poly8x16_from_private(a_[2]),
+ simde_poly8x16_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p8_x4
+ #define vld1q_p8_x4(a) simde_vld1q_p8_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x4_t
+simde_vld1q_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p16_x4(ptr);
+ #else
+ simde_poly16x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8);
+ a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8);
+ a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8);
+ a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8);
+ #else
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ #endif
+ simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]),
+ simde_poly16x8_from_private(a_[1]),
+ simde_poly16x8_from_private(a_[2]),
+ simde_poly16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p16_x4
+ #define vld1q_p16_x4(a) simde_vld1q_p16_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x4_t
+simde_vld1q_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if \
+ defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \
+ (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE)))
+ return vld1q_p64_x4(ptr);
+ #else
+ simde_poly64x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2);
+ a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2);
+ a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2);
+ a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2);
+ #else
+ for (size_t i = 0; i < 8; i++) {
+ a_[i / 2].values[i % 2] = ptr[i];
+ }
+ #endif
+ simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]),
+ simde_poly64x2_from_private(a_[1]),
+ simde_poly64x2_from_private(a_[2]),
+ simde_poly64x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_p64_x4
+ #define vld1q_p64_x4(a) simde_vld1q_p64_x4((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x4_t
+simde_vld1q_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld1q_bf16_x4(ptr);
+ #else
+ simde_bfloat16x8_private a_[4];
+ for (size_t i = 0; i < 32; i++) {
+ a_[i / 8].values[i % 8] = ptr[i];
+ }
+ simde_bfloat16x8x4_t s_ = { { simde_bfloat16x8_from_private(a_[0]),
+ simde_bfloat16x8_from_private(a_[1]),
+ simde_bfloat16x8_from_private(a_[2]),
+ simde_bfloat16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld1q_bf16_x4
+ #define vld1q_bf16_x4(a) simde_vld1q_bf16_x4((a))
+#endif
+
+#endif /* !defined(SIMDE_BUG_INTEL_857088) */
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X4_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld2.h b/lib/simd_wrapper/simde/arm/neon/ld2.h
index 70cb39af7c8..5d0be9f33fe 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld2.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld2.h
@@ -22,6 +22,8 @@
*
* Copyright:
* 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
*/
#if !defined(SIMDE_ARM_NEON_LD2_H)
@@ -57,6 +59,16 @@ simde_vld2_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
simde_vget_high_s8(q)
};
return u;
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int8x8_private a_[2];
+ vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 8);
+ a_[0].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 1);
+ simde_int8x8x2_t r = { {
+ simde_int8x8_from_private(a_[0]),
+ simde_int8x8_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_int8x16_private a_ = simde_int8x16_to_private(simde_vld1q_s8(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
@@ -90,6 +102,16 @@ simde_int16x4x2_t
simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_s16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int16x4_private a_[2];
+ vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 4);
+ a_[0].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 1);
+ simde_int16x4x2_t r = { {
+ simde_int16x4_from_private(a_[0]),
+ simde_int16x4_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_int16x8_private a_ = simde_int16x8_to_private(simde_vld1q_s16(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7);
@@ -97,6 +119,10 @@ simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_memcpy(&r, &a_, sizeof(r));
return r;
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_int16x4_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -104,6 +130,9 @@ simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
}
}
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
simde_int16x4x2_t r = { {
simde_int16x4_from_private(r_[0]),
@@ -123,6 +152,16 @@ simde_int32x2x2_t
simde_vld2_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_s32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int32x2_private a_[2];
+ vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 2);
+ a_[0].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 1);
+ simde_int32x2x2_t r = { {
+ simde_int32x2_from_private(a_[0]),
+ simde_int32x2_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_int32x4_private a_ = simde_int32x4_to_private(simde_vld1q_s32(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3);
@@ -156,6 +195,16 @@ simde_int64x1x2_t
simde_vld2_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_s64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int64x1_private a_[2];
+ vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 1);
+ a_[0].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 1);
+ simde_int64x1x2_t r = { {
+ simde_int64x1_from_private(a_[0]),
+ simde_int64x1_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_int64x2_private a_ = simde_int64x2_to_private(simde_vld1q_s64(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1);
@@ -200,6 +249,16 @@ simde_vld2_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
simde_vget_high_u8(q)
};
return u;
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x8_private a_[2];
+ vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8);
+ a_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1);
+ simde_uint8x8x2_t r = { {
+ simde_uint8x8_from_private(a_[0]),
+ simde_uint8x8_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_uint8x16_private a_ = simde_uint8x16_to_private(simde_vld1q_u8(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
@@ -233,6 +292,16 @@ simde_uint16x4x2_t
simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_u16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x4_private a_[2];
+ vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4);
+ a_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1);
+ simde_uint16x4x2_t r = { {
+ simde_uint16x4_from_private(a_[0]),
+ simde_uint16x4_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_uint16x8_private a_ = simde_uint16x8_to_private(simde_vld1q_u16(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7);
@@ -240,6 +309,10 @@ simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_memcpy(&r, &a_, sizeof(r));
return r;
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_uint16x4_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -247,6 +320,9 @@ simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
}
}
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
simde_uint16x4x2_t r = { {
simde_uint16x4_from_private(r_[0]),
@@ -266,6 +342,16 @@ simde_uint32x2x2_t
simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_u32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x2_private a_[2];
+ vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 2);
+ a_[0].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 1);
+ simde_uint32x2x2_t r = { {
+ simde_uint32x2_from_private(a_[0]),
+ simde_uint32x2_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_uint32x4_private a_ = simde_uint32x4_to_private(simde_vld1q_u32(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3);
@@ -296,9 +382,19 @@ simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x1x2_t
-simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_u64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint64x1_private a_[2];
+ vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1);
+ a_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1);
+ simde_uint64x1x2_t r = { {
+ simde_uint64x1_from_private(a_[0]),
+ simde_uint64x1_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_uint64x2_private a_ = simde_uint64x2_to_private(simde_vld1q_u64(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1);
@@ -327,11 +423,58 @@ simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#define vld2_u64(a) simde_vld2_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x2_t
+simde_vld2_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld2_f16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128)
+ simde_float16x4_private r_[2];
+ vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 4);
+ r_[0].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 1);
+ simde_float16x4x2_t r = { {
+ simde_float16x4_from_private(r_[0]),
+ simde_float16x4_from_private(r_[1]),
+ } };
+ return r;
+ #else
+ simde_float16x4_private r_[2];
+
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+
+ simde_float16x4x2_t r = { {
+ simde_float16x4_from_private(r_[0]),
+ simde_float16x4_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_f16
+ #define vld2_f16(a) simde_vld2_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2x2_t
simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2_f32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_float32x2_private r_[2];
+ vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 2);
+ r_[0].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 1);
+ simde_float32x2x2_t r = { {
+ simde_float32x2_from_private(r_[0]),
+ simde_float32x2_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_float32x4_private a_ = simde_float32x4_to_private(simde_vld1q_f32(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3);
@@ -362,9 +505,19 @@ simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
SIMDE_FUNCTION_ATTRIBUTES
simde_float64x1x2_t
-simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld2_f64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_float64x1_private r_[2];
+ vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 1);
+ r_[0].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 1);
+ simde_float64x1x2_t r = { {
+ simde_float64x1_from_private(r_[0]),
+ simde_float64x1_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_)
simde_float64x2_private a_ = simde_float64x2_to_private(simde_vld1q_f64(ptr));
a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1);
@@ -398,6 +551,16 @@ simde_int8x16x2_t
simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_s8(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int8x16_private a_[2];
+ vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 16);
+ a_[0].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 1);
+ simde_int8x16x2_t r = { {
+ simde_int8x16_from_private(a_[0]),
+ simde_int8x16_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_s8(
@@ -405,6 +568,10 @@ simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
simde_vld1q_s8(&(ptr[16]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_int8x16_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -419,6 +586,9 @@ simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
} };
return r;
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
@@ -431,6 +601,16 @@ simde_int32x4x2_t
simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_s32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int32x4_private a_[2];
+ vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 4);
+ a_[0].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 1);
+ simde_int32x4x2_t r = { {
+ simde_int32x4_from_private(a_[0]),
+ simde_int32x4_from_private(a_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_s32(
@@ -438,6 +618,10 @@ simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_vld1q_s32(&(ptr[4]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_int32x4_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -445,6 +629,9 @@ simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
}
}
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
simde_int32x4x2_t r = { {
simde_int32x4_from_private(r_[0]),
@@ -464,6 +651,16 @@ simde_int16x8x2_t
simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_s16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int16x8_private r_[2];
+ vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 1);
+ simde_int16x8x2_t r = { {
+ simde_int16x8_from_private(r_[0]),
+ simde_int16x8_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_s16(
@@ -471,6 +668,10 @@ simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
simde_vld1q_s16(&(ptr[8]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_int16x8_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -485,6 +686,9 @@ simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
} };
return r;
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
@@ -497,6 +701,16 @@ simde_int64x2x2_t
simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld2q_s64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int64x2_private r_[2];
+ vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 1);
+ simde_int64x2x2_t r = { {
+ simde_int64x2_from_private(r_[0]),
+ simde_int64x2_from_private(r_[1]),
+ } };
+ return r;
#else
simde_int64x2_private r_[2];
@@ -524,6 +738,16 @@ simde_uint8x16x2_t
simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_u8(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x16_private r_[2];
+ vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16);
+ r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1);
+ simde_uint8x16x2_t r = { {
+ simde_uint8x16_from_private(r_[0]),
+ simde_uint8x16_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_u8(
@@ -531,6 +755,10 @@ simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
simde_vld1q_u8(&(ptr[16]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_uint8x16_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -545,6 +773,9 @@ simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
} };
return r;
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
@@ -557,6 +788,16 @@ simde_uint16x8x2_t
simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_u16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x8_private r_[2];
+ vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1);
+ simde_uint16x8x2_t r = { {
+ simde_uint16x8_from_private(r_[0]),
+ simde_uint16x8_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_u16(
@@ -564,6 +805,10 @@ simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
simde_vld1q_u16(&(ptr[8]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_uint16x8_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -578,6 +823,9 @@ simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
} };
return r;
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
@@ -590,6 +838,16 @@ simde_uint32x4x2_t
simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_u32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x4_private r_[2];
+ vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 4);
+ r_[0].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 1);
+ simde_uint32x4x2_t r = { {
+ simde_uint32x4_from_private(r_[0]),
+ simde_uint32x4_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_u32(
@@ -597,6 +855,10 @@ simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_vld1q_u32(&(ptr[4]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_uint32x4_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
@@ -604,6 +866,9 @@ simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
}
}
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
simde_uint32x4x2_t r = { {
simde_uint32x4_from_private(r_[0]),
@@ -623,6 +888,16 @@ simde_uint64x2x2_t
simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld2q_u64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint64x2_private r_[2];
+ vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1);
+ simde_uint64x2x2_t r = { {
+ simde_uint64x2_from_private(r_[0]),
+ simde_uint64x2_from_private(r_[1]),
+ } };
+ return r;
#else
simde_uint64x2_private r_[2];
@@ -645,11 +920,65 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#define vld2q_u64(a) simde_vld2q_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x2_t
+simde_vld2q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld2q_f16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128)
+ simde_float16x8_private r_[2];
+ vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 1);
+ simde_float16x8x2_t r = { {
+ simde_float16x8_from_private(r_[0]),
+ simde_float16x8_from_private(r_[1]),
+ } };
+ return r;
+ #else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
+ simde_float16x8_private r_[2];
+
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
+
+ simde_float16x8x2_t r = { {
+ simde_float16x8_from_private(r_[0]),
+ simde_float16x8_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_f16
+ #define vld2q_f16(a) simde_vld2q_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4x2_t
simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld2q_f32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_float32x4_private r_[2];
+ vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 4);
+ r_[0].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 1);
+ simde_float32x4x2_t r = { {
+ simde_float32x4_from_private(r_[0]),
+ simde_float32x4_from_private(r_[1]),
+ } };
+ return r;
#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
return
simde_vuzpq_f32(
@@ -657,6 +986,10 @@ simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
simde_vld1q_f32(&(ptr[4]))
);
#else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
simde_float32x4_private r_[2];
for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) {
@@ -664,6 +997,9 @@ simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
}
}
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
simde_float32x4x2_t r = { {
simde_float32x4_from_private(r_[0]),
@@ -683,6 +1019,16 @@ simde_float64x2x2_t
simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld2q_f64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_float64x2_private r_[2];
+ vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 1);
+ simde_float64x2x2_t r = { {
+ simde_float64x2_from_private(r_[0]),
+ simde_float64x2_from_private(r_[1]),
+ } };
+ return r;
#else
simde_float64x2_private r_[2];
@@ -705,6 +1051,276 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
#define vld2q_f64(a) simde_vld2q_f64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x2_t
+simde_vld2_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_p8(ptr);
+ #else
+ simde_poly8x8_private r_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8);
+ r_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1);
+ #else
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+ simde_poly8x8x2_t r = { {
+ simde_poly8x8_from_private(r_[0]),
+ simde_poly8x8_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_p8
+ #define vld2_p8(a) simde_vld2_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x2_t
+simde_vld2_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_p16(ptr);
+ #else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
+ simde_poly16x4_private r_[2];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4);
+ r_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1);
+ #else
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
+
+ simde_poly16x4x2_t r = { {
+ simde_poly16x4_from_private(r_[0]),
+ simde_poly16x4_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_p16
+ #define vld2_p16(a) simde_vld2_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x2_t
+simde_vld2_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld2_p64(ptr);
+ #else
+ simde_poly64x1_private r_[2];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1);
+ r_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1);
+ #else
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly64x1x2_t r = { {
+ simde_poly64x1_from_private(r_[0]),
+ simde_poly64x1_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_p64
+ #define vld2_p64(a) simde_vld2_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x2_t
+simde_vld2q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2q_p8(ptr);
+ #else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
+ simde_poly8x16_private r_[2];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16);
+ r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1);
+ #else
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly8x16x2_t r = { {
+ simde_poly8x16_from_private(r_[0]),
+ simde_poly8x16_from_private(r_[1]),
+ } };
+
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_p8
+ #define vld2q_p8(a) simde_vld2q_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x2_t
+simde_vld2q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2q_p16(ptr);
+ #else
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_PUSH
+ SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_
+ #endif
+ simde_poly16x8_private r_[2];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1);
+ #else
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly16x8x2_t r = { {
+ simde_poly16x8_from_private(r_[0]),
+ simde_poly16x8_from_private(r_[1]),
+ } };
+ #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64)
+ HEDLEY_DIAGNOSTIC_POP
+ #endif
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_p16
+ #define vld2q_p16(a) simde_vld2q_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x2_t
+simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_p64(ptr);
+ #else
+ simde_poly64x2_private r_[2];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1);
+ #else
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly64x2x2_t r = { {
+ simde_poly64x2_from_private(r_[0]),
+ simde_poly64x2_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_p64
+ #define vld2q_p64(a) simde_vld2q_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x2_t
+simde_vld2_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld2_bf16(ptr);
+ #else
+ simde_bfloat16x4_private r_[2];
+
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+
+ simde_bfloat16x4x2_t r = { {
+ simde_bfloat16x4_from_private(r_[0]),
+ simde_bfloat16x4_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_bf16
+ #define vld2_bf16(a) simde_vld2_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x2_t
+simde_vld2q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld2q_bf16(ptr);
+ #else
+ simde_bfloat16x8_private r_[2];
+
+ for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+
+ simde_bfloat16x8x2_t r = { {
+ simde_bfloat16x8_from_private(r_[0]),
+ simde_bfloat16x8_from_private(r_[1]),
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_bf16
+ #define vld2q_bf16(a) simde_vld2q_bf16((a))
+#endif
+
#endif /* !defined(SIMDE_BUG_INTEL_857088) */
SIMDE_END_DECLS_
diff --git a/lib/simd_wrapper/simde/arm/neon/ld2_dup.h b/lib/simd_wrapper/simde/arm/neon/ld2_dup.h
new file mode 100644
index 00000000000..238807ab743
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld2_dup.h
@@ -0,0 +1,612 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD2_DUP_H)
+#define SIMDE_ARM_NEON_LD2_DUP_H
+
+#include "dup_n.h"
+#include "reinterpret.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x2_t
+simde_vld2_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld2_dup_f16(ptr);
+ #else
+ simde_float16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_f16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_f16
+ #define vld2_dup_f16(a) simde_vld2_dup_f16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x2_t
+simde_vld2_dup_f32(simde_float32 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_f32(ptr);
+ #else
+ simde_float32x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_f32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_f32
+ #define vld2_dup_f32(a) simde_vld2_dup_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x2_t
+simde_vld2_dup_f64(simde_float64 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2_dup_f64(ptr);
+ #else
+ simde_float64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_f64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_f64
+ #define vld2_dup_f64(a) simde_vld2_dup_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x2_t
+simde_vld2_dup_s8(int8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_s8(ptr);
+ #else
+ simde_int8x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_s8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_s8
+ #define vld2_dup_s8(a) simde_vld2_dup_s8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x2_t
+simde_vld2_dup_s16(int16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_s16(ptr);
+ #else
+ simde_int16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_s16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_s16
+ #define vld2_dup_s16(a) simde_vld2_dup_s16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x2_t
+simde_vld2_dup_s32(int32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_s32(ptr);
+ #else
+ simde_int32x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_s32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_s32
+ #define vld2_dup_s32(a) simde_vld2_dup_s32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x2_t
+simde_vld2_dup_s64(int64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_s64(ptr);
+ #else
+ simde_int64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_s64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_s64
+ #define vld2_dup_s64(a) simde_vld2_dup_s64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x2_t
+simde_vld2_dup_u8(uint8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_u8(ptr);
+ #else
+ simde_uint8x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_u8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_u8
+ #define vld2_dup_u8(a) simde_vld2_dup_u8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x2_t
+simde_vld2_dup_u16(uint16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_u16(ptr);
+ #else
+ simde_uint16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_u16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_u16
+ #define vld2_dup_u16(a) simde_vld2_dup_u16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x2_t
+simde_vld2_dup_u32(uint32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_u32(ptr);
+ #else
+ simde_uint32x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_u32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_u32
+ #define vld2_dup_u32(a) simde_vld2_dup_u32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x2_t
+simde_vld2_dup_u64(uint64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_u64(ptr);
+ #else
+ simde_uint64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_u64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_u64
+ #define vld2_dup_u64(a) simde_vld2_dup_u64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x2_t
+simde_vld2q_dup_f16(simde_float16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld2q_dup_f16(ptr);
+ #else
+ simde_float16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_f16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_f16
+ #define vld2q_dup_f16(a) simde_vld2q_dup_f16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x2_t
+simde_vld2q_dup_f32(simde_float32 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_f32(ptr);
+ #else
+ simde_float32x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_f32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_f32
+ #define vld2q_dup_f32(a) simde_vld2q_dup_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x2_t
+simde_vld2q_dup_f64(simde_float64 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_f64(ptr);
+ #else
+ simde_float64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_f64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_f64
+ #define vld2q_dup_f64(a) simde_vld2q_dup_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x2_t
+simde_vld2q_dup_s8(int8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_s8(ptr);
+ #else
+ simde_int8x16x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_s8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_s8
+ #define vld2q_dup_s8(a) simde_vld2q_dup_s8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x2_t
+simde_vld2q_dup_s16(int16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_s16(ptr);
+ #else
+ simde_int16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_s16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_s16
+ #define vld2q_dup_s16(a) simde_vld2q_dup_s16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x2_t
+simde_vld2q_dup_s32(int32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_s32(ptr);
+ #else
+ simde_int32x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_s32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_s32
+ #define vld2q_dup_s32(a) simde_vld2q_dup_s32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x2_t
+simde_vld2q_dup_s64(int64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_s64(ptr);
+ #else
+ simde_int64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_s64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_s64
+ #define vld2q_dup_s64(a) simde_vld2q_dup_s64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x2_t
+simde_vld2q_dup_u8(uint8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_u8(ptr);
+ #else
+ simde_uint8x16x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_u8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_u8
+ #define vld2q_dup_u8(a) simde_vld2q_dup_u8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x2_t
+simde_vld2q_dup_u16(uint16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_u16(ptr);
+ #else
+ simde_uint16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_u16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_u16
+ #define vld2q_dup_u16(a) simde_vld2q_dup_u16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x2_t
+simde_vld2q_dup_u32(uint32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_u32(ptr);
+ #else
+ simde_uint32x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_u32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_u32
+ #define vld2q_dup_u32(a) simde_vld2q_dup_u32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x2_t
+simde_vld2q_dup_u64(uint64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_u64(ptr);
+ #else
+ simde_uint64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_u64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_u64
+ #define vld2q_dup_u64(a) simde_vld2q_dup_u64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x2_t
+simde_vld2_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_p8(ptr);
+ #else
+ simde_poly8x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_p8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_p8
+ #define vld2_dup_p8(a) simde_vld2_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x2_t
+simde_vld2_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld2_dup_p16(ptr);
+ #else
+ simde_poly16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_p16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_p16
+ #define vld2_dup_p16(a) simde_vld2_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x2_t
+simde_vld2_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld2_dup_p64(ptr);
+ #else
+ simde_poly64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_p64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_p64
+ #define vld2_dup_p64(a) simde_vld2_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x2_t
+simde_vld2q_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && \
+ !defined(SIMDE_BUG_CLANG_71763)
+ return vld2q_dup_p8(ptr);
+ #else
+ simde_poly8x16x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_p8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_p8
+ #define vld2q_dup_p8(a) simde_vld2q_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x2_t
+simde_vld2q_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && \
+ !defined(SIMDE_BUG_CLANG_71763)
+ return vld2q_dup_p16(ptr);
+ #else
+ simde_poly16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_p16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_p16
+ #define vld2q_dup_p16(a) simde_vld2q_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x2_t
+simde_vld2q_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld2q_dup_p64(ptr);
+ #else
+ simde_poly64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_p64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_p64
+ #define vld2q_dup_p64(a) simde_vld2q_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x2_t
+simde_vld2_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld2_dup_bf16(ptr);
+ #else
+ simde_bfloat16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdup_n_bf16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_dup_bf16
+ #define vld2_dup_bf16(a) simde_vld2_dup_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x2_t
+simde_vld2q_dup_bf16(simde_bfloat16 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld2q_dup_bf16(ptr);
+ #else
+ simde_bfloat16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ r.val[i] = simde_vdupq_n_bf16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_dup_bf16
+ #define vld2q_dup_bf16(a) simde_vld2q_dup_bf16((a))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD2_DUP_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld2_lane.h b/lib/simd_wrapper/simde/arm/neon/ld2_lane.h
new file mode 100644
index 00000000000..81b29dd2005
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld2_lane.h
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD2_LANE_H)
+#define SIMDE_ARM_NEON_LD2_LANE_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x2_t simde_vld2_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int8x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int8x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int8x8_private tmp_ = simde_int8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int8x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_s8(ptr, src, lane) vld2_lane_s8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_s8
+ #define vld2_lane_s8(ptr, src, lane) simde_vld2_lane_s8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x2_t simde_vld2_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int16x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int16x4_private tmp_ = simde_int16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_s16(ptr, src, lane) vld2_lane_s16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_s16
+ #define vld2_lane_s16(ptr, src, lane) simde_vld2_lane_s16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x2_t simde_vld2_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int32x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int32x2_private tmp_ = simde_int32x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int32x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_s32(ptr, src, lane) vld2_lane_s32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_s32
+ #define vld2_lane_s32(ptr, src, lane) simde_vld2_lane_s32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x2_t simde_vld2_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x1x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_int64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int64x1_private tmp_ = simde_int64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2_lane_s64(ptr, src, lane) vld2_lane_s64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_s64
+ #define vld2_lane_s64(ptr, src, lane) simde_vld2_lane_s64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x2_t simde_vld2_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint8x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_uint8x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint8x8_private tmp_ = simde_uint8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint8x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_u8(ptr, src, lane) vld2_lane_u8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_u8
+ #define vld2_lane_u8(ptr, src, lane) simde_vld2_lane_u8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x2_t simde_vld2_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint16x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_uint16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint16x4_private tmp_ = simde_uint16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_u16(ptr, src, lane) vld2_lane_u16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_u16
+ #define vld2_lane_u16(ptr, src, lane) simde_vld2_lane_u16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x2_t simde_vld2_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_uint32x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint32x2_private tmp_ = simde_uint32x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint32x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_u32(ptr, src, lane) vld2_lane_u32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_u32
+ #define vld2_lane_u32(ptr, src, lane) simde_vld2_lane_u32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x2_t simde_vld2_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_uint64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint64x1_private tmp_ = simde_uint64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2_lane_u64(ptr, src, lane) vld2_lane_u64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_u64
+ #define vld2_lane_u64(ptr, src, lane) simde_vld2_lane_u64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x2_t simde_vld2_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vld2_lane_f16(ptr, src, lane) vld2_lane_f16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_f16
+ #define vld2_lane_f16(ptr, src, lane) simde_vld2_lane_f16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x2_t simde_vld2_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float32x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_float32x2_private tmp_ = simde_float32x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float32x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_f32(ptr, src, lane) vld2_lane_f32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_f32
+ #define vld2_lane_f32(ptr, src, lane) simde_vld2_lane_f32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x2_t simde_vld2_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x1x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_float64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_float64x1_private tmp_ = simde_float64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2_lane_f64(ptr, src, lane) vld2_lane_f64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_f64
+ #define vld2_lane_f64(ptr, src, lane) simde_vld2_lane_f64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x2_t simde_vld2q_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int8x16x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_int8x16x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int8x16_private tmp_ = simde_int8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int8x16_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_s8(ptr, src, lane) vld2q_lane_s8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_s8
+ #define vld2q_lane_s8(ptr, src, lane) simde_vld2q_lane_s8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x2_t simde_vld2q_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int16x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int16x8_private tmp_ = simde_int16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_s16
+ #define vld2q_lane_s16(ptr, src, lane) simde_vld2q_lane_s16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x2_t simde_vld2q_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int32x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int32x4_private tmp_ = simde_int32x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int32x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_s32
+ #define vld2q_lane_s32(ptr, src, lane) simde_vld2q_lane_s32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x2_t simde_vld2q_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_int64x2_private tmp_ = simde_int64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_s64(ptr, src, lane) vld2q_lane_s64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_s64
+ #define vld2q_lane_s64(ptr, src, lane) simde_vld2q_lane_s64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x2_t simde_vld2q_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint8x16x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_uint8x16x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint8x16_private tmp_ = simde_uint8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint8x16_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_u8(ptr, src, lane) vld2q_lane_u8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_u8
+ #define vld2q_lane_u8(ptr, src, lane) simde_vld2q_lane_u8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x2_t simde_vld2q_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint16x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_uint16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint16x8_private tmp_ = simde_uint16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_u16
+ #define vld2q_lane_u16(ptr, src, lane) simde_vld2q_lane_u16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x2_t simde_vld2q_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_uint32x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint32x4_private tmp_ = simde_uint32x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint32x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_u32
+ #define vld2q_lane_u32(ptr, src, lane) simde_vld2q_lane_u32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x2_t simde_vld2q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_uint64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_uint64x2_private tmp_ = simde_uint64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_u64(ptr, src, lane) vld2q_lane_u64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_u64
+ #define vld2q_lane_u64(ptr, src, lane) simde_vld2q_lane_u64((ptr), (src), (lane))
+#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x2_t simde_vld2q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vld2q_lane_f16(ptr, src, lane) vld2q_lane_f16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_f16
+ #define vld2q_lane_f16(ptr, src, lane) simde_vld2q_lane_f16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x2_t simde_vld2q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_float32x4_private tmp_ = simde_float32x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float32x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_f32
+ #define vld2q_lane_f32(ptr, src, lane) simde_vld2q_lane_f32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x2_t simde_vld2q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_float64x2_private tmp_ = simde_float64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_f64(ptr, src, lane) vld2q_lane_f64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_f64
+ #define vld2q_lane_f64(ptr, src, lane) simde_vld2q_lane_f64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x2_t simde_vld2_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly8x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly8x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_p8(ptr, src, lane) vld2_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_p8
+ #define vld2_lane_p8(ptr, src, lane) simde_vld2_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x2_t simde_vld2_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_poly16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2_lane_p16(ptr, src, lane) vld2_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_p16
+ #define vld2_lane_p16(ptr, src, lane) simde_vld2_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x2_t simde_vld2_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_poly64x1x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2_lane_p64(ptr, src, lane) vld2_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_p64
+ #define vld2_lane_p64(ptr, src, lane) simde_vld2_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x2_t simde_vld2q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x16x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_poly8x16x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly8x16_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_p8(ptr, src, lane) vld2q_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_p8
+ #define vld2q_lane_p8(ptr, src, lane) simde_vld2q_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x2_t simde_vld2q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld2q_lane_p16(ptr, src, lane) vld2q_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_p16
+ #define vld2q_lane_p16(ptr, src, lane) simde_vld2q_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x2_t simde_vld2q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_poly64x2x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld2q_lane_p64(ptr, src, lane) vld2q_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_p64
+ #define vld2q_lane_p64(ptr, src, lane) simde_vld2q_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x2_t simde_vld2_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x4x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_bfloat16x4x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_bfloat16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld2_lane_bf16(ptr, src, lane) vld2_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2_lane_bf16
+ #define vld2_lane_bf16(ptr, src, lane) simde_vld2_lane_bf16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x2_t simde_vld2q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x8x2_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_bfloat16x8x2_t r;
+
+ for (size_t i = 0 ; i < 2 ; i++) {
+ simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_bfloat16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld2q_lane_bf16(ptr, src, lane) vld2q_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld2q_lane_bf16
+ #define vld2q_lane_bf16(ptr, src, lane) simde_vld2q_lane_bf16((ptr), (src), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD2_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld3.h b/lib/simd_wrapper/simde/arm/neon/ld3.h
index e13eff1dbc1..a60c2aa0d9d 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld3.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld3.h
@@ -23,6 +23,8 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
*/
#if !defined(SIMDE_ARM_NEON_LD3_H)
@@ -40,6 +42,39 @@ SIMDE_BEGIN_DECLS_
#if !defined(SIMDE_BUG_INTEL_857088)
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x3_t
+simde_vld3_f16(simde_float16_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld3_f16(ptr);
+ #else
+ simde_float16x4_private r_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128)
+ vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 4);
+ r_[0].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+ simde_float16x4x3_t r = { {
+ simde_float16x4_from_private(r_[0]),
+ simde_float16x4_from_private(r_[1]),
+ simde_float16x4_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_f16
+ #define vld3_f16(a) simde_vld3_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2x3_t
simde_vld3_f32(simde_float32 const *ptr) {
@@ -47,13 +82,18 @@ simde_vld3_f32(simde_float32 const *ptr) {
return vld3_f32(ptr);
#else
simde_float32x2_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 2);
+ r_[0].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_float32x2x3_t r = { {
simde_float32x2_from_private(r_[0]),
simde_float32x2_from_private(r_[1]),
@@ -75,13 +115,18 @@ simde_vld3_f64(simde_float64 const *ptr) {
return vld3_f64(ptr);
#else
simde_float64x1_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 1);
+ r_[0].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_float64x1x3_t r = { {
simde_float64x1_from_private(r_[0]),
simde_float64x1_from_private(r_[1]),
@@ -103,13 +148,18 @@ simde_vld3_s8(int8_t const *ptr) {
return vld3_s8(ptr);
#else
simde_int8x8_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 8);
+ r_[0].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_int8x8x3_t r = { {
simde_int8x8_from_private(r_[0]),
simde_int8x8_from_private(r_[1]),
@@ -131,13 +181,18 @@ simde_vld3_s16(int16_t const *ptr) {
return vld3_s16(ptr);
#else
simde_int16x4_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 4);
+ r_[0].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_int16x4x3_t r = { {
simde_int16x4_from_private(r_[0]),
simde_int16x4_from_private(r_[1]),
@@ -159,13 +214,18 @@ simde_vld3_s32(int32_t const *ptr) {
return vld3_s32(ptr);
#else
simde_int32x2_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 2);
+ r_[0].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_int32x2x3_t r = { {
simde_int32x2_from_private(r_[0]),
simde_int32x2_from_private(r_[1]),
@@ -187,13 +247,18 @@ simde_vld3_s64(int64_t const *ptr) {
return vld3_s64(ptr);
#else
simde_int64x1_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 1);
+ r_[0].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_int64x1x3_t r = { {
simde_int64x1_from_private(r_[0]),
simde_int64x1_from_private(r_[1]),
@@ -203,7 +268,7 @@ simde_vld3_s64(int64_t const *ptr) {
return r;
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vld3_s64
#define vld3_s64(a) simde_vld3_s64((a))
#endif
@@ -215,13 +280,18 @@ simde_vld3_u8(uint8_t const *ptr) {
return vld3_u8(ptr);
#else
simde_uint8x8_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8);
+ r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_uint8x8x3_t r = { {
simde_uint8x8_from_private(r_[0]),
simde_uint8x8_from_private(r_[1]),
@@ -243,13 +313,18 @@ simde_vld3_u16(uint16_t const *ptr) {
return vld3_u16(ptr);
#else
simde_uint16x4_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4);
+ r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_uint16x4x3_t r = { {
simde_uint16x4_from_private(r_[0]),
simde_uint16x4_from_private(r_[1]),
@@ -271,13 +346,18 @@ simde_vld3_u32(uint32_t const *ptr) {
return vld3_u32(ptr);
#else
simde_uint32x2_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 2);
+ r_[0].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_uint32x2x3_t r = { {
simde_uint32x2_from_private(r_[0]),
simde_uint32x2_from_private(r_[1]),
@@ -299,13 +379,18 @@ simde_vld3_u64(uint64_t const *ptr) {
return vld3_u64(ptr);
#else
simde_uint64x1_private r_[3];
-
- for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
- for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
- r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1);
+ r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
}
- }
-
+ #endif
simde_uint64x1x3_t r = { {
simde_uint64x1_from_private(r_[0]),
simde_uint64x1_from_private(r_[1]),
@@ -315,16 +400,61 @@ simde_vld3_u64(uint64_t const *ptr) {
return r;
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vld3_u64
#define vld3_u64(a) simde_vld3_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x3_t
+simde_vld3q_f16(simde_float16_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld3q_f16(ptr);
+ #else
+ simde_float16x8_private r_[3];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128)
+ vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+ simde_float16x8x3_t r = { {
+ simde_float16x8_from_private(r_[0]),
+ simde_float16x8_from_private(r_[1]),
+ simde_float16x8_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_f16
+ #define vld3q_f16(a) simde_vld3q_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4x3_t
simde_vld3q_f32(simde_float32 const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_f32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_float32x4_private r_[3];
+ vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 4);
+ r_[0].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 2);
+ simde_float32x4x3_t r = { {
+ simde_float32x4_from_private(r_[0]),
+ simde_float32x4_from_private(r_[1]),
+ simde_float32x4_from_private(r_[2])
+ } };
+ return r;
#else
simde_float32x4_private r_[3];
@@ -353,6 +483,18 @@ simde_float64x2x3_t
simde_vld3q_f64(simde_float64 const *ptr) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld3q_f64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_float64x2_private r_[3];
+ vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 2);
+ simde_float64x2x3_t r = { {
+ simde_float64x2_from_private(r_[0]),
+ simde_float64x2_from_private(r_[1]),
+ simde_float64x2_from_private(r_[2])
+ } };
+ return r;
#else
simde_float64x2_private r_[3];
@@ -381,6 +523,18 @@ simde_int8x16x3_t
simde_vld3q_s8(int8_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_s8(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int8x16_private r_[3];
+ vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 16);
+ r_[0].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 2);
+ simde_int8x16x3_t r = { {
+ simde_int8x16_from_private(r_[0]),
+ simde_int8x16_from_private(r_[1]),
+ simde_int8x16_from_private(r_[2])
+ } };
+ return r;
#else
simde_int8x16_private r_[3];
@@ -409,6 +563,18 @@ simde_int16x8x3_t
simde_vld3q_s16(int16_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_s16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int16x8_private r_[3];
+ vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 2);
+ simde_int16x8x3_t r = { {
+ simde_int16x8_from_private(r_[0]),
+ simde_int16x8_from_private(r_[1]),
+ simde_int16x8_from_private(r_[2])
+ } };
+ return r;
#else
simde_int16x8_private r_[3];
@@ -437,6 +603,18 @@ simde_int32x4x3_t
simde_vld3q_s32(int32_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_s32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int32x4_private r_[3];
+ vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 4);
+ r_[0].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 2);
+ simde_int32x4x3_t r = { {
+ simde_int32x4_from_private(r_[0]),
+ simde_int32x4_from_private(r_[1]),
+ simde_int32x4_from_private(r_[2])
+ } };
+ return r;
#else
simde_int32x4_private r_[3];
@@ -465,6 +643,18 @@ simde_int64x2x3_t
simde_vld3q_s64(int64_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld3q_s64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_int64x2_private r_[3];
+ vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 2);
+ simde_int64x2x3_t r = { {
+ simde_int64x2_from_private(r_[0]),
+ simde_int64x2_from_private(r_[1]),
+ simde_int64x2_from_private(r_[2])
+ } };
+ return r;
#else
simde_int64x2_private r_[3];
@@ -494,6 +684,18 @@ simde_uint8x16x3_t
simde_vld3q_u8(uint8_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_u8(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x16_private r_[3];
+ vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16);
+ r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2);
+ simde_uint8x16x3_t r = { {
+ simde_uint8x16_from_private(r_[0]),
+ simde_uint8x16_from_private(r_[1]),
+ simde_uint8x16_from_private(r_[2])
+ } };
+ return r;
#else
simde_uint8x16_private r_[3];
@@ -522,6 +724,18 @@ simde_uint16x8x3_t
simde_vld3q_u16(uint16_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_u16(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x8_private r_[3];
+ vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2);
+ simde_uint16x8x3_t r = { {
+ simde_uint16x8_from_private(r_[0]),
+ simde_uint16x8_from_private(r_[1]),
+ simde_uint16x8_from_private(r_[2])
+ } };
+ return r;
#else
simde_uint16x8_private r_[3];
@@ -550,6 +764,18 @@ simde_uint32x4x3_t
simde_vld3q_u32(uint32_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vld3q_u32(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x4_private r_[3];
+ vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 4);
+ r_[0].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 2);
+ simde_uint32x4x3_t r = { {
+ simde_uint32x4_from_private(r_[0]),
+ simde_uint32x4_from_private(r_[1]),
+ simde_uint32x4_from_private(r_[2])
+ } };
+ return r;
#else
simde_uint32x4_private r_[3];
@@ -578,6 +804,18 @@ simde_uint64x2x3_t
simde_vld3q_u64(uint64_t const *ptr) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vld3q_u64(ptr);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint64x2_private r_[3];
+ vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2);
+ simde_uint64x2x3_t r = { {
+ simde_uint64x2_from_private(r_[0]),
+ simde_uint64x2_from_private(r_[1]),
+ simde_uint64x2_from_private(r_[2])
+ } };
+ return r;
#else
simde_uint64x2_private r_[3];
@@ -601,6 +839,272 @@ simde_vld3q_u64(uint64_t const *ptr) {
#define vld3q_u64(a) simde_vld3q_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x3_t
+simde_vld3_p8(simde_poly8_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_p8(ptr);
+ #else
+ simde_poly8x8_private r_[3];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8);
+ r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly8x8x3_t r = { {
+ simde_poly8x8_from_private(r_[0]),
+ simde_poly8x8_from_private(r_[1]),
+ simde_poly8x8_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_p8
+ #define vld3_p8(a) simde_vld3_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x3_t
+simde_vld3_p16(simde_poly16_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_p16(ptr);
+ #else
+ simde_poly16x4_private r_[3];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4);
+ r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly16x4x3_t r = { {
+ simde_poly16x4_from_private(r_[0]),
+ simde_poly16x4_from_private(r_[1]),
+ simde_poly16x4_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_p16
+ #define vld3_p16(a) simde_vld3_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x3_t
+simde_vld3_p64(simde_poly64_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld3_p64(ptr);
+ #else
+ simde_poly64x1_private r_[3];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1);
+ r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0);
+ r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1);
+ r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly64x1x3_t r = { {
+ simde_poly64x1_from_private(r_[0]),
+ simde_poly64x1_from_private(r_[1]),
+ simde_poly64x1_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_p64
+ #define vld3_p64(a) simde_vld3_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x3_t
+simde_vld3q_p8(simde_poly8_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3q_p8(ptr);
+ #else
+ simde_poly8x16_private r_[3];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16);
+ r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly8x16x3_t r = { {
+ simde_poly8x16_from_private(r_[0]),
+ simde_poly8x16_from_private(r_[1]),
+ simde_poly8x16_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_p8
+ #define vld3q_p8(a) simde_vld3q_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x3_t
+simde_vld3q_p16(simde_poly16_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3q_p16(ptr);
+ #else
+ simde_poly16x8_private r_[3];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8);
+ r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly16x8x3_t r = { {
+ simde_poly16x8_from_private(r_[0]),
+ simde_poly16x8_from_private(r_[1]),
+ simde_poly16x8_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_p16
+ #define vld3q_p16(a) simde_vld3q_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x3_t
+simde_vld3q_p64(simde_poly64_t const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_p64(ptr);
+ #else
+ simde_poly64x2_private r_[3];
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2);
+ r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2);
+ #else
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+ #endif
+
+ simde_poly64x2x3_t r = { {
+ simde_poly64x2_from_private(r_[0]),
+ simde_poly64x2_from_private(r_[1]),
+ simde_poly64x2_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_p64
+ #define vld3q_p64(a) simde_vld3q_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x3_t
+simde_vld3_bf16(simde_bfloat16 const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld3_bf16(ptr);
+ #else
+ simde_bfloat16x4_private r_[3];
+
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+
+ simde_bfloat16x4x3_t r = { {
+ simde_bfloat16x4_from_private(r_[0]),
+ simde_bfloat16x4_from_private(r_[1]),
+ simde_bfloat16x4_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_bf16
+ #define vld3_bf16(a) simde_vld3_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x3_t
+simde_vld3q_bf16(simde_bfloat16 const *ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld3q_bf16(ptr);
+ #else
+ simde_bfloat16x8_private r_[3];
+
+ for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) {
+ for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) {
+ r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))];
+ }
+ }
+
+ simde_bfloat16x8x3_t r = { {
+ simde_bfloat16x8_from_private(r_[0]),
+ simde_bfloat16x8_from_private(r_[1]),
+ simde_bfloat16x8_from_private(r_[2])
+ } };
+
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_bf16
+ #define vld3q_bf16(a) simde_vld3q_bf16((a))
+#endif
+
#endif /* !defined(SIMDE_BUG_INTEL_857088) */
SIMDE_END_DECLS_
diff --git a/lib/simd_wrapper/simde/arm/neon/ld3_dup.h b/lib/simd_wrapper/simde/arm/neon/ld3_dup.h
new file mode 100644
index 00000000000..25f133b694a
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld3_dup.h
@@ -0,0 +1,610 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD3_DUP_H)
+#define SIMDE_ARM_NEON_LD3_DUP_H
+
+#include "dup_n.h"
+#include "reinterpret.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x3_t
+simde_vld3_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld3_dup_f16(ptr);
+ #else
+ simde_float16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_f16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_f16
+ #define vld3_dup_f16(a) simde_vld3_dup_f16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x3_t
+simde_vld3_dup_f32(simde_float32 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_f32(ptr);
+ #else
+ simde_float32x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_f32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_f32
+ #define vld3_dup_f32(a) simde_vld3_dup_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x3_t
+simde_vld3_dup_f64(simde_float64 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3_dup_f64(ptr);
+ #else
+ simde_float64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_f64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_f64
+ #define vld3_dup_f64(a) simde_vld3_dup_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x3_t
+simde_vld3_dup_s8(int8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_s8(ptr);
+ #else
+ simde_int8x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_s8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_s8
+ #define vld3_dup_s8(a) simde_vld3_dup_s8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x3_t
+simde_vld3_dup_s16(int16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_s16(ptr);
+ #else
+ simde_int16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_s16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_s16
+ #define vld3_dup_s16(a) simde_vld3_dup_s16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x3_t
+simde_vld3_dup_s32(int32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_s32(ptr);
+ #else
+ simde_int32x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_s32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_s32
+ #define vld3_dup_s32(a) simde_vld3_dup_s32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x3_t
+simde_vld3_dup_s64(int64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_s64(ptr);
+ #else
+ simde_int64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_s64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_s64
+ #define vld3_dup_s64(a) simde_vld3_dup_s64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x3_t
+simde_vld3_dup_u8(uint8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_u8(ptr);
+ #else
+ simde_uint8x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_u8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_u8
+ #define vld3_dup_u8(a) simde_vld3_dup_u8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x3_t
+simde_vld3_dup_u16(uint16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_u16(ptr);
+ #else
+ simde_uint16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_u16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_u16
+ #define vld3_dup_u16(a) simde_vld3_dup_u16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x3_t
+simde_vld3_dup_u32(uint32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_u32(ptr);
+ #else
+ simde_uint32x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_u32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_u32
+ #define vld3_dup_u32(a) simde_vld3_dup_u32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x3_t
+simde_vld3_dup_u64(uint64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld3_dup_u64(ptr);
+ #else
+ simde_uint64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_u64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_u64
+ #define vld3_dup_u64(a) simde_vld3_dup_u64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x3_t
+simde_vld3q_dup_f16(simde_float16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld3q_dup_f16(ptr);
+ #else
+ simde_float16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_f16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_f16
+ #define vld3q_dup_f16(a) simde_vld3q_dup_f16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x3_t
+simde_vld3q_dup_f32(simde_float32 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_f32(ptr);
+ #else
+ simde_float32x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_f32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_f32
+ #define vld3q_dup_f32(a) simde_vld3q_dup_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x3_t
+simde_vld3q_dup_f64(simde_float64 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_f64(ptr);
+ #else
+ simde_float64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_f64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_f64
+ #define vld3q_dup_f64(a) simde_vld3q_dup_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x3_t
+simde_vld3q_dup_s8(int8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_s8(ptr);
+ #else
+ simde_int8x16x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_s8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_s8
+ #define vld3q_dup_s8(a) simde_vld3q_dup_s8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x3_t
+simde_vld3q_dup_s16(int16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_s16(ptr);
+ #else
+ simde_int16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_s16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_s16
+ #define vld3q_dup_s16(a) simde_vld3q_dup_s16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x3_t
+simde_vld3q_dup_s32(int32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_s32(ptr);
+ #else
+ simde_int32x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_s32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_s32
+ #define vld3q_dup_s32(a) simde_vld3q_dup_s32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x3_t
+simde_vld3q_dup_s64(int64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_s64(ptr);
+ #else
+ simde_int64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_s64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_s64
+ #define vld3q_dup_s64(a) simde_vld3q_dup_s64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x3_t
+simde_vld3q_dup_u8(uint8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_u8(ptr);
+ #else
+ simde_uint8x16x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_u8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_u8
+ #define vld3q_dup_u8(a) simde_vld3q_dup_u8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x3_t
+simde_vld3q_dup_u16(uint16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_u16(ptr);
+ #else
+ simde_uint16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_u16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_u16
+ #define vld3q_dup_u16(a) simde_vld3q_dup_u16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x3_t
+simde_vld3q_dup_u32(uint32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_u32(ptr);
+ #else
+ simde_uint32x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_u32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_u32
+ #define vld3q_dup_u32(a) simde_vld3q_dup_u32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x3_t
+simde_vld3q_dup_u64(uint64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_u64(ptr);
+ #else
+ simde_uint64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_u64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_u64
+ #define vld3q_dup_u64(a) simde_vld3q_dup_u64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x3_t
+simde_vld3_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld3_dup_p8(ptr);
+ #else
+ simde_poly8x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_p8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_p8
+ #define vld3_dup_p8(a) simde_vld3_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x3_t
+simde_vld3_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld3_dup_p16(ptr);
+ #else
+ simde_poly16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_p16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_p16
+ #define vld3_dup_p16(a) simde_vld3_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x3_t
+simde_vld3_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld3_dup_p64(ptr);
+ #else
+ simde_poly64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_p64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_p64
+ #define vld3_dup_p64(a) simde_vld3_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x3_t
+simde_vld3q_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld3q_dup_p8(ptr);
+ #else
+ simde_poly8x16x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_p8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_p8
+ #define vld3q_dup_p8(a) simde_vld3q_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x3_t
+simde_vld3q_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld3q_dup_p16(ptr);
+ #else
+ simde_poly16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_p16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_p16
+ #define vld3q_dup_p16(a) simde_vld3q_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x3_t
+simde_vld3q_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld3q_dup_p64(ptr);
+ #else
+ simde_poly64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_p64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_p64
+ #define vld3q_dup_p64(a) simde_vld3q_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x3_t
+simde_vld3_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld3_dup_bf16(ptr);
+ #else
+ simde_bfloat16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdup_n_bf16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_dup_bf16
+ #define vld3_dup_bf16(a) simde_vld3_dup_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x3_t
+simde_vld3q_dup_bf16(simde_bfloat16 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld3q_dup_bf16(ptr);
+ #else
+ simde_bfloat16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ r.val[i] = simde_vdupq_n_bf16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_dup_bf16
+ #define vld3q_dup_bf16(a) simde_vld3q_dup_bf16((a))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD3_DUP_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld3_lane.h b/lib/simd_wrapper/simde/arm/neon/ld3_lane.h
new file mode 100644
index 00000000000..4950792a8a6
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld3_lane.h
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD3_LANE_H)
+#define SIMDE_ARM_NEON_LD3_LANE_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x3_t simde_vld3_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int8x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int8x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int8x8_private tmp_ = simde_int8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int8x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_s8(ptr, src, lane) vld3_lane_s8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_s8
+ #define vld3_lane_s8(ptr, src, lane) simde_vld3_lane_s8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x3_t simde_vld3_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int16x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int16x4_private tmp_ = simde_int16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_s16(ptr, src, lane) vld3_lane_s16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_s16
+ #define vld3_lane_s16(ptr, src, lane) simde_vld3_lane_s16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x3_t simde_vld3_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int32x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int32x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int32x2_private tmp_ = simde_int32x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int32x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_s32(ptr, src, lane) vld3_lane_s32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_s32
+ #define vld3_lane_s32(ptr, src, lane) simde_vld3_lane_s32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x3_t simde_vld3_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_int64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int64x1_private tmp_ = simde_int64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3_lane_s64(ptr, src, lane) vld3_lane_s64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_s64
+ #define vld3_lane_s64(ptr, src, lane) simde_vld3_lane_s64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x3_t simde_vld3_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint8x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_uint8x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint8x8_private tmp_ = simde_uint8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint8x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_u8(ptr, src, lane) vld3_lane_u8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_u8
+ #define vld3_lane_u8(ptr, src, lane) simde_vld3_lane_u8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x3_t simde_vld3_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint16x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_uint16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint16x4_private tmp_ = simde_uint16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_u16(ptr, src, lane) vld3_lane_u16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_u16
+ #define vld3_lane_u16(ptr, src, lane) simde_vld3_lane_u16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x3_t simde_vld3_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint32x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_uint32x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint32x2_private tmp_ = simde_uint32x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint32x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_u32(ptr, src, lane) vld3_lane_u32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_u32
+ #define vld3_lane_u32(ptr, src, lane) simde_vld3_lane_u32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x3_t simde_vld3_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_uint64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint64x1_private tmp_ = simde_uint64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3_lane_u64(ptr, src, lane) vld3_lane_u64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_u64
+ #define vld3_lane_u64(ptr, src, lane) simde_vld3_lane_u64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x3_t simde_vld3_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vld3_lane_f16(ptr, src, lane) vld3_lane_f16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_f16
+ #define vld3_lane_f16(ptr, src, lane) simde_vld3_lane_f16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x3_t simde_vld3_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float32x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_float32x2_private tmp_ = simde_float32x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float32x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_f32(ptr, src, lane) vld3_lane_f32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_f32
+ #define vld3_lane_f32(ptr, src, lane) simde_vld3_lane_f32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x3_t simde_vld3_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_float64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_float64x1_private tmp_ = simde_float64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3_lane_f64(ptr, src, lane) vld3_lane_f64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_f64
+ #define vld3_lane_f64(ptr, src, lane) simde_vld3_lane_f64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x3_t simde_vld3q_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int8x16x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_int8x16x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int8x16_private tmp_ = simde_int8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int8x16_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_s8(ptr, src, lane) vld3q_lane_s8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_s8
+ #define vld3q_lane_s8(ptr, src, lane) simde_vld3q_lane_s8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x3_t simde_vld3q_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int16x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int16x8_private tmp_ = simde_int16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_s16
+ #define vld3q_lane_s16(ptr, src, lane) simde_vld3q_lane_s16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x3_t simde_vld3q_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int32x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int32x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int32x4_private tmp_ = simde_int32x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int32x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_s32
+ #define vld3q_lane_s32(ptr, src, lane) simde_vld3q_lane_s32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x3_t simde_vld3q_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_int64x2_private tmp_ = simde_int64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_int64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_s64(ptr, src, lane) vld3q_lane_s64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_s64
+ #define vld3q_lane_s64(ptr, src, lane) simde_vld3q_lane_s64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x3_t simde_vld3q_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint8x16x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_uint8x16x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint8x16_private tmp_ = simde_uint8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint8x16_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_u8(ptr, src, lane) vld3q_lane_u8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_u8
+ #define vld3q_lane_u8(ptr, src, lane) simde_vld3q_lane_u8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x3_t simde_vld3q_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint16x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_uint16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint16x8_private tmp_ = simde_uint16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_u16
+ #define vld3q_lane_u16(ptr, src, lane) simde_vld3q_lane_u16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x3_t simde_vld3q_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint32x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_uint32x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint32x4_private tmp_ = simde_uint32x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint32x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_u32
+ #define vld3q_lane_u32(ptr, src, lane) simde_vld3q_lane_u32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x3_t simde_vld3q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_uint64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_uint64x2_private tmp_ = simde_uint64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_uint64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_u64(ptr, src, lane) vld3q_lane_u64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_u64
+ #define vld3q_lane_u64(ptr, src, lane) simde_vld3q_lane_u64((ptr), (src), (lane))
+#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x3_t simde_vld3q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_f16
+ #define vld3q_lane_f16(ptr, src, lane) simde_vld3q_lane_f16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x3_t simde_vld3q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_float32x4_private tmp_ = simde_float32x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float32x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_f32
+ #define vld3q_lane_f32(ptr, src, lane) simde_vld3q_lane_f32((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x3_t simde_vld3q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_float64x2_private tmp_ = simde_float64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_f64(ptr, src, lane) vld3q_lane_f64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_f64
+ #define vld3q_lane_f64(ptr, src, lane) simde_vld3q_lane_f64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x3_t simde_vld3_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly8x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly8x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_p8(ptr, src, lane) vld3_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_p8
+ #define vld3_lane_p8(ptr, src, lane) simde_vld3_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x3_t simde_vld3_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_poly16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3_lane_p16(ptr, src, lane) vld3_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_p16
+ #define vld3_lane_p16(ptr, src, lane) simde_vld3_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x3_t simde_vld3_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_poly64x1x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly64x1_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3_lane_p64(ptr, src, lane) vld3_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_p64
+ #define vld3_lane_p64(ptr, src, lane) simde_vld3_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x3_t simde_vld3q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x16x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_poly8x16x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly8x16_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_p8(ptr, src, lane) vld3q_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_p8
+ #define vld3q_lane_p8(ptr, src, lane) simde_vld3q_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x3_t simde_vld3q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld3q_lane_p16(ptr, src, lane) vld3q_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_p16
+ #define vld3q_lane_p16(ptr, src, lane) simde_vld3q_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x3_t simde_vld3q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x2x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_poly64x2x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly64x2_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld3q_lane_p64(ptr, src, lane) vld3q_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_p64
+ #define vld3q_lane_p64(ptr, src, lane) simde_vld3q_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x3_t simde_vld3_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x4x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_bfloat16x4x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_bfloat16x4_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld3_lane_bf16(ptr, src, lane) vld3_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3_lane_bf16
+ #define vld3_lane_bf16(ptr, src, lane) simde_vld3_lane_bf16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x3_t simde_vld3q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x8x3_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_bfloat16x8x3_t r;
+
+ for (size_t i = 0 ; i < 3 ; i++) {
+ simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_bfloat16x8_from_private(tmp_);
+ }
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld3q_lane_bf16(ptr, src, lane) vld3q_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld3q_lane_bf16
+ #define vld3q_lane_bf16(ptr, src, lane) simde_vld3q_lane_bf16((ptr), (src), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD3_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld4.h b/lib/simd_wrapper/simde/arm/neon/ld4.h
index b936182485f..777c24f73c4 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld4.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld4.h
@@ -23,6 +23,8 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab)
*/
#if !defined(SIMDE_ARM_NEON_LD4_H)
@@ -39,6 +41,34 @@ SIMDE_BEGIN_DECLS_
#if !defined(SIMDE_BUG_INTEL_857088)
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x4_t
+simde_vld4_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld4_f16(ptr);
+ #else
+ simde_float16x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128)
+ vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 4);
+ a_[0].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_float16x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]),
+ simde_float16x4_from_private(a_[2]), simde_float16x4_from_private(a_[3]) } };
+ return (s_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_f16
+ #define vld4_f16(a) simde_vld4_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2x4_t
simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
@@ -46,9 +76,17 @@ simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld4_f32(ptr);
#else
simde_float32x2_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 2);
+ a_[0].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]),
simde_float32x2_from_private(a_[2]), simde_float32x2_from_private(a_[3]) } };
return (s_);
@@ -66,9 +104,17 @@ simde_vld4_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) {
return vld4_f64(ptr);
#else
simde_float64x1_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 1);
+ a_[0].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]),
simde_float64x1_from_private(a_[2]), simde_float64x1_from_private(a_[3]) } };
return s_;
@@ -86,9 +132,17 @@ simde_vld4_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
return vld4_s8(ptr);
#else
simde_int8x8_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 8);
+ a_[0].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]),
simde_int8x8_from_private(a_[2]), simde_int8x8_from_private(a_[3]) } };
return s_;
@@ -106,9 +160,17 @@ simde_vld4_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
return vld4_s16(ptr);
#else
simde_int16x4_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 4);
+ a_[0].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]),
simde_int16x4_from_private(a_[2]), simde_int16x4_from_private(a_[3]) } };
return s_;
@@ -126,9 +188,17 @@ simde_vld4_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld4_s32(ptr);
#else
simde_int32x2_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 2);
+ a_[0].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]),
simde_int32x2_from_private(a_[2]), simde_int32x2_from_private(a_[3]) } };
return s_;
@@ -146,15 +216,23 @@ simde_vld4_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
return vld4_s64(ptr);
#else
simde_int64x1_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 1);
+ a_[0].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]),
simde_int64x1_from_private(a_[2]), simde_int64x1_from_private(a_[3]) } };
return s_;
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vld4_s64
#define vld4_s64(a) simde_vld4_s64((a))
#endif
@@ -166,9 +244,17 @@ simde_vld4_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
return vld4_u8(ptr);
#else
simde_uint8x8_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8);
+ a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]),
simde_uint8x8_from_private(a_[2]), simde_uint8x8_from_private(a_[3]) } };
return s_;
@@ -186,9 +272,17 @@ simde_vld4_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
return vld4_u16(ptr);
#else
simde_uint16x4_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4);
+ a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]),
simde_uint16x4_from_private(a_[2]), simde_uint16x4_from_private(a_[3]) } };
return s_;
@@ -206,9 +300,17 @@ simde_vld4_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld4_u32(ptr);
#else
simde_uint32x2_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 2);
+ a_[0].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]),
simde_uint32x2_from_private(a_[2]), simde_uint32x2_from_private(a_[3]) } };
return s_;
@@ -226,19 +328,55 @@ simde_vld4_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
return vld4_u64(ptr);
#else
simde_uint64x1_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1);
+ a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]),
simde_uint64x1_from_private(a_[2]), simde_uint64x1_from_private(a_[3]) } };
return s_;
#endif
}
-#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
#undef vld4_u64
#define vld4_u64(a) simde_vld4_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x4_t
+simde_vld4q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld4q_f16(ptr);
+ #else
+ simde_float16x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128)
+ vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 8);
+ a_[0].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_float16x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]),
+ simde_float16x8_from_private(a_[2]), simde_float16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_f16
+ #define vld4q_f16(a) simde_vld4q_f16((a))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4x4_t
simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) {
@@ -246,9 +384,17 @@ simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) {
return vld4q_f32(ptr);
#else
simde_float32x4_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 4);
+ a_[0].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]),
simde_float32x4_from_private(a_[2]), simde_float32x4_from_private(a_[3]) } };
return s_;
@@ -266,9 +412,17 @@ simde_vld4q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld4q_f64(ptr);
#else
simde_float64x2_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 2);
+ a_[0].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]),
simde_float64x2_from_private(a_[2]), simde_float64x2_from_private(a_[3]) } };
return s_;
@@ -286,9 +440,17 @@ simde_vld4q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
return vld4q_s8(ptr);
#else
simde_int8x16_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 16);
+ a_[0].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]),
simde_int8x16_from_private(a_[2]), simde_int8x16_from_private(a_[3]) } };
return s_;
@@ -306,9 +468,17 @@ simde_vld4q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
return vld4q_s16(ptr);
#else
simde_int16x8_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 8);
+ a_[0].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]),
simde_int16x8_from_private(a_[2]), simde_int16x8_from_private(a_[3]) } };
return s_;
@@ -326,9 +496,17 @@ simde_vld4q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
return vld4q_s32(ptr);
#else
simde_int32x4_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 4);
+ a_[0].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]),
simde_int32x4_from_private(a_[2]), simde_int32x4_from_private(a_[3]) } };
return s_;
@@ -346,9 +524,17 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld4q_s64(ptr);
#else
simde_int64x2_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 2);
+ a_[0].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]),
simde_int64x2_from_private(a_[2]), simde_int64x2_from_private(a_[3]) } };
return s_;
@@ -358,7 +544,6 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#undef vld4q_s64
#define vld4q_s64(a) simde_vld4q_s64((a))
#endif
-
SIMDE_FUNCTION_ATTRIBUTES
simde_uint8x16x4_t
simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
@@ -403,6 +588,20 @@ simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
simde_uint8x16_from_private(r_[2]),
simde_uint8x16_from_private(r_[3])}};
return s_;
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x16_private r_[4];
+ vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16);
+ r_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0);
+ r_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1);
+ r_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2);
+ r_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3);
+ simde_uint8x16x4_t r = { {
+ simde_uint8x16_from_private(r_[0]),
+ simde_uint8x16_from_private(r_[1]),
+ simde_uint8x16_from_private(r_[2]),
+ simde_uint8x16_from_private(r_[3])
+ } };
+ return r;
#else
simde_uint8x16_private a_[4];
for (size_t i = 0; i < (sizeof(simde_uint8x16_t) / sizeof(*ptr)) * 4 ; i++) {
@@ -425,9 +624,17 @@ simde_vld4q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
return vld4q_u16(ptr);
#else
simde_uint16x8_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8);
+ a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]),
simde_uint16x8_from_private(a_[2]), simde_uint16x8_from_private(a_[3]) } };
return s_;
@@ -445,9 +652,17 @@ simde_vld4q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
return vld4q_u32(ptr);
#else
simde_uint32x4_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 4);
+ a_[0].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]),
simde_uint32x4_from_private(a_[2]), simde_uint32x4_from_private(a_[3]) } };
return s_;
@@ -465,9 +680,17 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
return vld4q_u64(ptr);
#else
simde_uint64x2_private a_[4];
- for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) {
- a_[i % 4].values[i / 4] = ptr[i];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2);
+ a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]),
simde_uint64x2_from_private(a_[2]), simde_uint64x2_from_private(a_[3]) } };
return s_;
@@ -478,6 +701,214 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
#define vld4q_u64(a) simde_vld4q_u64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x4_t
+simde_vld4_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_p8(ptr);
+ #else
+ simde_poly8x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8);
+ a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_poly8x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]),
+ simde_poly8x8_from_private(a_[2]), simde_poly8x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_p8
+ #define vld4_p8(a) simde_vld4_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x4_t
+simde_vld4_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_p16(ptr);
+ #else
+ simde_poly16x4_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4);
+ a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_poly16x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]),
+ simde_poly16x4_from_private(a_[2]), simde_poly16x4_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_p16
+ #define vld4_p16(a) simde_vld4_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x4_t
+simde_vld4_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld4_p64(ptr);
+ #else
+ simde_poly64x1_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1);
+ a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0);
+ a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1);
+ a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2);
+ a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_poly64x1_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]),
+ simde_poly64x1_from_private(a_[2]), simde_poly64x1_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_p64
+ #define vld4_p64(a) simde_vld4_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x4_t
+simde_vld4q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4q_p8(ptr);
+ #else
+ simde_poly8x16_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16);
+ a_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_poly8x16_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]),
+ simde_poly8x16_from_private(a_[2]), simde_poly8x16_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_p8
+ #define vld4q_p8(a) simde_vld4q_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x4_t
+simde_vld4q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4q_p16(ptr);
+ #else
+ simde_poly16x8_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8);
+ a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_poly16x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]),
+ simde_poly16x8_from_private(a_[2]), simde_poly16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_p16
+ #define vld4q_p16(a) simde_vld4q_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x4_t
+simde_vld4q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_p64(ptr);
+ #else
+ simde_poly64x2_private a_[4];
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2);
+ a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0);
+ a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1);
+ a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2);
+ a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3);
+ #else
+ for (size_t i = 0; i < (sizeof(simde_poly64x2_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ #endif
+ simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]),
+ simde_poly64x2_from_private(a_[2]), simde_poly64x2_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_p64
+ #define vld4q_p64(a) simde_vld4q_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x4_t
+simde_vld4_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld4_bf16(ptr);
+ #else
+ simde_bfloat16x4_private a_[4];
+ for (size_t i = 0; i < (sizeof(simde_bfloat16x4_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ simde_bfloat16x4x4_t s_ = { { simde_bfloat16x4_from_private(a_[0]), simde_bfloat16x4_from_private(a_[1]),
+ simde_bfloat16x4_from_private(a_[2]), simde_bfloat16x4_from_private(a_[3]) } };
+ return (s_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_bf16
+ #define vld4_bf16(a) simde_vld4_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x4_t
+simde_vld4q_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld4q_bf16(ptr);
+ #else
+ simde_bfloat16x8_private a_[4];
+ for (size_t i = 0; i < (sizeof(simde_bfloat16x8_t) / sizeof(*ptr)) * 4 ; i++) {
+ a_[i % 4].values[i / 4] = ptr[i];
+ }
+ simde_bfloat16x8x4_t s_ = { { simde_bfloat16x8_from_private(a_[0]), simde_bfloat16x8_from_private(a_[1]),
+ simde_bfloat16x8_from_private(a_[2]), simde_bfloat16x8_from_private(a_[3]) } };
+ return s_;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_bf16
+ #define vld4q_bf16(a) simde_vld4q_bf16((a))
+#endif
+
#endif /* !defined(SIMDE_BUG_INTEL_857088) */
SIMDE_END_DECLS_
diff --git a/lib/simd_wrapper/simde/arm/neon/ld4_dup.h b/lib/simd_wrapper/simde/arm/neon/ld4_dup.h
new file mode 100644
index 00000000000..c2100af147d
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/ld4_dup.h
@@ -0,0 +1,610 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_LD4_DUP_H)
+#define SIMDE_ARM_NEON_LD4_DUP_H
+
+#include "dup_n.h"
+#include "reinterpret.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x4_t
+simde_vld4_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld4_dup_f16(ptr);
+ #else
+ simde_float16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_f16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_f16
+ #define vld4_dup_f16(a) simde_vld4_dup_f16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2x4_t
+simde_vld4_dup_f32(simde_float32 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_f32(ptr);
+ #else
+ simde_float32x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_f32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_f32
+ #define vld4_dup_f32(a) simde_vld4_dup_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1x4_t
+simde_vld4_dup_f64(simde_float64 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4_dup_f64(ptr);
+ #else
+ simde_float64x1x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_f64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_f64
+ #define vld4_dup_f64(a) simde_vld4_dup_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x8x4_t
+simde_vld4_dup_s8(int8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_s8(ptr);
+ #else
+ simde_int8x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_s8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_s8
+ #define vld4_dup_s8(a) simde_vld4_dup_s8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4x4_t
+simde_vld4_dup_s16(int16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_s16(ptr);
+ #else
+ simde_int16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_s16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_s16
+ #define vld4_dup_s16(a) simde_vld4_dup_s16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2x4_t
+simde_vld4_dup_s32(int32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_s32(ptr);
+ #else
+ simde_int32x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_s32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_s32
+ #define vld4_dup_s32(a) simde_vld4_dup_s32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x1x4_t
+simde_vld4_dup_s64(int64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_s64(ptr);
+ #else
+ simde_int64x1x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_s64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_s64
+ #define vld4_dup_s64(a) simde_vld4_dup_s64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x8x4_t
+simde_vld4_dup_u8(uint8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_u8(ptr);
+ #else
+ simde_uint8x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_u8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_u8
+ #define vld4_dup_u8(a) simde_vld4_dup_u8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x4x4_t
+simde_vld4_dup_u16(uint16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_u16(ptr);
+ #else
+ simde_uint16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_u16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_u16
+ #define vld4_dup_u16(a) simde_vld4_dup_u16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x2x4_t
+simde_vld4_dup_u32(uint32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_u32(ptr);
+ #else
+ simde_uint32x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_u32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_u32
+ #define vld4_dup_u32(a) simde_vld4_dup_u32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x1x4_t
+simde_vld4_dup_u64(uint64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vld4_dup_u64(ptr);
+ #else
+ simde_uint64x1x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_u64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_u64
+ #define vld4_dup_u64(a) simde_vld4_dup_u64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x4_t
+simde_vld4q_dup_f16(simde_float16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vld4q_dup_f16(ptr);
+ #else
+ simde_float16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_f16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_f16
+ #define vld4q_dup_f16(a) simde_vld4q_dup_f16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4x4_t
+simde_vld4q_dup_f32(simde_float32 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_f32(ptr);
+ #else
+ simde_float32x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_f32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_f32
+ #define vld4q_dup_f32(a) simde_vld4q_dup_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2x4_t
+simde_vld4q_dup_f64(simde_float64 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_f64(ptr);
+ #else
+ simde_float64x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_f64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_f64
+ #define vld4q_dup_f64(a) simde_vld4q_dup_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int8x16x4_t
+simde_vld4q_dup_s8(int8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_s8(ptr);
+ #else
+ simde_int8x16x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_s8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_s8
+ #define vld4q_dup_s8(a) simde_vld4q_dup_s8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8x4_t
+simde_vld4q_dup_s16(int16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_s16(ptr);
+ #else
+ simde_int16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_s16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_s16
+ #define vld4q_dup_s16(a) simde_vld4q_dup_s16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4x4_t
+simde_vld4q_dup_s32(int32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_s32(ptr);
+ #else
+ simde_int32x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_s32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_s32
+ #define vld4q_dup_s32(a) simde_vld4q_dup_s32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2x4_t
+simde_vld4q_dup_s64(int64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_s64(ptr);
+ #else
+ simde_int64x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_s64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_s64
+ #define vld4q_dup_s64(a) simde_vld4q_dup_s64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16x4_t
+simde_vld4q_dup_u8(uint8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_u8(ptr);
+ #else
+ simde_uint8x16x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_u8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_u8
+ #define vld4q_dup_u8(a) simde_vld4q_dup_u8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8x4_t
+simde_vld4q_dup_u16(uint16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_u16(ptr);
+ #else
+ simde_uint16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_u16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_u16
+ #define vld4q_dup_u16(a) simde_vld4q_dup_u16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4x4_t
+simde_vld4q_dup_u32(uint32_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_u32(ptr);
+ #else
+ simde_uint32x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_u32(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_u32
+ #define vld4q_dup_u32(a) simde_vld4q_dup_u32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2x4_t
+simde_vld4q_dup_u64(uint64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_u64(ptr);
+ #else
+ simde_uint64x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_u64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_u64
+ #define vld4q_dup_u64(a) simde_vld4q_dup_u64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x4_t
+simde_vld4_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld4_dup_p8(ptr);
+ #else
+ simde_poly8x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_p8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_p8
+ #define vld4_dup_p8(a) simde_vld4_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x4_t
+simde_vld4_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld4_dup_p16(ptr);
+ #else
+ simde_poly16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_p16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_p16
+ #define vld4_dup_p16(a) simde_vld4_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x4_t
+simde_vld4_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE)
+ return vld4_dup_p64(ptr);
+ #else
+ simde_poly64x1x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_p64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_p64
+ #define vld4_dup_p64(a) simde_vld4_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x4_t
+simde_vld4q_dup_p8(simde_poly8_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld4q_dup_p8(ptr);
+ #else
+ simde_poly8x16x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_p8(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_p8
+ #define vld4q_dup_p8(a) simde_vld4q_dup_p8((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x4_t
+simde_vld4q_dup_p16(simde_poly16_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399)
+ return vld4q_dup_p16(ptr);
+ #else
+ simde_poly16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_p16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_p16
+ #define vld4q_dup_p16(a) simde_vld4q_dup_p16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x4_t
+simde_vld4q_dup_p64(simde_poly64_t const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vld4q_dup_p64(ptr);
+ #else
+ simde_poly64x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_p64(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_p64
+ #define vld4q_dup_p64(a) simde_vld4q_dup_p64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x4_t
+simde_vld4_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld4_dup_bf16(ptr);
+ #else
+ simde_bfloat16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdup_n_bf16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_dup_bf16
+ #define vld4_dup_bf16(a) simde_vld4_dup_bf16((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x4_t
+simde_vld4q_dup_bf16(simde_bfloat16 const * ptr) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ return vld4q_dup_bf16(ptr);
+ #else
+ simde_bfloat16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ r.val[i] = simde_vdupq_n_bf16(ptr[i]);
+ }
+ return r;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_dup_bf16
+ #define vld4q_dup_bf16(a) simde_vld4q_dup_bf16((a))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_LD3_DUP_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/ld4_lane.h b/lib/simd_wrapper/simde/arm/neon/ld4_lane.h
index c525755d2bf..ed8a7e4d2ad 100644
--- a/lib/simd_wrapper/simde/arm/neon/ld4_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/ld4_lane.h
@@ -23,6 +23,7 @@
* Copyright:
* 2021 Zhi An Ng (Copyright owned by Google, LLC)
* 2021 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
/* In older versions of clang, __builtin_neon_vld4_lane_v would
@@ -99,6 +100,7 @@ simde_vld4_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_int16x4x4_t
#define vld4_lane_s16(ptr, src, lane) simde_vld4_lane_s16((ptr), (src), (lane))
#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x2x4_t
simde_vld4_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x2x4_t src, const int lane)
@@ -261,6 +263,33 @@ simde_vld4_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_
#define vld4_lane_u64(ptr, src, lane) simde_vld4_lane_u64((ptr), (src), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4x4_t
+simde_vld4_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float16x4_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0)
+ #define simde_vld4_lane_f16(ptr, src, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vld4_lane_f16(ptr, src, lane))
+ #else
+ #define simde_vld4_lane_f16(ptr, src, lane) vld4_lane_f16(ptr, src, lane)
+ #endif
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_lane_f16
+ #define vld4_lane_f16(ptr, src, lane) simde_vld4_lane_f16((ptr), (src), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2x4_t
simde_vld4_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x4_t src, const int lane)
@@ -531,6 +560,33 @@ simde_vld4q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x4
#define vld4q_lane_u64(ptr, src, lane) simde_vld4q_lane_u64((ptr), (src), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8x4_t
+simde_vld4q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x8x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_float16x8_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0)
+ #define simde_vld4q_lane_f16(ptr, src, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vld4q_lane_f16(ptr, src, lane))
+ #else
+ #define simde_vld4q_lane_f16(ptr, src, lane) vld4q_lane_f16(ptr, src, lane)
+ #endif
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_lane_f16
+ #define vld4q_lane_f16(ptr, src, lane) simde_vld4q_lane_f16((ptr), (src), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4x4_t
simde_vld4q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4x4_t src, const int lane)
@@ -585,6 +641,182 @@ simde_vld4q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_flo
#define vld4q_lane_f64(ptr, src, lane) simde_vld4q_lane_f64((ptr), (src), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8x4_t
+simde_vld4_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x8x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly8x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly8x8_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld4_lane_p8(ptr, src, lane) vld4_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_lane_p8
+ #define vld4_lane_p8(ptr, src, lane) simde_vld4_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x4x4_t
+simde_vld4_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_poly16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly16x4_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld4_lane_p16(ptr, src, lane) vld4_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4_lane_p16
+ #define vld4_lane_p16(ptr, src, lane) simde_vld4_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x1x4_t
+simde_vld4_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_poly64x1x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly64x1_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld4_lane_p64(ptr, src, lane) vld4_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_lane_p64
+ #define vld4_lane_p64(ptr, src, lane) simde_vld4_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16x4_t
+simde_vld4q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x16x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) {
+ simde_poly8x16x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly8x16_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld4q_lane_p8(ptr, src, lane) vld4q_lane_p8(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_lane_p8
+ #define vld4q_lane_p8(ptr, src, lane) simde_vld4q_lane_p8((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8x4_t
+simde_vld4q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x8x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_poly16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly16x8_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vld4q_lane_p16(ptr, src, lane) vld4q_lane_p16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_lane_p16
+ #define vld4q_lane_p16(ptr, src, lane) simde_vld4q_lane_p16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly64x2x4_t
+simde_vld4q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_poly64x2x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_poly64x2_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vld4q_lane_p64(ptr, src, lane) vld4q_lane_p64(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_lane_p64
+ #define vld4q_lane_p64(ptr, src, lane) simde_vld4q_lane_p64((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x4x4_t
+simde_vld4_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_bfloat16x4x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_bfloat16x4_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld4_lane_bf16(ptr, src, lane) vld4_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4_lane_bf16
+ #define vld4_lane_bf16(ptr, src, lane) simde_vld4_lane_bf16((ptr), (src), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_bfloat16x8x4_t
+simde_vld4q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x8x4_t src, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_bfloat16x8x4_t r;
+
+ for (size_t i = 0 ; i < 4 ; i++) {
+ simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]);
+ tmp_.values[lane] = ptr[i];
+ r.val[i] = simde_bfloat16x8_from_private(tmp_);
+ }
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16)
+ #define simde_vld4q_lane_bf16(ptr, src, lane) vld4q_lane_bf16(ptr, src, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vld4q_lane_bf16
+ #define vld4q_lane_bf16(ptr, src, lane) simde_vld4q_lane_bf16((ptr), (src), (lane))
+#endif
+
#endif /* !defined(SIMDE_BUG_INTEL_857088) */
SIMDE_END_DECLS_
diff --git a/lib/simd_wrapper/simde/arm/neon/max.h b/lib/simd_wrapper/simde/arm/neon/max.h
index 1e2b449e34b..04c38184aa3 100644
--- a/lib/simd_wrapper/simde/arm/neon/max.h
+++ b/lib/simd_wrapper/simde/arm/neon/max.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MAX_H)
@@ -36,6 +37,52 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmaxh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxh_f16(a, b);
+ #else
+ simde_float32_t r_;
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+ #if !defined(SIMDE_FAST_NANS)
+ r_ = (a_ >= b_) ? a_ : ((a_ < b_) ? b_ : SIMDE_MATH_NANF);
+ #else
+ r_ = (a_ > b_) ? a_ : b_;
+ #endif
+ return simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxh_f16
+ #define vmaxh_f16(a, b) simde_vmaxh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmax_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmax_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmaxh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmax_f16
+ #define vmax_f16(a, b) simde_vmax_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmax_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -293,6 +340,30 @@ simde_x_vmax_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
#endif
}
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmaxh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxq_f16
+ #define vmaxq_f16(a, b) simde_vmaxq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/maxnm.h b/lib/simd_wrapper/simde/arm/neon/maxnm.h
index b9aceb02ce0..8101dd2ca3a 100644
--- a/lib/simd_wrapper/simde/arm/neon/maxnm.h
+++ b/lib/simd_wrapper/simde/arm/neon/maxnm.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MAXNM_H)
@@ -35,6 +36,84 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmaxnmh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxnmh_f16(a, b);
+ #else
+ #if defined(simde_math_fmaxf)
+ return simde_float16_from_float32(simde_math_fmaxf(simde_float16_to_float32(a), simde_float16_to_float32(b)));
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+ simde_float32_t r_;
+ if (a_ > b_) {
+ r_ = a_;
+ } else if (a_ < b_) {
+ r_ = b_;
+ } else if (a_ == a_) {
+ r_ = a_;
+ } else {
+ r_ = b_;
+ }
+ return simde_float16_from_float32(r_);
+ #endif
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmh_f16
+ #define vmaxnmh_f16(a, b) simde_vmaxnmh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxnm_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmaxnmh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnm_f16
+ #define vmaxnm_f16(a, b) simde_vmaxnm_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxnmq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmaxnmh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmq_f16
+ #define vmaxnmq_f16(a, b) simde_vmaxnmq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/maxnmv.h b/lib/simd_wrapper/simde/arm/neon/maxnmv.h
new file mode 100644
index 00000000000..7f00628e19b
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/maxnmv.h
@@ -0,0 +1,172 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MAXNMV_H)
+#define SIMDE_ARM_NEON_MAXNMV_H
+
+#include "types.h"
+#include
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vmaxnmv_f32(simde_float32x2_t a) {
+ simde_float32_t r;
+
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ r = vmaxnmv_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+
+ r = -SIMDE_MATH_INFINITYF;
+ SIMDE_VECTORIZE_REDUCTION(max:r)
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ r = a_.values[i] > r ? a_.values[i] : r;
+ }
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmv_f32
+ #define vmaxnmv_f32(v) simde_vmaxnmv_f32(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vmaxnmvq_f32(simde_float32x4_t a) {
+ simde_float32_t r;
+
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ r = vmaxnmvq_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+
+ r = -SIMDE_MATH_INFINITYF;
+ SIMDE_VECTORIZE_REDUCTION(max:r)
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ r = a_.values[i] > r ? a_.values[i] : r;
+ }
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmvq_f32
+ #define vmaxnmvq_f32(v) simde_vmaxnmvq_f32(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vmaxnmvq_f64(simde_float64x2_t a) {
+ simde_float64_t r;
+
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ r = vmaxnmvq_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+
+ r = -SIMDE_MATH_INFINITY;
+ SIMDE_VECTORIZE_REDUCTION(max:r)
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ r = a_.values[i] > r ? a_.values[i] : r;
+ }
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmvq_f64
+ #define vmaxnmvq_f64(v) simde_vmaxnmvq_f64(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmaxnmv_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxnmv_f16(a);
+ #else
+ simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF);
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(max:r_)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r_ = tmp_a > r_ ? tmp_a : r_;
+ #else
+ r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a));
+ #endif
+ }
+ return simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmv_f16
+ #define vmaxnmv_f16(v) simde_vmaxnmv_f16(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmaxnmvq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxnmvq_f16(a);
+ #else
+ simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF);
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(max:r_)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r_ = tmp_a > r_ ? tmp_a : r_;
+ #else
+ r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a));
+ #endif
+ }
+ return simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxnmvq_f16
+ #define vmaxnmvq_f16(v) simde_vmaxnmvq_f16(v)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MAXNMV_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/maxv.h b/lib/simd_wrapper/simde/arm/neon/maxv.h
index 37437b04d09..39c9e0cae5d 100644
--- a/lib/simd_wrapper/simde/arm/neon/maxv.h
+++ b/lib/simd_wrapper/simde/arm/neon/maxv.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MAXV_H)
@@ -34,6 +35,38 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmaxv_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxv_f16(a);
+ #else
+ simde_float32_t r;
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+
+ r = simde_float16_to_float32(SIMDE_NINFINITYHF);
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(max:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t a32 = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r = a32 > r ? a32 : r;
+ #else
+ r = a32 > r ? a32 : (a32 <= r ? r : ((a32 == a32) ? r : a32));
+ #endif
+ }
+
+ return simde_float16_from_float32(r);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxv_f16
+ #define vmaxv_f16(v) simde_vmaxv_f16(v)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vmaxv_f32(simde_float32x2_t a) {
@@ -202,6 +235,38 @@ simde_vmaxv_u32(simde_uint32x2_t a) {
#define vmaxv_u32(v) simde_vmaxv_u32(v)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmaxvq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmaxvq_f16(a);
+ #else
+ simde_float32_t r;
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+
+ r = simde_float16_to_float32(SIMDE_NINFINITYHF);
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(max:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t a32 = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r = a32 > r ? a32 : r;
+ #else
+ r = a32 > r ? a32 : (a32 <= r ? r : ((a32 == a32) ? r : a32));
+ #endif
+ }
+
+ return simde_float16_from_float32(r);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmaxvq_f16
+ #define vmaxvq_f16(v) simde_vmaxvq_f16(v)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vmaxvq_f32(simde_float32x4_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/min.h b/lib/simd_wrapper/simde/arm/neon/min.h
index 08ea4d00355..469e65aa035 100644
--- a/lib/simd_wrapper/simde/arm/neon/min.h
+++ b/lib/simd_wrapper/simde/arm/neon/min.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MIN_H)
@@ -36,6 +37,52 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vminh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vminh_f16(a, b);
+ #else
+ simde_float32_t r_;
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+ #if !defined(SIMDE_FAST_NANS)
+ r_ = (a_ <= b_) ? a_ : ((a_ > b_) ? b_ : SIMDE_MATH_NANF);
+ #else
+ r_ = (a_ < b_) ? a_ : b_;
+ #endif
+ return simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminh_f16
+ #define vminh_f16(a, b) simde_vminh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmin_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmin_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vminh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmin_f16
+ #define vmin_f16(a, b) simde_vmin_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmin_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -159,14 +206,10 @@ simde_vmin_s16(simde_int16x4_t a, simde_int16x4_t b) {
a_ = simde_int16x4_to_private(a),
b_ = simde_int16x4_to_private(b);
- #if defined(SIMDE_X86_MMX_NATIVE)
- r_.m64 = _mm_sub_pi16(a_.m64, _mm_subs_pu16(b_.m64));
- #else
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i];
- }
- #endif
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i];
+ }
return simde_int16x4_from_private(r_);
#endif
@@ -325,6 +368,30 @@ simde_x_vmin_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
#endif
}
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vminq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vminq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vminh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vminq_f16
+ #define vminq_f16(a, b) simde_vminq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vminq_f32(simde_float32x4_t a, simde_float32x4_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/minnm.h b/lib/simd_wrapper/simde/arm/neon/minnm.h
index b68a28cb750..341a3150ca9 100644
--- a/lib/simd_wrapper/simde/arm/neon/minnm.h
+++ b/lib/simd_wrapper/simde/arm/neon/minnm.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MINNM_H)
@@ -35,6 +36,60 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vminnmh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16)
+ return vminnmh_f16(a, b);
+ #else
+ #if defined(simde_math_fminf)
+ return simde_float16_from_float32(simde_math_fminf(simde_float16_to_float32(a), simde_float16_to_float32(b)));
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+ simde_float32_t r_;
+ if (a_ < b_) {
+ r_ = a_;
+ } else if (a_ > b_) {
+ r_ = b_;
+ } else if (a_ == a_) {
+ r_ = a_;
+ } else {
+ r_ = b_;
+ }
+ return simde_float16_from_float32(r_);
+ #endif
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmh_f16
+ #define vminnmh_f16(a, b) simde_vminnmh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vminnm_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16)
+ return vminnm_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vminnmh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnm_f16
+ #define vminnm_f16(a, b) simde_vminnm_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vminnm_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -107,6 +162,30 @@ simde_vminnm_f64(simde_float64x1_t a, simde_float64x1_t b) {
#define vminnm_f64(a, b) simde_vminnm_f64((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16)
+ return vminnmq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vminnmh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmq_f16
+ #define vminnmq_f16(a, b) simde_vminnmq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/minnmv.h b/lib/simd_wrapper/simde/arm/neon/minnmv.h
new file mode 100644
index 00000000000..11e1b3438db
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/minnmv.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MINNMV_H)
+#define SIMDE_ARM_NEON_MINNMV_H
+
+#include "types.h"
+#include
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vminnmv_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vminnmv_f16(a);
+ #else
+ simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF);
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r_)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r_ = tmp_a < r_ ? tmp_a : r_;
+ #else
+ r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a));
+ #endif
+ }
+ return simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmv_f16
+ #define vminnmv_f16(v) simde_vminnmv_f16(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vminnmv_f32(simde_float32x2_t a) {
+ simde_float32_t r;
+
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ r = vminnmv_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+
+ r = SIMDE_MATH_INFINITYF;
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ #if defined(SIMDE_FAST_NANS)
+ r = a_.values[i] < r ? a_.values[i] : r;
+ #else
+ r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i]));
+ #endif
+ }
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmv_f32
+ #define vminnmv_f32(v) simde_vminnmv_f32(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vminnmvq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vminnmvq_f16(a);
+ #else
+ simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF);
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r_)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r_ = tmp_a < r_ ? tmp_a : r_;
+ #else
+ r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a));
+ #endif
+ }
+ return simde_float16_from_float32(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmvq_f16
+ #define vminnmvq_f16(v) simde_vminnmvq_f16(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vminnmvq_f32(simde_float32x4_t a) {
+ simde_float32_t r;
+
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ r = vminnmvq_f32(a);
+ #else
+ simde_float32x4_private a_ = simde_float32x4_to_private(a);
+
+ r = SIMDE_MATH_INFINITYF;
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ #if defined(SIMDE_FAST_NANS)
+ r = a_.values[i] < r ? a_.values[i] : r;
+ #else
+ r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i]));
+ #endif
+ }
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmvq_f32
+ #define vminnmvq_f32(v) simde_vminnmvq_f32(v)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vminnmvq_f64(simde_float64x2_t a) {
+ simde_float64_t r;
+
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ r = vminnmvq_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+
+ r = SIMDE_MATH_INFINITY;
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ #if defined(SIMDE_FAST_NANS)
+ r = a_.values[i] < r ? a_.values[i] : r;
+ #else
+ r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i]));
+ #endif
+ }
+ #endif
+
+ return r;
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminnmvq_f64
+ #define vminnmvq_f64(v) simde_vminnmvq_f64(v)
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MINNMV_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/minv.h b/lib/simd_wrapper/simde/arm/neon/minv.h
index 93028d74fe8..2c7b5e3f7af 100644
--- a/lib/simd_wrapper/simde/arm/neon/minv.h
+++ b/lib/simd_wrapper/simde/arm/neon/minv.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MINV_H)
@@ -34,6 +35,38 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vminv_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vminv_f16(a);
+ #else
+ simde_float32_t r;
+ simde_float16x4_private a_ = simde_float16x4_to_private(a);
+
+ r = simde_float16_to_float32(SIMDE_INFINITYHF);
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t a32 = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r = a32 < r ? a32 : r;
+ #else
+ r = a32 < r ? a32 : (a32 >= r ? r : ((a32 == a32) ? r : a32));
+ #endif
+ }
+
+ return simde_float16_from_float32(r);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminv_f16
+ #define vminv_f16(v) simde_vminv_f16(v)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vminv_f32(simde_float32x2_t a) {
@@ -210,6 +243,38 @@ simde_vminv_u32(simde_uint32x2_t a) {
#define vminv_u32(v) simde_vminv_u32(v)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vminvq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vminvq_f16(a);
+ #else
+ simde_float32_t r;
+ simde_float16x8_private a_ = simde_float16x8_to_private(a);
+
+ r = simde_float16_to_float32(SIMDE_INFINITYHF);
+ #if defined(SIMDE_FAST_NANS)
+ SIMDE_VECTORIZE_REDUCTION(min:r)
+ #else
+ SIMDE_VECTORIZE
+ #endif
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) {
+ simde_float32_t a32 = simde_float16_to_float32(a_.values[i]);
+ #if defined(SIMDE_FAST_NANS)
+ r = a32 < r ? a32 : r;
+ #else
+ r = a32 < r ? a32 : (a32 >= r ? r : ((a32 == a32) ? r : a32));
+ #endif
+ }
+
+ return simde_float16_from_float32(r);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vminvq_f16
+ #define vminvq_f16(v) simde_vminvq_f16(v)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vminvq_f32(simde_float32x4_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/mla_lane.h b/lib/simd_wrapper/simde/arm/neon/mla_lane.h
new file mode 100644
index 00000000000..ad383d473ec
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mla_lane.h
@@ -0,0 +1,241 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MLA_LANE_H)
+#define SIMDE_ARM_NEON_MLA_LANE_H
+
+#include "mla.h"
+#include "dup_lane.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmla_lane_f32(a, b, v, lane) vmla_lane_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmla_lane_f32(a, b, v, lane) simde_vmla_f32((a), (b), simde_vdup_lane_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmla_lane_f32
+ #define vmla_lane_f32(a, b, v, lane) simde_vmla_lane_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmla_laneq_f32(a, b, v, lane) vmla_laneq_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmla_laneq_f32(a, b, v, lane) simde_vmla_f32((a), (b), simde_vdup_laneq_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmla_laneq_f32
+ #define vmla_laneq_f32(a, b, v, lane) simde_vmla_laneq_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlaq_laneq_f32(a, b, v, lane) vmlaq_laneq_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_laneq_f32(a, b, v, lane) simde_vmlaq_f32((a), (b), simde_vdupq_laneq_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_laneq_f32
+ #define vmlaq_laneq_f32(a, b, v, lane) simde_vmlaq_laneq_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmla_lane_s16(a, b, v, lane) vmla_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmla_lane_s16(a, b, v, lane) simde_vmla_s16((a), (b), simde_vdup_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmla_lane_s16
+ #define vmla_lane_s16(a, b, v, lane) simde_vmla_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmla_laneq_s16(a, b, v, lane) vmla_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmla_laneq_s16(a, b, v, lane) simde_vmla_s16((a), (b), simde_vdup_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmla_laneq_s16
+ #define vmla_laneq_s16(a, b, v, lane) simde_vmla_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlaq_laneq_s16(a, b, v, lane) vmlaq_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_laneq_s16(a, b, v, lane) simde_vmlaq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_laneq_s16
+ #define vmlaq_laneq_s16(a, b, v, lane) simde_vmlaq_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmla_lane_s32(a, b, v, lane) vmla_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmla_lane_s32(a, b, v, lane) simde_vmla_s32((a), (b), simde_vdup_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmla_lane_s32
+ #define vmla_lane_s32(a, b, v, lane) simde_vmla_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmla_laneq_s32(a, b, v, lane) vmla_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmla_laneq_s32(a, b, v, lane) simde_vmla_s32((a), (b), simde_vdup_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmla_laneq_s32
+ #define vmla_laneq_s32(a, b, v, lane) simde_vmla_laneq_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlaq_laneq_s32(a, b, v, lane) vmlaq_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_laneq_s32(a, b, v, lane) simde_vmlaq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_laneq_s32
+ #define vmlaq_laneq_s32(a, b, v, lane) simde_vmlaq_laneq_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmla_lane_u16(a, b, v, lane) vmla_lane_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmla_lane_u16(a, b, v, lane) simde_vmla_u16((a), (b), simde_vdup_lane_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmla_lane_u16
+ #define vmla_lane_u16(a, b, v, lane) simde_vmla_lane_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmla_laneq_u16(a, b, v, lane) vmla_laneq_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmla_laneq_u16(a, b, v, lane) simde_vmla_u16((a), (b), simde_vdup_laneq_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmla_laneq_u16
+ #define vmla_laneq_u16(a, b, v, lane) simde_vmla_laneq_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlaq_laneq_u16(a, b, v, lane) vmlaq_laneq_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_laneq_u16(a, b, v, lane) simde_vmlaq_u16((a), (b), simde_vdupq_laneq_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_laneq_u16
+ #define vmlaq_laneq_u16(a, b, v, lane) simde_vmlaq_laneq_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmla_lane_u32(a, b, v, lane) vmla_lane_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmla_lane_u32(a, b, v, lane) simde_vmla_u32((a), (b), simde_vdup_lane_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmla_lane_u32
+ #define vmla_lane_u32(a, b, v, lane) simde_vmla_lane_u32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmla_laneq_u32(a, b, v, lane) vmla_laneq_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmla_laneq_u32(a, b, v, lane) simde_vmla_u32((a), (b), simde_vdup_laneq_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmla_laneq_u32
+ #define vmla_laneq_u32(a, b, v, lane) simde_vmla_laneq_u32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlaq_laneq_u32(a, b, v, lane) vmlaq_laneq_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_laneq_u32(a, b, v, lane) simde_vmlaq_u32((a), (b), simde_vdupq_laneq_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_laneq_u32
+ #define vmlaq_laneq_u32(a, b, v, lane) simde_vmlaq_laneq_u32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlaq_lane_f32(a, b, v, lane) vmlaq_lane_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_lane_f32(a, b, v, lane) simde_vmlaq_f32((a), (b), simde_vdupq_lane_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_lane_f32
+ #define vmlaq_lane_f32(a, b, v, lane) simde_vmlaq_lane_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlaq_lane_s16(a, b, v, lane) vmlaq_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_lane_s16(a, b, v, lane) simde_vmlaq_s16((a), (b), simde_vdupq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_lane_s16
+ #define vmlaq_lane_s16(a, b, v, lane) simde_vmlaq_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlaq_lane_s32(a, b, v, lane) vmlaq_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_lane_s32(a, b, v, lane) simde_vmlaq_s32((a), (b), simde_vdupq_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_lane_s32
+ #define vmlaq_lane_s32(a, b, v, lane) simde_vmlaq_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlaq_lane_u16(a, b, v, lane) vmlaq_lane_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_lane_u16(a, b, v, lane) simde_vmlaq_u16((a), (b), simde_vdupq_lane_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_lane_u16
+ #define vmlaq_lane_u16(a, b, v, lane) simde_vmlaq_lane_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlaq_lane_u32(a, b, v, lane) vmlaq_lane_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmlaq_lane_u32(a, b, v, lane) simde_vmlaq_u32((a), (b), simde_vdupq_lane_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlaq_lane_u32
+ #define vmlaq_lane_u32(a, b, v, lane) simde_vmlaq_lane_u32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MLA_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mlal_high_lane.h b/lib/simd_wrapper/simde/arm/neon/mlal_high_lane.h
new file mode 100644
index 00000000000..50018a95d85
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mlal_high_lane.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_LANE_H)
+#define SIMDE_ARM_NEON_MLAL_HIGH_LANE_H
+
+#include "movl_high.h"
+#include "mlal_high.h"
+#include "dup_n.h"
+#include "mla.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmlal_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlal_high_s16(a, b, simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_lane_s16(a, b, v, lane) vmlal_high_lane_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_lane_s16
+ #define vmlal_high_lane_s16(a, b, v, lane) simde_vmlal_high_lane_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmlal_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vmlal_high_s16(a, b, simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_laneq_s16(a, b, v, lane) vmlal_high_laneq_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_laneq_s16
+ #define vmlal_high_laneq_s16(a, b, v, lane) simde_vmlal_high_laneq_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmlal_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return simde_vmlal_high_s32(a, b, simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_lane_s32(a, b, v, lane) vmlal_high_lane_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_lane_s32
+ #define vmlal_high_lane_s32(a, b, v, lane) simde_vmlal_high_lane_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmlal_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlal_high_s32(a, b, simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_laneq_s32(a, b, v, lane) vmlal_high_laneq_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_laneq_s32
+ #define vmlal_high_laneq_s32(a, b, v, lane) simde_vmlal_high_laneq_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmlal_high_lane_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlal_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_lane_u16(a, b, v, lane) vmlal_high_lane_u16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_lane_u16
+ #define vmlal_high_lane_u16(a, b, v, lane) simde_vmlal_high_lane_u16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmlal_high_laneq_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vmlal_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x8_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_laneq_u16(a, b, v, lane) vmlal_high_laneq_u16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_laneq_u16
+ #define vmlal_high_laneq_u16(a, b, v, lane) simde_vmlal_high_laneq_u16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmlal_high_lane_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return simde_vmlal_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x2_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_lane_u32(a, b, v, lane) vmlal_high_lane_u32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_lane_u32
+ #define vmlal_high_lane_u32(a, b, v, lane) simde_vmlal_high_lane_u32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmlal_high_laneq_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlal_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlal_high_laneq_u32(a, b, v, lane) vmlal_high_laneq_u32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlal_high_laneq_u32
+ #define vmlal_high_laneq_u32(a, b, v, lane) simde_vmlal_high_laneq_u32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MLAL_HIGH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mls.h b/lib/simd_wrapper/simde/arm/neon/mls.h
index 83fb42fc734..c92547f7de0 100644
--- a/lib/simd_wrapper/simde/arm/neon/mls.h
+++ b/lib/simd_wrapper/simde/arm/neon/mls.h
@@ -151,18 +151,13 @@ simde_float32x4_t
simde_vmlsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vmlsq_f32(a, b, c);
- #elif \
- defined(SIMDE_X86_FMA_NATIVE)
+ #elif defined(SIMDE_X86_FMA_NATIVE)
simde_float32x4_private
r_,
a_ = simde_float32x4_to_private(a),
b_ = simde_float32x4_to_private(b),
c_ = simde_float32x4_to_private(c);
-
- #if defined(SIMDE_X86_FMA_NATIVE)
- r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128);
- #endif
-
+ r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128);
return simde_float32x4_from_private(r_);
#else
return simde_vsubq_f32(a, simde_vmulq_f32(b, c));
@@ -178,18 +173,13 @@ simde_float64x2_t
simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) {
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
return vmlsq_f64(a, b, c);
- #elif \
- defined(SIMDE_X86_FMA_NATIVE)
+ #elif defined(SIMDE_X86_FMA_NATIVE)
simde_float64x2_private
r_,
a_ = simde_float64x2_to_private(a),
b_ = simde_float64x2_to_private(b),
c_ = simde_float64x2_to_private(c);
-
- #if defined(SIMDE_X86_FMA_NATIVE)
- r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d);
- #endif
-
+ r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d);
return simde_float64x2_from_private(r_);
#else
return simde_vsubq_f64(a, simde_vmulq_f64(b, c));
diff --git a/lib/simd_wrapper/simde/arm/neon/mls_lane.h b/lib/simd_wrapper/simde/arm/neon/mls_lane.h
new file mode 100644
index 00000000000..35855a2b7c6
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mls_lane.h
@@ -0,0 +1,240 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MLS_LANE_H)
+#define SIMDE_ARM_NEON_MLS_LANE_H
+
+#include "mls.h"
+#include "dup_lane.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmls_lane_f32(a, b, v, lane) vmls_lane_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmls_lane_f32(a, b, v, lane) simde_vmls_f32((a), (b), simde_vdup_lane_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmls_lane_f32
+ #define vmls_lane_f32(a, b, v, lane) simde_vmls_lane_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmls_laneq_f32(a, b, v, lane) vmls_laneq_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmls_laneq_f32(a, b, v, lane) simde_vmls_f32((a), (b), simde_vdup_laneq_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmls_laneq_f32
+ #define vmls_laneq_f32(a, b, v, lane) simde_vmls_laneq_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsq_laneq_f32(a, b, v, lane) vmlsq_laneq_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_laneq_f32(a, b, v, lane) simde_vmlsq_f32((a), (b), simde_vdupq_laneq_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_laneq_f32
+ #define vmlsq_laneq_f32(a, b, v, lane) simde_vmlsq_laneq_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmls_lane_s16(a, b, v, lane) vmls_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmls_lane_s16(a, b, v, lane) simde_vmls_s16((a), (b), simde_vdup_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmls_lane_s16
+ #define vmls_lane_s16(a, b, v, lane) simde_vmls_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmls_laneq_s16(a, b, v, lane) vmls_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmls_laneq_s16(a, b, v, lane) simde_vmls_s16((a), (b), simde_vdup_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmls_laneq_s16
+ #define vmls_laneq_s16(a, b, v, lane) simde_vmls_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsq_laneq_s16(a, b, v, lane) vmlsq_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_laneq_s16(a, b, v, lane) simde_vmlsq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_laneq_s16
+ #define vmlsq_laneq_s16(a, b, v, lane) simde_vmlsq_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmls_lane_s32(a, b, v, lane) vmls_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmls_lane_s32(a, b, v, lane) simde_vmls_s32((a), (b), simde_vdup_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmls_lane_s32
+ #define vmls_lane_s32(a, b, v, lane) simde_vmls_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmls_laneq_s32(a, b, v, lane) vmls_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmls_laneq_s32(a, b, v, lane) simde_vmls_s32((a), (b), simde_vdup_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmls_laneq_s32
+ #define vmls_laneq_s32(a, b, v, lane) simde_vmls_laneq_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsq_laneq_s32(a, b, v, lane) vmlsq_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_laneq_s32(a, b, v, lane) simde_vmlsq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_laneq_s32
+ #define vmlsq_laneq_s32(a, b, v, lane) simde_vmlsq_laneq_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmls_lane_u16(a, b, v, lane) vmls_lane_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmls_lane_u16(a, b, v, lane) simde_vmls_u16((a), (b), simde_vdup_lane_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmls_lane_u16
+ #define vmls_lane_u16(a, b, v, lane) simde_vmls_lane_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmls_laneq_u16(a, b, v, lane) vmls_laneq_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmls_laneq_u16(a, b, v, lane) simde_vmls_u16((a), (b), simde_vdup_laneq_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmls_laneq_u16
+ #define vmls_laneq_u16(a, b, v, lane) simde_vmls_laneq_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsq_laneq_u16(a, b, v, lane) vmlsq_laneq_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_laneq_u16(a, b, v, lane) simde_vmlsq_u16((a), (b), simde_vdupq_laneq_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_laneq_u16
+ #define vmlsq_laneq_u16(a, b, v, lane) simde_vmlsq_laneq_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmls_lane_u32(a, b, v, lane) vmls_lane_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmls_lane_u32(a, b, v, lane) simde_vmls_u32((a), (b), simde_vdup_lane_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmls_lane_u32
+ #define vmls_lane_u32(a, b, v, lane) simde_vmls_lane_u32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmls_laneq_u32(a, b, v, lane) vmls_laneq_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmls_laneq_u32(a, b, v, lane) simde_vmls_u32((a), (b), simde_vdup_laneq_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmls_laneq_u32
+ #define vmls_laneq_u32(a, b, v, lane) simde_vmls_laneq_u32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsq_laneq_u32(a, b, v, lane) vmlsq_laneq_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_laneq_u32(a, b, v, lane) simde_vmlsq_u32((a), (b), simde_vdupq_laneq_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_laneq_u32
+ #define vmlsq_laneq_u32(a, b, v, lane) simde_vmlsq_laneq_u32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlsq_lane_f32(a, b, v, lane) vmlsq_lane_f32((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_lane_f32(a, b, v, lane) simde_vmlsq_f32((a), (b), simde_vdupq_lane_f32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_lane_f32
+ #define vmlsq_lane_f32(a, b, v, lane) simde_vmlsq_lane_f32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlsq_lane_s16(a, b, v, lane) vmlsq_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_lane_s16(a, b, v, lane) simde_vmlsq_s16((a), (b), simde_vdupq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_lane_s16
+ #define vmlsq_lane_s16(a, b, v, lane) simde_vmlsq_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlsq_lane_s32(a, b, v, lane) vmlsq_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_lane_s32(a, b, v, lane) simde_vmlsq_s32((a), (b), simde_vdupq_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_lane_s32
+ #define vmlsq_lane_s32(a, b, v, lane) simde_vmlsq_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlsq_lane_u16(a, b, v, lane) vmlsq_lane_u16((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_lane_u16(a, b, v, lane) simde_vmlsq_u16((a), (b), simde_vdupq_lane_u16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_lane_u16
+ #define vmlsq_lane_u16(a, b, v, lane) simde_vmlsq_lane_u16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vmlsq_lane_u32(a, b, v, lane) vmlsq_lane_u32((a), (b), (v), (lane))
+#else
+ #define simde_vmlsq_lane_u32(a, b, v, lane) simde_vmlsq_u32((a), (b), simde_vdupq_lane_u32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmlsq_lane_u32
+ #define vmlsq_lane_u32(a, b, v, lane) simde_vmlsq_lane_u32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MLS_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mlsl_high_lane.h b/lib/simd_wrapper/simde/arm/neon/mlsl_high_lane.h
new file mode 100644
index 00000000000..f45b7d98930
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mlsl_high_lane.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MLSL_HIGH_LANE_H)
+#define SIMDE_ARM_NEON_MLSL_HIGH_LANE_H
+
+#include "movl_high.h"
+#include "mlsl_high.h"
+#include "dup_n.h"
+#include "mls.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmlsl_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlsl_high_s16(a, b, simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_lane_s16(a, b, v, lane) vmlsl_high_lane_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_lane_s16
+ #define vmlsl_high_lane_s16(a, b, v, lane) simde_vmlsl_high_lane_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmlsl_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vmlsl_high_s16(a, b, simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_laneq_s16(a, b, v, lane) vmlsl_high_laneq_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_laneq_s16
+ #define vmlsl_high_laneq_s16(a, b, v, lane) simde_vmlsl_high_laneq_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmlsl_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return simde_vmlsl_high_s32(a, b, simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_lane_s32(a, b, v, lane) vmlsl_high_lane_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_lane_s32
+ #define vmlsl_high_lane_s32(a, b, v, lane) simde_vmlsl_high_lane_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmlsl_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlsl_high_s32(a, b, simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_laneq_s32(a, b, v, lane) vmlsl_high_laneq_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_laneq_s32
+ #define vmlsl_high_laneq_s32(a, b, v, lane) simde_vmlsl_high_laneq_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmlsl_high_lane_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlsl_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_lane_u16(a, b, v, lane) vmlsl_high_lane_u16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_lane_u16
+ #define vmlsl_high_lane_u16(a, b, v, lane) simde_vmlsl_high_lane_u16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmlsl_high_laneq_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vmlsl_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x8_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_laneq_u16(a, b, v, lane) vmlsl_high_laneq_u16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_laneq_u16
+ #define vmlsl_high_laneq_u16(a, b, v, lane) simde_vmlsl_high_laneq_u16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmlsl_high_lane_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return simde_vmlsl_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x2_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_lane_u32(a, b, v, lane) vmlsl_high_lane_u32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_lane_u32
+ #define vmlsl_high_lane_u32(a, b, v, lane) simde_vmlsl_high_lane_u32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmlsl_high_laneq_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmlsl_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x4_to_private(v).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmlsl_high_laneq_u32(a, b, v, lane) vmlsl_high_laneq_u32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmlsl_high_laneq_u32
+ #define vmlsl_high_laneq_u32(a, b, v, lane) simde_vmlsl_high_laneq_u32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MLSL_HIGH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mmlaq.h b/lib/simd_wrapper/simde/arm/neon/mmlaq.h
new file mode 100644
index 00000000000..a5685385305
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mmlaq.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MMLAQ_H)
+#define SIMDE_ARM_NEON_MMLAQ_H
+
+#include "types.h"
+#include "cgt.h"
+#include "bsl.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmmlaq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) {
+ // I8MM is optional feature. src: https://patchwork.ffmpeg.org/project/ffmpeg/patch/20230530123043.52940-2-martin@martin.st/
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8)
+ return vmmlaq_s32(r, a, b);
+ #else
+ simde_int8x16_private
+ a_ = simde_int8x16_to_private(a),
+ b_ = simde_int8x16_to_private(b);
+ simde_int32x4_private
+ r_ = simde_int32x4_to_private(r),
+ ret;
+
+ for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) {
+ ret.values[k] = r_.values[k];
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) {
+ ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i];
+ }
+ }
+ return simde_int32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmmlaq_s32
+ #define vmmlaq_s32(r, a, b) simde_vmmlaq_s32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmmlaq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8)
+ return vmmlaq_u32(r, a, b);
+ #else
+ simde_uint8x16_private
+ a_ = simde_uint8x16_to_private(a),
+ b_ = simde_uint8x16_to_private(b);
+ simde_uint32x4_private
+ r_ = simde_uint32x4_to_private(r),
+ ret;
+
+ for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) {
+ ret.values[k] = r_.values[k];
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) {
+ ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i];
+ }
+ }
+ return simde_uint32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmmlaq_u32
+ #define vmmlaq_u32(r, a, b) simde_vmmlaq_u32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vusmmlaq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8)
+ return vusmmlaq_s32(r, a, b);
+ #else
+ simde_uint8x16_private
+ a_ = simde_uint8x16_to_private(a);
+ simde_int8x16_private
+ b_ = simde_int8x16_to_private(b);
+ simde_int32x4_private
+ r_ = simde_int32x4_to_private(r),
+ ret;
+
+ for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) {
+ ret.values[k] = r_.values[k];
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) {
+ ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i];
+ }
+ }
+ return simde_int32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vusmmlaq_s32
+ #define vusmmlaq_s32(r, a, b) simde_vusmmlaq_s32((r), (a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vbfmmlaq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) && \
+ defined(SIMDE_ARM_NEON_BF16)
+ return vbfmmlaq_f32(r, a, b);
+ #else
+ simde_bfloat16x8_private
+ a_ = simde_bfloat16x8_to_private(a),
+ b_ = simde_bfloat16x8_to_private(b);
+ simde_float32x4_private
+ r_ = simde_float32x4_to_private(r),
+ ret;
+
+ for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) {
+ ret.values[k] = r_.values[k];
+ for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) {
+ ret.values[k] += simde_bfloat16_to_float32(a_.values[(k/2)*4+i]) *
+ simde_bfloat16_to_float32(b_.values[(k%2)*4+i]);
+ }
+ }
+ return simde_float32x4_from_private(ret);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vbfmmlaq_f32
+ #define vbfmmlaq_f32(r, a, b) simde_vbfmmlaq_f32((r), (a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MMLAQ_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mul.h b/lib/simd_wrapper/simde/arm/neon/mul.h
index 48de8a24031..590b0eae54a 100644
--- a/lib/simd_wrapper/simde/arm/neon/mul.h
+++ b/lib/simd_wrapper/simde/arm/neon/mul.h
@@ -23,6 +23,8 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Yung-Cheng Su (Copyright owned by NTHU pllab)
*/
#if !defined(SIMDE_ARM_NEON_MUL_H)
@@ -36,6 +38,49 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmulh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulh_f16(a, b);
+ #else
+ simde_float32_t a_ = simde_float16_to_float32(a);
+ simde_float32_t b_ = simde_float16_to_float32(b);
+
+ return simde_float16_from_float32(a_ * b_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulh_f16
+ #define vmulh_f16(a, b) simde_vmulh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmul_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmul_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]);
+ simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]);
+ r_.values[i] = simde_float16_from_float32(tmp_a_ * tmp_b_);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmul_f16
+ #define vmul_f16(a, b) simde_vmul_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -47,7 +92,9 @@ simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) {
a_ = simde_float32x2_to_private(a),
b_ = simde_float32x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vv_f32m1(a_.sv64, b_.sv64, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -75,7 +122,9 @@ simde_vmul_f64(simde_float64x1_t a, simde_float64x1_t b) {
a_ = simde_float64x1_to_private(a),
b_ = simde_float64x1_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vv_f64m1(a_.sv64, b_.sv64, 1);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -103,7 +152,9 @@ simde_vmul_s8(simde_int8x8_t a, simde_int8x8_t b) {
a_ = simde_int8x8_to_private(a),
b_ = simde_int8x8_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_i8m1(a_.sv64, b_.sv64, 8);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -133,6 +184,8 @@ simde_vmul_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_X86_MMX_NATIVE)
r_.m64 = _m_pmullw(a_.m64, b_.m64);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_i16m1(a_.sv64, b_.sv64, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values * b_.values;
#else
@@ -161,7 +214,9 @@ simde_vmul_s32(simde_int32x2_t a, simde_int32x2_t b) {
a_ = simde_int32x2_to_private(a),
b_ = simde_int32x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_i32m1(a_.sv64, b_.sv64, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -186,7 +241,9 @@ simde_x_vmul_s64(simde_int64x1_t a, simde_int64x1_t b) {
a_ = simde_int64x1_to_private(a),
b_ = simde_int64x1_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_i64m1(a_.sv64, b_.sv64, 1);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -209,7 +266,9 @@ simde_vmul_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
a_ = simde_uint8x8_to_private(a),
b_ = simde_uint8x8_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_u8m1(a_.sv64, b_.sv64, 8);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -237,7 +296,9 @@ simde_vmul_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
a_ = simde_uint16x4_to_private(a),
b_ = simde_uint16x4_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_u16m1(a_.sv64, b_.sv64, 4);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -265,7 +326,9 @@ simde_vmul_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
a_ = simde_uint32x2_to_private(a),
b_ = simde_uint32x2_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_u32m1(a_.sv64, b_.sv64, 2);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -290,7 +353,9 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
a_ = simde_uint64x1_to_private(a),
b_ = simde_uint64x1_to_private(b);
- #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vv_u64m1(a_.sv64, b_.sv64, 1);
+ #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
SIMDE_VECTORIZE
@@ -302,6 +367,32 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
return simde_uint64x1_from_private(r_);
}
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]);
+ simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]);
+ r_.values[i] = simde_float16_from_float32(tmp_a_ * tmp_b_);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulq_f16
+ #define vmulq_f16(a, b) simde_vmulq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) {
@@ -317,6 +408,8 @@ simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) {
r_.m128 = _mm_mul_ps(a_.m128, b_.m128);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f32x4_mul(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vv_f32m1(a_.sv128, b_.sv128, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
@@ -349,6 +442,8 @@ simde_vmulq_f64(simde_float64x2_t a, simde_float64x2_t b) {
r_.m128d = _mm_mul_pd(a_.m128d, b_.m128d);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_f64x2_mul(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vv_f64m1(a_.sv128, b_.sv128, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
@@ -400,6 +495,8 @@ simde_vmulq_s8(simde_int8x16_t a, simde_int8x16_t b) {
)
#endif
);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vv_i8m1(a_.sv128, b_.sv128, 16);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
@@ -430,6 +527,8 @@ simde_vmulq_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_X86_SSE2_NATIVE)
r_.m128i = _mm_mullo_epi16(a_.m128i, b_.m128i);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vv_i16m1(a_.sv128, b_.sv128, 8);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
@@ -460,6 +559,8 @@ simde_vmulq_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i32x4_mul(a_.v128, b_.v128);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vv_i32m1(a_.sv128, b_.sv128, 4);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
@@ -489,6 +590,8 @@ simde_x_vmulq_s64(simde_int64x2_t a, simde_int64x2_t b) {
r_.v128 = wasm_i64x2_mul(a_.v128, b_.v128);
#elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE)
r_.m128i = _mm_mullo_epi64(a_.m128i, b_.m128i);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vv_i64m1(a_.sv128, b_.sv128, 2);
#elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
r_.values = a_.values * b_.values;
#else
@@ -506,6 +609,13 @@ simde_uint8x16_t
simde_vmulq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vmulq_u8(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint8x16_private
+ r_,
+ a_ = simde_uint8x16_to_private(a),
+ b_ = simde_uint8x16_to_private(b);
+ r_.sv128 = __riscv_vmul_vv_u8m1(a_.sv128, b_.sv128, 16);
+ return simde_uint8x16_from_private(r_);
#else
return
simde_vreinterpretq_u8_s8(
@@ -526,6 +636,13 @@ simde_uint16x8_t
simde_vmulq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vmulq_u16(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint16x8_private
+ r_,
+ a_ = simde_uint16x8_to_private(a),
+ b_ = simde_uint16x8_to_private(b);
+ r_.sv128 = __riscv_vmul_vv_u16m1(a_.sv128, b_.sv128, 8);
+ return simde_uint16x8_from_private(r_);
#else
return
simde_vreinterpretq_u16_s16(
@@ -546,6 +663,13 @@ simde_uint32x4_t
simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vmulq_u32(a, b);
+ #elif defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint32x4_private
+ r_,
+ a_ = simde_uint32x4_to_private(a),
+ b_ = simde_uint32x4_to_private(b);
+ r_.sv128 = __riscv_vmul_vv_u32m1(a_.sv128, b_.sv128, 4);
+ return simde_uint32x4_from_private(r_);
#else
return
simde_vreinterpretq_u32_s32(
@@ -564,14 +688,85 @@ simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
SIMDE_FUNCTION_ATTRIBUTES
simde_uint64x2_t
simde_x_vmulq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
- return
- simde_vreinterpretq_u64_s64(
- simde_x_vmulq_s64(
- simde_vreinterpretq_s64_u64(a),
- simde_vreinterpretq_s64_u64(b)
- )
- );
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ simde_uint64x2_private
+ r_,
+ a_ = simde_uint64x2_to_private(a),
+ b_ = simde_uint64x2_to_private(b);
+ r_.sv128 = __riscv_vmul_vv_u64m1(a_.sv128, b_.sv128, 2);
+ return simde_uint64x2_from_private(r_);
+ #else
+ return
+ simde_vreinterpretq_u64_s64(
+ simde_x_vmulq_s64(
+ simde_vreinterpretq_s64_u64(a),
+ simde_vreinterpretq_s64_u64(b)
+ )
+ );
+ #endif
+}
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vmul_p8(simde_poly8x8_t a, simde_poly8x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vmul_p8(a, b);
+ #else
+ simde_uint8x8_private
+ r_,
+ a_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(a)),
+ b_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(b));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]);
+ uint16_t result = 0;
+ for(uint16_t j = 0; j < 8; ++j) {
+ if (a_.values[i] & (1 << j)) {
+ result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j));
+ }
+ }
+ r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (result & (0xFF)));
+ }
+
+ return simde_vreinterpret_p8_u8(simde_uint8x8_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmul_p8
+ #define vmul_p8(a, b) simde_vmul_p8((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vmulq_p8(simde_poly8x16_t a, simde_poly8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vmulq_p8(a, b);
+ #else
+ simde_uint8x16_private
+ r_,
+ a_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(a)),
+ b_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(b));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]);
+ uint16_t result = 0;
+ for(uint16_t j = 0; j < 8; ++j) {
+ if (a_.values[i] & (1 << j)) {
+ result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j));
+ }
+ }
+ r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (result & (0xFF)));
+ }
+
+ return simde_vreinterpretq_p8_u8(simde_uint8x16_from_private(r_));
+ #endif
}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmulq_p8
+ #define vmulq_p8(a, b) simde_vmulq_p8((a), (b))
+#endif
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/mul_lane.h b/lib/simd_wrapper/simde/arm/neon/mul_lane.h
index f7b1f2e5141..72c032eea62 100644
--- a/lib/simd_wrapper/simde/arm/neon/mul_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/mul_lane.h
@@ -22,17 +22,39 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ * 2023 Yung-Cheng Su
*/
#if !defined(SIMDE_ARM_NEON_MUL_LANE_H)
#define SIMDE_ARM_NEON_MUL_LANE_H
#include "types.h"
+#include "mul.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmulh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vmulh_f16(a, simde_float16x4_to_private(b).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vmulh_lane_f16(a, b, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vmulh_lane_f16(a, b, lane))
+ #else
+ #define simde_vmulh_lane_f16(a, b, lane) vmulh_lane_f16((a), (b), (lane))
+ #endif
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulh_lane_f16
+ #define vmulh_lane_f16(a, b, lane) simde_vmulh_lane_f16(a, b, lane)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float64_t
simde_vmuld_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane)
@@ -90,6 +112,25 @@ simde_vmuls_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane)
#define vmuls_lane_f32(a, b, lane) simde_vmuls_lane_f32(a, b, lane)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmulh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vmulh_f16(a, simde_float16x8_to_private(b).values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ #define simde_vmulh_laneq_f16(a, b, lane) \
+ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vmulh_laneq_f16(a, b, lane))
+ #else
+ #define simde_vmulh_laneq_f16(a, b, lane) vmulh_laneq_f16((a), (b), (lane))
+ #endif
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulh_laneq_f16
+ #define vmulh_laneq_f16(a, b, lane) simde_vmulh_laneq_f16(a, b, lane)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32_t
simde_vmuls_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane)
@@ -109,6 +150,30 @@ simde_vmuls_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane)
#define vmuls_laneq_f32(a, b, lane) simde_vmuls_laneq_f32(a, b, lane)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmul_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_float16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmul_lane_f16(a, b, lane) vmul_lane_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmul_lane_f16
+ #define vmul_lane_f16(a, b, lane) simde_vmul_lane_f16((a), (b), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane)
@@ -118,10 +183,14 @@ simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane)
a_ = simde_float32x2_to_private(a),
b_ = simde_float32x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float32x2_from_private(r_);
}
@@ -142,10 +211,14 @@ simde_vmul_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane)
a_ = simde_float64x1_to_private(a),
b_ = simde_float64x1_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float64x1_from_private(r_);
}
@@ -166,10 +239,14 @@ simde_vmul_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane)
a_ = simde_int16x4_to_private(a),
b_ = simde_int16x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int16x4_from_private(r_);
}
@@ -190,10 +267,14 @@ simde_vmul_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane)
a_ = simde_int32x2_to_private(a),
b_ = simde_int32x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int32x2_from_private(r_);
}
@@ -214,10 +295,14 @@ simde_vmul_lane_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int lane)
a_ = simde_uint16x4_to_private(a),
b_ = simde_uint16x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint16x4_from_private(r_);
}
@@ -238,10 +323,14 @@ simde_vmul_lane_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int lane)
a_ = simde_uint32x2_to_private(a),
b_ = simde_uint32x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint32x2_from_private(r_);
}
@@ -263,10 +352,14 @@ simde_vmul_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane)
simde_int16x8_private
b_ = simde_int16x8_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int16x4_from_private(r_);
}
@@ -288,10 +381,14 @@ simde_vmul_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane)
simde_int32x4_private
b_ = simde_int32x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int32x2_from_private(r_);
}
@@ -313,10 +410,14 @@ simde_vmul_laneq_u16(simde_uint16x4_t a, simde_uint16x8_t b, const int lane)
simde_uint16x8_private
b_ = simde_uint16x8_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint16x4_from_private(r_);
}
@@ -338,10 +439,14 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane)
simde_uint32x4_private
b_ = simde_uint32x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint32x2_from_private(r_);
}
@@ -353,6 +458,30 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane)
#define vmul_laneq_u32(a, b, lane) simde_vmul_laneq_u32((a), (b), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a);
+ simde_float16x4_private b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_float16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulq_lane_f16(a, b, lane) vmulq_lane_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulq_lane_f16
+ #define vmulq_lane_f16(a, b, lane) simde_vmulq_lane_f16((a), (b), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane)
@@ -362,10 +491,14 @@ simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane)
a_ = simde_float32x4_to_private(a);
simde_float32x2_private b_ = simde_float32x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float32x4_from_private(r_);
}
@@ -386,10 +519,14 @@ simde_vmulq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane)
a_ = simde_float64x2_to_private(a);
simde_float64x1_private b_ = simde_float64x1_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float64x2_from_private(r_);
}
@@ -410,10 +547,14 @@ simde_vmulq_lane_s16(simde_int16x8_t a, simde_int16x4_t b, const int lane)
a_ = simde_int16x8_to_private(a);
simde_int16x4_private b_ = simde_int16x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int16x8_from_private(r_);
}
@@ -434,10 +575,14 @@ simde_vmulq_lane_s32(simde_int32x4_t a, simde_int32x2_t b, const int lane)
a_ = simde_int32x4_to_private(a);
simde_int32x2_private b_ = simde_int32x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int32x4_from_private(r_);
}
@@ -458,10 +603,14 @@ simde_vmulq_lane_u16(simde_uint16x8_t a, simde_uint16x4_t b, const int lane)
a_ = simde_uint16x8_to_private(a);
simde_uint16x4_private b_ = simde_uint16x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint16x8_from_private(r_);
}
@@ -482,10 +631,14 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane)
a_ = simde_uint32x4_to_private(a);
simde_uint32x2_private b_ = simde_uint32x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint32x4_from_private(r_);
}
@@ -497,6 +650,30 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane)
#define vmulq_lane_u32(a, b, lane) simde_vmulq_lane_u32((a), (b), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_float16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulq_laneq_f16(a, b, lane) vmulq_laneq_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulq_laneq_f16
+ #define vmulq_laneq_f16(a, b, lane) simde_vmulq_laneq_f16((a), (b), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane)
@@ -506,10 +683,14 @@ simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane)
a_ = simde_float32x4_to_private(a),
b_ = simde_float32x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float32x4_from_private(r_);
}
@@ -530,10 +711,14 @@ simde_vmulq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane)
a_ = simde_float64x2_to_private(a),
b_ = simde_float64x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float64x2_from_private(r_);
}
@@ -554,10 +739,14 @@ simde_vmulq_laneq_s16(simde_int16x8_t a, simde_int16x8_t b, const int lane)
a_ = simde_int16x8_to_private(a),
b_ = simde_int16x8_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int16x8_from_private(r_);
}
@@ -578,10 +767,14 @@ simde_vmulq_laneq_s32(simde_int32x4_t a, simde_int32x4_t b, const int lane)
a_ = simde_int32x4_to_private(a),
b_ = simde_int32x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_int32x4_from_private(r_);
}
@@ -602,10 +795,14 @@ simde_vmulq_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int lane)
a_ = simde_uint16x8_to_private(a),
b_ = simde_uint16x8_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint16x8_from_private(r_);
}
@@ -626,10 +823,14 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane)
a_ = simde_uint32x4_to_private(a),
b_ = simde_uint32x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_uint32x4_from_private(r_);
}
@@ -641,6 +842,30 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane)
#define vmulq_laneq_u32(a, b, lane) simde_vmulq_laneq_u32((a), (b), (lane))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmul_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a);
+ simde_float16x8_private b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_float16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmul_laneq_f16(a, b, lane) vmul_laneq_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmul_laneq_f16
+ #define vmul_laneq_f16(a, b, lane) simde_vmul_laneq_f16((a), (b), (lane))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane)
@@ -650,10 +875,14 @@ simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane)
a_ = simde_float32x2_to_private(a);
simde_float32x4_private b_ = simde_float32x4_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float32x2_from_private(r_);
}
@@ -674,10 +903,14 @@ simde_vmul_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane)
a_ = simde_float64x1_to_private(a);
simde_float64x2_private b_ = simde_float64x2_to_private(b);
- SIMDE_VECTORIZE
- for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
- r_.values[i] = a_.values[i] * b_.values[lane];
- }
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
return simde_float64x1_from_private(r_);
}
diff --git a/lib/simd_wrapper/simde/arm/neon/mul_n.h b/lib/simd_wrapper/simde/arm/neon/mul_n.h
index 5c73ad2e7f8..53375427933 100644
--- a/lib/simd_wrapper/simde/arm/neon/mul_n.h
+++ b/lib/simd_wrapper/simde/arm/neon/mul_n.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MUL_N_H)
@@ -36,6 +37,20 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmul_n_f16(simde_float16x4_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmul_n_f16(a, b);
+ #else
+ return simde_vmul_f16(a, simde_vdup_n_f16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmul_n_f16
+ #define vmul_n_f16(a, b) simde_vmul_n_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vmul_n_f32(simde_float32x2_t a, simde_float32 b) {
@@ -120,6 +135,20 @@ simde_vmul_n_u32(simde_uint32x2_t a, uint32_t b) {
#define vmul_n_u32(a, b) simde_vmul_n_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulq_n_f16(simde_float16x8_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulq_n_f16(a, b);
+ #else
+ return simde_vmulq_f16(a, simde_vdupq_n_f16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulq_n_f16
+ #define vmulq_n_f16(a, b) simde_vmulq_n_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vmulq_n_f32(simde_float32x4_t a, simde_float32 b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/mull.h b/lib/simd_wrapper/simde/arm/neon/mull.h
index bfad62a2f3c..cd5c9112f49 100644
--- a/lib/simd_wrapper/simde/arm/neon/mull.h
+++ b/lib/simd_wrapper/simde/arm/neon/mull.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MULL_H)
@@ -230,6 +231,62 @@ simde_vmull_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#define vmull_u32(a, b) simde_vmull_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vmull_p8(simde_poly8x8_t a, simde_poly8x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vmull_p8(a, b);
+ #else
+ simde_uint8x8_private
+ a_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(a)),
+ b_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(b));
+ simde_uint16x8_private r_;
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]);
+ uint16_t result = 0;
+ for(size_t j = 0; j < 8; ++j) {
+ if (a_.values[i] & (1 << j)) {
+ result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j));
+ }
+ }
+ r_.values[i] = result;
+ }
+
+ return simde_vreinterpretq_p16_u16(simde_uint16x8_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmull_p8
+ #define vmull_p8(a, b) simde_vmull_p8((a), (b))
+#endif
+
+#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE)
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly128_t
+simde_vmull_p64(simde_poly64_t a, simde_poly64_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)
+ return vmull_p64(a, b);
+ #else
+ simde_poly128_t extend_op2 = HEDLEY_STATIC_CAST(simde_poly128_t, b);
+ simde_poly128_t result = 0;
+ SIMDE_VECTORIZE
+ for(size_t j = 0; j < 64; ++j) {
+ if (a & (1ull << j)) {
+ result = result ^ (extend_op2 << j);
+ }
+ }
+ return result;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_p64
+ #define vmull_p64(a, b) simde_vmull_p64((a), (b))
+#endif
+
+#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/mull_high.h b/lib/simd_wrapper/simde/arm/neon/mull_high.h
index 658d151f709..87e83369a96 100644
--- a/lib/simd_wrapper/simde/arm/neon/mull_high.h
+++ b/lib/simd_wrapper/simde/arm/neon/mull_high.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MULL_HIGH_H)
@@ -30,6 +31,7 @@
#include "types.h"
#include "mul.h"
#include "movl_high.h"
+#include "mull.h"
HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
@@ -119,6 +121,57 @@ simde_vmull_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
#define vmull_high_u32(a, b) simde_vmull_high_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly16x8_t
+simde_vmull_high_p8(simde_poly8x16_t a, simde_poly8x16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmull_high_p8(a, b);
+ #else
+ simde_uint8x16_private
+ a_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(a)),
+ b_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(b));
+ simde_uint16x8_private r_;
+
+ size_t high_offset = (sizeof(r_.values) / sizeof(r_.values[0]));
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i+high_offset]);
+ uint16_t result = 0;
+ for(size_t j = 0; j < 8; ++j) {
+ if (a_.values[i+high_offset] & (1 << j)) {
+ result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j));
+ }
+ }
+ r_.values[i] = result;
+ }
+
+ return simde_vreinterpretq_p16_u16(simde_uint16x8_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_p8
+ #define vmull_high_p8(a, b) simde_vmull_high_p8((a), (b))
+#endif
+
+#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE)
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly128_t
+simde_vmull_high_p64(simde_poly64x2_t a, simde_poly64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)
+ return vmull_high_p64(a, b);
+ #else
+ simde_poly64x2_private
+ a_ = simde_poly64x2_to_private(a),
+ b_ = simde_poly64x2_to_private(b);
+ return simde_vmull_p64(a_.values[1], b_.values[1]);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_p64
+ #define vmull_high_p64(a, b) simde_vmull_high_p64((a), (b))
+#endif
+#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/mull_high_lane.h b/lib/simd_wrapper/simde/arm/neon/mull_high_lane.h
new file mode 100644
index 00000000000..226dbf862dc
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mull_high_lane.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MULL_HIGH_LANE_H)
+#define SIMDE_ARM_NEON_MULL_HIGH_LANE_H
+
+#include "combine.h"
+#include "mull.h"
+#include "dup_n.h"
+#include "get_high.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmull_high_lane_s16(simde_int16x8_t a, simde_int16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int16x4_private
+ v_ = simde_int16x4_to_private(v);
+ return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_lane_s16(a, v, lane) vmull_high_lane_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_lane_s16
+ #define vmull_high_lane_s16(a, v, lane) simde_vmull_high_lane_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmull_high_laneq_s16(simde_int16x8_t a, simde_int16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int16x8_private
+ v_ = simde_int16x8_to_private(v);
+ return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_laneq_s16(a, v, lane) vmull_high_laneq_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_laneq_s16
+ #define vmull_high_laneq_s16(a, v, lane) simde_vmull_high_laneq_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmull_high_lane_s32(simde_int32x4_t a, simde_int32x2_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int32x2_private
+ v_ = simde_int32x2_to_private(v);
+ return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_lane_s32(a, v, lane) vmull_high_lane_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_lane_s32
+ #define vmull_high_lane_s32(a, v, lane) simde_vmull_high_lane_s32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmull_high_laneq_s32(simde_int32x4_t a, simde_int32x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int32x4_private
+ v_ = simde_int32x4_to_private(v);
+ return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_laneq_s32(a, v, lane) vmull_high_laneq_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_laneq_s32
+ #define vmull_high_laneq_s32(a, v, lane) simde_vmull_high_laneq_s32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmull_high_lane_u16(simde_uint16x8_t a, simde_uint16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_uint16x4_private
+ v_ = simde_uint16x4_to_private(v);
+ return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_lane_u16(a, v, lane) vmull_high_lane_u16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_lane_u16
+ #define vmull_high_lane_u16(a, v, lane) simde_vmull_high_lane_u16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmull_high_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_uint16x8_private
+ v_ = simde_uint16x8_to_private(v);
+ return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_laneq_u16(a, v, lane) vmull_high_laneq_u16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_laneq_u16
+ #define vmull_high_laneq_u16(a, v, lane) simde_vmull_high_laneq_u16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmull_high_lane_u32(simde_uint32x4_t a, simde_uint32x2_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_uint32x2_private
+ v_ = simde_uint32x2_to_private(v);
+ return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_lane_u32(a, v, lane) vmull_high_lane_u32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_lane_u32
+ #define vmull_high_lane_u32(a, v, lane) simde_vmull_high_lane_u32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmull_high_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_uint32x4_private
+ v_ = simde_uint32x4_to_private(v);
+ return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmull_high_laneq_u32(a, v, lane) vmull_high_laneq_u32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_laneq_u32
+ #define vmull_high_laneq_u32(a, v, lane) simde_vmull_high_laneq_u32((a), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mull_high_n.h b/lib/simd_wrapper/simde/arm/neon/mull_high_n.h
new file mode 100644
index 00000000000..d6a5b356f4a
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mull_high_n.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MULL_HIGH_N_H)
+#define SIMDE_ARM_NEON_MULL_HIGH_N_H
+
+#include "combine.h"
+#include "get_high.h"
+#include "dup_n.h"
+#include "mull.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vmull_high_n_s16(simde_int16x8_t a, int16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmull_high_n_s16(a, b);
+ #else
+ return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_n_s16
+ #define vmull_high_n_s16(a, b) simde_vmull_high_n_s16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vmull_high_n_s32(simde_int32x4_t a, int32_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmull_high_n_s32(a, b);
+ #else
+ return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_n_s32
+ #define vmull_high_n_s32(a, b) simde_vmull_high_n_s32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vmull_high_n_u16(simde_uint16x8_t a, uint16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmull_high_n_u16(a, b);
+ #else
+ return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_n_u16
+ #define vmull_high_n_u16(a, b) simde_vmull_high_n_u16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint64x2_t
+simde_vmull_high_n_u32(simde_uint32x4_t a, uint32_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmull_high_n_u32(a, b);
+ #else
+ return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmull_high_n_u32
+ #define vmull_high_n_u32(a, b) simde_vmull_high_n_u32((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MULL_HIGH_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mulx.h b/lib/simd_wrapper/simde/arm/neon/mulx.h
new file mode 100644
index 00000000000..a089125f64f
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mulx.h
@@ -0,0 +1,237 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MULX_H)
+#define SIMDE_ARM_NEON_MULX_H
+
+#include "types.h"
+
+#include "reinterpret.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmulxh_f16(simde_float16_t a, simde_float16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulxh_f16(a, b);
+ #else
+ return simde_float16_from_float32(
+ simde_float16_to_float32(a) *
+ simde_float16_to_float32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxh_f16
+ #define vmulxh_f16(a, b) simde_vmulxh_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vmulxs_f32(simde_float32_t a, simde_float32_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmulxs_f32(a, b);
+ #else
+ return a * b;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxs_f32
+ #define vmulxs_f32(a, b) simde_vmulxs_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vmulxd_f64(simde_float64_t a, simde_float64_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmulxd_f64(a, b);
+ #else
+ return a * b;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxd_f64
+ #define vmulxd_f64(a, b) simde_vmulxd_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmulx_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulx_f16(a, b);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmulxh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_f16
+ #define vmulx_f16(a, b) simde_vmulx_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vmulx_f32(simde_float32x2_t a, simde_float32x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmulx_f32(a, b);
+ #else
+ simde_float32x2_private
+ r_,
+ a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(b);
+
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.values = a_.values * b_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[i];
+ }
+ #endif
+
+ return simde_float32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_f32
+ #define vmulx_f32(a, b) simde_vmulx_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vmulx_f64(simde_float64x1_t a, simde_float64x1_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmulx_f64(a, b);
+ #else
+ simde_float64x1_private
+ r_,
+ a_ = simde_float64x1_to_private(a),
+ b_ = simde_float64x1_to_private(b);
+
+ #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS)
+ r_.values = a_.values * b_.values;
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[i];
+ }
+ #endif
+
+ return simde_float64x1_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_f64
+ #define vmulx_f64(a, b) simde_vmulx_f64((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulxq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulxq_f16(a, b);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vmulxh_f16(a_.values[i], b_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_f16
+ #define vmulxq_f16(a, b) simde_vmulxq_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vmulxq_f32(simde_float32x4_t a, simde_float32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmulxq_f32(a, b);
+ #else
+ simde_float32x4_private
+ r_,
+ a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[i];
+ }
+
+ return simde_float32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_f32
+ #define vmulxq_f32(a, b) simde_vmulxq_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vmulxq_f64(simde_float64x2_t a, simde_float64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vmulxq_f64(a, b);
+ #else
+ simde_float64x2_private
+ r_,
+ a_ = simde_float64x2_to_private(a),
+ b_ = simde_float64x2_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[i];
+ }
+
+ return simde_float64x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_f64
+ #define vmulxq_f64(a, b) simde_vmulxq_f64((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MULX_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mulx_lane.h b/lib/simd_wrapper/simde/arm/neon/mulx_lane.h
new file mode 100644
index 00000000000..eed553651c1
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mulx_lane.h
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MULX_LANE_H)
+#define SIMDE_ARM_NEON_MULX_LANE_H
+
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmulxh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_float16_from_float32(
+ simde_float16_to_float32(a) *
+ simde_float16_to_float32(simde_float16x4_to_private(b).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulxh_lane_f16(a, b, lane) vmulxh_lane_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxh_lane_f16
+ #define vmulxh_lane_f16(a, b, lane) simde_vmulxh_lane_f16(a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vmulxs_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return a * simde_float32x2_to_private(b).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxs_lane_f32(a, b, lane) vmulxs_lane_f32((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxs_lane_f32
+ #define vmulxs_lane_f32(a, b, lane) simde_vmulxs_lane_f32(a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vmulxd_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ return a * simde_float64x1_to_private(b).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxd_lane_f64(a, b, lane) vmulxd_lane_f64((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxd_lane_f64
+ #define vmulxd_lane_f64(a, b, lane) simde_vmulxd_lane_f64(a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vmulxh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_float16_from_float32(
+ simde_float16_to_float32(a) *
+ simde_float16_to_float32(simde_float16x8_to_private(b).values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulxh_laneq_f16(a, b, lane) vmulxh_laneq_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxh_laneq_f16
+ #define vmulxh_laneq_f16(a, b, lane) simde_vmulxh_laneq_f16(a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vmulxs_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return a * simde_float32x4_to_private(b).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxs_laneq_f32(a, b, lane) vmulxs_laneq_f32((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxs_laneq_f32
+ #define vmulxs_laneq_f32(a, b, lane) simde_vmulxs_laneq_f32(a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vmulxd_laneq_f64(simde_float64_t a, simde_float64x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ return a * simde_float64x2_to_private(b).values[lane];
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxd_laneq_f64(a, b, lane) vmulxd_laneq_f64((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxd_laneq_f64
+ #define vmulxd_laneq_f64(a, b, lane) simde_vmulxd_laneq_f64(a, b, lane)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmulx_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a),
+ b_ = simde_float16x4_to_private(b);
+ simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(
+ simde_float16_to_float32(a_.values[i]) * b_lane_);
+ }
+
+ return simde_float16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulx_lane_f16(a, b, lane) vmulx_lane_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_lane_f16
+ #define vmulx_lane_f16(a, b, lane) simde_vmulx_lane_f16((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vmulx_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float32x2_private
+ r_,
+ a_ = simde_float32x2_to_private(a),
+ b_ = simde_float32x2_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulx_lane_f32(a, b, lane) vmulx_lane_f32((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_lane_f32
+ #define vmulx_lane_f32(a, b, lane) simde_vmulx_lane_f32((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vmulx_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_float64x1_private
+ r_,
+ a_ = simde_float64x1_to_private(a),
+ b_ = simde_float64x1_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulx_lane_f64(a, b, lane) vmulx_lane_f64((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_lane_f64
+ #define vmulx_lane_f64(a, b, lane) simde_vmulx_lane_f64((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulxq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a);
+ simde_float16x4_private b_ = simde_float16x4_to_private(b);
+ simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(
+ simde_float16_to_float32(a_.values[i]) * b_lane_);
+ }
+
+ return simde_float16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulxq_lane_f16(a, b, lane) vmulxq_lane_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_lane_f16
+ #define vmulxq_lane_f16(a, b, lane) simde_vmulxq_lane_f16((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vmulxq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float32x4_private
+ r_,
+ a_ = simde_float32x4_to_private(a);
+ simde_float32x2_private b_ = simde_float32x2_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxq_lane_f32(a, b, lane) vmulxq_lane_f32((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_lane_f32
+ #define vmulxq_lane_f32(a, b, lane) simde_vmulxq_lane_f32((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vmulxq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) {
+ simde_float64x2_private
+ r_,
+ a_ = simde_float64x2_to_private(a);
+ simde_float64x1_private b_ = simde_float64x1_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxq_lane_f64(a, b, lane) vmulxq_lane_f64((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_lane_f64
+ #define vmulxq_lane_f64(a, b, lane) simde_vmulxq_lane_f64((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulxq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a),
+ b_ = simde_float16x8_to_private(b);
+ simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(
+ simde_float16_to_float32(a_.values[i]) * b_lane_);
+ }
+
+ return simde_float16x8_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulxq_laneq_f16(a, b, lane) vmulxq_laneq_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_laneq_f16
+ #define vmulxq_laneq_f16(a, b, lane) simde_vmulxq_laneq_f16((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vmulxq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x4_private
+ r_,
+ a_ = simde_float32x4_to_private(a),
+ b_ = simde_float32x4_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxq_laneq_f32(a, b, lane) vmulxq_laneq_f32((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_laneq_f32
+ #define vmulxq_laneq_f32(a, b, lane) simde_vmulxq_laneq_f32((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vmulxq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float64x2_private
+ r_,
+ a_ = simde_float64x2_to_private(a),
+ b_ = simde_float64x2_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulxq_laneq_f64(a, b, lane) vmulxq_laneq_f64((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_laneq_f64
+ #define vmulxq_laneq_f64(a, b, lane) simde_vmulxq_laneq_f64((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmulx_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a);
+ simde_float16x8_private b_ = simde_float16x8_to_private(b);
+ simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_float16_from_float32(
+ simde_float16_to_float32(a_.values[i]) * b_lane_);
+ }
+
+ return simde_float16x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ #define simde_vmulx_laneq_f16(a, b, lane) vmulx_laneq_f16((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_laneq_f16
+ #define vmulx_laneq_f16(a, b, lane) simde_vmulx_laneq_f16((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vmulx_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_float32x2_private
+ r_,
+ a_ = simde_float32x2_to_private(a);
+ simde_float32x4_private b_ = simde_float32x4_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float32x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulx_laneq_f32(a, b, lane) vmulx_laneq_f32((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_laneq_f32
+ #define vmulx_laneq_f32(a, b, lane) simde_vmulx_laneq_f32((a), (b), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x1_t
+simde_vmulx_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_float64x1_private
+ r_,
+ a_ = simde_float64x1_to_private(a);
+ simde_float64x2_private b_ = simde_float64x2_to_private(b);
+
+ #if defined(SIMDE_RISCV_V_NATIVE)
+ r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1);
+ #else
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = a_.values[i] * b_.values[lane];
+ }
+ #endif
+
+ return simde_float64x1_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vmulx_laneq_f64(a, b, lane) vmulx_laneq_f64((a), (b), (lane))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_laneq_f64
+ #define vmulx_laneq_f64(a, b, lane) simde_vmulx_laneq_f64((a), (b), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MULX_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mulx_n.h b/lib/simd_wrapper/simde/arm/neon/mulx_n.h
new file mode 100644
index 00000000000..be78a834d6a
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/mulx_n.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_MULX_N_H)
+#define SIMDE_ARM_NEON_MULX_N_H
+
+#include "types.h"
+#include "mul.h"
+#include "dup_n.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vmulx_n_f16(simde_float16x4_t a, simde_float16 b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulx_n_f16(a, b);
+ #else
+ return simde_vmul_f16(a, simde_vdup_n_f16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulx_n_f16
+ #define vmulx_n_f16(a, b) simde_vmulx_n_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vmulxq_n_f16(simde_float16x8_t a, simde_float16 b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vmulxq_n_f16(a, b);
+ #else
+ return simde_vmulq_f16(a, simde_vdupq_n_f16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vmulxq_n_f16
+ #define vmulxq_n_f16(a, b) simde_vmulxq_n_f16((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_MULX_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/mvn.h b/lib/simd_wrapper/simde/arm/neon/mvn.h
index 654455ec2bc..1cd35591d69 100644
--- a/lib/simd_wrapper/simde/arm/neon/mvn.h
+++ b/lib/simd_wrapper/simde/arm/neon/mvn.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Christopher Moore
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_MVN_H)
@@ -420,6 +421,52 @@ simde_vmvn_u32(simde_uint32x2_t a) {
#define vmvn_u32(a) simde_vmvn_u32(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x8_t
+simde_vmvn_p8(simde_poly8x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vmvn_p8(a);
+ #else
+ simde_poly8x8_private
+ r_,
+ a_ = simde_poly8x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = ~(a_.values[i]);
+ }
+
+ return simde_poly8x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmvn_p8
+ #define vmvn_p8(a) simde_vmvn_p8(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_poly8x16_t
+simde_vmvnq_p8(simde_poly8x16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vmvnq_p8(a);
+ #else
+ simde_poly8x16_private
+ r_,
+ a_ = simde_poly8x16_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = ~(a_.values[i]);
+ }
+
+ return simde_poly8x16_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vmvnq_p8
+ #define vmvnq_p8(a) simde_vmvnq_p8(a)
+#endif
+
SIMDE_END_DECLS_
HEDLEY_DIAGNOSTIC_POP
diff --git a/lib/simd_wrapper/simde/arm/neon/neg.h b/lib/simd_wrapper/simde/arm/neon/neg.h
index 779238950a3..e6b2a8e480d 100644
--- a/lib/simd_wrapper/simde/arm/neon/neg.h
+++ b/lib/simd_wrapper/simde/arm/neon/neg.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_NEG_H)
@@ -47,6 +48,43 @@ simde_vnegd_s64(int64_t a) {
#define vnegd_s64(a) simde_vnegd_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16_t
+simde_vnegh_f16(simde_float16_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vnegh_f16(a);
+ #else
+ return simde_float16_from_float32(-simde_float16_to_float32(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vnegh_f16
+ #define vnegh_f16(a) simde_vnegh_f16(a)
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vneg_f16(simde_float16x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vneg_f16(a);
+ #else
+ simde_float16x4_private
+ r_,
+ a_ = simde_float16x4_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vnegh_f16(a_.values[i]);
+ }
+
+ return simde_float16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vneg_f16
+ #define vneg_f16(a) simde_vneg_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vneg_f32(simde_float32x2_t a) {
@@ -209,6 +247,29 @@ simde_vneg_s64(simde_int64x1_t a) {
#define vneg_s64(a) simde_vneg_s64(a)
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vnegq_f16(simde_float16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vnegq_f16(a);
+ #else
+ simde_float16x8_private
+ r_,
+ a_ = simde_float16x8_to_private(a);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vnegh_f16(a_.values[i]);
+ }
+
+ return simde_float16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vnegq_f16
+ #define vnegq_f16(a) simde_vnegq_f16(a)
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vnegq_f32(simde_float32x4_t a) {
diff --git a/lib/simd_wrapper/simde/arm/neon/padd.h b/lib/simd_wrapper/simde/arm/neon/padd.h
index 6cfd99a2d71..5c34cbe8960 100644
--- a/lib/simd_wrapper/simde/arm/neon/padd.h
+++ b/lib/simd_wrapper/simde/arm/neon/padd.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020-2021 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_PADD_H)
@@ -96,6 +97,20 @@ simde_vpadds_f32(simde_float32x2_t a) {
#define vpadds_f32(a) simde_vpadds_f32((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vpadd_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) && defined(SIMDE_ARM_NEON_FP16)
+ return vpadd_f16(a, b);
+ #else
+ return simde_vadd_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vpadd_f16
+ #define vpadd_f16(a, b) simde_vpadd_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vpadd_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -198,6 +213,20 @@ simde_vpadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#define vpadd_u32(a, b) simde_vpadd_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vpaddq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpaddq_f16(a, b);
+ #else
+ return simde_vaddq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpaddq_f16
+ #define vpaddq_f16(a, b) simde_vpaddq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vpaddq_f32(simde_float32x4_t a, simde_float32x4_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/paddl.h b/lib/simd_wrapper/simde/arm/neon/paddl.h
index 203fbad9fcd..3b36e0dcdef 100644
--- a/lib/simd_wrapper/simde/arm/neon/paddl.h
+++ b/lib/simd_wrapper/simde/arm/neon/paddl.h
@@ -286,7 +286,7 @@ simde_vpaddlq_u16(simde_uint16x8_t a) {
simde_uint32x4_private r_;
#if defined(SIMDE_X86_XOP_NATIVE)
- r_.sse_m128i = _mm_haddd_epu16(a_.sse_m128i);
+ r_.m128i = _mm_haddd_epu16(a_.m128i);
#elif defined(SIMDE_X86_SSE2_NATIVE)
r_.m128i =
_mm_add_epi32(
diff --git a/lib/simd_wrapper/simde/arm/neon/pmax.h b/lib/simd_wrapper/simde/arm/neon/pmax.h
index ecf31a1a93a..d8de39d763e 100644
--- a/lib/simd_wrapper/simde/arm/neon/pmax.h
+++ b/lib/simd_wrapper/simde/arm/neon/pmax.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_PMAX_H)
@@ -67,6 +68,20 @@ simde_vpmaxqd_f64(simde_float64x2_t a) {
#define vpmaxqd_f64(a) simde_vpmaxqd_f64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vpmax_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpmax_f16(a, b);
+ #else
+ return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmax_f16
+ #define vpmax_f16(a, b) simde_vpmax_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vpmax_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -165,6 +180,20 @@ simde_vpmax_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#define vpmax_u32(a, b) simde_vpmax_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vpmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpmaxq_f16(a, b);
+ #else
+ return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxq_f16
+ #define vpmaxq_f16(a, b) simde_vpmaxq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vpmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/pmaxnm.h b/lib/simd_wrapper/simde/arm/neon/pmaxnm.h
new file mode 100644
index 00000000000..5fa519d5e78
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/pmaxnm.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_PMAXNM_H)
+#define SIMDE_ARM_NEON_PMAXNM_H
+
+#include "types.h"
+#include "max.h"
+#include "uzp1.h"
+#include "uzp2.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vpmaxnms_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpmaxnms_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ return (a_.values[0] > a_.values[1]) ? a_.values[0] : a_.values[1];
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnms_f32
+ #define vpmaxnms_f32(a) simde_vpmaxnms_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vpmaxnmqd_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpmaxnmqd_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ return (a_.values[0] > a_.values[1]) ? a_.values[0] : a_.values[1];
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnmqd_f64
+ #define vpmaxnmqd_f64(a) simde_vpmaxnmqd_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vpmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpmaxnm_f16(a, b);
+ #else
+ return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnm_f16
+ #define vpmaxnm_f16(a, b) simde_vpmaxnm_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vpmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpmaxnm_f32(a, b);
+ #else
+ return simde_vmax_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnm_f32
+ #define vpmaxnm_f32(a, b) simde_vpmaxnm_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vpmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpmaxnmq_f16(a, b);
+ #else
+ return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnmq_f16
+ #define vpmaxnmq_f16(a, b) simde_vpmaxnmq_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vpmaxnmq_f32(simde_float32x4_t a, simde_float32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpmaxnmq_f32(a, b);
+ #else
+ return simde_vmaxq_f32(simde_vuzp1q_f32(a, b), simde_vuzp2q_f32(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnmq_f32
+ #define vpmaxnmq_f32(a, b) simde_vpmaxnmq_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vpmaxnmq_f64(simde_float64x2_t a, simde_float64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpmaxnmq_f64(a, b);
+ #else
+ return simde_vmaxq_f64(simde_vuzp1q_f64(a, b), simde_vuzp2q_f64(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmaxnmq_f64
+ #define vpmaxnmq_f64(a, b) simde_vpmaxnmq_f64((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_PMAXNM_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/pmin.h b/lib/simd_wrapper/simde/arm/neon/pmin.h
index eaf58e45503..2f76c63801f 100644
--- a/lib/simd_wrapper/simde/arm/neon/pmin.h
+++ b/lib/simd_wrapper/simde/arm/neon/pmin.h
@@ -22,6 +22,7 @@
*
* Copyright:
* 2020 Evan Nemerson
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_PMIN_H)
@@ -66,6 +67,20 @@ simde_vpminqd_f64(simde_float64x2_t a) {
#define vpminqd_f64(a) simde_vpminqd_f64((a))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vpmin_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpmin_f16(a, b);
+ #else
+ return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES)
+ #undef vpmin_f16
+ #define vpmin_f16(a, b) simde_vpmin_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x2_t
simde_vpmin_f32(simde_float32x2_t a, simde_float32x2_t b) {
@@ -164,6 +179,20 @@ simde_vpmin_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
#define vpmin_u32(a, b) simde_vpmin_u32((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vpminq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpminq_f16(a, b);
+ #else
+ return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminq_f16
+ #define vpminq_f16(a, b) simde_vpminq_f16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_float32x4_t
simde_vpminq_f32(simde_float32x4_t a, simde_float32x4_t b) {
diff --git a/lib/simd_wrapper/simde/arm/neon/pminnm.h b/lib/simd_wrapper/simde/arm/neon/pminnm.h
new file mode 100644
index 00000000000..99de03555b0
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/pminnm.h
@@ -0,0 +1,142 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_PMINNM_H)
+#define SIMDE_ARM_NEON_PMINNM_H
+
+#include "types.h"
+#include "min.h"
+#include "uzp1.h"
+#include "uzp2.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32_t
+simde_vpminnms_f32(simde_float32x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpminnms_f32(a);
+ #else
+ simde_float32x2_private a_ = simde_float32x2_to_private(a);
+ return (a_.values[0] < a_.values[1]) ? a_.values[0] : a_.values[1];
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnms_f32
+ #define vpminnms_f32(a) simde_vpminnms_f32((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64_t
+simde_vpminnmqd_f64(simde_float64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpminnmqd_f64(a);
+ #else
+ simde_float64x2_private a_ = simde_float64x2_to_private(a);
+ return (a_.values[0] < a_.values[1]) ? a_.values[0] : a_.values[1];
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnmqd_f64
+ #define vpminnmqd_f64(a) simde_vpminnmqd_f64((a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x4_t
+simde_vpminnm_f16(simde_float16x4_t a, simde_float16x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpminnm_f16(a, b);
+ #else
+ return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnm_f16
+ #define vpminnm_f16(a, b) simde_vpminnm_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x2_t
+simde_vpminnm_f32(simde_float32x2_t a, simde_float32x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpminnm_f32(a, b);
+ #else
+ return simde_vmin_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnm_f32
+ #define vpminnm_f32(a, b) simde_vpminnm_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float16x8_t
+simde_vpminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16)
+ return vpminnmq_f16(a, b);
+ #else
+ return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnmq_f16
+ #define vpminnmq_f16(a, b) simde_vpminnmq_f16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float32x4_t
+simde_vpminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpminnmq_f32(a, b);
+ #else
+ return simde_vminq_f32(simde_vuzp1q_f32(a, b), simde_vuzp2q_f32(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnmq_f32
+ #define vpminnmq_f32(a, b) simde_vpminnmq_f32((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_float64x2_t
+simde_vpminnmq_f64(simde_float64x2_t a, simde_float64x2_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vpminnmq_f64(a, b);
+ #else
+ return simde_vminq_f64(simde_vuzp1q_f64(a, b), simde_vuzp2q_f64(a, b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vpminnmq_f64
+ #define vpminnmq_f64(a, b) simde_vpminnmq_f64((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_PMINNM_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qabs.h b/lib/simd_wrapper/simde/arm/neon/qabs.h
index 6e956f1e144..9ad7d7c8300 100644
--- a/lib/simd_wrapper/simde/arm/neon/qabs.h
+++ b/lib/simd_wrapper/simde/arm/neon/qabs.h
@@ -162,7 +162,7 @@ simde_int8x16_t
simde_vqabsq_s8(simde_int8x16_t a) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vqabsq_s8(a);
- #elif defined(SIMDE_X86_SSE4_1_NATIVE)
+ #elif defined(SIMDE_X86_SSE2_NATIVE)
simde_int8x16_private
r_,
a_ = simde_int8x16_to_private(simde_vabsq_s8(a));
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal.h b/lib/simd_wrapper/simde/arm/neon/qdmlal.h
new file mode 100644
index 00000000000..fe96b0fc813
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlal.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLAL_H)
+#define SIMDE_ARM_NEON_QDMLAL_H
+
+#include "add.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "movl.h"
+#include "qadd.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlalh_s16(a, b, c);
+ #else
+ return HEDLEY_STATIC_CAST(int32_t, b) * HEDLEY_STATIC_CAST(int32_t, c) * 2 + a;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlalh_s16
+ #define vqdmlalh_s16(a, b, c) simde_vqdmlalh_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlals_s32(a, b, c);
+ #else
+ return HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c) * 2 + a;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlals_s32
+ #define vqdmlals_s32(a, b, c) simde_vqdmlals_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlal_s16(a, b, c);
+ #else
+ simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c));
+ return simde_vqaddq_s32(simde_vqaddq_s32(temp, temp), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_s16
+ #define vqdmlal_s16(a, b, c) simde_vqdmlal_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlal_s32(a, b, c);
+ #else
+ simde_int64x2_t r = simde_x_vmulq_s64(
+ simde_vmovl_s32(b),
+ simde_vmovl_s32(c));
+ return simde_vqaddq_s64(a, simde_vqaddq_s64(r, r));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_s32
+ #define vqdmlal_s32(a, b, c) simde_vqdmlal_s32((a), (b), (c))
+#endif
+
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_high.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_high.h
new file mode 100644
index 00000000000..016deb01191
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_high.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H)
+#define SIMDE_ARM_NEON_QDMLAL_HIGH_H
+
+#include "movl_high.h"
+#include "mla.h"
+#include "mul_n.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlal_high_s16(a, b, c);
+ #else
+ return simde_vaddq_s32(
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)), 2), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_s16
+ #define vqdmlal_high_s16(a, b, c) simde_vqdmlal_high_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlal_high_s32(a, b, c);
+ #else
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(c)));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vaddq_s64(a, simde_int64x2_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_s32
+ #define vqdmlal_high_s32(a, b, c) simde_vqdmlal_high_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_high_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_lane.h
new file mode 100644
index 00000000000..b2d6a8b4283
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_lane.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H)
+#define SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H
+
+#include "movl_high.h"
+#include "add.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "dup_n.h"
+#include "mla.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlal_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vaddq_s32(
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b),
+ simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]))), 2), a);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlal_high_lane_s16(a, b, v, lane) vqdmlal_high_lane_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_lane_s16
+ #define vqdmlal_high_lane_s16(a, b, v, lane) simde_vqdmlal_high_lane_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlal_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vaddq_s32(
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b),
+ simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]))), 2), a);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlal_high_laneq_s16(a, b, v, lane) vqdmlal_high_laneq_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_laneq_s16
+ #define vqdmlal_high_laneq_s16(a, b, v, lane) simde_vqdmlal_high_laneq_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlal_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane]))));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vaddq_s64(a, simde_int64x2_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlal_high_lane_s32(a, b, v, lane) vqdmlal_high_lane_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_lane_s32
+ #define vqdmlal_high_lane_s32(a, b, v, lane) simde_vqdmlal_high_lane_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlal_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane]))));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vaddq_s64(a, simde_int64x2_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlal_high_laneq_s32(a, b, v, lane) vqdmlal_high_laneq_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_laneq_s32
+ #define vqdmlal_high_laneq_s32(a, b, v, lane) simde_vqdmlal_high_laneq_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_high_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_n.h
new file mode 100644
index 00000000000..205cafbcc16
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_n.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H)
+#define SIMDE_ARM_NEON_QDMLAL_HIGH_N_H
+
+#include "movl_high.h"
+#include "dup_n.h"
+#include "add.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlal_high_n_s16(a, b, c);
+ #else
+ return simde_vaddq_s32(
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b),
+ simde_vmovl_high_s16(simde_vdupq_n_s16(c))), 2), a);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_n_s16
+ #define vqdmlal_high_n_s16(a, b, c) simde_vqdmlal_high_n_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlal_high_n_s32(a, b, c);
+ #else
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(simde_vdupq_n_s32(c))));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vaddq_s64(a, simde_int64x2_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_high_n_s32
+ #define vqdmlal_high_n_s32(a, b, c) simde_vqdmlal_high_n_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_lane.h
new file mode 100644
index 00000000000..14a663cd66d
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_lane.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLAL_LANE_H)
+#define SIMDE_ARM_NEON_QDMLAL_LANE_H
+
+#include "qdmlal.h"
+#include "dup_lane.h"
+#include "get_lane.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vqdmlal_lane_s16(a, b, v, lane) vqdmlal_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlal_lane_s16(a, b, v, lane) simde_vqdmlal_s16((a), (b), simde_vdup_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_lane_s16
+ #define vqdmlal_lane_s16(a, b, c, lane) simde_vqdmlal_lane_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vqdmlal_lane_s32(a, b, v, lane) vqdmlal_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlal_lane_s32(a, b, v, lane) simde_vqdmlal_s32((a), (b), simde_vdup_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_lane_s32
+ #define vqdmlal_lane_s32(a, b, c, lane) simde_vqdmlal_lane_s32((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlal_laneq_s16(a, b, v, lane) vqdmlal_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlal_laneq_s16(a, b, v, lane) simde_vqdmlal_s16((a), (b), simde_vdup_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_laneq_s16
+ #define vqdmlal_laneq_s16(a, b, c, lane) simde_vqdmlal_laneq_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlal_laneq_s32(a, b, v, lane) vqdmlal_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlal_laneq_s32(a, b, v, lane) simde_vqdmlal_s32((a), (b), simde_vdup_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_laneq_s32
+ #define vqdmlal_laneq_s32(a, b, c, lane) simde_vqdmlal_laneq_s32((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlalh_lane_s16(a, b, v, lane) vqdmlalh_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlalh_lane_s16(a, b, v, lane) simde_vqdmlalh_s16((a), (b), simde_vget_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlalh_lane_s16
+ #define vqdmlalh_lane_s16(a, b, c, lane) simde_vqdmlalh_lane_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlalh_laneq_s16(a, b, v, lane) vqdmlalh_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlalh_laneq_s16(a, b, v, lane) simde_vqdmlalh_s16((a), (b), simde_vgetq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlalh_laneq_s16
+ #define vqdmlalh_laneq_s16(a, b, c, lane) simde_vqdmlalh_laneq_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlals_lane_s32(a, b, v, lane) vqdmlals_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlals_lane_s32(a, b, v, lane) simde_vqdmlals_s32((a), (b), simde_vget_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlals_lane_s32
+ #define vqdmlals_lane_s32(a, b, c, lane) simde_vqdmlals_lane_s32((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlals_laneq_s32(a, b, v, lane) vqdmlals_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlals_laneq_s32(a, b, v, lane) simde_vqdmlals_s32((a), (b), simde_vgetq_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlals_laneq_s32
+ #define vqdmlals_laneq_s32(a, b, c, lane) simde_vqdmlals_laneq_s32((a), (b), (c), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_n.h
new file mode 100644
index 00000000000..0a5c69ea376
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_n.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLAL_N_H)
+#define SIMDE_ARM_NEON_QDMLAL_N_H
+
+#include "dup_n.h"
+#include "qdmlal.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlal_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlal_n_s16(a, b, c);
+ #else
+ return simde_vqdmlal_s16(a, b, simde_vdup_n_s16(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_n_s16
+ #define vqdmlal_n_s16(a, b, c) simde_vqdmlal_n_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlal_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlal_n_s32(a, b, c);
+ #else
+ return simde_vqdmlal_s32(a, b, simde_vdup_n_s32(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlal_n_s32
+ #define vqdmlal_n_s32(a, b, c) simde_vqdmlal_n_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl.h
new file mode 100644
index 00000000000..68e17ca05e0
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLSL_H)
+#define SIMDE_ARM_NEON_QDMLSL_H
+
+#include "sub.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "movl.h"
+#include "qadd.h"
+#include "qsub.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlslh_s16(a, b, c);
+ #else
+ return a - HEDLEY_STATIC_CAST(int32_t, b) * HEDLEY_STATIC_CAST(int32_t, c) * 2;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlslh_s16
+ #define vqdmlslh_s16(a, b, c) simde_vqdmlslh_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlsls_s32(a, b, c);
+ #else
+ return a - HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c) * 2;
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsls_s32
+ #define vqdmlsls_s32(a, b, c) simde_vqdmlsls_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlsl_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlsl_s16(a, b, c);
+ #else
+ simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c));
+ return simde_vqsubq_s32(a, simde_vqaddq_s32(temp, temp));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_s16
+ #define vqdmlsl_s16(a, b, c) simde_vqdmlsl_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlsl_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlsl_s32(a, b, c);
+ #else
+ simde_int64x2_t r = simde_x_vmulq_s64(
+ simde_vmovl_s32(b),
+ simde_vmovl_s32(c));
+ return simde_vqsubq_s64(a, simde_vqaddq_s64(r, r));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_s32
+ #define vqdmlsl_s32(a, b, c) simde_vqdmlsl_s32((a), (b), (c))
+#endif
+
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_high.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high.h
new file mode 100644
index 00000000000..18a6f47fe5a
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_H)
+#define SIMDE_ARM_NEON_QDMLSL_HIGH_H
+
+#include "movl_high.h"
+#include "sub.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlsl_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlsl_high_s16(a, b, c);
+ #else
+ return simde_vsubq_s32(a, simde_vmulq_n_s32(simde_vmulq_s32(simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)), 2));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_s16
+ #define vqdmlsl_high_s16(a, b, c) simde_vqdmlsl_high_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlsl_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlsl_high_s32(a, b, c);
+ #else
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(c)));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vsubq_s64(a, simde_int64x2_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_s32
+ #define vqdmlsl_high_s32(a, b, c) simde_vqdmlsl_high_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_lane.h
new file mode 100644
index 00000000000..877c72a2aa7
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_lane.h
@@ -0,0 +1,124 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H)
+#define SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H
+
+#include "movl_high.h"
+#include "sub.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "dup_n.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlsl_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ return simde_vsubq_s32(a,
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b),
+ simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]))), 2));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsl_high_lane_s16(a, b, v, lane) vqdmlsl_high_lane_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_lane_s16
+ #define vqdmlsl_high_lane_s16(a, b, v, lane) simde_vqdmlsl_high_lane_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlsl_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ return simde_vsubq_s32(a,
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b),
+ simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]))), 2));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsl_high_laneq_s16(a, b, v, lane) vqdmlsl_high_laneq_s16(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_laneq_s16
+ #define vqdmlsl_high_laneq_s16(a, b, v, lane) simde_vqdmlsl_high_laneq_s16((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlsl_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane]))));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vsubq_s64(a, simde_int64x2_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsl_high_lane_s32(a, b, v, lane) vqdmlsl_high_lane_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_lane_s32
+ #define vqdmlsl_high_lane_s32(a, b, v, lane) simde_vqdmlsl_high_lane_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlsl_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane]))));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vsubq_s64(a, simde_int64x2_from_private(r_));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsl_high_laneq_s32(a, b, v, lane) vqdmlsl_high_laneq_s32(a, b, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_laneq_s32
+ #define vqdmlsl_high_laneq_s32(a, b, v, lane) simde_vqdmlsl_high_laneq_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_n.h
new file mode 100644
index 00000000000..9db3d7e045f
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_n.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_N_H)
+#define SIMDE_ARM_NEON_QDMLSL_HIGH_N_H
+
+#include "movl_high.h"
+#include "dup_n.h"
+#include "sub.h"
+#include "mul.h"
+#include "mul_n.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlsl_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlsl_high_n_s16(a, b, c);
+ #else
+ return simde_vsubq_s32(a,
+ simde_vmulq_n_s32(
+ simde_vmulq_s32(
+ simde_vmovl_high_s16(b),
+ simde_vmovl_high_s16(simde_vdupq_n_s16(c))), 2));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_n_s16
+ #define vqdmlsl_high_n_s16(a, b, c) simde_vqdmlsl_high_n_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlsl_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmlsl_high_n_s32(a, b, c);
+ #else
+ simde_int64x2_private r_ = simde_int64x2_to_private(
+ simde_x_vmulq_s64(
+ simde_vmovl_high_s32(b),
+ simde_vmovl_high_s32(simde_vdupq_n_s32(c))));
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2);
+ }
+
+ return simde_vsubq_s64(a, simde_int64x2_from_private(r_));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_high_n_s32
+ #define vqdmlsl_high_n_s32(a, b, c) simde_vqdmlsl_high_n_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_lane.h
new file mode 100644
index 00000000000..d93677da0dc
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_lane.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLSL_LANE_H)
+#define SIMDE_ARM_NEON_QDMLSL_LANE_H
+
+#include "qdmlsl.h"
+#include "dup_lane.h"
+#include "get_lane.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vqdmlsl_lane_s16(a, b, v, lane) vqdmlsl_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlsl_lane_s16(a, b, v, lane) simde_vqdmlsl_s16((a), (b), simde_vdup_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_lane_s16
+ #define vqdmlsl_lane_s16(a, b, c, lane) simde_vqdmlsl_lane_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vqdmlsl_lane_s32(a, b, v, lane) vqdmlsl_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlsl_lane_s32(a, b, v, lane) simde_vqdmlsl_s32((a), (b), simde_vdup_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_lane_s32
+ #define vqdmlsl_lane_s32(a, b, c, lane) simde_vqdmlsl_lane_s32((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsl_laneq_s16(a, b, v, lane) vqdmlsl_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlsl_laneq_s16(a, b, v, lane) simde_vqdmlsl_s16((a), (b), simde_vdup_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_laneq_s16
+ #define vqdmlsl_laneq_s16(a, b, c, lane) simde_vqdmlsl_laneq_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsl_laneq_s32(a, b, v, lane) vqdmlsl_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlsl_laneq_s32(a, b, v, lane) simde_vqdmlsl_s32((a), (b), simde_vdup_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_laneq_s32
+ #define vqdmlsl_laneq_s32(a, b, c, lane) simde_vqdmlsl_laneq_s32((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlslh_lane_s16(a, b, v, lane) vqdmlslh_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlslh_lane_s16(a, b, v, lane) simde_vqdmlslh_s16((a), (b), simde_vget_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlslh_lane_s16
+ #define vqdmlslh_lane_s16(a, b, c, lane) simde_vqdmlslh_lane_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlslh_laneq_s16(a, b, v, lane) vqdmlslh_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlslh_laneq_s16(a, b, v, lane) simde_vqdmlslh_s16((a), (b), simde_vgetq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlslh_laneq_s16
+ #define vqdmlslh_laneq_s16(a, b, c, lane) simde_vqdmlslh_laneq_s16((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsls_lane_s32(a, b, v, lane) vqdmlsls_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlsls_lane_s32(a, b, v, lane) simde_vqdmlsls_s32((a), (b), simde_vget_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsls_lane_s32
+ #define vqdmlsls_lane_s32(a, b, c, lane) simde_vqdmlsls_lane_s32((a), (b), (c), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmlsls_laneq_s32(a, b, v, lane) vqdmlsls_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqdmlsls_laneq_s32(a, b, v, lane) simde_vqdmlsls_s32((a), (b), simde_vgetq_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsls_laneq_s32
+ #define vqdmlsls_laneq_s32(a, b, c, lane) simde_vqdmlsls_laneq_s32((a), (b), (c), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDmlsl_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_n.h
new file mode 100644
index 00000000000..5707f4c47b1
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_n.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMLSL_N_H)
+#define SIMDE_ARM_NEON_QDMLSL_N_H
+
+#include "dup_n.h"
+#include "qdmlsl.h"
+#include "types.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmlsl_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlsl_n_s16(a, b, c);
+ #else
+ return simde_vqdmlsl_s16(a, b, simde_vdup_n_s16(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_n_s16
+ #define vqdmlsl_n_s16(a, b, c) simde_vqdmlsl_n_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmlsl_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmlsl_n_s32(a, b, c);
+ #else
+ return simde_vqdmlsl_s32(a, b, simde_vdup_n_s32(c));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmlsl_n_s32
+ #define vqdmlsl_n_s32(a, b, c) simde_vqdmlsl_n_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmulh.h b/lib/simd_wrapper/simde/arm/neon/qdmulh.h
index d42e393ad7f..29d1078cb02 100644
--- a/lib/simd_wrapper/simde/arm/neon/qdmulh.h
+++ b/lib/simd_wrapper/simde/arm/neon/qdmulh.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_QDMULH_H)
@@ -63,7 +64,7 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) {
#else
simde_int16x4_private r_;
- #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
+ #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH))
simde_int16x8_private tmp_ =
simde_int16x8_to_private(
simde_vreinterpretq_s16_s32(
@@ -89,6 +90,21 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) {
#define vqdmulh_s16(a, b) simde_vqdmulh_s16((a), (b))
#endif
+SIMDE_FUNCTION_ATTRIBUTES
+int16_t
+simde_vqdmulhh_s16(int16_t a, int16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmulhh_s16(a, b);
+ #else
+ int32_t tmp = simde_vqdmullh_s16(a, b);
+ return HEDLEY_STATIC_CAST(int16_t, tmp >> 16);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmulhh_s16
+ #define vqdmulhh_s16(a, b) simde_vqdmulhh_s16((a), (b))
+#endif
+
SIMDE_FUNCTION_ATTRIBUTES
simde_int32x2_t
simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) {
@@ -97,7 +113,7 @@ simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) {
#else
simde_int32x2_private r_;
- #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector)
+ #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH))
simde_int32x4_private tmp_ =
simde_int32x4_to_private(
simde_vreinterpretq_s32_s64(
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h
index 3120eb7ad70..32cd22dead3 100644
--- a/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h
+++ b/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h
@@ -23,6 +23,7 @@
* Copyright:
* 2021 Evan Nemerson
* 2021 Zhi An Ng (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
#if !defined(SIMDE_ARM_NEON_QDMULH_LANE_H)
@@ -37,6 +38,17 @@ HEDLEY_DIAGNOSTIC_PUSH
SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
SIMDE_BEGIN_DECLS_
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmulhh_lane_s16(a, v, lane) vqdmulhh_lane_s16((a), (v), (lane))
+#else
+ #define simde_vqdmulhh_lane_s16(a, v, lane) \
+ simde_vqdmulhh_s16((a), simde_vget_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmulhh_lane_s16
+ #define vqdmulhh_lane_s16(a, v, lane) simde_vqdmulhh_lane_s16((a), (v), (lane))
+#endif
+
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
#define simde_vqdmulh_lane_s16(a, v, lane) vqdmulh_lane_s16((a), (v), (lane))
#else
@@ -81,6 +93,17 @@ SIMDE_BEGIN_DECLS_
#define vqdmulhq_lane_s32(a, v, lane) simde_vqdmulhq_lane_s32((a), (v), (lane))
#endif
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmulhh_laneq_s16(a, v, lane) vqdmulhh_laneq_s16((a), (v), (lane))
+#else
+ #define simde_vqdmulhh_laneq_s16(a, v, lane) \
+ simde_vqdmulhh_s16((a), simde_vgetq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmulhh_laneq_s16
+ #define vqdmulhh_laneq_s16(a, v, lane) simde_vqdmulhh_laneq_s16((a), (v), (lane))
+#endif
+
#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
#define simde_vqdmulh_laneq_s16(a, v, lane) vqdmulh_laneq_s16((a), (v), (lane))
#else
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull.h b/lib/simd_wrapper/simde/arm/neon/qdmull.h
index 88bf50bcbb8..871257f6188 100644
--- a/lib/simd_wrapper/simde/arm/neon/qdmull.h
+++ b/lib/simd_wrapper/simde/arm/neon/qdmull.h
@@ -23,6 +23,7 @@
* Copyright:
* 2020 Evan Nemerson
* 2020 Sean Maher (Copyright owned by Google, LLC)
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
*/
/* Implementation notes (seanptmaher):
@@ -67,8 +68,8 @@ simde_vqdmulls_s32(int32_t a, int32_t b) {
#endif
}
#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
- #undef vqdmulls_s16
- #define vqdmulls_s16(a, b) simde_vqdmulls_s16((a), (b))
+ #undef vqdmulls_s32
+ #define vqdmulls_s32(a, b) simde_vqdmulls_s32((a), (b))
#endif
SIMDE_FUNCTION_ATTRIBUTES
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_high.h b/lib/simd_wrapper/simde/arm/neon/qdmull_high.h
new file mode 100644
index 00000000000..2c6b26912b2
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmull_high.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_H)
+#define SIMDE_ARM_NEON_QDMULL_HIGH_H
+
+#include "combine.h"
+#include "get_high.h"
+#include "qdmull.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_high_s16(simde_int16x8_t a, simde_int16x8_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmull_high_s16(a, b);
+ #else
+ return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vget_high_s16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_s16
+ #define vqdmull_high_s16(a, b) simde_vqdmull_high_s16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_high_s32(simde_int32x4_t a, simde_int32x4_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmull_high_s32(a, b);
+ #else
+ return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vget_high_s32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_s32
+ #define vqdmull_high_s32(a, b) simde_vqdmull_high_s32((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_high_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmull_high_lane.h
new file mode 100644
index 00000000000..f8326b2bf48
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmull_high_lane.h
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H)
+#define SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H
+
+#include "combine.h"
+#include "qdmull.h"
+#include "dup_n.h"
+#include "get_high.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_high_lane_s16(simde_int16x8_t a, simde_int16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int16x4_private
+ v_ = simde_int16x4_to_private(v);
+ return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmull_high_lane_s16(a, v, lane) vqdmull_high_lane_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_lane_s16
+ #define vqdmull_high_lane_s16(a, v, lane) simde_vqdmull_high_lane_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_high_laneq_s16(simde_int16x8_t a, simde_int16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int16x8_private
+ v_ = simde_int16x8_to_private(v);
+ return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmull_high_laneq_s16(a, v, lane) vqdmull_high_laneq_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_laneq_s16
+ #define vqdmull_high_laneq_s16(a, v, lane) simde_vqdmull_high_laneq_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_high_lane_s32(simde_int32x4_t a, simde_int32x2_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int32x2_private
+ v_ = simde_int32x2_to_private(v);
+ return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmull_high_lane_s32(a, v, lane) vqdmull_high_lane_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_lane_s32
+ #define vqdmull_high_lane_s32(a, v, lane) simde_vqdmull_high_lane_s32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_high_laneq_s32(simde_int32x4_t a, simde_int32x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int32x4_private
+ v_ = simde_int32x4_to_private(v);
+ return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane]));
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmull_high_laneq_s32(a, v, lane) vqdmull_high_laneq_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_laneq_s32
+ #define vqdmull_high_laneq_s32(a, v, lane) simde_vqdmull_high_laneq_s32((a), (v), (lane))
+#endif
+
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_high_n.h b/lib/simd_wrapper/simde/arm/neon/qdmull_high_n.h
new file mode 100644
index 00000000000..aef31240f69
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmull_high_n.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_N_H)
+#define SIMDE_ARM_NEON_QDMULL_HIGH_N_H
+
+#include "combine.h"
+#include "get_high.h"
+#include "dup_n.h"
+#include "qdmull.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_high_n_s16(simde_int16x8_t a, int16_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmull_high_n_s16(a, b);
+ #else
+ return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_n_s16
+ #define vqdmull_high_n_s16(a, b) simde_vqdmull_high_n_s16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_high_n_s32(simde_int32x4_t a, int32_t b) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqdmull_high_n_s32(a, b);
+ #else
+ return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_high_n_s32
+ #define vqdmull_high_n_s32(a, b) simde_vqdmull_high_n_s32((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmull_lane.h
new file mode 100644
index 00000000000..a7bf68cbdc1
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmull_lane.h
@@ -0,0 +1,206 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMULL_LANE_H)
+#define SIMDE_ARM_NEON_QDMULL_LANE_H
+
+#include "combine.h"
+#include "qdmull.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vqdmullh_lane_s16(int16_t a, simde_int16x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int16x4_private
+ v_ = simde_int16x4_to_private(v);
+
+ return simde_vqdmullh_s16(a, v_.values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmullh_lane_s16(a, v, lane) vqdmullh_lane_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmullh_lane_s16
+ #define vqdmullh_lane_s16(a, v, lane) simde_vqdmullh_lane_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vqdmullh_laneq_s16(int16_t a, simde_int16x8_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int16x8_private
+ v_ = simde_int16x8_to_private(v);
+
+ return simde_vqdmullh_s16(a, v_.values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmullh_laneq_s16(a, v, lane) vqdmullh_laneq_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmullh_laneq_s16
+ #define vqdmullh_laneq_s16(a, v, lane) simde_vqdmullh_laneq_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vqdmulls_lane_s32(int32_t a, simde_int32x2_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int32x2_private
+ v_ = simde_int32x2_to_private(v);
+
+ return simde_vqdmulls_s32(a, v_.values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmulls_lane_s32(a, v, lane) vqdmulls_lane_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmulls_lane_s32
+ #define vqdmulls_lane_s32(a, v, lane) simde_vqdmulls_lane_s32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int64_t
+simde_vqdmulls_laneq_s32(int32_t a, simde_int32x4_t v, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int32x4_private
+ v_ = simde_int32x4_to_private(v);
+
+ return simde_vqdmulls_s32(a, v_.values[lane]);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmulls_laneq_s32(a, v, lane) vqdmulls_laneq_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmulls_laneq_s32
+ #define vqdmulls_laneq_s32(a, v, lane) simde_vqdmulls_laneq_s32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int32x4_private r_;
+ simde_int16x4_private
+ a_ = simde_int16x4_to_private(a),
+ b_ = simde_int16x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_int32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vqdmull_lane_s16(a, v, lane) vqdmull_lane_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_lane_s16
+ #define vqdmull_lane_s16(a, v, lane) simde_vqdmull_lane_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) {
+ simde_int32x4_private r_;
+ simde_int16x4_private
+ a_ = simde_int16x4_to_private(a);
+ simde_int16x8_private
+ b_ = simde_int16x8_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_int32x4_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmull_laneq_s16(a, v, lane) vqdmull_laneq_s16(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_laneq_s16
+ #define vqdmull_laneq_s16(a, v, lane) simde_vqdmull_laneq_s16((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) {
+ simde_int64x2_private r_;
+ simde_int32x2_private
+ a_ = simde_int32x2_to_private(a),
+ b_ = simde_int32x2_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_int64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ #define simde_vqdmull_lane_s32(a, v, lane) vqdmull_lane_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_lane_s32
+ #define vqdmull_lane_s32(a, v, lane) simde_vqdmull_lane_s32((a), (v), (lane))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane)
+ SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) {
+ simde_int64x2_private r_;
+ simde_int32x2_private
+ a_ = simde_int32x2_to_private(a);
+ simde_int32x4_private
+ b_ = simde_int32x4_to_private(b);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[lane]);
+ }
+
+ return simde_int64x2_from_private(r_);
+}
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ #define simde_vqdmull_laneq_s32(a, v, lane) vqdmull_laneq_s32(a, v, lane)
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_laneq_s32
+ #define vqdmull_laneq_s32(a, v, lane) simde_vqdmull_laneq_s32((a), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMULL_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_n.h b/lib/simd_wrapper/simde/arm/neon/qdmull_n.h
new file mode 100644
index 00000000000..691802637b9
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qdmull_n.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QDMULL_N_H)
+#define SIMDE_ARM_NEON_QDMULL_N_H
+
+#include "combine.h"
+#include "dup_n.h"
+#include "qdmull.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqdmull_n_s16(simde_int16x4_t a, int16_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmull_n_s16(a, b);
+ #else
+ return simde_vqdmull_s16(a, simde_vdup_n_s16(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_n_s16
+ #define vqdmull_n_s16(a, b) simde_vqdmull_n_s16((a), (b))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int64x2_t
+simde_vqdmull_n_s32(simde_int32x2_t a, int32_t b) {
+ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
+ return vqdmull_n_s32(a, b);
+ #else
+ return simde_vqdmull_s32(a, simde_vdup_n_s32(b));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqdmull_n_s32
+ #define vqdmull_n_s32(a, b) simde_vqdmull_n_s32((a), (b))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QDMULL_N_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qmovun_high.h b/lib/simd_wrapper/simde/arm/neon/qmovun_high.h
new file mode 100644
index 00000000000..edb3e17a499
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qmovun_high.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QMOVUN_HIGH_H)
+#define SIMDE_ARM_NEON_QMOVUN_HIGH_H
+
+#include "types.h"
+
+#include "combine.h"
+#include "qmovun.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint8x16_t
+simde_vqmovun_high_s16(simde_uint8x8_t r, simde_int16x8_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqmovun_high_s16(r, a);
+ #else
+ return simde_vcombine_u8(r, simde_vqmovun_s16(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqmovun_high_s16
+ #define vqmovun_high_s16(r, a) simde_vqmovun_high_s16((r), (a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint16x8_t
+simde_vqmovun_high_s32(simde_uint16x4_t r, simde_int32x4_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqmovun_high_s32(r, a);
+ #else
+ return simde_vcombine_u16(r, simde_vqmovun_s32(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqmovun_high_s32
+ #define vqmovun_high_s32(r, a) simde_vqmovun_high_s32((r), (a))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_uint32x4_t
+simde_vqmovun_high_s64(simde_uint32x2_t r, simde_int64x2_t a) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
+ return vqmovun_high_s64(r, a);
+ #else
+ return simde_vcombine_u32(r, simde_vqmovun_s64(a));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqmovun_high_s64
+ #define vqmovun_high_s64(r, a) simde_vqmovun_high_s64((r), (a))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QMOVUN_HIGH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlah.h b/lib/simd_wrapper/simde/arm/neon/qrdmlah.h
new file mode 100644
index 00000000000..9442101e312
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qrdmlah.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QRDMLAH_H)
+#define SIMDE_ARM_NEON_QRDMLAH_H
+
+#include "types.h"
+#include "qmovn.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+SIMDE_FUNCTION_ATTRIBUTES
+int16_t
+simde_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)
+ return SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vqrdmlahh_s16(a, b, c));
+ #else
+ return vqrdmlahh_s16(a, b, c);
+ #endif
+ #else
+ int64_t r = (((1 << 15) + (HEDLEY_STATIC_CAST(int64_t, a) << 16) + ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)))) << 1)) >> 16);
+ return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahh_s16
+ #define vqrdmlahh_s16(a, b, c) simde_vqrdmlahh_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+int32_t
+simde_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ return vqrdmlahs_s32(a, b, c);
+ #else
+ int64_t round_const = (HEDLEY_STATIC_CAST(int64_t, 1) << 31);
+ int64_t a_ = (HEDLEY_STATIC_CAST(int64_t, a) << 32);
+ int64_t sum = round_const + a_;
+ int64_t mul = (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c));
+ int64_t mul2 = mul << 1;
+ if (mul2 >> 1 != mul) {
+ if (mul > 0) return INT32_MAX;
+ else if (mul < 0) return INT32_MIN;
+ }
+ int64_t sum2 = sum + mul2;
+ if (sum > 0 && INT64_MAX - sum < mul2) return INT32_MAX;
+ if (sum < 0 && INT64_MIN - sum > mul2) return INT32_MIN;
+ return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff));
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahs_s32
+ #define vqrdmlahs_s32(a, b, c) simde_vqrdmlahs_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x4_t
+simde_vqrdmlah_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ return vqrdmlah_s16(a, b, c);
+ #else
+ simde_int16x4_private
+ r_,
+ a_ = simde_int16x4_to_private(a),
+ b_ = simde_int16x4_to_private(b),
+ c_ = simde_int16x4_to_private(c);
+
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqrdmlahh_s16(a_.values[i], b_.values[i], c_.values[i]);
+ }
+
+ return simde_int16x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlah_s16
+ #define vqrdmlah_s16(a, b, c) simde_vqrdmlah_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x2_t
+simde_vqrdmlah_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ return vqrdmlah_s32(a, b, c);
+ #else
+ simde_int32x2_private
+ r_,
+ a_ = simde_int32x2_to_private(a),
+ b_ = simde_int32x2_to_private(b),
+ c_ = simde_int32x2_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqrdmlahs_s32(a_.values[i], b_.values[i], c_.values[i]);
+ }
+
+ return simde_int32x2_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlah_s32
+ #define vqrdmlah_s32(a, b, c) simde_vqrdmlah_s32((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int16x8_t
+simde_vqrdmlahq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ return vqrdmlahq_s16(a, b, c);
+ #else
+ simde_int16x8_private
+ r_,
+ a_ = simde_int16x8_to_private(a),
+ b_ = simde_int16x8_to_private(b),
+ c_ = simde_int16x8_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqrdmlahh_s16(a_.values[i], b_.values[i], c_.values[i]);
+ }
+
+ return simde_int16x8_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahq_s16
+ #define vqrdmlahq_s16(a, b, c) simde_vqrdmlahq_s16((a), (b), (c))
+#endif
+
+SIMDE_FUNCTION_ATTRIBUTES
+simde_int32x4_t
+simde_vqrdmlahq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) {
+ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ return vqrdmlahq_s32(a, b, c);
+ #else
+ simde_int32x4_private
+ r_,
+ a_ = simde_int32x4_to_private(a),
+ b_ = simde_int32x4_to_private(b),
+ c_ = simde_int32x4_to_private(c);
+
+ SIMDE_VECTORIZE
+ for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+ r_.values[i] = simde_vqrdmlahs_s32(a_.values[i], b_.values[i], c_.values[i]);
+ }
+
+ return simde_int32x4_from_private(r_);
+ #endif
+}
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahq_s32
+ #define vqrdmlahq_s32(a, b, c) simde_vqrdmlahq_s32((a), (b), (c))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QRDMLAH_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlah_lane.h b/lib/simd_wrapper/simde/arm/neon/qrdmlah_lane.h
new file mode 100644
index 00000000000..4f18bbb5fb1
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qrdmlah_lane.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung (Copyright owned by Andes Technology)
+ */
+
+#if !defined(SIMDE_ARM_NEON_QRDMLAH_LANE_H)
+#define SIMDE_ARM_NEON_QRDMLAH_LANE_H
+
+#include "types.h"
+#include "qrdmlah.h"
+#include "dup_lane.h"
+#include "get_lane.h"
+
+HEDLEY_DIAGNOSTIC_PUSH
+SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
+SIMDE_BEGIN_DECLS_
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahh_lane_s16(a, b, v, lane) vqrdmlahh_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vget_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahh_lane_s16
+ #define vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) vqrdmlahh_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vgetq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahh_laneq_s16
+ #define vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahs_lane_s32(a, b, v, lane) vqrdmlahs_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vget_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahs_lane_s32
+ #define vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) vqrdmlahs_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vgetq_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahs_laneq_s32
+ #define vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_laneq_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlah_lane_s16(a, b, v, lane) vqrdmlah_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlah_lane_s16
+ #define vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlah_lane_s32(a, b, v, lane) vqrdmlah_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlah_lane_s32
+ #define vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahq_lane_s16(a, b, v, lane) vqrdmlahq_lane_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_lane_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahq_lane_s16
+ #define vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_lane_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahq_lane_s32(a, b, v, lane) vqrdmlahq_lane_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_lane_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahq_lane_s32
+ #define vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_lane_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlah_laneq_s16(a, b, v, lane) vqrdmlah_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlah_laneq_s16
+ #define vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlah_laneq_s32(a, b, v, lane) vqrdmlah_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlah_laneq_s32
+ #define vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_laneq_s32((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) vqrdmlahq_laneq_s16((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahq_laneq_s16
+ #define vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_laneq_s16((a), (b), (v), (lane))
+#endif
+
+#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX)
+ #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) vqrdmlahq_laneq_s32((a), (b), (v), (lane))
+#else
+ #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane)))
+#endif
+#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES)
+ #undef vqrdmlahq_laneq_s32
+ #define vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_laneq_s32((a), (b), (v), (lane))
+#endif
+
+SIMDE_END_DECLS_
+HEDLEY_DIAGNOSTIC_POP
+
+#endif /* !defined(SIMDE_ARM_NEON_QRDMLAH_LANE_H) */
diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlsh.h b/lib/simd_wrapper/simde/arm/neon/qrdmlsh.h
new file mode 100644
index 00000000000..eb0be8e7c87
--- /dev/null
+++ b/lib/simd_wrapper/simde/arm/neon/qrdmlsh.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Copyright:
+ * 2023 Yi-Yen Chung