From e28e1b61a999f93f7419bd1a0e859e3c9660bc3d Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 25 Sep 2025 19:53:19 -0700 Subject: [PATCH 1/9] Add build option for ARM NCHWc kernels --- .github/workflows/android.yml | 4 ++-- cmake/CMakeLists.txt | 1 + cmake/onnxruntime_mlas.cmake | 24 ++++++++++++++----- onnxruntime/core/mlas/lib/mlasi.h | 4 +++- onnxruntime/core/mlas/lib/platform.cpp | 2 ++ onnxruntime/core/mlas/lib/sconv.h | 6 ++++- .../core/mlas/lib/sconv_kernel_neon.cpp | 4 ++++ onnxruntime/core/mlas/lib/snchwc.cpp | 18 +++++++------- .../core/mlas/lib/spool_kernel_neon.cpp | 4 ++++ tools/ci_build/build.py | 3 +++ tools/ci_build/build_args.py | 8 +++++++ 11 files changed, 59 insertions(+), 19 deletions(-) diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 9e1a491d154cf..7f7ff74959d52 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -71,8 +71,8 @@ jobs: run: | set -e -x BINARY_SIZE_THRESHOLD_ARGS="" - echo "Binary size threshold in bytes: 1722565" - BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1722565" + echo "Binary size threshold in bytes: 1436672" + BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672" # Ensure ANDROID_NDK_HOME is available and get its real path if [ -z "$ANDROID_NDK_HOME" ]; then diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bcab5e9e6fa1b..a5ad7ff7310f2 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -88,6 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF) option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF) +option(onnxruntime_ARM_USE_NCHWC "Build with ARM NCHWc kernels in MLAS" OFF) option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF) # iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 66c654e4a29e7..33e6c604d299a 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -109,8 +109,6 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp - ${MLAS_SRC_DIR}/spool_kernel_neon.cpp ) set(mlas_platform_preprocess_srcs @@ -134,7 +132,11 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm ) - if (onnxruntime_USE_KLEIDIAI) + if (onnxruntime_ARM_USE_NCHWC) + setup_arm_nchwc() + endif() + + if (onnxruntime_USE_KLEIDIAI) setup_kleidiai() endif() else() @@ -289,6 +291,14 @@ function(setup_kleidiai) endif() endfunction() +function (setup_arm_nchwc) + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp + ${MLAS_SRC_DIR}/spool_kernel_neon.cpp + ) + target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NCHWC) +endfunction () + if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) file(GLOB_RECURSE mlas_platform_srcs @@ -433,8 +443,6 @@ else() ${MLAS_SRC_DIR}/eltwise_kernel_neon.h ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp - ${MLAS_SRC_DIR}/spool_kernel_neon.cpp ) # Conditionally add the SVE implementation if compiler supports it @@ -445,7 +453,11 @@ else() target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE) endif() - if (onnxruntime_USE_KLEIDIAI) + if (onnxruntime_ARM_USE_NCHWC) + setup_arm_nchwc() + endif() + + if (onnxruntime_USE_KLEIDIAI) setup_kleidiai() endif() set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 2e93584095343..05f14549ce955 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -953,7 +953,7 @@ extern "C" { MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero; MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd; #endif -#if defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC) MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon; MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon; MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon; @@ -1347,12 +1347,14 @@ struct MLAS_PLATFORM { const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch; const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch; const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch; +#if defined (MLAS_USE_ARM_NCHWC) MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel; MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel; MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel; MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel; MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount]; uint32_t NchwcBlockSize; +#endif #endif const MLAS_SYMM_QGEMM_DISPATCH* SymmQgemmDispatch{nullptr}; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 81067095401e7..5388221aec4fa 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -560,6 +560,7 @@ Return Value: this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon; this->EltwiseDispatch = &MlasEltwiseDispatchNeon; +#if defined(MLAS_USE_ARM_NCHWC) this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon; this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon; this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon; @@ -568,6 +569,7 @@ Return Value: this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelNeon; this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelNeon; this->NchwcBlockSize = MLAS_NEON_NCHWC_BLOCK_SIZE; +#endif // // Check if the processor supports ASIMD dot product instructions. diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv.h index 94e657638975a..a9e59f6038d4e 100644 --- a/onnxruntime/core/mlas/lib/sconv.h +++ b/onnxruntime/core/mlas/lib/sconv.h @@ -19,7 +19,11 @@ Module Name: // Define the convolution kernel flags. // +#if defined(MLAS_USE_ARM_NCHWC) + #define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001 #define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002 #define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION 0x00000004 -#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008 \ No newline at end of file +#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008 + +#endif diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp index 3ecad66a32886..f396b08a103c3 100644 --- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp @@ -14,6 +14,8 @@ Module Name: --*/ +#if defined(MLAS_USE_ARM_NCHWC) + #include "mlasi.h" #include "sconv.h" @@ -518,3 +520,5 @@ void } } } + +#endif diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp index 2fc27d6d4ad7f..52aa9eb07311c 100644 --- a/onnxruntime/core/mlas/lib/snchwc.cpp +++ b/onnxruntime/core/mlas/lib/snchwc.cpp @@ -101,7 +101,7 @@ Return Value: --*/ { -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) return GetMlasPlatform().NchwcBlockSize; #else return 1; @@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel; #else MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel; @@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel; #else MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel; @@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float); const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float); -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel; #else MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel; @@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel; #else MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel; @@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM { -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[]; #endif @@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float); const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind]; #else MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind]; @@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM } }; -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] = { @@ -1621,7 +1621,7 @@ Return Value: } } -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) // // Convolution and pooling kernel stubs for architectures that do not yet have diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp index 8cca036d54c3a..5159dca63d2e0 100644 --- a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp @@ -14,6 +14,8 @@ Module Name: --*/ +#if defined(MLAS_USE_ARM_NCHWC) + #include "mlasi.h" constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE; @@ -287,3 +289,5 @@ void false // ExcludePad = false ); } + +#endif diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 4438ddba014d0..b62935c992f1d 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -896,6 +896,9 @@ def generate_build_tree( ): cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"] + if not args.enable_arm_nchwc: + cmake_args += ["-Donnxruntime_ARM_USE_NCHWC=ON"] + if not args.no_sve: cmake_args += ["-Donnxruntime_USE_SVE=ON"] diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index 996d46974716e..cb557781292d8 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -629,8 +629,16 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: help="Enable CUDA kernel profiling (requires CUPTI in PATH).", ) + # --- CPU --- cpu_group = parser.add_argument_group("CPU Execution Provider") cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.") + + # The following enables building ORT with NCHWc ARM kernels. + # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels + # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting + # ARM platforms. + # Once the gap is closed for smaller thread counts, it can be turned on by default. + cpu_group.add_argument("--enable_arm_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.") # --- DNNL (formerly MKL-DNN / oneDNN) --- dnnl_group = parser.add_argument_group("DNNL Execution Provider") From 45f850cdd0bf184703617aa9eac5ac1bbe42bbe1 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 25 Sep 2025 19:58:08 -0700 Subject: [PATCH 2/9] Fix --- tools/ci_build/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index b62935c992f1d..d0e9959058deb 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -896,7 +896,7 @@ def generate_build_tree( ): cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"] - if not args.enable_arm_nchwc: + if args.enable_arm_nchwc: cmake_args += ["-Donnxruntime_ARM_USE_NCHWC=ON"] if not args.no_sve: From ac6cafb80d9be4d5c8645131a38092a9236d4f2b Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 25 Sep 2025 20:09:20 -0700 Subject: [PATCH 3/9] Add message in cmake --- cmake/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index a5ad7ff7310f2..8d95d043cac08 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -664,6 +664,10 @@ else() endif() endif() +if (onnxruntime_ARM_USE_NCHWC) + message(STATUS "Building MLAS with ARM NCHWc kernels") +endif() + if(onnxruntime_USE_SVE) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") check_cxx_compiler_flag("-march=armv8.2-a+sve" HAS_ARM64_SVE) From 513234a915e79d9efe12d1f44bad0f5728c4e7ab Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 25 Sep 2025 20:51:04 -0700 Subject: [PATCH 4/9] Enable NCHWc ARM kernels on Mac OS alone --- .github/workflows/macos-ci-build-and-test-workflow.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml index c7c35fb234013..61370e098a598 100644 --- a/.github/workflows/macos-ci-build-and-test-workflow.yml +++ b/.github/workflows/macos-ci-build-and-test-workflow.yml @@ -62,6 +62,7 @@ jobs: --build_objc --build_java --build_wheel + ${{ matrix.target == 'arm64' && '--enable_arm_nchwc' || '' }} ${{ inputs.use_webgpu && '--use_webgpu' || '' }} ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }} ${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }} From 68932dad4c7b21ebbf253e8ec02a36aae1a4c48c Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Fri, 26 Sep 2025 17:41:10 -0700 Subject: [PATCH 5/9] Update tools/ci_build/build_args.py Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- tools/ci_build/build_args.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index cb557781292d8..518a969ef2a31 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -632,7 +632,6 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: # --- CPU --- cpu_group = parser.add_argument_group("CPU Execution Provider") cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.") - # The following enables building ORT with NCHWc ARM kernels. # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting From 3029cb4e3313aa68a2000948fc17c5501f7a543a Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Fri, 26 Sep 2025 17:41:17 -0700 Subject: [PATCH 6/9] Update tools/ci_build/build_args.py Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- tools/ci_build/build_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index 518a969ef2a31..dd429f4e663cb 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -636,7 +636,7 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting # ARM platforms. - # Once the gap is closed for smaller thread counts, it can be turned on by default. + # Once the gap is closed for smaller thread counts, it can be turned on by default. cpu_group.add_argument("--enable_arm_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.") # --- DNNL (formerly MKL-DNN / oneDNN) --- From 98f5d834eedd1505c535181830ee752a3453192f Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Fri, 26 Sep 2025 21:33:12 -0700 Subject: [PATCH 7/9] Reflect Neon --- .../macos-ci-build-and-test-workflow.yml | 2 +- cmake/CMakeLists.txt | 6 +++--- cmake/onnxruntime_mlas.cmake | 13 +++++++------ onnxruntime/core/mlas/lib/mlasi.h | 4 ++-- onnxruntime/core/mlas/lib/platform.cpp | 2 +- onnxruntime/core/mlas/lib/sconv.h | 2 +- .../core/mlas/lib/sconv_kernel_neon.cpp | 8 ++++---- onnxruntime/core/mlas/lib/snchwc.cpp | 18 +++++++++--------- .../core/mlas/lib/spool_kernel_neon.cpp | 2 +- tools/ci_build/build.py | 4 ++-- tools/ci_build/build_args.py | 4 ++-- 11 files changed, 33 insertions(+), 32 deletions(-) diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml index 61370e098a598..329584c68d7d1 100644 --- a/.github/workflows/macos-ci-build-and-test-workflow.yml +++ b/.github/workflows/macos-ci-build-and-test-workflow.yml @@ -62,7 +62,7 @@ jobs: --build_objc --build_java --build_wheel - ${{ matrix.target == 'arm64' && '--enable_arm_nchwc' || '' }} + ${{ matrix.target == 'arm64' && '--enable_arm_neon_nchwc' || '' }} ${{ inputs.use_webgpu && '--use_webgpu' || '' }} ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }} ${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }} diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8d95d043cac08..a92a0cddefd55 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -88,7 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF) option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF) -option(onnxruntime_ARM_USE_NCHWC "Build with ARM NCHWc kernels in MLAS" OFF) +option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF) option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF) # iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so @@ -664,8 +664,8 @@ else() endif() endif() -if (onnxruntime_ARM_USE_NCHWC) - message(STATUS "Building MLAS with ARM NCHWc kernels") +if (onnxruntime_USE_ARM_NEON_NCHWC) + message(STATUS "Building MLAS with ARM Neon NCHWc kernels") endif() if(onnxruntime_USE_SVE) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 33e6c604d299a..3b7c6a95ba98f 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -132,8 +132,8 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm ) - if (onnxruntime_ARM_USE_NCHWC) - setup_arm_nchwc() + if (onnxruntime_USE_ARM_NEON_NCHWC) + setup_arm_neon_nchwc() endif() if (onnxruntime_USE_KLEIDIAI) @@ -291,12 +291,13 @@ function(setup_kleidiai) endif() endfunction() -function (setup_arm_nchwc) +function (setup_arm_neon_nchwc) target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/sconv.h ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp ${MLAS_SRC_DIR}/spool_kernel_neon.cpp ) - target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NCHWC) + target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC) endfunction () if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") @@ -453,8 +454,8 @@ else() target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE) endif() - if (onnxruntime_ARM_USE_NCHWC) - setup_arm_nchwc() + if (onnxruntime_USE_ARM_NEON_NCHWC) + setup_arm_neon_nchwc() endif() if (onnxruntime_USE_KLEIDIAI) diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 05f14549ce955..8ed6352e7baa7 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -953,7 +953,7 @@ extern "C" { MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero; MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd; #endif -#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC) +#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon; MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon; MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon; @@ -1347,7 +1347,7 @@ struct MLAS_PLATFORM { const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch; const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch; const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch; -#if defined (MLAS_USE_ARM_NCHWC) +#if defined(MLAS_USE_ARM_NEON_NCHWC) MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel; MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel; MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 5388221aec4fa..46fa150395d75 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -560,7 +560,7 @@ Return Value: this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon; this->EltwiseDispatch = &MlasEltwiseDispatchNeon; -#if defined(MLAS_USE_ARM_NCHWC) +#if defined(MLAS_USE_ARM_NEON_NCHWC) this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon; this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon; this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon; diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv.h index a9e59f6038d4e..12ccff2b7ea33 100644 --- a/onnxruntime/core/mlas/lib/sconv.h +++ b/onnxruntime/core/mlas/lib/sconv.h @@ -19,7 +19,7 @@ Module Name: // Define the convolution kernel flags. // -#if defined(MLAS_USE_ARM_NCHWC) +#if defined(MLAS_USE_ARM_NEON_NCHWC) #define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001 #define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002 diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp index f396b08a103c3..4c5f50adb929c 100644 --- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp @@ -14,7 +14,7 @@ Module Name: --*/ -#if defined(MLAS_USE_ARM_NCHWC) +#if defined(MLAS_USE_ARM_NEON_NCHWC) #include "mlasi.h" #include "sconv.h" @@ -60,7 +60,7 @@ void const size_t InputWidthElements = InputWidth / sizeof(float); const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float); - (void)InputStride; + MLAS_UNREFERENCED_PARAMETER(InputStride); const size_t TotalOutputCount = OutputCountLeftPad + OutputCount + OutputCountRightPad; @@ -102,7 +102,7 @@ void const float* input_base = Input + output_idx * StrideWidthElements + kh * DilatedInputWidthElements + kw * DilationWidthElements; - if (IsNchwcFormat) { + if constexpr (IsNchwcFormat) { for (size_t filterBlock = 0; filterBlock < BlockSize; filterBlock++) { const float* input_element = input_base + filterBlock; const float* input_row_start = InputBase + kh * DilatedInputWidthElements; @@ -345,7 +345,7 @@ void const size_t InputStrideElements = InputStride / sizeof(float); const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float); - (void)InputStrideElements; + MLAS_UNREFERENCED_PARAMETER(InputStrideElements); const size_t InputWidthElements = InputWidth / sizeof(float); diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp index 52aa9eb07311c..6f3423a792509 100644 --- a/onnxruntime/core/mlas/lib/snchwc.cpp +++ b/onnxruntime/core/mlas/lib/snchwc.cpp @@ -101,7 +101,7 @@ Return Value: --*/ { -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) return GetMlasPlatform().NchwcBlockSize; #else return 1; @@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel; #else MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel; @@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel; #else MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel; @@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float); const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float); -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel; #else MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel; @@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel; #else MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel; @@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM { -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[]; #endif @@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float); const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind]; #else MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind]; @@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM } }; -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] = { @@ -1621,7 +1621,7 @@ Return Value: } } -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) // // Convolution and pooling kernel stubs for architectures that do not yet have diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp index 5159dca63d2e0..588362584791b 100644 --- a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp @@ -14,7 +14,7 @@ Module Name: --*/ -#if defined(MLAS_USE_ARM_NCHWC) +#if defined(MLAS_USE_ARM_NEON_NCHWC) #include "mlasi.h" diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index d0e9959058deb..0513379139464 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -896,8 +896,8 @@ def generate_build_tree( ): cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"] - if args.enable_arm_nchwc: - cmake_args += ["-Donnxruntime_ARM_USE_NCHWC=ON"] + if args.enable_arm_neon_nchwc: + cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"] if not args.no_sve: cmake_args += ["-Donnxruntime_USE_SVE=ON"] diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index cb557781292d8..7f19c14c6cbd1 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -633,12 +633,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: cpu_group = parser.add_argument_group("CPU Execution Provider") cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.") - # The following enables building ORT with NCHWc ARM kernels. + # The following enables building ORT with NCHWc Neon ARM kernels. # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting # ARM platforms. # Once the gap is closed for smaller thread counts, it can be turned on by default. - cpu_group.add_argument("--enable_arm_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.") + cpu_group.add_argument("--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.") # --- DNNL (formerly MKL-DNN / oneDNN) --- dnnl_group = parser.add_argument_group("DNNL Execution Provider") From 401bf94f24fe13443994a73c424afd11678095af Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Sun, 28 Sep 2025 13:20:22 -0700 Subject: [PATCH 8/9] Update tools/ci_build/build_args.py Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- tools/ci_build/build_args.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index 8efc9be18da21..f6f2e69f82827 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -637,7 +637,9 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting # ARM platforms. # Once the gap is closed for smaller thread counts, it can be turned on by default. - cpu_group.add_argument("--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.") + cpu_group.add_argument( + "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels." + ) # --- DNNL (formerly MKL-DNN / oneDNN) --- dnnl_group = parser.add_argument_group("DNNL Execution Provider") From 7a59bd6c0180b9e38750b909ba42bb6da74420b6 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Mon, 29 Sep 2025 10:49:47 -0700 Subject: [PATCH 9/9] Add comment about NCHWc ARM kernels performance Added a comment regarding the performance of NCHWc ARM kernels and their default state. --- tools/ci_build/build_args.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index f6f2e69f82827..8c04f8dd46016 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -637,6 +637,7 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting # ARM platforms. # Once the gap is closed for smaller thread counts, it can be turned on by default. + # See https://github.com/microsoft/onnxruntime/pull/25580#issuecomment-3335056846 for benchmarking details. cpu_group.add_argument( "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels." )