diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml index 9e1a491d154cf..7f7ff74959d52 100644 --- a/.github/workflows/android.yml +++ b/.github/workflows/android.yml @@ -71,8 +71,8 @@ jobs: run: | set -e -x BINARY_SIZE_THRESHOLD_ARGS="" - echo "Binary size threshold in bytes: 1722565" - BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1722565" + echo "Binary size threshold in bytes: 1436672" + BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672" # Ensure ANDROID_NDK_HOME is available and get its real path if [ -z "$ANDROID_NDK_HOME" ]; then diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml index c7c35fb234013..329584c68d7d1 100644 --- a/.github/workflows/macos-ci-build-and-test-workflow.yml +++ b/.github/workflows/macos-ci-build-and-test-workflow.yml @@ -62,6 +62,7 @@ jobs: --build_objc --build_java --build_wheel + ${{ matrix.target == 'arm64' && '--enable_arm_neon_nchwc' || '' }} ${{ inputs.use_webgpu && '--use_webgpu' || '' }} ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }} ${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }} diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index bcab5e9e6fa1b..a92a0cddefd55 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -88,6 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF) option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF) +option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF) option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF) # iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so @@ -663,6 +664,10 @@ else() endif() endif() +if (onnxruntime_USE_ARM_NEON_NCHWC) + message(STATUS "Building MLAS with ARM Neon NCHWc kernels") +endif() + if(onnxruntime_USE_SVE) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_SYSTEM_NAME STREQUAL "Linux") check_cxx_compiler_flag("-march=armv8.2-a+sve" HAS_ARM64_SVE) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 66c654e4a29e7..3b7c6a95ba98f 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -109,8 +109,6 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp - ${MLAS_SRC_DIR}/spool_kernel_neon.cpp ) set(mlas_platform_preprocess_srcs @@ -134,7 +132,11 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm ) - if (onnxruntime_USE_KLEIDIAI) + if (onnxruntime_USE_ARM_NEON_NCHWC) + setup_arm_neon_nchwc() + endif() + + if (onnxruntime_USE_KLEIDIAI) setup_kleidiai() endif() else() @@ -289,6 +291,15 @@ function(setup_kleidiai) endif() endfunction() +function (setup_arm_neon_nchwc) + target_sources(onnxruntime_mlas PRIVATE + ${MLAS_SRC_DIR}/sconv.h + ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp + ${MLAS_SRC_DIR}/spool_kernel_neon.cpp + ) + target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC) +endfunction () + if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD) file(GLOB_RECURSE mlas_platform_srcs @@ -433,8 +444,6 @@ else() ${MLAS_SRC_DIR}/eltwise_kernel_neon.h ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp - ${MLAS_SRC_DIR}/spool_kernel_neon.cpp ) # Conditionally add the SVE implementation if compiler supports it @@ -445,7 +454,11 @@ else() target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE) endif() - if (onnxruntime_USE_KLEIDIAI) + if (onnxruntime_USE_ARM_NEON_NCHWC) + setup_arm_neon_nchwc() + endif() + + if (onnxruntime_USE_KLEIDIAI) setup_kleidiai() endif() set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 2e93584095343..8ed6352e7baa7 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -953,7 +953,7 @@ extern "C" { MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero; MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd; #endif -#if defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC) MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon; MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon; MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon; @@ -1347,12 +1347,14 @@ struct MLAS_PLATFORM { const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch; const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch; const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch; +#if defined(MLAS_USE_ARM_NEON_NCHWC) MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel; MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel; MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel; MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel; MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount]; uint32_t NchwcBlockSize; +#endif #endif const MLAS_SYMM_QGEMM_DISPATCH* SymmQgemmDispatch{nullptr}; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 81067095401e7..46fa150395d75 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -560,6 +560,7 @@ Return Value: this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon; this->EltwiseDispatch = &MlasEltwiseDispatchNeon; +#if defined(MLAS_USE_ARM_NEON_NCHWC) this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon; this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon; this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon; @@ -568,6 +569,7 @@ Return Value: this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelNeon; this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelNeon; this->NchwcBlockSize = MLAS_NEON_NCHWC_BLOCK_SIZE; +#endif // // Check if the processor supports ASIMD dot product instructions. diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv.h index 94e657638975a..12ccff2b7ea33 100644 --- a/onnxruntime/core/mlas/lib/sconv.h +++ b/onnxruntime/core/mlas/lib/sconv.h @@ -19,7 +19,11 @@ Module Name: // Define the convolution kernel flags. // +#if defined(MLAS_USE_ARM_NEON_NCHWC) + #define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001 #define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002 #define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION 0x00000004 -#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008 \ No newline at end of file +#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008 + +#endif diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp index 3ecad66a32886..4c5f50adb929c 100644 --- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp @@ -14,6 +14,8 @@ Module Name: --*/ +#if defined(MLAS_USE_ARM_NEON_NCHWC) + #include "mlasi.h" #include "sconv.h" @@ -58,7 +60,7 @@ void const size_t InputWidthElements = InputWidth / sizeof(float); const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float); - (void)InputStride; + MLAS_UNREFERENCED_PARAMETER(InputStride); const size_t TotalOutputCount = OutputCountLeftPad + OutputCount + OutputCountRightPad; @@ -100,7 +102,7 @@ void const float* input_base = Input + output_idx * StrideWidthElements + kh * DilatedInputWidthElements + kw * DilationWidthElements; - if (IsNchwcFormat) { + if constexpr (IsNchwcFormat) { for (size_t filterBlock = 0; filterBlock < BlockSize; filterBlock++) { const float* input_element = input_base + filterBlock; const float* input_row_start = InputBase + kh * DilatedInputWidthElements; @@ -343,7 +345,7 @@ void const size_t InputStrideElements = InputStride / sizeof(float); const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float); - (void)InputStrideElements; + MLAS_UNREFERENCED_PARAMETER(InputStrideElements); const size_t InputWidthElements = InputWidth / sizeof(float); @@ -518,3 +520,5 @@ void } } } + +#endif diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp index 2fc27d6d4ad7f..6f3423a792509 100644 --- a/onnxruntime/core/mlas/lib/snchwc.cpp +++ b/onnxruntime/core/mlas/lib/snchwc.cpp @@ -101,7 +101,7 @@ Return Value: --*/ { -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) return GetMlasPlatform().NchwcBlockSize; #else return 1; @@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel; #else MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel; @@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel; #else MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel; @@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float); const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float); -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel; #else MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel; @@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM const size_t BlockedOutputWidth = BlockSize * OutputWidth; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel; #else MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel; @@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM { -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[]; #endif @@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float); const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes; -#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind]; #else MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind]; @@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM } }; -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] = { @@ -1621,7 +1621,7 @@ Return Value: } } -#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64) +#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)) // // Convolution and pooling kernel stubs for architectures that do not yet have diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp index 8cca036d54c3a..588362584791b 100644 --- a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp @@ -14,6 +14,8 @@ Module Name: --*/ +#if defined(MLAS_USE_ARM_NEON_NCHWC) + #include "mlasi.h" constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE; @@ -287,3 +289,5 @@ void false // ExcludePad = false ); } + +#endif diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 4438ddba014d0..0513379139464 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -896,6 +896,9 @@ def generate_build_tree( ): cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"] + if args.enable_arm_neon_nchwc: + cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"] + if not args.no_sve: cmake_args += ["-Donnxruntime_USE_SVE=ON"] diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index 996d46974716e..8c04f8dd46016 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -629,8 +629,18 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: help="Enable CUDA kernel profiling (requires CUPTI in PATH).", ) + # --- CPU --- cpu_group = parser.add_argument_group("CPU Execution Provider") cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.") + # The following enables building ORT with NCHWc Neon ARM kernels. + # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels + # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting + # ARM platforms. + # Once the gap is closed for smaller thread counts, it can be turned on by default. + # See https://github.com/microsoft/onnxruntime/pull/25580#issuecomment-3335056846 for benchmarking details. + cpu_group.add_argument( + "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels." + ) # --- DNNL (formerly MKL-DNN / oneDNN) --- dnnl_group = parser.add_argument_group("DNNL Execution Provider")