Add build option for ARM NCHWc kernels #26171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

hariharans29 merged 10 commits into main from hari/mlas_fix_3

Sep 29, 2025

.github/workflows/android.yml

-Original file line number
+Diff line change
@@ Expand Up / @@ -71,8 +71,8 @@ jobs: @@
           run: |
             set -e -x
             BINARY_SIZE_THRESHOLD_ARGS=""
-            echo "Binary size threshold in bytes: 1722565"
-            BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1722565"
+            echo "Binary size threshold in bytes: 1436672"
+            BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672"
             # Ensure ANDROID_NDK_HOME is available and get its real path
             if [ -z "$ANDROID_NDK_HOME" ]; then
@@ Expand Down @@

.github/workflows/macos-ci-build-and-test-workflow.yml

-Original file line number
+Diff line change
@@ Expand Up / @@ -62,6 +62,7 @@ jobs: @@
             --build_objc
             --build_java
             --build_wheel
+            ${{ matrix.target == 'arm64' && '--enable_arm_neon_nchwc' || '' }}
             ${{ inputs.use_webgpu && '--use_webgpu' || '' }}
             ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }}
             ${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }}
@@ Expand Down @@

cmake/CMakeLists.txt

-Original file line number
+Diff line change
@@ Expand Up / @@ -88,6 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) @@
     option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
     option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
     option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
+    option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)
     option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
     # iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so
@@ Expand Down Expand Up / @@ -663,6 +664,10 @@ else() @@
       endif()
     endif()
+    if (onnxruntime_USE_ARM_NEON_NCHWC)
+      message(STATUS "Building MLAS with ARM Neon NCHWc kernels")
+    endif()
     if(onnxruntime_USE_SVE)
       if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
         check_cxx_compiler_flag("-march=armv8.2-a+sve" HAS_ARM64_SVE)
@@ Expand Down @@

cmake/onnxruntime_mlas.cmake

-Original file line number
+Diff line change
@@ Expand Up / @@ -109,8 +109,6 @@ function(setup_mlas_source_for_windows) @@
             ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
             ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
             ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-            ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
-            ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
           )
           set(mlas_platform_preprocess_srcs
@@ Expand All / @@ -134,7 +132,11 @@ function(setup_mlas_source_for_windows) @@
             ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
           )
-          if (onnxruntime_USE_KLEIDIAI)
+          if (onnxruntime_USE_ARM_NEON_NCHWC)
+    		setup_arm_neon_nchwc()
+    	  endif()
+    	  if (onnxruntime_USE_KLEIDIAI)
             setup_kleidiai()
           endif()
         else()
@@ Expand Down Expand Up / @@ -289,6 +291,15 @@ function(setup_kleidiai) @@
       endif()
     endfunction()
+    function (setup_arm_neon_nchwc)
+      target_sources(onnxruntime_mlas PRIVATE
+       ${MLAS_SRC_DIR}/sconv.h
+       ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
+       ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
+      )
+      target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC)
+    endfunction ()
     if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
       if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
         file(GLOB_RECURSE mlas_platform_srcs
@@ Expand Down Expand Up / @@ -433,8 +444,6 @@ else() @@
               ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
               ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
               ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-              ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
-              ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
             )
             # Conditionally add the SVE implementation if compiler supports it
@@ Expand All / @@ -445,7 +454,11 @@ else() @@
               target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE)
             endif()
-            if (onnxruntime_USE_KLEIDIAI)
+            if (onnxruntime_USE_ARM_NEON_NCHWC)
+    		  setup_arm_neon_nchwc()
+    		endif()
+    		if (onnxruntime_USE_KLEIDIAI)
               setup_kleidiai()
             endif()
             set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
@@ Expand Down @@

onnxruntime/core/mlas/lib/mlasi.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -953,7 +953,7 @@ extern "C" { @@
         MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
         MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
     #endif
-    #if defined(MLAS_TARGET_ARM64)
+    #if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
         MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon;
         MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon;
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
@@ Expand Down Expand Up / @@ -1347,12 +1347,14 @@ struct MLAS_PLATFORM { @@
         const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
         const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
         const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch;
+    #if defined(MLAS_USE_ARM_NEON_NCHWC)
         MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel;
         MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel;
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel;
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel;
         MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount];
         uint32_t NchwcBlockSize;
+    #endif
     #endif
         const MLAS_SYMM_QGEMM_DISPATCH* SymmQgemmDispatch{nullptr};
@@ Expand Down @@

onnxruntime/core/mlas/lib/platform.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -560,6 +560,7 @@ Return Value: @@
         this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon;
         this->EltwiseDispatch = &MlasEltwiseDispatchNeon;
+    #if defined(MLAS_USE_ARM_NEON_NCHWC)
         this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
         this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon;
         this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon;
@@ Expand All / @@ -568,6 +569,7 @@ Return Value: @@
         this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelNeon;
         this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelNeon;
         this->NchwcBlockSize = MLAS_NEON_NCHWC_BLOCK_SIZE;
+    #endif
         //
         // Check if the processor supports ASIMD dot product instructions.
@@ Expand Down @@

onnxruntime/core/mlas/lib/sconv.h

-Original file line number
+Diff line change
@@ Expand Up / @@ -19,7 +19,11 @@ Module Name: @@
     // Define the convolution kernel flags.
     //
+    #if defined(MLAS_USE_ARM_NEON_NCHWC)
     #define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001
     #define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002
     #define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION 0x00000004
-    #define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008
+    #define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008
+    #endif

onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,8 @@ Module Name: @@
     --*/
+    #if defined(MLAS_USE_ARM_NEON_NCHWC)
     #include "mlasi.h"
     #include "sconv.h"
@@ Expand Down Expand Up / @@ -58,7 +60,7 @@ void @@
         const size_t InputWidthElements = InputWidth / sizeof(float);
         const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float);
-        (void)InputStride;
+        MLAS_UNREFERENCED_PARAMETER(InputStride);
         const size_t TotalOutputCount = OutputCountLeftPad + OutputCount + OutputCountRightPad;
@@ Expand Down Expand Up / @@ -100,7 +102,7 @@ void @@
                         const float* input_base = Input + output_idx * StrideWidthElements +
                                                   kh * DilatedInputWidthElements + kw * DilationWidthElements;
-                        if (IsNchwcFormat) {
+                        if constexpr (IsNchwcFormat) {
                             for (size_t filterBlock = 0; filterBlock < BlockSize; filterBlock++) {
                                 const float* input_element = input_base + filterBlock;
                                 const float* input_row_start = InputBase + kh * DilatedInputWidthElements;
@@ Expand Down Expand Up / @@ -343,7 +345,7 @@ void @@
         const size_t InputStrideElements = InputStride / sizeof(float);
         const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float);
-        (void)InputStrideElements;
+        MLAS_UNREFERENCED_PARAMETER(InputStrideElements);
         const size_t InputWidthElements = InputWidth / sizeof(float);
@@ Expand Down Expand Up / @@ -518,3 +520,5 @@ void @@
             }
         }
     }
+    #endif

onnxruntime/core/mlas/lib/snchwc.cpp

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -101,7 +101,7 @@ Return Value:
  
    --*/

    {

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

        return GetMlasPlatform().NchwcBlockSize;

    #else

        return 1;

    @@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
  
            const size_t BlockedOutputWidth = BlockSize * OutputWidth;

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

            MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel;

    #else

            MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel;

    @@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
  
            const size_t BlockedOutputWidth = BlockSize * OutputWidth;

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

            MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel;

    #else

            MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel;

    @@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
  
            const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float);

            const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

            MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;

    #else

            MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel;

    @@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
  
            const size_t BlockedOutputWidth = BlockSize * OutputWidth;

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

            MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel;

    #else

            MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel;

    @@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
  
    struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM

    {

    #if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)

    #if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

        static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[];

    #endif

    @@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
  
            const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);

            const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)

    #if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

            MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind];

    #else

            MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind];

    @@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
  
        }

    };

    #if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)

    #if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

    MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] =

    {

    @@ -1621,7 +1621,7 @@ Return Value:
  
        }

    }

    #if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)

    #if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

    //

    // Convolution and pooling kernel stubs for architectures that do not yet have

onnxruntime/core/mlas/lib/spool_kernel_neon.cpp

-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,8 @@ Module Name: @@
     --*/
+    #if defined(MLAS_USE_ARM_NEON_NCHWC)
     #include "mlasi.h"
     constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE;
@@ Expand Down Expand Up / @@ -287,3 +289,5 @@ void @@
             false  // ExcludePad = false
         );
     }
+    #endif

tools/ci_build/build.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -896,6 +896,9 @@ def generate_build_tree( @@
             ):
                 cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
+        if args.enable_arm_neon_nchwc:
+            cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"]
         if not args.no_sve:
             cmake_args += ["-Donnxruntime_USE_SVE=ON"]
@@ Expand Down @@

tools/ci_build/build_args.py

-Original file line number
+Diff line change
@@ Expand Up @@
             help="Enable CUDA kernel profiling (requires CUPTI in PATH).",
         )
+        # --- CPU ---
         cpu_group = parser.add_argument_group("CPU Execution Provider")
         cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.")
+        # The following enables building ORT with NCHWc Neon ARM kernels.
+        # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels
+        # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
+        # ARM platforms.
+        # Once the gap is closed for smaller thread counts, it can be turned on by default.
+        # See https://github.com/microsoft/onnxruntime/pull/25580#issuecomment-3335056846 for benchmarking details.
+        cpu_group.add_argument(
+            "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels."
+        )
         # --- DNNL (formerly MKL-DNN / oneDNN) ---
         dnnl_group = parser.add_argument_group("DNNL Execution Provider")
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add build option for ARM NCHWc kernels #26171

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Uh oh!

Uh oh!