From e28e1b61a999f93f7419bd1a0e859e3c9660bc3d Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 25 Sep 2025 19:53:19 -0700
Subject: [PATCH 1/9] Add build option for ARM NCHWc kernels

---
 .github/workflows/android.yml                 |  4 ++--
 cmake/CMakeLists.txt                          |  1 +
 cmake/onnxruntime_mlas.cmake                  | 24 ++++++++++++++-----
 onnxruntime/core/mlas/lib/mlasi.h             |  4 +++-
 onnxruntime/core/mlas/lib/platform.cpp        |  2 ++
 onnxruntime/core/mlas/lib/sconv.h             |  6 ++++-
 .../core/mlas/lib/sconv_kernel_neon.cpp       |  4 ++++
 onnxruntime/core/mlas/lib/snchwc.cpp          | 18 +++++++-------
 .../core/mlas/lib/spool_kernel_neon.cpp       |  4 ++++
 tools/ci_build/build.py                       |  3 +++
 tools/ci_build/build_args.py                  |  8 +++++++
 11 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/android.yml b/.github/workflows/android.yml
index 9e1a491d154cf..7f7ff74959d52 100644
--- a/.github/workflows/android.yml
+++ b/.github/workflows/android.yml
@@ -71,8 +71,8 @@ jobs:
       run: |
         set -e -x
         BINARY_SIZE_THRESHOLD_ARGS=""
-        echo "Binary size threshold in bytes: 1722565"
-        BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1722565"
+        echo "Binary size threshold in bytes: 1436672"
+        BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672"
 
         # Ensure ANDROID_NDK_HOME is available and get its real path
         if [ -z "$ANDROID_NDK_HOME" ]; then
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bcab5e9e6fa1b..a5ad7ff7310f2 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -88,6 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
+option(onnxruntime_ARM_USE_NCHWC "Build with ARM NCHWc kernels in MLAS" OFF)
 
 option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
 # iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 66c654e4a29e7..33e6c604d299a 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -109,8 +109,6 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
         ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-        ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
-        ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -134,7 +132,11 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
       )
 
-      if (onnxruntime_USE_KLEIDIAI)
+      if (onnxruntime_ARM_USE_NCHWC)
+		setup_arm_nchwc()	
+	  endif()
+      
+	  if (onnxruntime_USE_KLEIDIAI)
         setup_kleidiai()
       endif()
     else()
@@ -289,6 +291,14 @@ function(setup_kleidiai)
   endif()
 endfunction()
 
+function (setup_arm_nchwc)
+  target_sources(onnxruntime_mlas PRIVATE
+   ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
+   ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
+  )
+  target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NCHWC)
+endfunction ()
+
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
   if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
     file(GLOB_RECURSE mlas_platform_srcs
@@ -433,8 +443,6 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-          ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
-          ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
         )
         
         # Conditionally add the SVE implementation if compiler supports it
@@ -445,7 +453,11 @@ else()
           target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE)
         endif()
 
-        if (onnxruntime_USE_KLEIDIAI)
+        if (onnxruntime_ARM_USE_NCHWC)
+		  setup_arm_nchwc()	
+		endif()
+        
+		if (onnxruntime_USE_KLEIDIAI)
           setup_kleidiai()
         endif()
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 2e93584095343..05f14549ce955 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -953,7 +953,7 @@ extern "C" {
     MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
     MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
 #endif
-#if defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)
     MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon;
     MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon;
     MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
@@ -1347,12 +1347,14 @@ struct MLAS_PLATFORM {
     const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
     const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
     const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch;
+#if defined (MLAS_USE_ARM_NCHWC)
     MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel;
     MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel;
     MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel;
     MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel;
     MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount];
     uint32_t NchwcBlockSize;
+#endif
 #endif
     const MLAS_SYMM_QGEMM_DISPATCH* SymmQgemmDispatch{nullptr};
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 81067095401e7..5388221aec4fa 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -560,6 +560,7 @@ Return Value:
     this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon;
     this->EltwiseDispatch = &MlasEltwiseDispatchNeon;
 
+#if defined(MLAS_USE_ARM_NCHWC)
     this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
     this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon;
     this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon;
@@ -568,6 +569,7 @@ Return Value:
     this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelNeon;
     this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelNeon;
     this->NchwcBlockSize = MLAS_NEON_NCHWC_BLOCK_SIZE;
+#endif
 
     //
     // Check if the processor supports ASIMD dot product instructions.
diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv.h
index 94e657638975a..a9e59f6038d4e 100644
--- a/onnxruntime/core/mlas/lib/sconv.h
+++ b/onnxruntime/core/mlas/lib/sconv.h
@@ -19,7 +19,11 @@ Module Name:
 // Define the convolution kernel flags.
 //
 
+#if defined(MLAS_USE_ARM_NCHWC)
+
 #define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001
 #define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002
 #define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION 0x00000004
-#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008
\ No newline at end of file
+#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008
+
+#endif
diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
index 3ecad66a32886..f396b08a103c3 100644
--- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
@@ -14,6 +14,8 @@ Module Name:
 
 --*/
 
+#if defined(MLAS_USE_ARM_NCHWC)
+
 #include "mlasi.h"
 #include "sconv.h"
 
@@ -518,3 +520,5 @@ void
         }
     }
 }
+
+#endif
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
index 2fc27d6d4ad7f..52aa9eb07311c 100644
--- a/onnxruntime/core/mlas/lib/snchwc.cpp
+++ b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -101,7 +101,7 @@ Return Value:
 
 --*/
 {
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
     return GetMlasPlatform().NchwcBlockSize;
 #else
     return 1;
@@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
         MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel;
 #else
         MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel;
@@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
         MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel;
 #else
         MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel;
@@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
         const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float);
         const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;
 #else
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel;
@@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel;
 #else
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel;
@@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
 struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
 {
-#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
     static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[];
 #endif
 
@@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
         const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
         const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
         MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind];
 #else
         MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind];
@@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
     }
 };
 
-#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
 
 MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] =
 {
@@ -1621,7 +1621,7 @@ Return Value:
     }
 }
 
-#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
 
 //
 // Convolution and pooling kernel stubs for architectures that do not yet have
diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
index 8cca036d54c3a..5159dca63d2e0 100644
--- a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
@@ -14,6 +14,8 @@ Module Name:
 
 --*/
 
+#if defined(MLAS_USE_ARM_NCHWC)
+
 #include "mlasi.h"
 
 constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE;
@@ -287,3 +289,5 @@ void
         false  // ExcludePad = false
     );
 }
+
+#endif
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 4438ddba014d0..b62935c992f1d 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -896,6 +896,9 @@ def generate_build_tree(
         ):
             cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
 
+    if not args.enable_arm_nchwc:
+        cmake_args += ["-Donnxruntime_ARM_USE_NCHWC=ON"]
+
     if not args.no_sve:
         cmake_args += ["-Donnxruntime_USE_SVE=ON"]
 
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index 996d46974716e..cb557781292d8 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -629,8 +629,16 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
         help="Enable CUDA kernel profiling (requires CUPTI in PATH).",
     )
 
+    # --- CPU ---
     cpu_group = parser.add_argument_group("CPU Execution Provider")
     cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.")
+    
+    # The following enables building ORT with NCHWc ARM kernels.
+    # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels
+    # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
+    # ARM platforms.
+    # Once the gap is closed for smaller thread counts, it can be turned on by default.    
+    cpu_group.add_argument("--enable_arm_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.")
 
     # --- DNNL (formerly MKL-DNN / oneDNN) ---
     dnnl_group = parser.add_argument_group("DNNL Execution Provider")

From 45f850cdd0bf184703617aa9eac5ac1bbe42bbe1 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 25 Sep 2025 19:58:08 -0700
Subject: [PATCH 2/9] Fix

---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index b62935c992f1d..d0e9959058deb 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -896,7 +896,7 @@ def generate_build_tree(
         ):
             cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
 
-    if not args.enable_arm_nchwc:
+    if args.enable_arm_nchwc:
         cmake_args += ["-Donnxruntime_ARM_USE_NCHWC=ON"]
 
     if not args.no_sve:

From ac6cafb80d9be4d5c8645131a38092a9236d4f2b Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 25 Sep 2025 20:09:20 -0700
Subject: [PATCH 3/9] Add message in cmake

---
 cmake/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index a5ad7ff7310f2..8d95d043cac08 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -664,6 +664,10 @@ else()
   endif()
 endif()
 
+if (onnxruntime_ARM_USE_NCHWC)
+  message(STATUS "Building MLAS with ARM NCHWc kernels")
+endif()
+
 if(onnxruntime_USE_SVE)
   if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
     check_cxx_compiler_flag("-march=armv8.2-a+sve" HAS_ARM64_SVE)

From 513234a915e79d9efe12d1f44bad0f5728c4e7ab Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 25 Sep 2025 20:51:04 -0700
Subject: [PATCH 4/9] Enable NCHWc ARM kernels on Mac OS alone

---
 .github/workflows/macos-ci-build-and-test-workflow.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
index c7c35fb234013..61370e098a598 100644
--- a/.github/workflows/macos-ci-build-and-test-workflow.yml
+++ b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -62,6 +62,7 @@ jobs:
         --build_objc
         --build_java
         --build_wheel
+        ${{ matrix.target == 'arm64' && '--enable_arm_nchwc' || '' }}
         ${{ inputs.use_webgpu && '--use_webgpu' || '' }}
         ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }}
         ${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }}

From 68932dad4c7b21ebbf253e8ec02a36aae1a4c48c Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Fri, 26 Sep 2025 17:41:10 -0700
Subject: [PATCH 5/9] Update tools/ci_build/build_args.py

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 tools/ci_build/build_args.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index cb557781292d8..518a969ef2a31 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -632,7 +632,6 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     # --- CPU ---
     cpu_group = parser.add_argument_group("CPU Execution Provider")
     cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.")
-    
     # The following enables building ORT with NCHWc ARM kernels.
     # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels
     # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting

From 3029cb4e3313aa68a2000948fc17c5501f7a543a Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Fri, 26 Sep 2025 17:41:17 -0700
Subject: [PATCH 6/9] Update tools/ci_build/build_args.py

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 tools/ci_build/build_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index 518a969ef2a31..dd429f4e663cb 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -636,7 +636,7 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels
     # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
     # ARM platforms.
-    # Once the gap is closed for smaller thread counts, it can be turned on by default.    
+    # Once the gap is closed for smaller thread counts, it can be turned on by default.
     cpu_group.add_argument("--enable_arm_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.")
 
     # --- DNNL (formerly MKL-DNN / oneDNN) ---

From 98f5d834eedd1505c535181830ee752a3453192f Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Fri, 26 Sep 2025 21:33:12 -0700
Subject: [PATCH 7/9] Reflect Neon

---
 .../macos-ci-build-and-test-workflow.yml       |  2 +-
 cmake/CMakeLists.txt                           |  6 +++---
 cmake/onnxruntime_mlas.cmake                   | 13 +++++++------
 onnxruntime/core/mlas/lib/mlasi.h              |  4 ++--
 onnxruntime/core/mlas/lib/platform.cpp         |  2 +-
 onnxruntime/core/mlas/lib/sconv.h              |  2 +-
 .../core/mlas/lib/sconv_kernel_neon.cpp        |  8 ++++----
 onnxruntime/core/mlas/lib/snchwc.cpp           | 18 +++++++++---------
 .../core/mlas/lib/spool_kernel_neon.cpp        |  2 +-
 tools/ci_build/build.py                        |  4 ++--
 tools/ci_build/build_args.py                   |  4 ++--
 11 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/macos-ci-build-and-test-workflow.yml b/.github/workflows/macos-ci-build-and-test-workflow.yml
index 61370e098a598..329584c68d7d1 100644
--- a/.github/workflows/macos-ci-build-and-test-workflow.yml
+++ b/.github/workflows/macos-ci-build-and-test-workflow.yml
@@ -62,7 +62,7 @@ jobs:
         --build_objc
         --build_java
         --build_wheel
-        ${{ matrix.target == 'arm64' && '--enable_arm_nchwc' || '' }}
+        ${{ matrix.target == 'arm64' && '--enable_arm_neon_nchwc' || '' }}
         ${{ inputs.use_webgpu && '--use_webgpu' || '' }}
         ${{ inputs.use_xnnpack && '--use_xnnpack' || '' }}
         ${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }}
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8d95d043cac08..a92a0cddefd55 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -88,7 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
-option(onnxruntime_ARM_USE_NCHWC "Build with ARM NCHWc kernels in MLAS" OFF)
+option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)
 
 option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
 # iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so
@@ -664,8 +664,8 @@ else()
   endif()
 endif()
 
-if (onnxruntime_ARM_USE_NCHWC)
-  message(STATUS "Building MLAS with ARM NCHWc kernels")
+if (onnxruntime_USE_ARM_NEON_NCHWC)
+  message(STATUS "Building MLAS with ARM Neon NCHWc kernels")
 endif()
 
 if(onnxruntime_USE_SVE)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 33e6c604d299a..3b7c6a95ba98f 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -132,8 +132,8 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
       )
 
-      if (onnxruntime_ARM_USE_NCHWC)
-		setup_arm_nchwc()	
+      if (onnxruntime_USE_ARM_NEON_NCHWC)
+		setup_arm_neon_nchwc()	
 	  endif()
       
 	  if (onnxruntime_USE_KLEIDIAI)
@@ -291,12 +291,13 @@ function(setup_kleidiai)
   endif()
 endfunction()
 
-function (setup_arm_nchwc)
+function (setup_arm_neon_nchwc)
   target_sources(onnxruntime_mlas PRIVATE
+   ${MLAS_SRC_DIR}/sconv.h  
    ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
    ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
   )
-  target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NCHWC)
+  target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC)
 endfunction ()
 
 if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
@@ -453,8 +454,8 @@ else()
           target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE)
         endif()
 
-        if (onnxruntime_ARM_USE_NCHWC)
-		  setup_arm_nchwc()	
+        if (onnxruntime_USE_ARM_NEON_NCHWC)
+		  setup_arm_neon_nchwc()	
 		endif()
         
 		if (onnxruntime_USE_KLEIDIAI)
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 05f14549ce955..8ed6352e7baa7 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -953,7 +953,7 @@ extern "C" {
     MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
     MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
 #endif
-#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC)
+#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
     MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon;
     MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon;
     MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
@@ -1347,7 +1347,7 @@ struct MLAS_PLATFORM {
     const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
     const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
     const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch;
-#if defined (MLAS_USE_ARM_NCHWC)
+#if defined(MLAS_USE_ARM_NEON_NCHWC)
     MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel;
     MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel;
     MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 5388221aec4fa..46fa150395d75 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -560,7 +560,7 @@ Return Value:
     this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon;
     this->EltwiseDispatch = &MlasEltwiseDispatchNeon;
 
-#if defined(MLAS_USE_ARM_NCHWC)
+#if defined(MLAS_USE_ARM_NEON_NCHWC)
     this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
     this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon;
     this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon;
diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv.h
index a9e59f6038d4e..12ccff2b7ea33 100644
--- a/onnxruntime/core/mlas/lib/sconv.h
+++ b/onnxruntime/core/mlas/lib/sconv.h
@@ -19,7 +19,7 @@ Module Name:
 // Define the convolution kernel flags.
 //
 
-#if defined(MLAS_USE_ARM_NCHWC)
+#if defined(MLAS_USE_ARM_NEON_NCHWC)
 
 #define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001
 #define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002
diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
index f396b08a103c3..4c5f50adb929c 100644
--- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
@@ -14,7 +14,7 @@ Module Name:
 
 --*/
 
-#if defined(MLAS_USE_ARM_NCHWC)
+#if defined(MLAS_USE_ARM_NEON_NCHWC)
 
 #include "mlasi.h"
 #include "sconv.h"
@@ -60,7 +60,7 @@ void
     const size_t InputWidthElements = InputWidth / sizeof(float);
     const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float);
 
-    (void)InputStride;
+    MLAS_UNREFERENCED_PARAMETER(InputStride);
 
     const size_t TotalOutputCount = OutputCountLeftPad + OutputCount + OutputCountRightPad;
 
@@ -102,7 +102,7 @@ void
                     const float* input_base = Input + output_idx * StrideWidthElements +
                                               kh * DilatedInputWidthElements + kw * DilationWidthElements;
 
-                    if (IsNchwcFormat) {
+                    if constexpr (IsNchwcFormat) {
                         for (size_t filterBlock = 0; filterBlock < BlockSize; filterBlock++) {
                             const float* input_element = input_base + filterBlock;
                             const float* input_row_start = InputBase + kh * DilatedInputWidthElements;
@@ -345,7 +345,7 @@ void
     const size_t InputStrideElements = InputStride / sizeof(float);
     const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float);
 
-    (void)InputStrideElements;
+    MLAS_UNREFERENCED_PARAMETER(InputStrideElements);
 
     const size_t InputWidthElements = InputWidth / sizeof(float);
 
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
index 52aa9eb07311c..6f3423a792509 100644
--- a/onnxruntime/core/mlas/lib/snchwc.cpp
+++ b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -101,7 +101,7 @@ Return Value:
 
 --*/
 {
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
     return GetMlasPlatform().NchwcBlockSize;
 #else
     return 1;
@@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel;
 #else
         MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel;
@@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel;
 #else
         MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel;
@@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
         const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float);
         const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;
 #else
         MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel;
@@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
         const size_t BlockedOutputWidth = BlockSize * OutputWidth;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel;
 #else
         MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel;
@@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
 struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
 {
-#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
     static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[];
 #endif
 
@@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
         const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
         const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;
 
-#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
         MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind];
 #else
         MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind];
@@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
     }
 };
 
-#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
 
 MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] =
 {
@@ -1621,7 +1621,7 @@ Return Value:
     }
 }
 
-#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NCHWC))
+#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
 
 //
 // Convolution and pooling kernel stubs for architectures that do not yet have
diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
index 5159dca63d2e0..588362584791b 100644
--- a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
@@ -14,7 +14,7 @@ Module Name:
 
 --*/
 
-#if defined(MLAS_USE_ARM_NCHWC)
+#if defined(MLAS_USE_ARM_NEON_NCHWC)
 
 #include "mlasi.h"
 
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index d0e9959058deb..0513379139464 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -896,8 +896,8 @@ def generate_build_tree(
         ):
             cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
 
-    if args.enable_arm_nchwc:
-        cmake_args += ["-Donnxruntime_ARM_USE_NCHWC=ON"]
+    if args.enable_arm_neon_nchwc:
+        cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"]
 
     if not args.no_sve:
         cmake_args += ["-Donnxruntime_USE_SVE=ON"]
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index cb557781292d8..7f19c14c6cbd1 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -633,12 +633,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     cpu_group = parser.add_argument_group("CPU Execution Provider")
     cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.")
     
-    # The following enables building ORT with NCHWc ARM kernels.
+    # The following enables building ORT with NCHWc Neon ARM kernels.
     # At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels
     # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
     # ARM platforms.
     # Once the gap is closed for smaller thread counts, it can be turned on by default.    
-    cpu_group.add_argument("--enable_arm_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.")
+    cpu_group.add_argument("--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.")
 
     # --- DNNL (formerly MKL-DNN / oneDNN) ---
     dnnl_group = parser.add_argument_group("DNNL Execution Provider")

From 401bf94f24fe13443994a73c424afd11678095af Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Sun, 28 Sep 2025 13:20:22 -0700
Subject: [PATCH 8/9] Update tools/ci_build/build_args.py

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 tools/ci_build/build_args.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index 8efc9be18da21..f6f2e69f82827 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -637,7 +637,9 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
     # ARM platforms.
     # Once the gap is closed for smaller thread counts, it can be turned on by default.
-    cpu_group.add_argument("--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels.")
+    cpu_group.add_argument(
+        "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels."
+    )
 
     # --- DNNL (formerly MKL-DNN / oneDNN) ---
     dnnl_group = parser.add_argument_group("DNNL Execution Provider")

From 7a59bd6c0180b9e38750b909ba42bb6da74420b6 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Mon, 29 Sep 2025 10:49:47 -0700
Subject: [PATCH 9/9] Add comment about NCHWc ARM kernels performance

Added a comment regarding the performance of NCHWc ARM kernels and their default state.
---
 tools/ci_build/build_args.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index f6f2e69f82827..8c04f8dd46016 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -637,6 +637,7 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
     # is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
     # ARM platforms.
     # Once the gap is closed for smaller thread counts, it can be turned on by default.
+    # See https://github.com/microsoft/onnxruntime/pull/25580#issuecomment-3335056846 for benchmarking details.
     cpu_group.add_argument(
         "--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels."
     )