Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
cc30a14
Rewire ORT to support a NEON version of NCHWc Conv
Rohanjames1997 Jun 11, 2025
f190c0d
Remove reference to assembly file
Rohanjames1997 Jun 17, 2025
632870b
Add a NEON kernel for Pointwise Convolution
Rohanjames1997 Jun 20, 2025
159570a
Add a NEON kernel for Depthwise
Rohanjames1997 Jun 26, 2025
52f09bf
Remove placeholder implementations
Rohanjames1997 Jun 26, 2025
b505bd6
Add placeholder kernel for MlasConvNchwcFloatKernelNeon
Rohanjames1997 Jul 8, 2025
790cc7e
Fix MlasConvNchwcFloatKernelNeon
Rohanjames1997 Jul 8, 2025
906393a
Use MLAS intrinsics for MlasConvNchwcFloatKernelNeon
Rohanjames1997 Jul 8, 2025
4d322e6
Add MlasConvNchwFloatKernelNeon
Rohanjames1997 Jul 9, 2025
cb06a1a
Add placeholder NCHWc Pool
Rohanjames1997 Jul 10, 2025
00caa4c
Vanilla C++ implementation
Rohanjames1997 Jul 10, 2025
4cead5e
Intrinsics for Pooling
Rohanjames1997 Jul 10, 2025
abd5491
Refactored to share code
Rohanjames1997 Jul 10, 2025
74e0e3b
Format file & delete unused header
Rohanjames1997 Jul 11, 2025
16be947
Minor modifications to pass more tests
Rohanjames1997 Jul 11, 2025
f7d971d
Remove unnecessary code & formatting changes
Rohanjames1997 Jul 15, 2025
0ff394c
Refactor to share some code
Rohanjames1997 Jul 15, 2025
bd2b6c4
Change block size to 16
Rohanjames1997 Jul 17, 2025
2b78377
Update pooling algorithm for block size 16
Rohanjames1997 Jul 17, 2025
ee9b943
Remove comment
Rohanjames1997 Jul 29, 2025
23425e8
Add correct header and refactor kernels to share code.
Rohanjames1997 Sep 9, 2025
7000e9f
Address Copilot comments
Rohanjames1997 Sep 9, 2025
c5c3f05
Extend kernels to Windows & Apple
Rohanjames1997 Sep 9, 2025
619d87c
Merge remote-tracking branch 'upstream/main' into nchwc_conv_pool
Rohanjames1997 Sep 9, 2025
506bf05
Hardcode BlockSize to 16 and add it to the header.
Rohanjames1997 Sep 10, 2025
fb5fb50
Increase android build size to 10% higher than the CI-reported size o…
Rohanjames1997 Sep 12, 2025
fb99f7d
Centralize MLAS_NEON_NCHWC_BLOCK_SIZE
Rohanjames1997 Sep 12, 2025
aa21aca
Merge branch 'microsoft:main' into nchwc_conv_pool
Rohanjames1997 Sep 12, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/android.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ jobs:
run: |
set -e -x
BINARY_SIZE_THRESHOLD_ARGS=""
echo "Binary size threshold in bytes: 1436672"
BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672"
echo "Binary size threshold in bytes: 1722565"
BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1722565"

# Ensure ANDROID_NDK_HOME is available and get its real path
if [ -z "$ANDROID_NDK_HOME" ]; then
Expand Down
4 changes: 4 additions & 0 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
)

set(mlas_platform_preprocess_srcs
Expand Down Expand Up @@ -431,6 +433,8 @@ else()
${MLAS_SRC_DIR}/eltwise_kernel_neon.h
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
)
if (onnxruntime_USE_KLEIDIAI)
setup_kleidiai()
Expand Down
16 changes: 16 additions & 0 deletions onnxruntime/core/mlas/lib/mlasi.h
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,15 @@ extern "C" {
#if defined(__aarch64__) && defined(__linux__)
MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
#endif
#if defined(MLAS_TARGET_ARM64)
MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon;
MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon;
MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
MLAS_CONV_POINTWISE_FLOAT_KERNEL MlasConvPointwiseFloatKernelNeon;
MLAS_POOL_FLOAT_KERNEL MlasPoolMaximumFloatKernelNeon;
MLAS_POOL_FLOAT_KERNEL MlasPoolAverageExcludePadFloatKernelNeon;
MLAS_POOL_FLOAT_KERNEL MlasPoolAverageIncludePadFloatKernelNeon;
#endif
MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelZero;
MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd;
Expand Down Expand Up @@ -1335,6 +1344,12 @@ struct MLAS_PLATFORM {
const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch;
MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel;
MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel;
MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel;
MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel;
MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount];
uint32_t NchwcBlockSize;
#endif
const MLAS_SYMM_QGEMM_DISPATCH* SymmQgemmDispatch{nullptr};

Expand Down Expand Up @@ -1395,6 +1410,7 @@ struct MLAS_PLATFORM {
int32_t MaximumThreadCount;
#elif defined(MLAS_TARGET_ARM64)
static constexpr int32_t MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT * 4;
static constexpr size_t MLAS_NEON_NCHWC_BLOCK_SIZE = 16;
#else
static constexpr int32_t MaximumThreadCount = MLAS_MAXIMUM_THREAD_COUNT;
#endif
Expand Down
9 changes: 9 additions & 0 deletions onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,15 @@ Return Value:
this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon;
this->EltwiseDispatch = &MlasEltwiseDispatchNeon;

this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon;
this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon;
this->ConvPointwiseFloatKernel = MlasConvPointwiseFloatKernelNeon;
this->PoolFloatKernel[MlasMaximumPooling] = MlasPoolMaximumFloatKernelNeon;
this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelNeon;
this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelNeon;
this->NchwcBlockSize = MLAS_NEON_NCHWC_BLOCK_SIZE;

//
// Check if the processor supports ASIMD dot product instructions.
//
Expand Down
25 changes: 25 additions & 0 deletions onnxruntime/core/mlas/lib/sconv.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*++

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

Module Name:

sconv.h

Abstract:

This module defines convolution kernel flags for configuring convolution
operations including output accumulation, bias addition, and activations.

--*/

//
// Define the convolution kernel flags.
//

#define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001
#define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002
#define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION 0x00000004
#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008
Loading
Loading