Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/android.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ jobs:
run: |
set -e -x
BINARY_SIZE_THRESHOLD_ARGS=""
echo "Binary size threshold in bytes: 1722565"
BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1722565"
echo "Binary size threshold in bytes: 1436672"
BINARY_SIZE_THRESHOLD_ARGS="--threshold_size_in_bytes 1436672"

# Ensure ANDROID_NDK_HOME is available and get its real path
if [ -z "$ANDROID_NDK_HOME" ]; then
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/macos-ci-build-and-test-workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ jobs:
--build_objc
--build_java
--build_wheel
${{ matrix.target == 'arm64' && '--enable_arm_neon_nchwc' || '' }}
${{ inputs.use_webgpu && '--use_webgpu' || '' }}
${{ inputs.use_xnnpack && '--use_xnnpack' || '' }}
${{ inputs.use_coreml && '--use_coreml --skip_onnx_tests' || '' }}
Expand Down
5 changes: 5 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)

option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
# iOS simulator build explicitly builds targets with USE_KLEIDIAI=ON so attempting to force override if so
Expand Down Expand Up @@ -663,6 +664,10 @@ else()
endif()
endif()

if (onnxruntime_USE_ARM_NEON_NCHWC)
message(STATUS "Building MLAS with ARM Neon NCHWc kernels")
endif()

if(onnxruntime_USE_SVE)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
check_cxx_compiler_flag("-march=armv8.2-a+sve" HAS_ARM64_SVE)
Expand Down
25 changes: 19 additions & 6 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,6 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
)

set(mlas_platform_preprocess_srcs
Expand All @@ -134,7 +132,11 @@ function(setup_mlas_source_for_windows)
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
)

if (onnxruntime_USE_KLEIDIAI)
if (onnxruntime_USE_ARM_NEON_NCHWC)
setup_arm_neon_nchwc()
endif()

if (onnxruntime_USE_KLEIDIAI)
setup_kleidiai()
endif()
else()
Expand Down Expand Up @@ -289,6 +291,15 @@ function(setup_kleidiai)
endif()
endfunction()

function (setup_arm_neon_nchwc)
target_sources(onnxruntime_mlas PRIVATE
${MLAS_SRC_DIR}/sconv.h
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
)
target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_ARM_NEON_NCHWC)
endfunction ()

if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
file(GLOB_RECURSE mlas_platform_srcs
Expand Down Expand Up @@ -433,8 +444,6 @@ else()
${MLAS_SRC_DIR}/eltwise_kernel_neon.h
${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
${MLAS_SRC_DIR}/spool_kernel_neon.cpp
)

# Conditionally add the SVE implementation if compiler supports it
Expand All @@ -445,7 +454,11 @@ else()
target_compile_definitions(onnxruntime_mlas PRIVATE MLAS_USE_SVE)
endif()

if (onnxruntime_USE_KLEIDIAI)
if (onnxruntime_USE_ARM_NEON_NCHWC)
setup_arm_neon_nchwc()
endif()

if (onnxruntime_USE_KLEIDIAI)
setup_kleidiai()
endif()
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/mlas/lib/mlasi.h
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,7 @@ extern "C" {
MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelZero;
MLAS_SBGEMM_FLOAT_KERNEL MlasSbgemmKernelAdd;
#endif
#if defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC)
MLAS_CONV_FLOAT_KERNEL MlasConvNchwFloatKernelNeon;
MLAS_CONV_FLOAT_KERNEL MlasConvNchwcFloatKernelNeon;
MLAS_CONV_DEPTHWISE_FLOAT_KERNEL MlasConvDepthwiseFloatKernelNeon;
Expand Down Expand Up @@ -1347,12 +1347,14 @@ struct MLAS_PLATFORM {
const MLAS_GEMM_QUANT_DISPATCH* GemmU8U8Dispatch;
const MLAS_GEMM_QUANT_DISPATCH* GemmU8S8Dispatch;
const MLAS_GEMM_QUANT_DISPATCH* GemmS8S8Dispatch;
#if defined(MLAS_USE_ARM_NEON_NCHWC)
MLAS_CONV_FLOAT_KERNEL* ConvNchwFloatKernel;
MLAS_CONV_FLOAT_KERNEL* ConvNchwcFloatKernel;
MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* ConvDepthwiseFloatKernel;
MLAS_CONV_POINTWISE_FLOAT_KERNEL* ConvPointwiseFloatKernel;
MLAS_POOL_FLOAT_KERNEL* PoolFloatKernel[MlasPoolingKindCount];
uint32_t NchwcBlockSize;
#endif
#endif
const MLAS_SYMM_QGEMM_DISPATCH* SymmQgemmDispatch{nullptr};

Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -560,6 +560,7 @@ Return Value:
this->SoftmaxDispatch = &MlasSoftmaxDispatchNeon;
this->EltwiseDispatch = &MlasEltwiseDispatchNeon;

#if defined(MLAS_USE_ARM_NEON_NCHWC)
this->ConvNchwFloatKernel = MlasConvNchwFloatKernelNeon;
this->ConvNchwcFloatKernel = MlasConvNchwcFloatKernelNeon;
this->ConvDepthwiseFloatKernel = MlasConvDepthwiseFloatKernelNeon;
Expand All @@ -568,6 +569,7 @@ Return Value:
this->PoolFloatKernel[MlasAveragePoolingExcludePad] = MlasPoolAverageExcludePadFloatKernelNeon;
this->PoolFloatKernel[MlasAveragePoolingIncludePad] = MlasPoolAverageIncludePadFloatKernelNeon;
this->NchwcBlockSize = MLAS_NEON_NCHWC_BLOCK_SIZE;
#endif

//
// Check if the processor supports ASIMD dot product instructions.
Expand Down
6 changes: 5 additions & 1 deletion onnxruntime/core/mlas/lib/sconv.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ Module Name:
// Define the convolution kernel flags.
//

#if defined(MLAS_USE_ARM_NEON_NCHWC)

#define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT 0x00000001
#define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION 0x00000002
#define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION 0x00000004
#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008
#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION 0x00000008

#endif
10 changes: 7 additions & 3 deletions onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Module Name:

--*/

#if defined(MLAS_USE_ARM_NEON_NCHWC)

#include "mlasi.h"
#include "sconv.h"

Expand Down Expand Up @@ -58,7 +60,7 @@ void
const size_t InputWidthElements = InputWidth / sizeof(float);
const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float);

(void)InputStride;
MLAS_UNREFERENCED_PARAMETER(InputStride);

const size_t TotalOutputCount = OutputCountLeftPad + OutputCount + OutputCountRightPad;

Expand Down Expand Up @@ -100,7 +102,7 @@ void
const float* input_base = Input + output_idx * StrideWidthElements +
kh * DilatedInputWidthElements + kw * DilationWidthElements;

if (IsNchwcFormat) {
if constexpr (IsNchwcFormat) {
for (size_t filterBlock = 0; filterBlock < BlockSize; filterBlock++) {
const float* input_element = input_base + filterBlock;
const float* input_row_start = InputBase + kh * DilatedInputWidthElements;
Expand Down Expand Up @@ -343,7 +345,7 @@ void
const size_t InputStrideElements = InputStride / sizeof(float);
const size_t DilatedInputWidthElements = DilatedInputWidth / sizeof(float);

(void)InputStrideElements;
MLAS_UNREFERENCED_PARAMETER(InputStrideElements);

const size_t InputWidthElements = InputWidth / sizeof(float);

Expand Down Expand Up @@ -518,3 +520,5 @@ void
}
}
}

#endif
18 changes: 9 additions & 9 deletions onnxruntime/core/mlas/lib/snchwc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ Return Value:

--*/
{
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
return GetMlasPlatform().NchwcBlockSize;
#else
return 1;
Expand Down Expand Up @@ -674,7 +674,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM

const size_t BlockedOutputWidth = BlockSize * OutputWidth;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwcFloatKernel;
#else
MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel;
Expand Down Expand Up @@ -784,7 +784,7 @@ struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM

const size_t BlockedOutputWidth = BlockSize * OutputWidth;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
MLAS_CONV_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvNchwFloatKernel;
#else
MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel;
Expand Down Expand Up @@ -879,7 +879,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float);
const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvPointwiseFloatKernel;
#else
MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel;
Expand Down Expand Up @@ -1016,7 +1016,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM

const size_t BlockedOutputWidth = BlockSize * OutputWidth;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = GetMlasPlatform().ConvDepthwiseFloatKernel;
#else
MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel;
Expand Down Expand Up @@ -1093,7 +1093,7 @@ struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM

struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
{
#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)
#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
static MLAS_POOL_FLOAT_KERNEL* const PoolKernels[];
#endif

Expand Down Expand Up @@ -1131,7 +1131,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_ARM64)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || (defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))
MLAS_POOL_FLOAT_KERNEL* Kernel = GetMlasPlatform().PoolFloatKernel[WorkBlock->PoolingKind];
#else
MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind];
Expand Down Expand Up @@ -1197,7 +1197,7 @@ struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
}
};

#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)
#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

MLAS_POOL_FLOAT_KERNEL* const MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] =
{
Expand Down Expand Up @@ -1621,7 +1621,7 @@ Return Value:
}
}

#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !defined(MLAS_TARGET_ARM64)
#if !defined(MLAS_TARGET_AMD64) && !defined(MLAS_TARGET_LARCH64) && !(defined(MLAS_TARGET_ARM64) && defined(MLAS_USE_ARM_NEON_NCHWC))

//
// Convolution and pooling kernel stubs for architectures that do not yet have
Expand Down
4 changes: 4 additions & 0 deletions onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Module Name:

--*/

#if defined(MLAS_USE_ARM_NEON_NCHWC)

#include "mlasi.h"

constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE;
Expand Down Expand Up @@ -287,3 +289,5 @@ void
false // ExcludePad = false
);
}

#endif
3 changes: 3 additions & 0 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,9 @@ def generate_build_tree(
):
cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]

if args.enable_arm_neon_nchwc:
cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"]

if not args.no_sve:
cmake_args += ["-Donnxruntime_USE_SVE=ON"]

Expand Down
10 changes: 10 additions & 0 deletions tools/ci_build/build_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,8 +629,18 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
help="Enable CUDA kernel profiling (requires CUPTI in PATH).",
)

# --- CPU ---
cpu_group = parser.add_argument_group("CPU Execution Provider")
cpu_group.add_argument("--no_sve", action="store_true", help="Disable building with SVE support.")
# The following enables building ORT with NCHWc Neon ARM kernels.
# At the time of writing, it is turned OFF by default because its performance relative to "regular" NCHW kernels
# is not good at smaller thread counts. But its speed-up is non-negligible with higher thread counts on supporting
# ARM platforms.
# Once the gap is closed for smaller thread counts, it can be turned on by default.
# See https://github.com/microsoft/onnxruntime/pull/25580#issuecomment-3335056846 for benchmarking details.
cpu_group.add_argument(
"--enable_arm_neon_nchwc", action="store_true", help="Enables building with NCHWc ARM kernels."
)

# --- DNNL (formerly MKL-DNN / oneDNN) ---
dnnl_group = parser.add_argument_group("DNNL Execution Provider")
Expand Down
Loading