Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
option(onnxruntime_USE_RVV "Build with RISC-V Vector support in MLAS" OFF)
option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)

option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
Expand Down
44 changes: 43 additions & 1 deletion cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ else()
set(X86 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
set(X86_64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
set(RISCV64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
set(LOONGARCH64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x$")
Expand Down Expand Up @@ -903,6 +905,46 @@ endif()
set(MLAS_SOURCE_IS_NOT_SET 0)
endif()
endif()
if(RISCV64 AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs CONFIGURE_DEPENDS
"${MLAS_SRC_DIR}/scalar/*.cpp")

if(onnxruntime_USE_RVV)
set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv -mabi=lp64d")
check_cxx_source_compiles("
#include <stddef.h>
#include <riscv_vector.h>
int main() {
size_t vl = __riscv_vsetvl_e32m1(4);
return static_cast<int>(vl == 0);
}"
HAS_RISCV64_RVV
)
unset(CMAKE_REQUIRED_FLAGS)
Comment thread
hariharans29 marked this conversation as resolved.
Outdated

if(HAS_RISCV64_RVV)
list(APPEND mlas_platform_srcs
${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
)
set_source_files_properties(
${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
PROPERTIES COMPILE_FLAGS "-march=rv64gcv -mabi=lp64d")
list(APPEND mlas_private_compile_definitions MLAS_USE_RVV=1)
else()
message(
WARNING
"onnxruntime_USE_RVV was requested, but the compiler does not support rv64gcv RVV intrinsics. Falling back to scalar MLAS kernels.")
endif()
endif()

if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
set(MLAS_SOURCE_IS_NOT_SET 0)
endif()
endif()
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs
"${MLAS_SRC_DIR}/scalar/*.cpp")
Expand Down Expand Up @@ -997,4 +1039,4 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
endif()
endif()

endif()
endif()
28 changes: 28 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1400,6 +1400,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)

SET(MLAS_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench)
file(GLOB_RECURSE MLAS_BENCH_SOURCE_FILES "${MLAS_BENCH_DIR}/*.cpp" "${MLAS_BENCH_DIR}/*.h")
list(FILTER MLAS_BENCH_SOURCE_FILES EXCLUDE REGEX "${MLAS_BENCH_DIR}/riscv64/.*")
onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES} ${ONNXRUNTIME_ROOT}/core/framework/error_code.cc)
target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
Expand All @@ -1418,6 +1419,33 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo)
endif()
set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest")

endif()

if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
set(MLAS_RISCV64_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench/riscv64)

onnxruntime_add_executable(
onnxruntime_mlas_sgemm_riscv_bench
${MLAS_RISCV64_BENCH_DIR}/sgemm_riscv_bench.cpp)
target_include_directories(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_link_libraries(
onnxruntime_mlas_sgemm_riscv_bench
PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
target_compile_definitions(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${mlas_private_compile_definitions})
set_target_properties(onnxruntime_mlas_sgemm_riscv_bench PROPERTIES FOLDER "ONNXRuntimeTest")

onnxruntime_add_executable(
onnxruntime_mlas_softmax_riscv_compare
${MLAS_RISCV64_BENCH_DIR}/softmax_rvv_compare.cpp)
target_include_directories(
onnxruntime_mlas_softmax_riscv_compare
PRIVATE ${ONNXRUNTIME_ROOT} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_link_libraries(
onnxruntime_mlas_softmax_riscv_compare
PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
target_compile_definitions(onnxruntime_mlas_softmax_riscv_compare PRIVATE ${mlas_private_compile_definitions})
set_target_properties(onnxruntime_mlas_softmax_riscv_compare PROPERTIES FOLDER "ONNXRuntimeTest")
endif()

if(WIN32)
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ Module Name:
#if defined(__s390x__)
#define MLAS_TARGET_S390X
#endif
#if defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
#define MLAS_TARGET_RISCV64
#endif

#if defined(__VSX__)
#define MLAS_TARGET_POWER
Expand Down
8 changes: 4 additions & 4 deletions onnxruntime/core/mlas/lib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ Return Value:
//
float Maximum;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D);
#else
Maximum = MlasReduceMaximumF32Kernel(Input, D);
Expand All @@ -894,7 +894,7 @@ Return Value:
float* Temp = LogSoftmax ? nullptr : Output;
float Accumulation;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
Accumulation = GetMlasPlatform().ComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum);
#else
Accumulation = MlasComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum);
Expand All @@ -910,7 +910,7 @@ Return Value:
//
float Parameters[] = {NegativeMaximum, std::log(Accumulation)};

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
#else

Expand All @@ -922,7 +922,7 @@ Return Value:
//
float Parameters[] = {1.0f / Accumulation};

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
#else
MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
Expand Down
41 changes: 38 additions & 3 deletions onnxruntime/core/mlas/lib/mlasi.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
//

#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || \
defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X)
defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X) || \
defined(MLAS_TARGET_RISCV64)

typedef
size_t
Expand Down Expand Up @@ -1018,6 +1019,34 @@ extern "C" {
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelLasx;
MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelLasx;
MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelLasx;
#elif defined(MLAS_TARGET_RISCV64)
MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelRvv;
void MlasSgemmCopyPackBRvv(
float* D,
const float* B,
size_t ldb,
size_t CountX,
size_t CountY);
Comment thread
hariharans29 marked this conversation as resolved.
size_t MLASCALL MlasSgemmKernelZero(
const float* A,
const float* B,
float* C,
size_t CountK,
size_t CountM,
size_t CountN,
size_t lda,
size_t ldc,
float alpha);
size_t MLASCALL MlasSgemmKernelAdd(
const float* A,
const float* B,
float* C,
size_t CountK,
size_t CountM,
size_t CountN,
size_t lda,
size_t ldc,
float alpha);
#else
MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
Expand Down Expand Up @@ -1167,6 +1196,12 @@ extern "C" {

MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32Kernel;
MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32Kernel;
#if defined(MLAS_TARGET_RISCV64)
MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL MlasComputeSumExpF32KernelRvv;
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelRvv;
MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelRvv;
MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelRvv;
#endif
#if defined(MLAS_TARGET_AMD64)
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx;
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx512F;
Expand Down Expand Up @@ -1442,7 +1477,7 @@ struct MLAS_PLATFORM {
#endif


#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X)
#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) || defined(MLAS_TARGET_RISCV64)
MLAS_GEMM_FLOAT_KERNEL* GemmFloatKernel;
#endif
#if defined(MLAS_TARGET_LARCH64)
Expand Down Expand Up @@ -1507,7 +1542,7 @@ struct MLAS_PLATFORM {
MLAS_QUANTIZE_LINEAR_U4_KERNEL* QuantizeLinearU4Kernel;
#endif

#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64)
#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_RISCV64)
MLAS_COMPUTE_UNARY_FLOAT_KERNEL* ErfKernelRoutine;
MLAS_COMPUTE_UNARY_FLOAT_KERNEL* LogisticKernelRoutine;
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL* ReduceMaximumF32Kernel;
Expand Down
37 changes: 37 additions & 0 deletions onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Module Name:
#include "kleidiai/mlasi_kleidiai.h"
#endif

#include <cstdlib>
#include <thread>
#include <mutex>

Expand All @@ -49,6 +50,14 @@ Module Name:
#include <sys/auxv.h>
#endif

#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) && defined(__linux__)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#ifndef COMPAT_HWCAP_ISA_V
#define COMPAT_HWCAP_ISA_V (1UL << ('V' - 'A'))
#endif
#endif

#if defined(MLAS_TARGET_ARM64)
#if defined(_WIN32)

Expand Down Expand Up @@ -265,6 +274,34 @@ Return Value:
this->CastF16ToF32Kernel = nullptr;
this->CastF32ToF16Kernel = nullptr;

#if defined(MLAS_TARGET_RISCV64)
this->GemmFloatKernel = nullptr;
this->ErfKernelRoutine = MlasErfKernel;
this->LogisticKernelRoutine = MlasLogisticKernel;
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32Kernel;
this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;

#if defined(MLAS_USE_RVV)
bool has_rvv = true;
#if defined(__linux__)
has_rvv = (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V) != 0;
#endif
const char* force_scalar = std::getenv("ORT_MLAS_RISCV_FORCE_SCALAR");
if (force_scalar != nullptr && force_scalar[0] != '\0' && force_scalar[0] != '0') {
has_rvv = false;
}
Comment thread
hariharans29 marked this conversation as resolved.
Outdated
if (has_rvv) {
this->GemmFloatKernel = MlasGemmFloatKernelRvv;
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelRvv;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelRvv;
this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32KernelRvv;
this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelRvv;
}
#endif
#endif

#if defined(MLAS_TARGET_AMD64_IX86)

//
Expand Down
Loading
Loading