Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
option(onnxruntime_USE_RVV "Build with RISC-V Vector support in MLAS" OFF)
option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)

option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
Expand Down
46 changes: 45 additions & 1 deletion cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,8 @@ else()
set(X86 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
set(X86_64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
set(RISCV64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
set(LOONGARCH64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x$")
Expand Down Expand Up @@ -903,6 +905,48 @@ endif()
set(MLAS_SOURCE_IS_NOT_SET 0)
endif()
endif()
if(RISCV64 AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs CONFIGURE_DEPENDS
"${MLAS_SRC_DIR}/scalar/*.cpp")

if(onnxruntime_USE_RVV)
set(OLD_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS} -march=rv64gcv -mabi=lp64d")
check_cxx_source_compiles("
#include <stddef.h>
#include <riscv_vector.h>
int main() {
size_t vl = __riscv_vsetvl_e32m1(4);
return static_cast<int>(vl == 0);
}"
HAS_RISCV64_RVV
)
set(CMAKE_REQUIRED_FLAGS "${OLD_CMAKE_REQUIRED_FLAGS}")
unset(OLD_CMAKE_REQUIRED_FLAGS)

if(HAS_RISCV64_RVV)
list(APPEND mlas_platform_srcs
${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
)
set_source_files_properties(
${MLAS_SRC_DIR}/riscv64/sgemm_pack_b_rvv.cpp
${MLAS_SRC_DIR}/riscv64/sgemm_kernel_rvv.cpp
${MLAS_SRC_DIR}/riscv64/softmax_kernel_rvv.cpp
PROPERTIES COMPILE_FLAGS "-march=rv64gcv -mabi=lp64d")
list(APPEND mlas_private_compile_definitions MLAS_USE_RVV=1)
else()
message(
WARNING
"onnxruntime_USE_RVV was requested, but the compiler does not support rv64gcv RVV intrinsics. Falling back to scalar MLAS kernels.")
endif()
endif()

if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
set(MLAS_SOURCE_IS_NOT_SET 0)
endif()
endif()
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
file(GLOB_RECURSE mlas_platform_srcs
"${MLAS_SRC_DIR}/scalar/*.cpp")
Expand Down Expand Up @@ -997,4 +1041,4 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD)
endif()
endif()

endif()
endif()
28 changes: 28 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1400,6 +1400,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)

SET(MLAS_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench)
file(GLOB_RECURSE MLAS_BENCH_SOURCE_FILES "${MLAS_BENCH_DIR}/*.cpp" "${MLAS_BENCH_DIR}/*.h")
list(FILTER MLAS_BENCH_SOURCE_FILES EXCLUDE REGEX "${MLAS_BENCH_DIR}/riscv64/.*")
onnxruntime_add_executable(onnxruntime_mlas_benchmark ${MLAS_BENCH_SOURCE_FILES} ${ONNXRUNTIME_ROOT}/core/framework/error_code.cc)
target_include_directories(onnxruntime_mlas_benchmark PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE benchmark::benchmark onnxruntime_util ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
Expand All @@ -1418,6 +1419,33 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo)
endif()
set_target_properties(onnxruntime_mlas_benchmark PROPERTIES FOLDER "ONNXRuntimeTest")

endif()

if(CMAKE_SYSTEM_PROCESSOR MATCHES "^riscv64.*")
set(MLAS_RISCV64_BENCH_DIR ${TEST_SRC_DIR}/mlas/bench/riscv64)

onnxruntime_add_executable(
onnxruntime_mlas_sgemm_riscv_bench
${MLAS_RISCV64_BENCH_DIR}/sgemm_riscv_bench.cpp)
target_include_directories(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_link_libraries(
onnxruntime_mlas_sgemm_riscv_bench
PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
target_compile_definitions(onnxruntime_mlas_sgemm_riscv_bench PRIVATE ${mlas_private_compile_definitions})
set_target_properties(onnxruntime_mlas_sgemm_riscv_bench PROPERTIES FOLDER "ONNXRuntimeTest")

onnxruntime_add_executable(
onnxruntime_mlas_softmax_riscv_compare
${MLAS_RISCV64_BENCH_DIR}/softmax_rvv_compare.cpp)
target_include_directories(
onnxruntime_mlas_softmax_riscv_compare
PRIVATE ${ONNXRUNTIME_ROOT} ${ONNXRUNTIME_ROOT}/core/mlas/inc)
target_link_libraries(
onnxruntime_mlas_softmax_riscv_compare
PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common ${CMAKE_DL_LIBS})
target_compile_definitions(onnxruntime_mlas_softmax_riscv_compare PRIVATE ${mlas_private_compile_definitions})
set_target_properties(onnxruntime_mlas_softmax_riscv_compare PROPERTIES FOLDER "ONNXRuntimeTest")
endif()

if(WIN32)
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ Module Name:
#if defined(__s390x__)
#define MLAS_TARGET_S390X
#endif
#if defined(__riscv) && defined(__riscv_xlen) && (__riscv_xlen == 64)
#define MLAS_TARGET_RISCV64
#endif

#if defined(__VSX__)
#define MLAS_TARGET_POWER
Expand Down
8 changes: 4 additions & 4 deletions onnxruntime/core/mlas/lib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,7 @@ Return Value:
//
float Maximum;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
Maximum = GetMlasPlatform().ReduceMaximumF32Kernel(Input, D);
#else
Maximum = MlasReduceMaximumF32Kernel(Input, D);
Expand All @@ -894,7 +894,7 @@ Return Value:
float* Temp = LogSoftmax ? nullptr : Output;
float Accumulation;

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
Accumulation = GetMlasPlatform().ComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum);
#else
Accumulation = MlasComputeSumExpF32Kernel(Input, Temp, D, &NegativeMaximum);
Expand All @@ -910,7 +910,7 @@ Return Value:
//
float Parameters[] = {NegativeMaximum, std::log(Accumulation)};

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
GetMlasPlatform().ComputeLogSoftmaxOutputF32Kernel(Input, Output, D, Parameters);
#else

Expand All @@ -922,7 +922,7 @@ Return Value:
//
float Parameters[] = {1.0f / Accumulation};

#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE)
#if defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_LARCH64) || defined(MLAS_USE_SVE) || defined(MLAS_TARGET_RISCV64)
GetMlasPlatform().ComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
#else
MlasComputeSoftmaxOutputF32Kernel(Output, D, Parameters);
Expand Down
43 changes: 40 additions & 3 deletions onnxruntime/core/mlas/lib/mlasi.h
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
//

#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || \
defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X)
defined(MLAS_TARGET_LARCH64) || defined(MLAS_TARGET_S390X) || \
defined(MLAS_TARGET_RISCV64)

typedef
size_t
Expand Down Expand Up @@ -1018,6 +1019,36 @@ extern "C" {
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelLasx;
MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelLasx;
MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelLasx;
#elif defined(MLAS_TARGET_RISCV64)
#if defined(MLAS_USE_RVV)
MLAS_GEMM_FLOAT_KERNEL MlasGemmFloatKernelRvv;
void MlasSgemmCopyPackBRvv(
float* D,
const float* B,
size_t ldb,
size_t CountX,
size_t CountY);
Comment thread
hariharans29 marked this conversation as resolved.
#endif
size_t MLASCALL MlasSgemmKernelZero(
const float* A,
const float* B,
float* C,
size_t CountK,
size_t CountM,
size_t CountN,
size_t lda,
size_t ldc,
float alpha);
size_t MLASCALL MlasSgemmKernelAdd(
const float* A,
const float* B,
float* C,
size_t CountK,
size_t CountM,
size_t CountN,
size_t lda,
size_t ldc,
float alpha);
#else
MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
Expand Down Expand Up @@ -1167,6 +1198,12 @@ extern "C" {

MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32Kernel;
MLAS_REDUCE_MINIMUM_MAXIMUM_FLOAT_KERNEL MlasReduceMinimumMaximumF32Kernel;
#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV)
MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL MlasComputeSumExpF32KernelRvv;
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelRvv;
MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeSoftmaxOutputF32KernelRvv;
MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL MlasComputeLogSoftmaxOutputF32KernelRvv;
#endif
#if defined(MLAS_TARGET_AMD64)
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx;
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL MlasReduceMaximumF32KernelAvx512F;
Expand Down Expand Up @@ -1442,7 +1479,7 @@ struct MLAS_PLATFORM {
#endif


#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X)
#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_S390X) || defined(MLAS_TARGET_RISCV64)
MLAS_GEMM_FLOAT_KERNEL* GemmFloatKernel;
#endif
#if defined(MLAS_TARGET_LARCH64)
Expand Down Expand Up @@ -1507,7 +1544,7 @@ struct MLAS_PLATFORM {
MLAS_QUANTIZE_LINEAR_U4_KERNEL* QuantizeLinearU4Kernel;
#endif

#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64)
#if defined(MLAS_USE_SVE) || defined(MLAS_TARGET_AMD64) || defined(MLAS_TARGET_RISCV64)
MLAS_COMPUTE_UNARY_FLOAT_KERNEL* ErfKernelRoutine;
MLAS_COMPUTE_UNARY_FLOAT_KERNEL* LogisticKernelRoutine;
MLAS_REDUCE_MAXIMUM_FLOAT_KERNEL* ReduceMaximumF32Kernel;
Expand Down
79 changes: 78 additions & 1 deletion onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ Module Name:
#include "kleidiai/mlasi_kleidiai.h"
#endif

#include <thread>
#include <cctype>
#include <cstdlib>
#include <mutex>
#include <thread>

#if defined(MLAS_TARGET_POWER)
#if defined(__linux__)
Expand All @@ -49,6 +51,54 @@ Module Name:
#include <sys/auxv.h>
#endif

#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV) && defined(__linux__)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#ifndef COMPAT_HWCAP_ISA_V
#define COMPAT_HWCAP_ISA_V (1UL << ('V' - 'A'))
#endif
#endif

#if defined(MLAS_TARGET_RISCV64) && defined(MLAS_USE_RVV)
namespace {

bool
MlasStringEqualsIgnoreCase(
const char* value,
const char* expected
)
{
while (*value != '\0' && *expected != '\0') {
const auto lhs = static_cast<unsigned char>(*value);
const auto rhs = static_cast<unsigned char>(*expected);
if (std::tolower(lhs) != std::tolower(rhs)) {
return false;
}
++value;
++expected;
}

return *value == '\0' && *expected == '\0';
}

bool
MlasShouldForceScalarRiscv(
const char* value
)
{
if (value == nullptr || value[0] == '\0') {
return false;
}

return MlasStringEqualsIgnoreCase(value, "1") ||
MlasStringEqualsIgnoreCase(value, "true") ||
MlasStringEqualsIgnoreCase(value, "on") ||
MlasStringEqualsIgnoreCase(value, "yes");
}

} // namespace
#endif

#if defined(MLAS_TARGET_ARM64)
#if defined(_WIN32)

Expand Down Expand Up @@ -265,6 +315,33 @@ Return Value:
this->CastF16ToF32Kernel = nullptr;
this->CastF32ToF16Kernel = nullptr;

#if defined(MLAS_TARGET_RISCV64)
this->GemmFloatKernel = nullptr;
this->ErfKernelRoutine = MlasErfKernel;
this->LogisticKernelRoutine = MlasLogisticKernel;
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32Kernel;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32Kernel;
this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;

#if defined(MLAS_USE_RVV)
bool has_rvv = true;
#if defined(__linux__)
has_rvv = (getauxval(AT_HWCAP) & COMPAT_HWCAP_ISA_V) != 0;
#endif
if (MlasShouldForceScalarRiscv(std::getenv("ORT_MLAS_RISCV_FORCE_SCALAR"))) {
has_rvv = false;
}
if (has_rvv) {
this->GemmFloatKernel = MlasGemmFloatKernelRvv;
this->ReduceMaximumF32Kernel = MlasReduceMaximumF32KernelRvv;
this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelRvv;
this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32KernelRvv;
this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32KernelRvv;
}
#endif
#endif

#if defined(MLAS_TARGET_AMD64_IX86)

//
Expand Down
Loading
Loading