Skip to content
Merged
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
83946f3
Cuda Plug EP Core
tianleiwu Mar 20, 2026
1aab300
Add doc
tianleiwu Mar 21, 2026
f97bbe4
ops
tianleiwu Mar 23, 2026
267bb06
remove cuda graph
tianleiwu Mar 23, 2026
e61f27a
review feedback
tianleiwu Mar 23, 2026
44cf955
refactoring
tianleiwu Mar 23, 2026
eb329a4
Copilot feedback
tianleiwu Mar 23, 2026
59fda40
fix test
tianleiwu Mar 24, 2026
87edf0e
add more ops
tianleiwu Mar 24, 2026
5dbba29
add fused conv
tianleiwu Mar 24, 2026
ede1493
Add group norm and qordered ops.
tianleiwu Mar 24, 2026
f4b1881
add ort stream adapter
tianleiwu Mar 24, 2026
2869301
remove duplicated link cudnn; use adapter in llm attention
tianleiwu Mar 24, 2026
dbefbf1
Merge remote-tracking branch 'origin/main' into tlwu/20260320/cuda_pl…
tianleiwu Mar 24, 2026
0bb6422
onnx attention op
tianleiwu Mar 24, 2026
7e17d0a
Merge remote-tracking branch 'origin/main' into tlwu/20260320/cuda_pl…
tianleiwu Mar 24, 2026
ced3fdb
redesign CudaKernelAdapterRuntimeConfig map
tianleiwu Mar 24, 2026
6307a15
refactor ConstantOfShape and other feedbacks
tianleiwu Mar 25, 2026
b3fdf25
update doc; fix test
tianleiwu Mar 25, 2026
8bb2a98
Merge remote-tracking branch 'origin/main' into tlwu/20260320/cuda_pl…
tianleiwu Mar 25, 2026
876c3b6
comments
tianleiwu Mar 25, 2026
45ed1c1
refine
tianleiwu Mar 25, 2026
b35226c
fix Windows build
tianleiwu Mar 25, 2026
c8341c3
refine CleanupDeferredCPUBuffers etc.
tianleiwu Mar 25, 2026
3c7e3e0
CUDA Plugin EP: Test Coverage & Bug Fixes (#27817)
tianleiwu Mar 26, 2026
34a5416
fill controlflow opset gap
tianleiwu Mar 26, 2026
5ac5b9f
ep config
tianleiwu Mar 26, 2026
0c311f9
Add fuse_conv_bias and sdpa_kernel option
tianleiwu Mar 26, 2026
6175224
RestoreDeviceIfKnown and GetCublasHandleOrDefault
tianleiwu Mar 26, 2026
d7f5205
update import style
tianleiwu Mar 26, 2026
827afa7
update tests
tianleiwu Mar 27, 2026
ef4dcc0
review feedback
tianleiwu Mar 27, 2026
5328c53
refactoring
tianleiwu Mar 27, 2026
8da28f5
update doc about arena and resource accounting
tianleiwu Mar 27, 2026
0e105fe
update doc for OpSchema API
tianleiwu Mar 27, 2026
18e4a2e
refine design of config storage
tianleiwu Mar 27, 2026
102b6b2
fx build
tianleiwu Mar 27, 2026
94e6d7a
refine
tianleiwu Mar 27, 2026
ab0d23e
add droput, identity, crop, synamicslice and fft ops
tianleiwu Mar 28, 2026
d201f35
doc: quick start
tianleiwu Mar 28, 2026
8ffb3c2
add script for parity report
tianleiwu Mar 28, 2026
481b895
Add test cases
tianleiwu Mar 29, 2026
c14fb1e
MemcpyToHost and MemcpyFromHosst
tianleiwu Mar 29, 2026
2b994c9
lintrunner
tianleiwu Mar 29, 2026
47f7674
add second gpu test
tianleiwu Mar 29, 2026
5db09d7
Merge remote-tracking branch 'origin/main' into tlwu/20260320/cuda_pl…
tianleiwu Mar 30, 2026
8471fec
Fix dropout and webgpu plugin test failures
tianleiwu Mar 30, 2026
20b9609
review feedback
tianleiwu Mar 30, 2026
d5ff49f
update design doc
tianleiwu Mar 30, 2026
6377dfd
address remaining feedbacks
tianleiwu Mar 30, 2026
3f7ec37
refine code and doc
tianleiwu Mar 30, 2026
a7fb8f1
fix webgpu
tianleiwu Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)

cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF)
cmake_dependent_option(onnxruntime_BUILD_CUDA_EP_AS_PLUGIN "Build CUDA EP as a separate plugin shared library" OFF "onnxruntime_USE_CUDA" OFF)
option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
Expand Down Expand Up @@ -1439,6 +1440,9 @@ if (Git_FOUND)
if (onnxruntime_USE_FP8_KV_CACHE)
string(APPEND ORT_BUILD_INFO "fp8-kv-cache=1, ")
endif()
if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
string(APPEND ORT_BUILD_INFO "cuda-plugin-ep=1, ")
endif()
if (onnxruntime_DUMP_TENSOR)
string(APPEND ORT_BUILD_INFO "dump-tensor=1, ")
endif()
Expand Down Expand Up @@ -1771,6 +1775,11 @@ endif()
foreach(onnxruntime_cmake_file ${ONNXRUNTIME_CMAKE_FILES})
include(${onnxruntime_cmake_file}.cmake)
endforeach()

# CUDA EP Plugin build (independent shared library)
if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
include(onnxruntime_providers_cuda_plugin.cmake)
endif()
if (UNIX)
option(BUILD_PKGCONFIG_FILES "Build and install pkg-config files" ON)
else()
Expand Down
14 changes: 14 additions & 0 deletions cmake/external/cuda_configuration.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,20 @@ macro(setup_cuda_architectures)
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")

unset(ORT_HAS_SM80_OR_LATER)
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_ORIG)
if(CUDA_ARCH MATCHES "^([0-9]+)")
if(CMAKE_MATCH_1 GREATER_EQUAL 80)
set(ORT_HAS_SM80_OR_LATER ON)
break()
endif()
endif()
endforeach()

if(ORT_HAS_SM80_OR_LATER)
add_definitions("-DHAS_SM80_OR_LATER")
endif()

set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
Expand Down
5 changes: 5 additions & 0 deletions cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
"${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
)
endif()
# Exclude plugin directory if it was picked up by GLOB_RECURSE
list(FILTER onnxruntime_providers_cuda_cc_srcs EXCLUDE REGEX "core/providers/cuda/plugin/.*")

# Remove pch files
list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
"${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
Expand All @@ -43,6 +46,8 @@
"${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
)
endif()
# Exclude plugin directory if it was picked up by GLOB_RECURSE
list(FILTER onnxruntime_providers_cuda_cu_srcs EXCLUDE REGEX "core/providers/cuda/plugin/.*")
source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})

Expand Down
287 changes: 287 additions & 0 deletions cmake/onnxruntime_providers_cuda_plugin.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

# Build the CUDA Execution Provider as a plugin shared library.
# This file is included from the main CMakeLists.txt when onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON.

message(STATUS "Building CUDA EP as plugin shared library")



set(CUDA_PLUGIN_EP_DIR "${ONNXRUNTIME_ROOT}/core/providers/cuda/plugin")

# --- Collect standard CUDA EP sources ---
file(GLOB_RECURSE CUDA_EP_CC_SRCS CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
)

file(GLOB_RECURSE CUDA_EP_CU_SRCS CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
)

# --- Collect contrib ops sources ---
file(GLOB_RECURSE CUDA_CONTRIB_OPS_CC_SRCS CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/*.cc"
)

file(GLOB_RECURSE CUDA_CONTRIB_OPS_CU_SRCS CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/contrib_ops/cuda/*.cu"
)

list(APPEND CUDA_PLUGIN_EP_CC_SRCS
${CUDA_EP_CC_SRCS}
${CUDA_CONTRIB_OPS_CC_SRCS}
)

list(APPEND CUDA_PLUGIN_EP_CU_SRCS
${CUDA_EP_CU_SRCS}
${CUDA_CONTRIB_OPS_CU_SRCS}
)

list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/aten_ops/.*")
Comment thread
tianleiwu marked this conversation as resolved.
list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/collective/.*")

list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/aten_ops/.*")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/collective/.*")

# Exclude files that include cuda_execution_provider.h (directly or transitively),
# which conflicts with the adapter shim CUDAExecutionProvider class.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_execution_provider\\.cc$")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_provider_factory\\.cc$")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_provider_interface\\.cc$")

# Exclude the framework controlflow/ subdirectory — these inherit from CPU base
# classes (If, Loop, Scan). The plugin has its own control flow wrappers in
# plugin/cuda_controlflow_plugin.cc that delegate to OrtEpApi.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/core/providers/cuda/controlflow/.*")

# Exclude the entire tunable/ subdirectory — it depends on the real CudaTuningContext
# and CUDAExecutionProvider which are not available in the plugin build.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tunable/.*")

# Exclude real EP infrastructure files (replaced by plugin/ equivalents).
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_stream_handle\\.cc$")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_execution_provider_info\\.cc$")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_graph\\.cc$")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_mempool_arena\\.cc$")

# Exclude cuda_common.cc — its HalfGemmOptions definitions conflict with the
# adapter's inline shim. Utility functions are replaced or not needed.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_common\\.cc$")

# Exclude cuda_nhwc_kernels.cc and cuda_contrib_kernels.cc — these files contain
# explicit BuildKernelCreateInfo<> registration tables that reference ALL kernel
# classes (including those in excluded source files like space_depth_ops.cc,
# controlflow/, transformers/, etc.), causing undefined symbols at link time.
# With PluginKernelCollector, individual kernel files self-register via macro
# overrides, so these centralized tables are not needed in the plugin build.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_nhwc_kernels\\.cc$")
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_contrib_kernels\\.cc$")

# Exclude sequence_op.cc — uses TensorSeq (incomplete type in plugin build).
# identity_op.cc is now included: TensorSeq code path is guarded by
# BUILD_CUDA_EP_AS_PLUGIN and opset 14+ registrations use Tensor-only types.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/sequence_op\\.cc$")

# Permanently excluded — pure CPU ops, handled by GetCpuPreferredNodes.
# size.cc registers onnxruntime::Size (CPU op) whose Compute() body lives
# in the CPU provider and is not linked into the plugin.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/size\\.cc$")

# Permanently excluded — pure CPU ops, handled by GetCpuPreferredNodes.
# shape_op.cc inherits from onnxruntime::OpKernel (framework)
# which cannot convert to ep::adapter::OpKernel in the plugin build.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/shape_op\\.cc$")

# Exclude contrib llm/ for now. The core CUDA llm kernels are adapter-safe, but
# contrib llm kernels still need their own plugin pass.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/llm/.*")
list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/llm/.*")

# Exclude contrib training ops (shrunken_gather depends on provider_api.h in header).
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/tensor/shrunken_gather\\.cc$")


# Exclude contrib transformers/ (beam search, greedy search, sampling). Those need subgraph inference.
list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/transformers/.*")
list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/transformers/.*")

# Create shared library target using the ORT helper function for plugins
onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
${CUDA_PLUGIN_EP_CC_SRCS}
${CUDA_PLUGIN_EP_CU_SRCS}
)
# Keep the plugin CUDA target aligned with the repo-wide C++20 baseline.
# Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin
# build, as absl::compare expects standard ordering support in this configuration.
set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
CUDA_STANDARD 20
CUDA_STANDARD_REQUIRED ON
)

# Suppress -Werror=maybe-uninitialized for local variables written by
# adapter OpKernelInfo::GetAttr<> (GCC falsely warns about variables that are
# initialized inside GetAttr’s output parameter path).
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:GNU>>:-Wno-maybe-uninitialized>
)
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
# Flash-attention, XQA, MoE, and other pure CUDA kernel .cu files must NOT
# receive the ORT-framework force-include (it conflicts with cute::Tensor etc.).
# cuda_plugin_kernels.cu already #include "cuda_kernel_adapter.h" directly.
# Op-registration .cc files do not include it directly, so they need it here.
#
# IMPORTANT: The CXX force-include order matters — adapters.h MUST precede
# cuda_kernel_adapter.h because the adapter establishes type aliases that the
# kernel adapter header depends on.
#
# Force NVCC onto C++20 explicitly. With the VS generator the CUDA standard
# property alone still leaves `-std=c++17` in AdditionalOptions.
# Suppress NVCC cudafe warnings:
# 550 - variable set but never used (in adapter headers)
# 2810 - [[nodiscard]] false positive on Status assignments in op_kernel.h / kernel_registry.h
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--std c++20>"
"$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr;-Xcudafe;--diag_suppress=550>"
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=2810>"
"$<$<COMPILE_LANGUAGE:CXX>:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
"$<$<COMPILE_LANGUAGE:CXX>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
)

if (MSVC)
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4834>"
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4127>"
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4211>"
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Zc:__cplusplus>"
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>"
)

target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
"$<$<COMPILE_LANGUAGE:CXX>:/wd4127>"
)
endif()

# Mirror the core CUDA provider's CUDA 12.8+ NVCC workarounds so the plugin
# target handles stricter cudafe diagnostics consistently.
if (DEFINED onnxruntime_NVCC_THREADS)
set(onnxruntime_plugin_nvcc_threads "${onnxruntime_NVCC_THREADS}")
else()
set(onnxruntime_plugin_nvcc_threads "1")
endif()
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">"
"$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>"
)

if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
"$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>"
"$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>"
)

if (MSVC)
target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
"$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>"
)
endif()
endif()

include(cudnn_frontend)
include(cutlass)

# --- Find cuDNN (may be at a custom path via onnxruntime_CUDNN_HOME) ---
set(_CUDNN_SEARCH_PATHS "")
if(onnxruntime_CUDNN_HOME)
list(APPEND _CUDNN_SEARCH_PATHS "${onnxruntime_CUDNN_HOME}")
endif()
if(DEFINED ENV{CUDNN_HOME})
list(APPEND _CUDNN_SEARCH_PATHS "$ENV{CUDNN_HOME}")
endif()

set(CUDA_PLUGIN_CUDNN_INCLUDE_DIR ${CUDNN_INCLUDE_DIR})
set(CUDA_PLUGIN_CUDNN_LIBRARY ${cudnn_LIBRARY})

if(NOT CUDA_PLUGIN_CUDNN_INCLUDE_DIR OR NOT CUDA_PLUGIN_CUDNN_LIBRARY)
message(FATAL_ERROR "cuDNN not found (from main ORT search) for CUDA Plugin EP.")
endif()

message(STATUS "CUDA Plugin EP: cuDNN include: ${CUDA_PLUGIN_CUDNN_INCLUDE_DIR}")
message(STATUS "CUDA Plugin EP: cuDNN library: ${CUDA_PLUGIN_CUDNN_LIBRARY}")

# Include directories — only public ORT headers + CUDA toolkit + cuDNN + internal headers for adapter
target_include_directories(onnxruntime_providers_cuda_plugin PRIVATE
${REPO_ROOT}/include
${REPO_ROOT}/include/onnxruntime/core/session
${ONNXRUNTIME_ROOT}
${CUDAToolkit_INCLUDE_DIRS}
${CUDA_PLUGIN_CUDNN_INCLUDE_DIR}
${Eigen3_SOURCE_DIR}
${cutlass_SOURCE_DIR}/include
${cutlass_SOURCE_DIR}/examples
${cutlass_SOURCE_DIR}/tools/util/include
)

onnxruntime_add_include_to_target(
onnxruntime_providers_cuda_plugin
onnxruntime_common
onnx
onnx_proto
${PROTOBUF_LIB}
flatbuffers::flatbuffers
)

# Link libraries
target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
CUDA::cudart
CUDA::cublas
CUDA::cublasLt
CUDA::cufft
CUDNN::cudnn_all
cudnn_frontend
Boost::mp11
safeint_interface
onnxruntime_framework
onnxruntime_graph
onnxruntime_mlas
onnxruntime_flatbuffers
onnxruntime_common
cpuinfo::cpuinfo
onnx
onnx_proto
${PROTOBUF_LIB}
)

# Symbol visibility — only export CreateEpFactories and ReleaseEpFactory
target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ORT_API_MANUAL_INIT BUILD_CUDA_EP_AS_PLUGIN ORT_USE_EP_API_ADAPTERS=1 ONNX_ML=1 ONNX_NAMESPACE=onnx ONNX_USE_LITE_PROTO=1)

if (onnxruntime_USE_CUDA_NHWC_OPS)
target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_NHWC_OPS)
endif()

if(WIN32)
# Windows: use .def file for symbol exports
set(CUDA_PLUGIN_DEF_FILE ${CUDA_PLUGIN_EP_DIR}/cuda_plugin_ep_symbols.def)
if(EXISTS ${CUDA_PLUGIN_DEF_FILE})
target_sources(onnxruntime_providers_cuda_plugin PRIVATE ${CUDA_PLUGIN_DEF_FILE})
endif()
else()
# Linux/macOS: hide all symbols by default, explicitly export via __attribute__((visibility("default")))
set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
C_VISIBILITY_PRESET hidden
CXX_VISIBILITY_PRESET hidden
)
endif()



# Set output name
set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
OUTPUT_NAME "onnxruntime_providers_cuda_plugin"
)

# Install
install(TARGETS onnxruntime_providers_cuda_plugin
LIBRARY DESTINATION lib
RUNTIME DESTINATION bin
)
13 changes: 13 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,10 @@ target_include_directories(onnxruntime_test_all PRIVATE ${ONNXRUNTIME_ROOT}/core

onnxruntime_apply_test_target_workarounds(onnxruntime_test_all)

if (onnxruntime_USE_CUDA AND onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
target_compile_definitions(onnxruntime_test_all PRIVATE ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP=1)
endif()

if (MSVC)
# The warning means the type of two integral values around a binary operator is narrow than their result.
# If we promote the two input values first, it could be more tolerant to integer overflow.
Expand Down Expand Up @@ -1264,6 +1268,10 @@ block()
onnxruntime_apply_test_target_workarounds(onnxruntime_provider_test)
onnxruntime_set_plugin_ep_test_environment(onnxruntime_provider_test)

if (onnxruntime_USE_CUDA AND onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
target_compile_definitions(onnxruntime_provider_test PRIVATE ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP=1)
endif()

# Expose QNN SDK headers to unit tests via an interface target
if(onnxruntime_USE_QNN)
add_library(qnn_sdk_headers_include INTERFACE)
Expand Down Expand Up @@ -1474,6 +1482,11 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
endif()
else()
target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common absl::flags absl::flags_parse ${onnx_test_libs})
# When onnxruntime_BUILD_SHARED_LIB is OFF (the plugin build path), perf test was missing CUDA include directories and CUDA::cudart linkage.
if (onnxruntime_USE_CUDA OR onnxruntime_USE_NV OR onnxruntime_USE_TENSORRT)
target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(onnxruntime_perf_test PRIVATE CUDA::cudart)
endif()
endif()
set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")

Expand Down
Loading
Loading