microsoft · tianleiwu · Mar 31, 2026 · Mar 20, 2026 · Mar 21, 2026 · Mar 23, 2026
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -84,6 +84,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
 cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF)
+cmake_dependent_option(onnxruntime_BUILD_CUDA_EP_AS_PLUGIN "Build CUDA EP as a separate plugin shared library" OFF "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
@@ -1439,6 +1440,9 @@ if (Git_FOUND)
   if (onnxruntime_USE_FP8_KV_CACHE)
     string(APPEND ORT_BUILD_INFO "fp8-kv-cache=1, ")
   endif()
+  if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+    string(APPEND ORT_BUILD_INFO "cuda-plugin-ep=1, ")
+  endif()
   if (onnxruntime_DUMP_TENSOR)
     string(APPEND ORT_BUILD_INFO "dump-tensor=1, ")
   endif()
@@ -1771,6 +1775,11 @@ endif()
 foreach(onnxruntime_cmake_file ${ONNXRUNTIME_CMAKE_FILES})
   include(${onnxruntime_cmake_file}.cmake)
 endforeach()
+
+# CUDA EP Plugin build (independent shared library)
+if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+  include(onnxruntime_providers_cuda_plugin.cmake)
+endif()
 if (UNIX)
   option(BUILD_PKGCONFIG_FILES "Build and install pkg-config files" ON)
 else()

diff --git a/cmake/external/cuda_configuration.cmake b/cmake/external/cuda_configuration.cmake
@@ -161,6 +161,20 @@ macro(setup_cuda_architectures)
   set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
   message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")
 
+  unset(ORT_HAS_SM80_OR_LATER)
+  foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES_ORIG)
+    if(CUDA_ARCH MATCHES "^([0-9]+)")
+      if(CMAKE_MATCH_1 GREATER_EQUAL 80)
+        set(ORT_HAS_SM80_OR_LATER ON)
+        break()
+      endif()
+    endif()
+  endforeach()
+
+  if(ORT_HAS_SM80_OR_LATER)
+    add_definitions("-DHAS_SM80_OR_LATER")
+  endif()
+
   set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "110" "120")
   foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
     if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -20,6 +20,9 @@
       "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
     )
   endif()
+  # Exclude plugin directory if it was picked up by GLOB_RECURSE
+  list(FILTER onnxruntime_providers_cuda_cc_srcs EXCLUDE REGEX "core/providers/cuda/plugin/.*")
+
   # Remove pch files
   list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
     "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -43,6 +46,8 @@
         "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
         )
   endif()
+  # Exclude plugin directory if it was picked up by GLOB_RECURSE
+  list(FILTER onnxruntime_providers_cuda_cu_srcs EXCLUDE REGEX "core/providers/cuda/plugin/.*")
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
   set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
 

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -0,0 +1,287 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# Build the CUDA Execution Provider as a plugin shared library.
+# This file is included from the main CMakeLists.txt when onnxruntime_BUILD_CUDA_EP_AS_PLUGIN=ON.
+
+message(STATUS "Building CUDA EP as plugin shared library")
+
+
+
+set(CUDA_PLUGIN_EP_DIR "${ONNXRUNTIME_ROOT}/core/providers/cuda/plugin")
+
+# --- Collect standard CUDA EP sources ---
+file(GLOB_RECURSE CUDA_EP_CC_SRCS CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
+)
+
+file(GLOB_RECURSE CUDA_EP_CU_SRCS CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cu"
+)
+
+# --- Collect contrib ops sources ---
+file(GLOB_RECURSE CUDA_CONTRIB_OPS_CC_SRCS CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/*.cc"
+)
+
+file(GLOB_RECURSE CUDA_CONTRIB_OPS_CU_SRCS CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/*.cu"
+)
+
+list(APPEND CUDA_PLUGIN_EP_CC_SRCS
+     ${CUDA_EP_CC_SRCS}
+     ${CUDA_CONTRIB_OPS_CC_SRCS}
+)
+
+list(APPEND CUDA_PLUGIN_EP_CU_SRCS
+     ${CUDA_EP_CU_SRCS}
+     ${CUDA_CONTRIB_OPS_CU_SRCS}
+)
+
+list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/aten_ops/.*")
+list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/collective/.*")
+
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/aten_ops/.*")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX "onnxruntime/contrib_ops/cuda/collective/.*")
+
+# Exclude files that include cuda_execution_provider.h (directly or transitively),
+# which conflicts with the adapter shim CUDAExecutionProvider class.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_execution_provider\\.cc$")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_provider_factory\\.cc$")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_provider_interface\\.cc$")
+
+# Exclude the framework controlflow/ subdirectory — these inherit from CPU base
+# classes (If, Loop, Scan). The plugin has its own control flow wrappers in
+# plugin/cuda_controlflow_plugin.cc that delegate to OrtEpApi.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/core/providers/cuda/controlflow/.*")
+
+# Exclude the entire tunable/ subdirectory — it depends on the real CudaTuningContext
+# and CUDAExecutionProvider which are not available in the plugin build.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tunable/.*")
+
+# Exclude real EP infrastructure files (replaced by plugin/ equivalents).
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_stream_handle\\.cc$")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_execution_provider_info\\.cc$")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_graph\\.cc$")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_mempool_arena\\.cc$")
+
+# Exclude cuda_common.cc — its HalfGemmOptions definitions conflict with the
+# adapter's inline shim. Utility functions are replaced or not needed.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_common\\.cc$")
+
+# Exclude cuda_nhwc_kernels.cc and cuda_contrib_kernels.cc — these files contain
+# explicit BuildKernelCreateInfo<> registration tables that reference ALL kernel
+# classes (including those in excluded source files like space_depth_ops.cc,
+# controlflow/, transformers/, etc.), causing undefined symbols at link time.
+# With PluginKernelCollector, individual kernel files self-register via macro
+# overrides, so these centralized tables are not needed in the plugin build.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_nhwc_kernels\\.cc$")
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/cuda_contrib_kernels\\.cc$")
+
+# Exclude sequence_op.cc — uses TensorSeq (incomplete type in plugin build).
+# identity_op.cc is now included: TensorSeq code path is guarded by
+# BUILD_CUDA_EP_AS_PLUGIN and opset 14+ registrations use Tensor-only types.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/sequence_op\\.cc$")
+
+# Permanently excluded — pure CPU ops, handled by GetCpuPreferredNodes.
+# size.cc registers onnxruntime::Size (CPU op) whose Compute() body lives
+# in the CPU provider and is not linked into the plugin.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/size\\.cc$")
+
+# Permanently excluded — pure CPU ops, handled by GetCpuPreferredNodes.
+# shape_op.cc inherits from onnxruntime::OpKernel (framework)
+# which cannot convert to ep::adapter::OpKernel in the plugin build.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/tensor/shape_op\\.cc$")
+
+# Exclude contrib llm/ for now. The core CUDA llm kernels are adapter-safe, but
+# contrib llm kernels still need their own plugin pass.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/llm/.*")
+list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/llm/.*")
+
+# Exclude contrib training ops (shrunken_gather depends on provider_api.h in header).
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/tensor/shrunken_gather\\.cc$")
+
+
+# Exclude contrib transformers/ (beam search, greedy search, sampling). Those need subgraph inference.
+list(FILTER CUDA_PLUGIN_EP_CC_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/transformers/.*")
+list(FILTER CUDA_PLUGIN_EP_CU_SRCS EXCLUDE REGEX ".*/contrib_ops/cuda/transformers/.*")
+
+# Create shared library target using the ORT helper function for plugins
+onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
+    ${CUDA_PLUGIN_EP_CC_SRCS}
+    ${CUDA_PLUGIN_EP_CU_SRCS}
+)
+# Keep the plugin CUDA target aligned with the repo-wide C++20 baseline.
+# Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin
+# build, as absl::compare expects standard ordering support in this configuration.
+set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+)
+
+# Suppress -Werror=maybe-uninitialized for local variables written by
+# adapter OpKernelInfo::GetAttr<> (GCC falsely warns about variables that are
+# initialized inside GetAttr’s output parameter path).
+target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+    $<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:GNU>>:-Wno-maybe-uninitialized>
+)
+target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+    # Flash-attention, XQA, MoE, and other pure CUDA kernel .cu files must NOT
+    # receive the ORT-framework force-include (it conflicts with cute::Tensor etc.).
+    # cuda_plugin_kernels.cu already #include "cuda_kernel_adapter.h" directly.
+    # Op-registration .cc files do not include it directly, so they need it here.
+    #
+    # IMPORTANT: The CXX force-include order matters — adapters.h MUST precede
+    # cuda_kernel_adapter.h because the adapter establishes type aliases that the
+    # kernel adapter header depends on.
+    #
+    # Force NVCC onto C++20 explicitly. With the VS generator the CUDA standard
+    # property alone still leaves `-std=c++17` in AdditionalOptions.
+    # Suppress NVCC cudafe warnings:
+    #   550  - variable set but never used (in adapter headers)
+    #   2810 - [[nodiscard]] false positive on Status assignments in op_kernel.h / kernel_registry.h
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--std c++20>"
+    "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr;-Xcudafe;--diag_suppress=550>"
+    "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=2810>"
+    "$<$<COMPILE_LANGUAGE:CXX>:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
+    "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
+)
+
+if (MSVC)
+    target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4834>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4127>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4211>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /Zc:__cplusplus>"
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /bigobj>"
+    )
+
+    target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+        "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>"
+    )
+endif()
+
+# Mirror the core CUDA provider's CUDA 12.8+ NVCC workarounds so the plugin
+# target handles stricter cudafe diagnostics consistently.
+if (DEFINED onnxruntime_NVCC_THREADS)
+    set(onnxruntime_plugin_nvcc_threads "${onnxruntime_NVCC_THREADS}")
+else()
+    set(onnxruntime_plugin_nvcc_threads "1")
+endif()
+target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+        "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_plugin_nvcc_threads}\">"
+        "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=177>"
+)
+
+if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+    target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+            "$<$<COMPILE_LANGUAGE:CUDA>:--static-global-template-stub=false>"
+            "$<$<COMPILE_LANGUAGE:CUDA>:--diag-suppress=221>"
+    )
+
+    if (MSVC)
+        target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+                "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /wd4505>"
+        )
+    endif()
+endif()
+
+include(cudnn_frontend)
+include(cutlass)
+
+# --- Find cuDNN (may be at a custom path via onnxruntime_CUDNN_HOME) ---
+set(_CUDNN_SEARCH_PATHS "")
+if(onnxruntime_CUDNN_HOME)
+  list(APPEND _CUDNN_SEARCH_PATHS "${onnxruntime_CUDNN_HOME}")
+endif()
+if(DEFINED ENV{CUDNN_HOME})
+  list(APPEND _CUDNN_SEARCH_PATHS "$ENV{CUDNN_HOME}")
+endif()
+
+set(CUDA_PLUGIN_CUDNN_INCLUDE_DIR ${CUDNN_INCLUDE_DIR})
+set(CUDA_PLUGIN_CUDNN_LIBRARY ${cudnn_LIBRARY})
+
+if(NOT CUDA_PLUGIN_CUDNN_INCLUDE_DIR OR NOT CUDA_PLUGIN_CUDNN_LIBRARY)
+  message(FATAL_ERROR "cuDNN not found (from main ORT search) for CUDA Plugin EP.")
+endif()
+
+message(STATUS "CUDA Plugin EP: cuDNN include: ${CUDA_PLUGIN_CUDNN_INCLUDE_DIR}")
+message(STATUS "CUDA Plugin EP: cuDNN library: ${CUDA_PLUGIN_CUDNN_LIBRARY}")
+
+# Include directories — only public ORT headers + CUDA toolkit + cuDNN + internal headers for adapter
+target_include_directories(onnxruntime_providers_cuda_plugin PRIVATE
+    ${REPO_ROOT}/include
+    ${REPO_ROOT}/include/onnxruntime/core/session
+    ${ONNXRUNTIME_ROOT}
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${CUDA_PLUGIN_CUDNN_INCLUDE_DIR}
+    ${Eigen3_SOURCE_DIR}
+    ${cutlass_SOURCE_DIR}/include
+    ${cutlass_SOURCE_DIR}/examples
+    ${cutlass_SOURCE_DIR}/tools/util/include
+)
+
+onnxruntime_add_include_to_target(
+    onnxruntime_providers_cuda_plugin
+    onnxruntime_common
+    onnx
+    onnx_proto
+    ${PROTOBUF_LIB}
+    flatbuffers::flatbuffers
+)
+
+# Link libraries
+target_link_libraries(onnxruntime_providers_cuda_plugin PRIVATE
+    CUDA::cudart
+    CUDA::cublas
+    CUDA::cublasLt
+    CUDA::cufft
+    CUDNN::cudnn_all
+    cudnn_frontend
+    Boost::mp11
+    safeint_interface
+    onnxruntime_framework
+    onnxruntime_graph
+    onnxruntime_mlas
+    onnxruntime_flatbuffers
+    onnxruntime_common
+    cpuinfo::cpuinfo
+    onnx
+    onnx_proto
+    ${PROTOBUF_LIB}
+)
+
+# Symbol visibility — only export CreateEpFactories and ReleaseEpFactory
+target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ORT_API_MANUAL_INIT BUILD_CUDA_EP_AS_PLUGIN ORT_USE_EP_API_ADAPTERS=1 ONNX_ML=1 ONNX_NAMESPACE=onnx ONNX_USE_LITE_PROTO=1)
+
+if (onnxruntime_USE_CUDA_NHWC_OPS)
+    target_compile_definitions(onnxruntime_providers_cuda_plugin PRIVATE ENABLE_CUDA_NHWC_OPS)
+endif()
+
+if(WIN32)
+  # Windows: use .def file for symbol exports
+  set(CUDA_PLUGIN_DEF_FILE ${CUDA_PLUGIN_EP_DIR}/cuda_plugin_ep_symbols.def)
+  if(EXISTS ${CUDA_PLUGIN_DEF_FILE})
+    target_sources(onnxruntime_providers_cuda_plugin PRIVATE ${CUDA_PLUGIN_DEF_FILE})
+  endif()
+else()
+  # Linux/macOS: hide all symbols by default, explicitly export via __attribute__((visibility("default")))
+  set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
+      C_VISIBILITY_PRESET hidden
+      CXX_VISIBILITY_PRESET hidden
+  )
+endif()
+
+
+
+# Set output name
+set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
+    OUTPUT_NAME "onnxruntime_providers_cuda_plugin"
+)
+
+# Install
+install(TARGETS onnxruntime_providers_cuda_plugin
+    LIBRARY DESTINATION lib
+    RUNTIME DESTINATION bin
+)
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -1090,6 +1090,10 @@ target_include_directories(onnxruntime_test_all PRIVATE ${ONNXRUNTIME_ROOT}/core
 
 onnxruntime_apply_test_target_workarounds(onnxruntime_test_all)
 
+if (onnxruntime_USE_CUDA AND onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+  target_compile_definitions(onnxruntime_test_all PRIVATE ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP=1)
+endif()
+
 if (MSVC)
   # The warning means the type of two integral values around a binary operator is narrow than their result.
   # If we promote the two input values first, it could be more tolerant to integer overflow.
@@ -1264,6 +1268,10 @@ block()
   onnxruntime_apply_test_target_workarounds(onnxruntime_provider_test)
   onnxruntime_set_plugin_ep_test_environment(onnxruntime_provider_test)
 
+  if (onnxruntime_USE_CUDA AND onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+    target_compile_definitions(onnxruntime_provider_test PRIVATE ORT_UNIT_TEST_HAS_CUDA_PLUGIN_EP=1)
+  endif()
+
   # Expose QNN SDK headers to unit tests via an interface target
   if(onnxruntime_USE_QNN)
     add_library(qnn_sdk_headers_include INTERFACE)
@@ -1474,6 +1482,11 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       endif()
     else()
       target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common absl::flags absl::flags_parse ${onnx_test_libs})
+      #  When onnxruntime_BUILD_SHARED_LIB is OFF (the plugin build path), perf test was missing CUDA include directories and CUDA::cudart linkage.
+      if (onnxruntime_USE_CUDA OR onnxruntime_USE_NV OR onnxruntime_USE_TENSORRT)
+        target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        target_link_libraries(onnxruntime_perf_test PRIVATE CUDA::cudart)
+      endif()
     endif()
     set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")