microsoft · tianleiwu · Mar 31, 2026 · Mar 20, 2026 · Mar 21, 2026 · Mar 23, 2026
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -84,6 +84,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
 cmake_dependent_option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" ON "onnxruntime_USE_CUDA" OFF)
+cmake_dependent_option(onnxruntime_BUILD_CUDA_EP_AS_PLUGIN "Build CUDA EP as a separate plugin shared library" OFF "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
 option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
@@ -1431,6 +1432,9 @@ if (Git_FOUND)
   if (onnxruntime_USE_FP8_KV_CACHE)
     string(APPEND ORT_BUILD_INFO "fp8-kv-cache=1, ")
   endif()
+  if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+    string(APPEND ORT_BUILD_INFO "cuda-plugin-ep=1, ")
+  endif()
   if (onnxruntime_DUMP_TENSOR)
     string(APPEND ORT_BUILD_INFO "dump-tensor=1, ")
   endif()
@@ -1763,6 +1767,11 @@ endif()
 foreach(onnxruntime_cmake_file ${ONNXRUNTIME_CMAKE_FILES})
   include(${onnxruntime_cmake_file}.cmake)
 endforeach()
+
+# CUDA EP Plugin build (independent shared library)
+if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+  include(onnxruntime_providers_cuda_plugin.cmake)
+endif()
 if (UNIX)
   option(BUILD_PKGCONFIG_FILES "Build and install pkg-config files" ON)
 else()

diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -20,6 +20,9 @@
       "${ONNXRUNTIME_ROOT}/core/providers/cuda/*.cc"
     )
   endif()
+  # Exclude plugin directory if it was picked up by GLOB_RECURSE
+  list(FILTER onnxruntime_providers_cuda_cc_srcs EXCLUDE REGEX "core/providers/cuda/plugin/.*")
+
   # Remove pch files
   list(REMOVE_ITEM onnxruntime_providers_cuda_cc_srcs
     "${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_pch.h"
@@ -43,6 +46,8 @@
         "${ONNXRUNTIME_ROOT}/core/providers/cuda/math/unary_elementwise_ops_impl.cu"
         )
   endif()
+  # Exclude plugin directory if it was picked up by GLOB_RECURSE
+  list(FILTER onnxruntime_providers_cuda_cu_srcs EXCLUDE REGEX "core/providers/cuda/plugin/.*")
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
   set(onnxruntime_providers_cuda_src ${onnxruntime_providers_cuda_cc_srcs} ${onnxruntime_providers_cuda_shared_srcs} ${onnxruntime_providers_cuda_cu_srcs})
 

diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -1461,6 +1461,11 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       endif()
     else()
       target_link_libraries(onnxruntime_perf_test PRIVATE onnx_test_runner_common absl::flags absl::flags_parse ${onnx_test_libs})
+      #  When onnxruntime_BUILD_SHARED_LIB is OFF (the plugin build path), perf test was missing CUDA include directories and CUDA::cudart linkage.
+      if (onnxruntime_USE_CUDA OR onnxruntime_USE_NV OR onnxruntime_USE_TENSORRT)
+        target_include_directories(onnxruntime_perf_test PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        target_link_libraries(onnxruntime_perf_test PRIVATE CUDA::cudart)
+      endif()
     endif()
     set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest")
 

diff --git a/docs/cuda_plugin_ep/cuda_ops_for_plugin_ep.md b/docs/cuda_plugin_ep/cuda_ops_for_plugin_ep.md
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
@@ -43,6 +43,16 @@
   // Strive not to allocate Tensor with new/delete as it is a shallow class and using it by value is just fine.
   // Use InitOrtValue() methods to allocate for OrtValue.
 
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+  /// Static factory kept for plugin EP kernels that still call Tensor::Create().
+  /// The main tree deprecated these in favor of constructors, but dynamically-linked
+  /// plugin code relies on the static method.
+  static std::unique_ptr<Tensor> Create(MLDataType elt_type, const TensorShape& shape,
+                                        std::shared_ptr<IAllocator> allocator) {
+    return std::make_unique<Tensor>(elt_type, shape, std::move(allocator));
+  }
+#endif
+
   Tensor() = default;  // to allow creating vector<Tensor> to support seq(tensor)
 
   /**

diff --git a/include/onnxruntime/ep/adapter/kernel_def_builder.h b/include/onnxruntime/ep/adapter/kernel_def_builder.h
@@ -130,6 +130,8 @@ struct KernelDefBuilder {
     return *this;
   }
 
+  // ExecQueueId is intentionally a no-op. The plugin EP manages stream
+  // assignment externally; the queue id hint is not needed.
   KernelDefBuilder& ExecQueueId(int /*queue_id*/) { return *this; }
 
   Ort::KernelDef Build() { return builder_.Build(); }

diff --git a/include/onnxruntime/ep/adapter/node.h b/include/onnxruntime/ep/adapter/node.h
@@ -26,6 +26,11 @@
     return kernel_info_.GetOperatorType();
   }
 
+  /** Gets the Node's domain. */
+  std::string Domain() const {
+    return kernel_info_.GetOperatorDomain();
+  }
+
   /** Gets the since version of the operator. */
   int SinceVersion() const noexcept {
     return kernel_info_.GetOperatorSinceVersion();

diff --git a/include/onnxruntime/ep/adapter/op_kernel.h b/include/onnxruntime/ep/adapter/op_kernel.h
@@ -35,7 +35,7 @@ struct OpKernel {
   explicit OpKernel(const OpKernelInfo& info) : op_kernel_info_{info} {}
   virtual ~OpKernel() {}
 
-  Node Node() const {
+  adapter::Node Node() const {
     return op_kernel_info_.node();
   }
   const OpKernelInfo& Info() const {
@@ -93,6 +93,13 @@ struct OpKernelContext {
     input_tensors_[index] = CreateTensorFromApiValue(const_cast<OrtValue*>(static_cast<const OrtValue*>(input)));
     return &input_tensors_[index];
   }
+  template <typename T,
+            typename = std::enable_if_t<std::is_same_v<T, Tensor>>>
+  const T& RequiredInput(int index) const {
+    auto* input = Input<T>(index);
+    ORT_ENFORCE(input != nullptr, "Required input ", index, " is null");
+    return *input;
+  }
   Tensor* Output(int index, const TensorShape& shape) {
     if (index < 0 || static_cast<size_t>(index) >= output_tensors_.size()) {
       return nullptr;
@@ -109,6 +116,11 @@ struct OpKernelContext {
     output_tensors_[index] = CreateTensorFromApiValue(output);
     return &output_tensors_[index];
   }
+  Tensor& RequiredOutput(int index, const TensorShape& shape) {
+    auto* output = Output(index, shape);
+    ORT_ENFORCE(output != nullptr, "Required output ", index, " is null");
+    return *output;
+  }
   Tensor* Output(int index, const std::vector<int64_t>& shape) {
     return Output(index, TensorShape{shape});
   }
@@ -131,7 +143,6 @@ struct OpKernelContext {
     // TODO(fs-eire): Implement GetUseDeterministicCompute().
     return false;
   }
-
   void* GetGPUComputeStream() const {
     return context_.GetGPUComputeStream();
   }
@@ -146,7 +157,7 @@ struct OpKernelContext {
 };
 
 /// <summary>
-/// A bridge class between `onnxruntime::ep::adapter::OpKernel` and `::OrtKernelImpl`.
+/// A bridge class between `onnxruntime::ep::adapter::OpKernel` and `onnxruntime::OrtKernelImpl`.
 /// </summary>
 struct KernelImpl : OrtKernelImpl {
   explicit KernelImpl(std::unique_ptr<OpKernel> impl)

diff --git a/include/onnxruntime/ep/api.h b/include/onnxruntime/ep/api.h
@@ -15,6 +15,8 @@ namespace onnxruntime {
 namespace ep {
 
 struct ApiPtrs {
+  ApiPtrs(const OrtApi& ort_, const OrtEpApi& ep_, const OrtModelEditorApi& model_editor_)
+      : ort(ort_), ep(ep_), model_editor(model_editor_) {}
   const OrtApi& ort;
   const OrtEpApi& ep;
   const OrtModelEditorApi& model_editor;

diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.cu b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.cu
@@ -974,7 +974,7 @@ Status ComputeSoftmaxWithRawMask(Stream* ort_stream,
 
   if (use_persistent_softmax) {
     return onnxruntime::cuda::dispatch_warpwise_softmax_forward<T, T, float, false>(
-        ort_stream,
+        stream,
         output,
         persistent_softmax_workspace,
         total_sequence_length,

diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_masked_self_attention.cc
@@ -134,7 +134,7 @@ Status DecoderMaskedSelfAttention<T1, T2>::ComputeInternal(OpKernelContext* cont
   int m = batch_size * sequence_length;
   int n = (parameters.hidden_size + parameters.hidden_size + parameters.v_hidden_size);
   int k = parameters.input_hidden_size;
-  gemm_buffer = GetScratchBuffer<T1>(static_cast<size_t>(m) * n, context->GetComputeStream());
+  gemm_buffer = GetScratchBuffer<T1>(static_cast<size_t>(m) * n, GetComputeStream(context));
 
   CudaT one = ToCudaType<T1>::FromFloat(1.0f);
   CudaT zero = ToCudaType<T1>::FromFloat(0.0f);

diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
@@ -30,6 +30,33 @@ REGISTER_KERNEL_TYPED(double)
 
 using namespace ONNX_NAMESPACE;
 
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+static Status CheckInputsForPlugin(const OpKernelContext* context) {
+  const Tensor* input = context->Input<Tensor>(0);
+  const Tensor* bias = context->Input<Tensor>(1);
+
+  const auto& input_dims = input->Shape().GetDims();
+  if (input_dims.size() < 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 0 is expected to have 1 or more dimensions, got ", input_dims.size());
+  }
+
+  if (nullptr != bias) {
+    const auto& bias_dims = bias->Shape().GetDims();
+    if (bias_dims.size() != 1) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 1 is expected to have 1 dimensions, got ", bias_dims.size());
+    }
+    if (bias_dims[0] != input_dims[input_dims.size() - 1]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Input 1 dimension 0 should have same length as the last dimension of input 0");
+    }
+  }
+
+  return Status::OK();
+}
+#endif
+
 template <typename T>
 FastGelu<T>::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info) {
   const TransformerOptions* options = TransformerOptions::GetInstance();
@@ -38,7 +65,11 @@ FastGelu<T>::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel
 
 template <typename T>
 Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+  ORT_RETURN_IF_ERROR(CheckInputsForPlugin(context));
+#else
   ORT_RETURN_IF_ERROR(bias_gelu_helper::CheckInputs(context));
+#endif
 
   const Tensor* input = context->Input<Tensor>(0);
   const Tensor* bias = context->Input<Tensor>(1);

diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc
@@ -142,6 +142,14 @@ GroupQueryAttention<T, U>::GroupQueryAttention(const OpKernelInfo& info)
 // 11. head_sink        (Tensor) - Attention sink for GPT-OSS
 template <typename T, typename U>
 Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) const {
+  // Stream access: void* for GetScratchBuffer, Stream* for QkvToContext.
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+  onnxruntime::PluginStreamShim __stream_shim(GetComputeStream(context));
+  auto* ort_stream = static_cast<onnxruntime::Stream*>(&__stream_shim);
+#else
+  auto* ort_stream = context->GetComputeStream();
+#endif
+
   const Tensor* query = context->Input<Tensor>(0);
   const Tensor* key = context->Input<Tensor>(1);
   const Tensor* value = context->Input<Tensor>(2);
@@ -259,8 +267,8 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
       parameters.batch_size, parameters.kv_num_heads, parameters.seqlen_present_kv_cache, dense_head_size};
 
   TensorShape present_shape(present_dims);
-  Tensor* present_key_tensor = context->Output(1, present_shape);
-  Tensor* present_value_tensor = context->Output(2, present_shape);
+  Tensor* present_key_output = context->Output(1, present_shape);    // present_key
+  Tensor* present_value_output = context->Output(2, present_shape);  // present_value
 
   IAllocatorUniquePtr<void> k_buffer;
   IAllocatorUniquePtr<void> v_buffer;
@@ -288,8 +296,8 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
   data.past_key = (past_key == nullptr) ? nullptr : reinterpret_cast<const CudaU*>(past_key->Data<U>());
   data.past_value = (past_value == nullptr) ? nullptr : reinterpret_cast<const CudaU*>(past_value->Data<U>());
 
-  data.present_key = reinterpret_cast<CudaU*>(present_key_tensor->MutableData<U>());
-  data.present_value = reinterpret_cast<CudaU*>(present_value_tensor->MutableData<U>());
+  data.present_key = reinterpret_cast<CudaU*>(present_key_output->MutableData<U>());
+  data.present_value = reinterpret_cast<CudaU*>(present_value_output->MutableData<U>());
 
   // Compute past_present_share_buffer early since it's needed for flash attention path selection.
   // This compares the final pointer values after quantization handling.
@@ -370,7 +378,7 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
         xqa_total_bytes += q_bytes + k_bytes;
       }
 
-      xqa_scratch_buffer = this->GetScratchBuffer<void>(xqa_total_bytes, context->GetComputeStream());
+      xqa_scratch_buffer = this->GetScratchBuffer<void>(xqa_total_bytes, GetComputeStream(context));
       data.xqa_buffer = xqa_scratch_buffer.get();
       data.xqa_buffer_bytes = xqa_internal_bytes;
 
@@ -413,11 +421,11 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
       out_accum_bytes = onnxruntime::flash::get_out_accum_size(num_splits, parameters.batch_size, parameters.num_heads, parameters.sequence_length, round_multiple(parameters.head_size, 32));
     }
 
-    softmax_lse_buffer = GetScratchBuffer<void>(softmax_lse_bytes, context->GetComputeStream());
-    softmax_lse_accum_buffer = GetScratchBuffer<void>(softmax_lse_accum_bytes, context->GetComputeStream());
-    out_accum_buffer = GetScratchBuffer<void>(out_accum_bytes, context->GetComputeStream());
+    softmax_lse_buffer = GetScratchBuffer<void>(softmax_lse_bytes, GetComputeStream(context));
+    softmax_lse_accum_buffer = GetScratchBuffer<void>(softmax_lse_accum_bytes, GetComputeStream(context));
+    out_accum_buffer = GetScratchBuffer<void>(out_accum_bytes, GetComputeStream(context));
 
-    auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
+    auto cuda_stream = Stream(context);
     if (softmax_lse_accum_bytes > 0) {
       // Initialize to 0 is fine because Flash kernel will write -inf to it if needed.
       // However, the standard Flash kernel often doesn't zero it globally.
@@ -442,8 +450,8 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
   } else {
     // Compute sequence length buffers (past_seq_lens and total_seq_lens).
     // Allocate buffer for both: first half is past_seq_lens, second half is total_seq_lens.
-    seq_lens_buffer = GetScratchBuffer<int>(3 * parameters.batch_size, context->GetComputeStream());
-    auto cuda_stream = static_cast<cudaStream_t>(context->GetComputeStream()->GetHandle());
+    seq_lens_buffer = GetScratchBuffer<int>(3 * parameters.batch_size, GetComputeStream(context));
+    auto cuda_stream = Stream(context);
     data.past_seq_lens = seq_lens_buffer.get();
     data.total_seq_lens = seq_lens_buffer.get() + parameters.batch_size;
     data.padded_seq_lens = data.total_seq_lens + parameters.batch_size;
@@ -480,9 +488,9 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
                                    ? (sizeof(float) * parameters.batch_size * parameters.sequence_length * parameters.num_heads * parameters.head_size)
                                    : 0;
 
-    k_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
-    v_buffer = GetScratchBuffer<void>(kv_buffer_bytes, context->GetComputeStream());
-    fmha_buffer = GetScratchBuffer<void>(fmha_buffer_bytes, context->GetComputeStream());
+    k_buffer = GetScratchBuffer<void>(kv_buffer_bytes, GetComputeStream(context));
+    v_buffer = GetScratchBuffer<void>(kv_buffer_bytes, GetComputeStream(context));
+    fmha_buffer = GetScratchBuffer<void>(fmha_buffer_bytes, GetComputeStream(context));
 
     data.k = reinterpret_cast<CudaT*>(k_buffer.get());
     data.v = reinterpret_cast<CudaT*>(v_buffer.get());
@@ -501,7 +509,7 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
       data.use_memory_efficient_attention);
 
   if (buffer_req.qkv_buffer_bytes > 0) {
-    unpacked_qkv_buffer = GetScratchBuffer<void>(buffer_req.qkv_buffer_bytes, context->GetComputeStream());
+    unpacked_qkv_buffer = GetScratchBuffer<void>(buffer_req.qkv_buffer_bytes, GetComputeStream(context));
     data.qkv_buffer = reinterpret_cast<CudaT*>(unpacked_qkv_buffer.get());
   }
 
@@ -556,7 +564,7 @@ Status GroupQueryAttention<T, U>::ComputeInternal(OpKernelContext* context) cons
   cublasHandle_t cublas = GetCublasHandle(context);
 
   ORT_RETURN_IF_ERROR((QkvToContext<CudaT, CudaU>(
-      device_prop, cublas, context->GetComputeStream(), parameters, data)));
+      device_prop, cublas, ort_stream, parameters, data)));
   return Status::OK();
 }
 

diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc
@@ -286,7 +286,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
   int m = parameters.token_count;
   int n = parameters.hidden_size + parameters.hidden_size + parameters.v_hidden_size;
   int k = parameters.input_hidden_size;
-  gemm_buffer = this->template GetScratchBuffer<T>(static_cast<size_t>(m) * n, context->GetComputeStream());
+  gemm_buffer = this->template GetScratchBuffer<T>(static_cast<size_t>(m) * n, this->GetComputeStream(context));
 
   cublasHandle_t cublas = this->GetCublasHandle(context);
 
@@ -310,7 +310,7 @@ Status PackedAttention<T>::ComputeInternal(OpKernelContext* context) const {
                                                    false,
                                                    use_memory_efficient_attention,
                                                    no_qkv_workspace);
-  auto work_space = this->template GetScratchBuffer<void>(workSpaceSize, context->GetComputeStream());
+  auto work_space = this->template GetScratchBuffer<void>(workSpaceSize, this->GetComputeStream(context));
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   PackedAttentionData<CudaT> data;

diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc
@@ -267,7 +267,7 @@ Status PackedMultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) co
                                                    use_flash_attention,
                                                    use_memory_efficient_attention,
                                                    no_qkv_workspace);
-  auto work_space = this->template GetScratchBuffer<void>(workSpaceSize, context->GetComputeStream());
+  auto work_space = this->template GetScratchBuffer<void>(workSpaceSize, this->GetComputeStream(context));
 
   PackedMultiHeadAttentionData<CudaT> data;
   data.query = reinterpret_cast<const CudaT*>(query->Data<T>());