microsoft · yuslepukhin · Apr 8, 2026 · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/cmake/onnxruntime_providers_cuda_plugin.cmake b/cmake/onnxruntime_providers_cuda_plugin.cmake
@@ -111,6 +111,10 @@ onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_plugin
     ${CUDA_PLUGIN_EP_CC_SRCS}
     ${CUDA_PLUGIN_EP_CU_SRCS}
 )
+
+# Mirror directory structure in the Visual Studio solution tree under "onnxruntime".
+source_group(TREE ${ONNXRUNTIME_ROOT} PREFIX "onnxruntime" FILES ${CUDA_EP_CC_SRCS} ${CUDA_EP_CU_SRCS})
+source_group(TREE ${ONNXRUNTIME_ROOT} PREFIX "onnxruntime" FILES ${CUDA_CONTRIB_OPS_CC_SRCS} ${CUDA_CONTRIB_OPS_CU_SRCS})
 # Keep the plugin CUDA target aligned with the repo-wide C++20 baseline.
 # Forcing CUDA C++17 here breaks newer protobuf/absl headers used by the plugin
 # build, as absl::compare expects standard ordering support in this configuration.
@@ -143,22 +147,14 @@ target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--std c++20>"
     "$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr;-Xcudafe;--diag_suppress=550>"
     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcudafe --diag_suppress=2810>"
+    # Force-include adapters.h and cuda_kernel_adapter.h for CXX sources.
+    # GCC/Clang use -include, MSVC uses /FI.
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:-include;${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<NOT:$<CXX_COMPILER_ID:MSVC>>>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/FI${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
+    "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CXX_COMPILER_ID:MSVC>>:/FI${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
 )
 
-# Force-include adapter headers for CXX files.
-# MSVC uses /FI; GCC/Clang use -include.
-if (MSVC)
-    target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
-        "$<$<COMPILE_LANGUAGE:CXX>:SHELL:/FI \"${REPO_ROOT}/include/onnxruntime/ep/adapters.h\">"
-        "$<$<COMPILE_LANGUAGE:CXX>:SHELL:/FI \"${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h\">"
-    )
-else()
-    target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
-        "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-include ${REPO_ROOT}/include/onnxruntime/ep/adapters.h>"
-        "$<$<COMPILE_LANGUAGE:CXX>:SHELL:-include ${CUDA_PLUGIN_EP_DIR}/cuda_kernel_adapter.h>"
-    )
-endif()
-
 if (MSVC)
     target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
         "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-Xcompiler /permissive>"
@@ -170,6 +166,11 @@ if (MSVC)
     )
 
     target_compile_options(onnxruntime_providers_cuda_plugin PRIVATE
+        # /permissive is required for CUTLASS cute headers (cute::stride.hpp, cute::Layout etc.)
+        "$<$<COMPILE_LANGUAGE:CXX>:/permissive>"
+        # /permissive disables C++ alternative tokens (or, and, not, etc.).
+        # Force-include iso646.h to restore them as macros.
+        "$<$<COMPILE_LANGUAGE:CXX>:/FIiso646.h>"
         "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>"
     )
 endif()
@@ -287,9 +288,10 @@ endif()
 
 
 
-# Set output name
+# Set output name and solution folder
 set_target_properties(onnxruntime_providers_cuda_plugin PROPERTIES
     OUTPUT_NAME "onnxruntime_providers_cuda_plugin"
+    FOLDER "ONNXRuntime"
 )
 
 # Install

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -509,6 +509,13 @@ if (onnxruntime_USE_CUDA AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_R
     )
   list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_src})
 
+  if (onnxruntime_BUILD_CUDA_EP_AS_PLUGIN)
+    file(GLOB onnxruntime_test_providers_cuda_plugin_src CONFIGURE_DEPENDS
+      "${TEST_SRC_DIR}/providers/cuda/plugin/*.cc"
+    )
+    list(APPEND onnxruntime_test_providers_src ${onnxruntime_test_providers_cuda_plugin_src})
+  endif()
+
   if (onnxruntime_USE_CUDA_NHWC_OPS AND CUDNN_MAJOR_VERSION GREATER 8)
     file(GLOB onnxruntime_test_providers_cuda_nhwc_src CONFIGURE_DEPENDS
       "${TEST_SRC_DIR}/providers/cuda/nhwc/*.cc"

diff --git a/docs/cuda_plugin_ep/arena_allocator_migration_design.md b/docs/cuda_plugin_ep/arena_allocator_migration_design.md
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
@@ -176,6 +176,11 @@ class IAllocator {
     *stats = {};
   }
 
+  // Returns a pointer to this allocator as an IArena if it is one, nullptr otherwise.
+  // Used by SafeArenaCast to avoid dependency on RTTI.
+  virtual class IArena* AsArena() { return nullptr; }
+  virtual const class IArena* AsArena() const { return nullptr; }
+
   static bool CalcMemSizeForArray(size_t nmemb, size_t size, size_t* out) noexcept {
     return CalcMemSizeForArrayWithAlignment(nmemb, size, 0, out);
   }
@@ -364,6 +369,8 @@ class IArena : public IAllocator {
   virtual Status Shrink() = 0;
   // Only implemented when IsStreamAware() returns true
   virtual void ReleaseStreamBuffers(Stream* /*stream*/) {}
+  IArena* AsArena() override { return this; }
+  const IArena* AsArena() const override { return this; }
   static IArena* SafeArenaCast(IAllocator* allocator);
 };
 

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -418,6 +418,22 @@ typedef struct OrtAllocator {
    * \since 1.23
    */
   void*(ORT_API_CALL* AllocOnStream)(struct OrtAllocator* this_, size_t size, OrtSyncStream* stream);
+
+  /** \brief Release unused memory held by the allocator back to the system.
+   *
+   * For arena-based allocators, this frees allocation regions that are completely unused.
+   * For mempool-based allocators, this trims the pool to a configured minimum.
+   * For non-arena allocators this is a no-op.
+   *
+   * \param[in] this_ OrtAllocator instance
+   *
+   * \return nullptr on success, or an OrtStatus* on failure.
+   *
+   * \note Implementation of this function is optional and Shrink may be set to a nullptr.
+   *       Callers must check for nullptr before invoking.
+   * \since 1.25
+   */
+  ORT_API2_STATUS(Shrink, _In_ struct OrtAllocator* this_);
 } OrtAllocator;
 
 typedef void(ORT_API_CALL* OrtLoggingFunction)(

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -1048,6 +1048,7 @@ struct AllocatorImpl : Base<T> {
   using B::B;
 
   void* Alloc(size_t size);
+  void* Reserve(size_t size);
   MemoryAllocation GetAllocation(size_t size);
   void Free(void* p);
   ConstMemoryInfo GetInfo() const;
@@ -1057,6 +1058,12 @@ struct AllocatorImpl : Base<T> {
    * \return A pointer to a KeyValuePairs object that will be filled with the allocator statistics.
    */
   KeyValuePairs GetStats() const;
+
+  /** \brief Release unused memory held by the allocator.
+   *
+   * Calls the optional Shrink function pointer if available; does nothing otherwise.
+   */
+  void Shrink();
 };
 }  // namespace detail
 

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -224,6 +224,19 @@ inline void* AllocatorImpl<T>::Alloc(size_t size) {
   return out;
 }
 
+template <typename T>
+inline void* AllocatorImpl<T>::Reserve(size_t size) {
+  // Reserve was added in version 18. For older allocators the field may be
+  // uninitialized, so we must not dereference it.
+  if (this->p_->version >= 18 && this->p_->Reserve) {
+    return this->p_->Reserve(this->p_, size);
+  }
+  // Fall back to Alloc() for allocators that don't implement Reserve,
+  // matching the ORT-core adapter behavior (IAllocatorImplWrappingOrtAllocator,
+  // IArenaImplWrappingOrtAllocator).
+  return this->p_->Alloc(this->p_, size);
+}
+
 template <typename T>
 inline MemoryAllocation AllocatorImpl<T>::GetAllocation(size_t size) {
   void* out;
@@ -250,6 +263,15 @@ inline KeyValuePairs AllocatorImpl<T>::GetStats() const {
   ThrowOnError(GetApi().AllocatorGetStats(this->p_, &out));
   return KeyValuePairs(out);
 }
+
+template <typename T>
+inline void AllocatorImpl<T>::Shrink() {
+  // Shrink was added in version 25. For older allocators the field may be
+  // uninitialized, so we must not dereference it.
+  if (this->p_->version >= 25 && this->p_->Shrink) {
+    ThrowOnError(this->p_->Shrink(this->p_));
+  }
+}
 }  // namespace detail
 
 inline AllocatorWithDefaultOptions::AllocatorWithDefaultOptions() {

diff --git a/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h b/onnxruntime/contrib_ops/cuda/tensor/dynamic_time_warping.h
@@ -9,7 +9,11 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
+#ifndef BUILD_CUDA_EP_AS_PLUGIN
+using onnxruntime::OpKernelContext;
+using onnxruntime::OpKernelInfo;
+#endif
+using onnxruntime::cuda::CudaKernel;
 class DynamicTimeWarping final : public CudaKernel {
  public:
   DynamicTimeWarping(const OpKernelInfo& info) : CudaKernel(info) {}

diff --git a/onnxruntime/contrib_ops/cuda/tensor/unfold.h b/onnxruntime/contrib_ops/cuda/tensor/unfold.h
@@ -9,7 +9,11 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-using namespace onnxruntime::cuda;
+#ifndef BUILD_CUDA_EP_AS_PLUGIN
+using onnxruntime::OpKernelContext;
+using onnxruntime::OpKernelInfo;
+#endif
+using onnxruntime::cuda::CudaKernel;
 class UnfoldTensor final : public CudaKernel {
  public:
   UnfoldTensor(const OpKernelInfo& info) : CudaKernel(info) {

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -191,12 +191,7 @@ void* AllocateBufferWithOptions(IAllocator& alloc, size_t size, bool use_reserve
 }
 
 IArena* IArena::SafeArenaCast(IAllocator* allocator) {
-#if !defined(ORT_NO_RTTI)
-  auto* result = dynamic_cast<IArena*>(allocator);
-  return result;
-#else
-  return static_cast<IArena*>(allocator);
-#endif
+  return allocator ? allocator->AsArena() : nullptr;
 }
 
 }  // namespace onnxruntime

diff --git a/onnxruntime/core/framework/device_stream_collection.cc b/onnxruntime/core/framework/device_stream_collection.cc
@@ -36,15 +36,10 @@ class DeviceStreamCollectionImpl {
   void ReleaseSingleStreamBuffers(Stream* stream) {
     if (!stream) return;
     for (const auto& it : allocators_) {
-      if (it.second->Info().device == stream->GetDevice() &&
-          it.second->Info().alloc_type == OrtArenaAllocator) {
-        if (it.second->IsStreamAware()) {
-          // Previously we only had one StreamAwareBFCArena. We need to guard
-          // against multiple allocators now.
-          auto* arena_alloc = IArena::SafeArenaCast(it.second.get());
-          if (arena_alloc) {
-            arena_alloc->ReleaseStreamBuffers(stream);
-          }
+      if (it.second->Info().device == stream->GetDevice()) {
+        auto* arena = it.second->AsArena();
+        if (arena && arena->IsStreamAware()) {
+          arena->ReleaseStreamBuffers(stream);
         }
       }
     }

diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc
@@ -3,7 +3,12 @@
 
 #include "core/providers/shared_library/provider_api.h"
 #include "shared_inc/cuda_call.h"
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+#include "ep/adapters.h"
+#include "plugin/provider_api_shims.h"
+#else
 #include <core/platform/env.h>
+#endif
 
 #ifdef _WIN32
 #else  // POSIX

diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -373,6 +373,7 @@ struct CudaOrtAllocator : OrtAllocator {
     Reserve = AllocImpl;      // no special behavior for Reserve so use AllocImpl
     GetStats = nullptr;       // GetStatsImpl. The CUDA allocators don't have stats currently so we can skip.
     AllocOnStream = nullptr;  // TODO. Plugin EP arena to provide this.
+    Shrink = nullptr;
 
     const OrtEpApi& ep_api = *api.GetEpApi();
     const OrtMemoryDevice* mem_device = ep_api.MemoryInfo_GetMemoryDevice(mem_info);

diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -24,6 +24,10 @@ DeferredCpuAllocator::DeferredCpuAllocator(CudaStream& cuda_stream) : cuda_strea
         auto self = reinterpret_cast<const DeferredCpuAllocator*>(this_);
         return &self->cuda_stream_.GetCpuAllocator()->Info();
       };
+  OrtAllocator::Reserve = nullptr;
+  OrtAllocator::GetStats = nullptr;
+  OrtAllocator::AllocOnStream = nullptr;
+  OrtAllocator::Shrink = nullptr;
 }
 
 struct CudaNotification : public synchronize::Notification {

diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc
@@ -3,7 +3,12 @@
 
 #include "core/providers/cuda/shared_inc/cudnn_fe_call.h"
 #include "core/providers/shared_library/provider_api.h"
+#ifdef BUILD_CUDA_EP_AS_PLUGIN
+#include "ep/adapters.h"
+#include "plugin/provider_api_shims.h"
+#else
 #include <core/platform/env.h>
+#endif
 #if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL)
 #include <cudnn_frontend.h>
 #endif

diff --git a/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h b/onnxruntime/core/providers/cuda/plugin/cuda_allocator_plugin.h
@@ -10,6 +10,11 @@
 
 #include "cuda_plugin_utils.h"
 
+#include <algorithm>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
 namespace onnxruntime {
 namespace cuda_plugin {
 
@@ -35,6 +40,52 @@ class CudaAllocatorBase : public OrtAllocator {
   const OrtMemoryInfo* memory_info_;
 };
 
+// CudaAllocatorBase derives from OrtAllocator via single non-virtual inheritance.
+// This guarantees OrtAllocator sits at offset 0 in the derived layout, so
+// static_cast between OrtAllocator* and CudaAllocatorBase* is safe.
+static_assert(!std::is_polymorphic_v<CudaAllocatorBase>,
+              "CudaAllocatorBase must not be polymorphic (no virtual functions) "
+              "to ensure OrtAllocator is at offset 0.");
+
+/// Allocator statistics tracked by arena allocators.
+struct AllocatorStats {
+  int64_t num_allocs = 0;
+  int64_t num_reserves = 0;
+  int64_t num_arena_extensions = 0;
+  int64_t num_arena_shrinkages = 0;
+  int64_t bytes_in_use = 0;
+  int64_t total_allocated_bytes = 0;
+  int64_t max_bytes_in_use = 0;
+  int64_t max_alloc_size = 0;
+  int64_t bytes_limit = 0;
+
+  void ToKeyValuePairs(const OrtApi& api, OrtKeyValuePairs* kvps) const {
+    api.AddKeyValuePair(kvps, "Limit", std::to_string(bytes_limit).c_str());
+    api.AddKeyValuePair(kvps, "InUse", std::to_string(bytes_in_use).c_str());
+    api.AddKeyValuePair(kvps, "TotalAllocated", std::to_string(total_allocated_bytes).c_str());
+    api.AddKeyValuePair(kvps, "MaxInUse", std::to_string(max_bytes_in_use).c_str());
+    api.AddKeyValuePair(kvps, "NumAllocs", std::to_string(num_allocs).c_str());
+    api.AddKeyValuePair(kvps, "NumReserves", std::to_string(num_reserves).c_str());
+    api.AddKeyValuePair(kvps, "NumArenaExtensions", std::to_string(num_arena_extensions).c_str());
+    api.AddKeyValuePair(kvps, "NumArenaShrinkages", std::to_string(num_arena_shrinkages).c_str());
+    api.AddKeyValuePair(kvps, "MaxAllocSize", std::to_string(max_alloc_size).c_str());
+  }
+
+  std::string DebugString() const {
+    std::ostringstream ss;
+    ss << "Limit:                    " << bytes_limit << "\n"
+       << "InUse:                    " << bytes_in_use << "\n"
+       << "TotalAllocated:           " << total_allocated_bytes << "\n"
+       << "MaxInUse:                 " << max_bytes_in_use << "\n"
+       << "NumAllocs:                " << num_allocs << "\n"
+       << "NumReserves:              " << num_reserves << "\n"
+       << "NumArenaExtensions:       " << num_arena_extensions << "\n"
+       << "NumArenaShrinkages:       " << num_arena_shrinkages << "\n"
+       << "MaxAllocSize:             " << max_alloc_size << "\n";
+    return ss.str();
+  }
+};
+
 /// CUDA device memory allocator using cudaMalloc/cudaFree.
 /// Lifetime is managed by the EP factory (ReleaseAllocatorImpl), not by a Release callback.
 class CudaDeviceAllocator final : public CudaAllocatorBase {