microsoft · tianleiwu · Apr 7, 2026 · Mar 31, 2026 · Mar 31, 2026 · Apr 4, 2026
diff --git a/docs/cuda_plugin_ep/cuda_plugin_ep_design.md b/docs/cuda_plugin_ep/cuda_plugin_ep_design.md
@@ -213,7 +213,7 @@ The primary approach moves pure-computation helpers from CPU `.cc` files to head
 - `roialign.h` — `CheckROIAlignValidInput`, `RoiAlignBase` constructor (templatized on info type)
 - `upsamplebase.h` — `UpsampleBase::AdjustOutputSizeAsPolicy`
 - `crop.h` — `CropBase` constructor (templatized on info type)
-- `space_depth_ops.h` — `SpaceDepthBase` constructor (templatized on info type)
+- `space_depth_ops.h` — `SpaceDepthBase` constructor plus shared `ReadBlocksize`, `ReadIsDCR`, and dimension-validation helpers (templatized on info/context type where needed)
 - `clip.h` — Clip min/max attribute handling (removed `Clip_6Base` CPU dependency)
 - `cuda_common_type_helpers.h` — CUDA type conversion and handle error string helpers (moved from `cuda_common.cc`)
 
@@ -249,7 +249,8 @@ This allows the base class constructor to work with both the framework `OpKernel
 Some CPU base classes have heavy dependencies (protobuf, `UnpackTensor`) that make inlining impractical:
 
 - **`ConstantOfShapeBase`** — depends on `TensorProto` and `UnpackTensor`. The plugin path in `constant_of_shape.h` stays self-contained: it reuses `ConstantOfShapeCore` but fetches the `value` attribute through the ORT C++ API instead of depending on the full CPU base implementation.
-- **`UpsampleBase`** — partially addressed: `AdjustOutputSizeAsPolicy` moved to header (#27628). Still depends on `InputDefs()` and `OpKernelInfo::GetAllocator()` which are not in the adapter.
+
+`UpsampleBase` no longer belongs in this category: the adapter now exposes `OpKernelInfo::GetAllocator(OrtMemType)`, and the remaining shape-rank query already has an adapter-safe fallback when `Node::InputDefs()` is unavailable. That lets the CUDA `Upsample` antialias path reuse the same persistent device lookup-table initialization in both bundled and plugin builds instead of keeping a plugin-only scratch-buffer fallback.
 
 ---
 
@@ -603,7 +604,7 @@ The branch still contains a small set of plugin guards in both infrastructure an
 - `generator/constant_of_shape.h` still needs a plugin-specific path because `ConstantOfShapeBase` depends on framework-only tensor-attribute helpers.
 - Tunable kernels such as `math/matmul.cc` still gate framework-only registration paths.
 - `tensor/identity_op.h` guards the `TensorSeq` code path and `context->InputType()` call with `#ifndef BUILD_CUDA_EP_AS_PLUGIN` — the plugin build handles only the `Tensor` path. `identity_op.cc` uses conditional macros (`IDENTITY_V_TYPES` / `IDENTITY_V_TYPES_IRv9`) so opset 14+ registrations use `AllFixedSizeTensorTypes()` in the plugin build. Additionally, old Dropout opset 7–9 and 10–11 kernel registrations were moved from `identity_op.cc` to `nn/dropout.cc` so that each op's registrations live in that op's own source file.
-- A few tensor kernels (`pad.cc`, `tile.cc`, `unsqueeze.cc`, `upsample.*`, `space_depth_ops.h`, `scatter_nd.*`) still contain localized plugin guards where adapter and framework paths have not fully converged.
+- A few tensor kernels (`pad.cc`, `tile.cc`, `unsqueeze.cc`) still contain localized plugin guards where adapter and framework paths have not fully converged. Recent cleanup removed the plugin-only branches from `upsample.*`, `space_depth_ops.h`, and `scatter_nd.*` by moving reusable logic into shared adapter-safe helpers and by adding allocator access to `ep::adapter::OpKernelInfo`.
 
 The broad trend remains positive: most operator-level plugin conditionals were removed by moving reusable CPU/helper logic into shared headers and by centralizing stream bridging in `CudaKernel` helpers.
 

diff --git a/include/onnxruntime/ep/adapter/op_kernel_info.h b/include/onnxruntime/ep/adapter/op_kernel_info.h
@@ -73,6 +73,14 @@ struct OpKernelInfo {
   const DataTransferManager& GetDataTransferManager() const noexcept {
     return (static_cast<const Ep*>(cache_->ort_ep_))->GetDataTransferManager();
   }
+
+  // Delegates to the core OpKernelInfo::GetAllocator so the adapter returns
+  // exactly the same allocator the framework would provide for each OrtMemType.
+  AllocatorPtr GetAllocator(OrtMemType mem_type) const {
+    const auto* core_kernel_info = reinterpret_cast<const ::onnxruntime::OpKernelInfo*>(cache_->kernel_info_);
+    return core_kernel_info->GetAllocator(mem_type);
+  }
+
   Node node() const noexcept {
     return Node{cache_->kernel_info_};
   }

diff --git a/onnxruntime/core/providers/cpu/cpu_provider_shared.cc b/onnxruntime/core/providers/cpu/cpu_provider_shared.cc
@@ -89,7 +89,7 @@ struct ProviderHostCPUImpl : ProviderHostCPU {
   // From cpu/tensor/scatter_nd.h (direct)
   Status ScatterNDBase__ValidateShapes(const TensorShape& input_shape,
                                        const TensorShape& indice_shape,
-                                       const TensorShape& update_shape) override { return ScatterND::ValidateShapes(input_shape, indice_shape, update_shape); }
+                                       const TensorShape& update_shape) override { return scatter_nd_internal::ValidateShapes(input_shape, indice_shape, update_shape); }
   // From cpu/tensor/padbase.h (direct)
   Status PadBase__HandleDimValueZero(const Mode& mode, const TensorShape& input_shape, const TensorShape& output_shape) override { return PadBase::HandleDimValueZero(mode, input_shape, output_shape); }
 

diff --git a/onnxruntime/core/providers/cpu/tensor/scatter_nd.h b/onnxruntime/core/providers/cpu/tensor/scatter_nd.h
@@ -5,7 +5,7 @@
 
 #include "core/common/narrow.h"
 
-#ifndef SHARED_PROVIDER
+#if !defined(SHARED_PROVIDER) && !defined(BUILD_CUDA_EP_AS_PLUGIN)
 #include "core/common/common.h"
 #include "core/framework/op_kernel.h"
 #endif
@@ -15,6 +15,51 @@ namespace concurrency {
 class ThreadPool;
 }
 
+namespace scatter_nd_internal {
+
+inline Status ValidateShapes(const TensorShape& input_shape,
+                             const TensorShape& indice_shape,
+                             const TensorShape& update_shape) {
+  auto input_rank = input_shape.NumDimensions();
+  auto indice_rank = indice_shape.NumDimensions();
+  auto update_rank = update_shape.NumDimensions();
+
+  if (input_rank == 0 || indice_rank == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "input tensor and indices tensor must have rank larger than 0. ",
+                           "input shape: ", input_shape, ", indices shape: ", indice_shape);
+  }
+
+  auto last_indice_dimension = indice_shape[indice_rank - 1];
+  if (last_indice_dimension > static_cast<int64_t>(input_rank)) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "last dimension of indices must not be larger than rank of input tensor");
+  }
+
+  bool is_update_shape_invalid = [&]() {
+    if (update_rank != (input_rank + indice_rank - 1 - static_cast<ptrdiff_t>(last_indice_dimension))) {
+      return true;
+    }
+    if (indice_shape.Slice(0, indice_rank - 1) != update_shape.Slice(0, indice_rank - 1)) {
+      return true;
+    }
+    if (input_shape.Slice(onnxruntime::narrow<size_t>(last_indice_dimension)) != update_shape.Slice(indice_rank - 1)) {
+      return true;
+    }
+    return false;
+  }();
+
+  if (is_update_shape_invalid) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "updates tensor should have shape equal to indices.shape[:-1] + data.shape[indices.shape[-1]:]. ",
+                           "updates shape: ", update_shape, ", indices shape: ", indice_shape, ", data shape: ", input_shape);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace scatter_nd_internal
+
 class ScatterND final : public OpKernel {
  public:
   enum class Reduction : int {
@@ -51,42 +96,7 @@ class ScatterND final : public OpKernel {
   static inline Status ValidateShapes(const TensorShape& input_shape,
                                       const TensorShape& indice_shape,
                                       const TensorShape& update_shape) {
-    auto input_rank = input_shape.NumDimensions();
-    auto indice_rank = indice_shape.NumDimensions();
-    auto update_rank = update_shape.NumDimensions();
-
-    if (input_rank == 0 || indice_rank == 0) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "input tensor and indices tensor must has rank larger than 0. ",
-                             "input shape: ", input_shape, ", indices shape: ", indice_shape);
-    }
-
-    auto last_indice_dimension = indice_shape[indice_rank - 1];
-    if (last_indice_dimension > static_cast<int64_t>(input_rank)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "last dimension of indices must not be larger than rank of input tensor");
-    }
-
-    bool is_update_shape_invalid = [&]() {
-      if (update_rank != (input_rank + indice_rank - 1 - static_cast<ptrdiff_t>(last_indice_dimension))) {
-        return true;
-      }
-      if (indice_shape.Slice(0, indice_rank - 1) != update_shape.Slice(0, indice_rank - 1)) {
-        return true;
-      }
-      if (input_shape.Slice(onnxruntime::narrow<size_t>(last_indice_dimension)) != update_shape.Slice(indice_rank - 1)) {
-        return true;
-      }
-      return false;
-    }();
-
-    if (is_update_shape_invalid) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "updates tensor should have shape equal to indices.shape[:-1] + data.shape[indices.shape[-1]:]. ",
-                             "updates shape: ", update_shape, ", indices shape: ", indice_shape, ", data shape: ", input_shape);
-    }
-
-    return Status::OK();
+    return scatter_nd_internal::ValidateShapes(input_shape, indice_shape, update_shape);
   }
 #endif  // SHARED_PROVIDER
 

diff --git a/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h b/onnxruntime/core/providers/cpu/tensor/space_depth_ops.h
@@ -3,72 +3,116 @@
 
 #pragma once
 
+#include <string>
+
+#if !defined(SHARED_PROVIDER) && !defined(BUILD_CUDA_EP_AS_PLUGIN)
 #include "core/framework/op_kernel.h"
+#endif
 
 namespace onnxruntime {
 
-class SpaceDepthBase {
- protected:
-  template <typename KernelInfoType>
-  explicit SpaceDepthBase(const KernelInfoType& info) {
-    ORT_ENFORCE(info.template GetAttr<int64_t>("blocksize", &blocksize_).IsOK(),
-                "Attribute blocksize is not set.");
+namespace space_depth_internal {
+
+template <typename KernelInfoType>
+inline int64_t ReadBlocksize(const KernelInfoType& info) {
+  int64_t blocksize = 0;
+  ORT_ENFORCE(info.template GetAttr<int64_t>("blocksize", &blocksize).IsOK(),
+              "Attribute blocksize is not set.");
+  return blocksize;
+}
+
+template <typename KernelInfoType>
+inline bool ReadIsDCR(const KernelInfoType& info) {
+  bool is_dcr = true;
+  std::string mode;
+  // If mode doesn't exist, then it is the default "DCR" mode
+  // (or) it is an opset < 11 model for which the only mode is "DCR" mode.
+  if (info.GetAttr("mode", &mode).IsOK()) {
+    if (mode == "CRD") {
+      is_dcr = false;
+    } else if (mode != "DCR") {
+      ORT_THROW("DepthToSpace op: only 'DCR' and 'CRD' modes are supported");
+    }
   }
 
-  template <bool IsNHWC = false>
-  Status InputValidationsAndOutputDimsCalc(const Tensor& input,
-                                           int64_t& batch,
-                                           int64_t& input_depth, int64_t& input_height, int64_t& input_width,
-                                           int64_t& output_depth, int64_t& output_height, int64_t& output_width,
-                                           bool is_space_to_depth) const {
-    const TensorShape& input_shape = input.Shape();
+  return is_dcr;
+}
+
+template <bool IsNHWC = false>
+inline Status InputValidationsAndOutputDimsCalc(int64_t blocksize,
+                                                const Tensor& input,
+                                                int64_t& batch,
+                                                int64_t& input_depth, int64_t& input_height, int64_t& input_width,
+                                                int64_t& output_depth, int64_t& output_height, int64_t& output_width,
+                                                bool is_space_to_depth) {
+  const TensorShape& input_shape = input.Shape();
+
+  if (input_shape.NumDimensions() != 4) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "SpaceDepth ops require a 4-D input. Provided rank: ",
+                           input_shape.NumDimensions());
+  }
 
-    if (input_shape.NumDimensions() != 4) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "SpaceDepth ops require a 4-D input. Provided rank: ",
-                             input_shape.NumDimensions());
+  batch = input_shape[0];
+  if constexpr (IsNHWC) {
+    input_depth = input_shape[3];
+    input_height = input_shape[1];
+    input_width = input_shape[2];
+  } else {
+    input_depth = input_shape[1];
+    input_height = input_shape[2];
+    input_width = input_shape[3];
+  }
+
+  if (is_space_to_depth) {  // SpaceToDepth op
+    if ((input_height % blocksize) != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "SpaceToDepth requires input height to be a multiple of block_size");
     }
 
-    batch = input_shape[0];
-    if constexpr (IsNHWC) {
-      input_depth = input_shape[3];
-      input_height = input_shape[1];
-      input_width = input_shape[2];
-    } else {
-      input_depth = input_shape[1];
-      input_height = input_shape[2];
-      input_width = input_shape[3];
+    if ((input_width % blocksize) != 0) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "SpaceToDepth requires input width to be a multiple of block_size");
     }
 
-    if (is_space_to_depth) {  // SpaceToDepth op
-      if ((input_height % this->blocksize_) != 0) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "SpaceToDepth requires input height to be a multiple of block_size");
-      }
+    output_depth = input_depth * blocksize * blocksize;
+    output_height = input_height / blocksize;
+    output_width = input_width / blocksize;
 
-      if ((input_width % this->blocksize_) != 0) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "SpaceToDepth requires input width to be a multiple of block_size");
-      }
+  } else {  // DepthToSpace op
+    if ((input_depth % (blocksize * blocksize) != 0)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "DepthToSpace requires input depth to be a multiple of (block_size * block_size)");
+    }
 
-      output_depth = input_depth * blocksize_ * blocksize_;
-      output_height = input_height / blocksize_;
-      output_width = input_width / blocksize_;
+    output_depth = input_depth / blocksize / blocksize;
+    output_height = input_height * blocksize;
+    output_width = input_width * blocksize;
+  }
 
-    } else {  // DepthToSpace op
-      if ((input_depth % (blocksize_ * blocksize_) != 0)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               "DepthToSpace requires input depth to be a multiple of (block_size * block_size)");
-      }
+  return Status::OK();
+}
 
-      output_depth = input_depth / blocksize_ / blocksize_;
-      output_height = input_height * blocksize_;
-      output_width = input_width * blocksize_;
-    }
+}  // namespace space_depth_internal
+
+class SpaceDepthBase {
+ protected:
+  template <typename KernelInfoType>
+  explicit SpaceDepthBase(const KernelInfoType& info) : blocksize_(space_depth_internal::ReadBlocksize(info)) {}
 
-    return Status::OK();
+  template <bool IsNHWC = false>
+  Status InputValidationsAndOutputDimsCalc(const Tensor& input,
+                                           int64_t& batch,
+                                           int64_t& input_depth, int64_t& input_height, int64_t& input_width,
+                                           int64_t& output_depth, int64_t& output_height, int64_t& output_width,
+                                           bool is_space_to_depth) const {
+    return space_depth_internal::InputValidationsAndOutputDimsCalc<IsNHWC>(
+        blocksize_, input, batch, input_depth, input_height, input_width,
+        output_depth, output_height, output_width, is_space_to_depth);
   }
 
   int64_t blocksize_;
 };
 
+#if !defined(SHARED_PROVIDER) && !defined(BUILD_CUDA_EP_AS_PLUGIN)
+
 class SpaceToDepth final : public OpKernel, SpaceDepthBase {
  public:
   explicit SpaceToDepth(const OpKernelInfo& info) : OpKernel(info), SpaceDepthBase(info) {
@@ -79,23 +123,15 @@ class SpaceToDepth final : public OpKernel, SpaceDepthBase {
 
 class DepthToSpace final : public OpKernel, SpaceDepthBase {
  public:
-  explicit DepthToSpace(const OpKernelInfo& info) : OpKernel(info), SpaceDepthBase(info) {
-    std::string mode;
-    // if  mode doesn't exist, then it is the default "DCR" mode
-    // (or) it is an opset < 11 model for which the only mode is "DCR" mode
-    if (info.GetAttr("mode", &mode).IsOK()) {
-      if (mode == "CRD")
-        is_dcr_ = false;
-
-      else if (mode != "DCR")
-        ORT_THROW("DepthToSpace op: only 'DCR' and 'CRD' modes are supported");
-    }
-  }
+  explicit DepthToSpace(const OpKernelInfo& info)
+      : OpKernel(info), SpaceDepthBase(info), is_dcr_(space_depth_internal::ReadIsDCR(info)) {}
 
   Status Compute(OpKernelContext* context) const override;
 
  private:
   bool is_dcr_ = true;
 };
 
+#endif  // !defined(SHARED_PROVIDER) && !defined(BUILD_CUDA_EP_AS_PLUGIN)
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -149,11 +149,10 @@ struct Consts<BFloat16> {
 
 inline double ClampCudnnBatchNormEpsilon(double epsilon) {
   if (epsilon < CUDNN_BN_MIN_EPSILON) {
-#ifndef BUILD_CUDA_EP_AS_PLUGIN
-    if (CUDNN_BN_MIN_EPSILON - epsilon > FLT_EPSILON)
+    if (CUDNN_BN_MIN_EPSILON - epsilon > FLT_EPSILON) {
       LOGS_DEFAULT(WARNING) << "Provided epsilon is smaller than CUDNN_BN_MIN_EPSILON. "
                             << "Setting it to CUDNN_BN_MIN_EPSILON";
-#endif
+    }
     return CUDNN_BN_MIN_EPSILON;
   }
   return epsilon;

diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -237,12 +237,8 @@ Status Conv<T, Layout>::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShap
     CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_operation_graph(handle));
     CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->create_execution_plans({heur_mode}));
   } catch (const std::exception& ex) {
-#ifndef BUILD_CUDA_EP_AS_PLUGIN
     std::string message = MakeString("Failed to initialize CUDNN Frontend: ", ex.what(),
                                      " with the cudnn frontend json:\n", s_.cudnn_fe_graph->print());
-#else
-    std::string message = MakeString("Failed to initialize CUDNN Frontend: ", ex.what());
-#endif
     return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, message);
   }
 
@@ -253,12 +249,8 @@ Status Conv<T, Layout>::CreateCudnnFeExecutionPlan(const onnxruntime::TensorShap
     CUDNN_FE_CALL_THROW(s_.cudnn_fe_graph->build_plans(handle));
   } catch (const std::exception& ex) {
     if (!fuse_bias && !fuse_act && use_tf32) {
-#ifndef BUILD_CUDA_EP_AS_PLUGIN
       std::string message = MakeString("OP not supported by CUDNN Frontend: ", ex.what(),
                                        " with the cudnn frontend json:\n", s_.cudnn_fe_graph->print());
-#else
-      std::string message = MakeString("OP not supported by CUDNN Frontend: ", ex.what());
-#endif
       return Status(common::StatusCategory::ONNXRUNTIME, common::StatusCode::EP_FAIL, message);
     }