microsoft · yuslepukhin · Jun 11, 2025 · May 7, 2025 · Jun 3, 2025 · Jun 6, 2025
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
@@ -265,19 +265,23 @@
   return CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, out);
 }
 
+using AllocatorPtr = std::shared_ptr<IAllocator>;
+using AllocatorMap = std::map<OrtDevice, AllocatorPtr>;
+
 class CPUAllocator : public IAllocator {
  public:
   explicit CPUAllocator(const OrtMemoryInfo& memory_info) : IAllocator(memory_info) {}
 
+  // Creates a function local static and returns a shared pointer to it.
+  // Re-use in all places where we need a standalone CPUAllocator instance
+  static AllocatorPtr DefaultInstance();
+
   CPUAllocator() : IAllocator(OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator)) {}
 
   void* Alloc(size_t size) override;
   void Free(void* p) override;
 };
 
-using AllocatorPtr = std::shared_ptr<IAllocator>;
-using AllocatorMap = std::map<OrtDevice, AllocatorPtr>;
-
 void* AllocatorDefaultAlloc(size_t size);
 void AllocatorDefaultFree(void* p);
 void* AllocatorDefaultAllocAligned(size_t size, size_t alignment);

diff --git a/include/onnxruntime/core/framework/ort_value.h b/include/onnxruntime/core/framework/ort_value.h
@@ -18,7 +18,7 @@
 class TensorSeq;
 }  // namespace onnxruntime
 
-#endif
+#endif  // SHARED_PROVIDER
 
 /**
    Represents both tensors and non-tensors.
@@ -37,8 +37,8 @@
     type_ = type;
   }
 
-  void Init(void* pData, onnxruntime::MLDataType type, const std::function<void(void*)>& deleter) {
-    data_.reset(pData, deleter);
+  void Init(void* pData, onnxruntime::MLDataType type, std::function<void(void*)> deleter) {
+    data_.reset(pData, std::move(deleter));
     type_ = type;
   }
 

diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -567,6 +567,13 @@ class Node {
   friend class Graph;
   Node(NodeIndex index, Graph& graph) : index_(index), graph_(&graph), can_be_saved_(true) {}
 
+ protected:
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  // internal only method to allow selected classes to directly alter the input/output definitions and arg counts
+  // made protected to facilitate testing
+  Definitions& MutableDefinitions() noexcept;
+#endif
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
 
@@ -588,9 +595,6 @@ class Node {
 #endif
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-  // internal only method to allow selected classes to directly alter the input/output definitions and arg counts
-  Definitions& MutableDefinitions() noexcept;
-
   // internal only method to allow selected classes to directly alter the links between nodes.
   Relationships& MutableRelationships() noexcept;
 
@@ -721,11 +725,12 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
   /** Replaces the initializer tensor with the same name as the given initializer tensor.
   The replacement initializer tensor must have the same type and shape as the existing initializer tensor.
+  The new_initializer is expected to be either small or have external data reference stored in OrtValue.
 
   Note: This currently has linear time complexity. There is room for improvement but it would likely require changes to
   how initializer tensors are stored and tracked.
   */
-  common::Status ReplaceInitializedTensor(ONNX_NAMESPACE::TensorProto new_initializer);
+  common::Status ReplaceInitializedTensor(const ONNX_NAMESPACE::TensorProto& new_initializer, const OrtValue& ort_value);
 
 #if !defined(DISABLE_EXTERNAL_INITIALIZERS)
   /** This function takes externally provided data for initializers with external data
@@ -745,6 +750,18 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
   /** Add an initializer tensor to the Graph. */
   void AddInitializedTensor(const ONNX_NAMESPACE::TensorProto& tensor_proto);
+
+  /// <summary>
+  /// Add initializer to the Graph. This method takes a tensor proto that contains
+  /// a data pointer to ort_value. For small tensors (LT utils::kSmallTensorExternalDataThreshold),
+  /// the data would still be contained within tensor_proto, and
+  /// OrtValue would be unallocated in this case, and not added to ortvalue_initializers_.
+  /// </summary>
+  /// <param name="tensor_proto">tensor proto with external data pointing to OrtValue.</param>
+  /// <param name="ort_value_initializer">value that contains the initializer tensor. This may
+  /// be unallocated for small tensors.</param>
+  Status AddInitializedOrtValue(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                const OrtValue& ort_value_initializer);
 #endif
 
   /** Remove the initializer tensor with the provided name from the Graph. */
@@ -769,7 +786,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
 
   /** Populate `value` if an externally allocated OrtValue exists for an initializer with the given name.
    */
-  bool GetOrtValueInitializer(const std::string& name, OrtValue& value) const;
+  bool GetOrtValueInitializer(const std::string& name, OrtValue& value, bool check_outer_scope = false) const;
 
   /** Gets all the initializer tensors in this Graph. */
   const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return name_to_initial_tensor_; }
@@ -1645,8 +1662,16 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   // so they can be used to resolve outer scope dependencies when running BuildConnections for the subgraphs.
   common::Status SetOuterScopeNodeArgs(const std::unordered_set<std::string>& outer_scope_node_args);
 
-  // Implementation for initializer replacement
-  Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer, bool is_external);
+  /// <summary>
+  /// Replace initializer with new_initializer.
+  /// </summary>
+  /// <param name="new_initializer"></param>
+  /// <param name="ort_value">ort_value with data, may be empty</param>
+  /// <param name="must_replace_external">This is true when we replace the initializer with external data
+  /// with OrtValue from the customer, in which case we enforce that the original initializer must have external data</param>
+  /// <returns></returns>
+  Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer,
+                                      OrtValue ort_value, bool must_replace_external);
 
   template <typename StringRange>  // range-initializer returning std::string
   std::vector<NodeArg*> CreateNodeArgs(const StringRange& names,

diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -57,8 +57,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
     const IExecutionProvider& execution_provider /*required by constant folding*/,
     const logging::Logger& logger,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
-    concurrency::ThreadPool* intra_op_thread_pool = nullptr,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD)
 
@@ -89,8 +88,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
     const IExecutionProvider& cpu_execution_provider,
     const logging::Logger& logger,
     const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
-    concurrency::ThreadPool* intra_op_thread_pool = nullptr,
-    std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr);
 
 #endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
 

diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_t5_decoder.cc
@@ -215,7 +215,7 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
     } else {
       size_t total_size = static_cast<size_t>(sequence_length) * static_cast<size_t>(batch_beam_size);
       size_t total_size_bytes = total_size * sizeof(int);
-      AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
+      AllocatorPtr buffer_allocator = CPUAllocator::DefaultInstance();
       // TODO: not need extra buffer. Copy directly to input_ids_data instead like the user_cuda above.
       auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size_bytes, false, stream);
       int* seq_copy_ptr = seq_copy.get();

diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_whisper_decoder.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_whisper_decoder.cc
@@ -167,7 +167,7 @@ Status WhisperDecoderSubgraph::CreateInitialFeeds(
   Tensor::InitOrtValue(DataTypeImpl::GetType<int32_t>(), input_ids_shape, allocator, input_ids);
   int32_t* input_ids_data = input_ids.GetMutable<Tensor>()->MutableData<int32_t>();
 
-  AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
+  AllocatorPtr buffer_allocator = CPUAllocator::DefaultInstance();
   size_t total_size = static_cast<size_t>(static_cast<long long>(cur_len) * batch_beam_size * sizeof(int));
   auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size, false, stream);
   int* seq_copy_ptr = seq_copy.get();

diff --git a/onnxruntime/core/dlpack/dlpack_converter.cc b/onnxruntime/core/dlpack/dlpack_converter.cc
@@ -257,7 +257,7 @@
       deleter(p);
   };
 
-  ort_value.Init(p_tensor.release(), DataTypeImpl::GetType<Tensor>(), deleter);
+  ort_value.Init(p_tensor.release(), DataTypeImpl::GetType<Tensor>(), std::move(deleter));
   return ort_value;
 }
 

diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
@@ -99,6 +99,11 @@
   return AllocatorDefaultAllocAligned(size, alignment);
 }
 
+AllocatorPtr CPUAllocator::DefaultInstance() {
+  static AllocatorPtr instance = std::make_shared<CPUAllocator>();
+  return instance;
+}
+
 void* CPUAllocator::Alloc(size_t size) {
   const auto alignment = std::max(Info().device.GetAlignment(), MlasGetPreferredBufferAlignment());
   return AllocatorDefaultAllocAligned(size, alignment);

diff --git a/onnxruntime/core/framework/endian_utils.cc b/onnxruntime/core/framework/endian_utils.cc
@@ -48,6 +48,16 @@ void SwapByteOrderCopy(size_t element_size_in_bytes,
   }
 }
 
+void SwapByteOrderInplace(size_t element_size_in_bytes, gsl::span<std::byte> bytes) {
+  ORT_ENFORCE(element_size_in_bytes > 0, "Expecting a positive element size");
+  ORT_ENFORCE(bytes.size_bytes() % element_size_in_bytes == 0, "Expecting a match");
+  if (element_size_in_bytes > 1) {
+    for (size_t offset = 0, lim = bytes.size_bytes(); offset < lim; offset += element_size_in_bytes) {
+      std::reverse(bytes.begin() + offset, bytes.begin() + offset + element_size_in_bytes);
+    }
+  }
+}
+
 namespace detail {
 
 Status CopyLittleEndian(size_t element_size_in_bytes,

diff --git a/onnxruntime/core/framework/endian_utils.h b/onnxruntime/core/framework/endian_utils.h
@@ -31,6 +31,21 @@ void SwapByteOrderCopy(size_t element_size_in_bytes,
                        gsl::span<const unsigned char> source_bytes,
                        gsl::span<unsigned char> destination_bytes);
 
+/**
+ * Swaps the byte order of the elements in the given byte span in place.
+ *
+ * This is a low-level function - please be sure to pass in valid arguments.
+ * In particular:
+ * - bytes should have a size that is a multiple of element_size_in_bytes.
+ * - element_size_in_bytes should be greater than zero.
+ * - bytes should not overlap with itself.
+ *
+ * @param element_size_in_bytes The size of an individual element, in bytes.
+ * @param source_bytes The source byte span.
+ */
+void SwapByteOrderInplace(size_t element_size_in_bytes,
+                          gsl::span<std::byte> bytes);
+
 namespace detail {
 
 /**

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -17,6 +17,7 @@
 #include "core/framework/resource_accountant.h"
 #include "core/graph/function.h"
 #include "core/graph/function_utils.h"
+#include "core/graph/graph_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
 #include "core/graph/model_saving_options.h"
@@ -902,9 +903,9 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
   }
 
   // handle initializers
-  for (const auto& initialized_tensor : graph.GetAllInitializedTensors()) {
-    if (ep_graph.GetNodeArg(initialized_tensor.first) != nullptr) {
-      ep_graph.AddInitializedTensor(*initialized_tensor.second);
+  for (const auto& [name, _] : graph.GetAllInitializedTensors()) {
+    if (ep_graph.GetNodeArg(name) != nullptr) {
+      graph_utils::MakeInitializerCopyIfNotExist(graph, ep_graph, name);
     }
   }
 

diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
@@ -300,17 +300,13 @@
   return p_seq_exec_plan_->allocation_plan;
 }
 
-Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d,
+Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& ort_value,
                                           bool constant, bool sparse) {
   auto p = initialized_tensors_.insert({ort_value_index, ort_value});
   if (!p.second)
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "duplicated ort_value index:", ort_value_index,
                            ". Do you have duplicated calls to SessionState::AddInitializedTensor function?");
 
-  if (d != nullptr && d->f != nullptr) {
-    deleter_for_initialized_tensors_.insert_or_assign(ort_value_index, *d);
-  }
-
   if (constant) {
     constant_initialized_tensors_.insert({ort_value_index, ort_value});
   }
@@ -1620,16 +1616,16 @@
       Env::Default(), graph_location, *graph_viewer_,
       GetAllocator(OrtDevice()),
       ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator,
-      [this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d,
+      [this, remove_initializers](const std::string& name, int idx, const OrtValue& value,
                                   bool constant, bool sparse) -> Status {
-        ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse));
+        ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, constant, sparse));
         if (remove_initializers) {
           graph_.RemoveInitializedTensor(name);
         }
         return Status::OK();
       },
       logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
-      memory_profile_func, name_to_buffered_tensor_, graph_.GetPrepacked()));
+      memory_profile_func, graph_.GetPrepacked()));
 
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
   // Record Weight allocation info on device

diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
@@ -18,7 +18,6 @@
 #include "core/common/logging/logging.h"
 #include "core/common/profiler.h"
 #include "core/framework/allocation_planner.h"
-#include "core/framework/callback.h"
 #include "core/framework/data_transfer_manager.h"
 #include "core/framework/external_data_loader_manager.h"
 #include "core/framework/execution_providers.h"
@@ -102,9 +101,6 @@ class SessionState {
                AllocatorMap* parent_allocators = nullptr);
 
   ~SessionState() {
-    for (auto& kvp : deleter_for_initialized_tensors_) {
-      kvp.second.f(kvp.second.param);
-    }
   }
 
   // Graph viewer. CreateGraphInfo must have been called previously.
@@ -143,12 +139,11 @@ class SessionState {
   /**
    * Adds an initialized tensor (weight) so that it can be used by the
    * execution frame to setup the appropriate OrtValue vectors.
-   * This function will take a shallow copy of d if d is not NULL.
    * If 'constant' is true the tensor value cannot be overridden by an input at runtime.
    * If 'sparse' is true the tensor value represents a densified weight that was initially stored in the model
    * as sparse tensor.
    */
-  Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d, bool constant, bool sparse);
+  Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, bool constant, bool sparse);
 
   /**
    * Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the
@@ -310,10 +305,6 @@ class SessionState {
   const InlinedHashSet<NodeIndex>* GetToBeExecutedRange(gsl::span<int const> fetch_mlvalue_idxs) const;
 #endif
 
-  std::unordered_map<std::string, std::unique_ptr<Tensor>>* GetMutableBufferedTensors() {
-    return &name_to_buffered_tensor_;
-  }
-
   Status FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
                               const KernelRegistryManager& kernel_registry_manager,
                               bool remove_initializers = true,
@@ -509,7 +500,6 @@ class SessionState {
 
   // This data structure is for uninitializing string tensors and
   // munmap memory region and close file descriptor
-  InlinedHashMap<int, OrtCallback> deleter_for_initialized_tensors_;
   InlinedVector<BufferUniquePtr> weights_buffers_;
   std::optional<SequentialExecutionPlan> p_seq_exec_plan_;
 
@@ -607,12 +597,6 @@ class SessionState {
   // flag to indicate whether current session using any EP that create device stream dynamically.
   bool has_device_stream_enabled_ep_ = false;
 #endif
-
-  // Holds the tensors which provide memory buffer for TensorProtos
-  // Use case: in optimizer, transform a TensorProto to a new TensorProto whose the memory buffer is
-  // allocated by CPU instead by protobuf's arena. Arena style memory allocators do not fully release
-  // a instance's memory which may result large memory consumption, which is a tradeoff for speed.
-  std::unordered_map<std::string, std::unique_ptr<Tensor>> name_to_buffered_tensor_;
 };
 
 }  // namespace onnxruntime