Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions include/onnxruntime/core/framework/allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,19 +265,23 @@
return CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, out);
}

using AllocatorPtr = std::shared_ptr<IAllocator>;

Check warning on line 268 in include/onnxruntime/core/framework/allocator.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <memory> for shared_ptr<> [build/include_what_you_use] [4] Raw Output: include/onnxruntime/core/framework/allocator.h:268: Add #include <memory> for shared_ptr<> [build/include_what_you_use] [4]
using AllocatorMap = std::map<OrtDevice, AllocatorPtr>;

class CPUAllocator : public IAllocator {
public:
explicit CPUAllocator(const OrtMemoryInfo& memory_info) : IAllocator(memory_info) {}

// Creates a function local static and returns a shared pointer to it.
// Re-use in all places where we need a standalone CPUAllocator instance
static AllocatorPtr DefaultInstance();

CPUAllocator() : IAllocator(OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator)) {}

void* Alloc(size_t size) override;
void Free(void* p) override;
};

using AllocatorPtr = std::shared_ptr<IAllocator>;
using AllocatorMap = std::map<OrtDevice, AllocatorPtr>;

void* AllocatorDefaultAlloc(size_t size);
void AllocatorDefaultFree(void* p);
void* AllocatorDefaultAllocAligned(size_t size, size_t alignment);
Expand Down
6 changes: 3 additions & 3 deletions include/onnxruntime/core/framework/ort_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class TensorSeq;
} // namespace onnxruntime

#endif
#endif // SHARED_PROVIDER

/**
Represents both tensors and non-tensors.
Expand All @@ -37,8 +37,8 @@
type_ = type;
}

void Init(void* pData, onnxruntime::MLDataType type, const std::function<void(void*)>& deleter) {
data_.reset(pData, deleter);
void Init(void* pData, onnxruntime::MLDataType type, std::function<void(void*)> deleter) {
data_.reset(pData, std::move(deleter));

Check warning on line 41 in include/onnxruntime/core/framework/ort_value.h

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <utility> for move [build/include_what_you_use] [4] Raw Output: include/onnxruntime/core/framework/ort_value.h:41: Add #include <utility> for move [build/include_what_you_use] [4]
type_ = type;
}

Expand Down
39 changes: 32 additions & 7 deletions include/onnxruntime/core/graph/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,13 @@ class Node {
friend class Graph;
Node(NodeIndex index, Graph& graph) : index_(index), graph_(&graph), can_be_saved_(true) {}

protected:
#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
// internal only method to allow selected classes to directly alter the input/output definitions and arg counts
// made protected to facilitate testing
Definitions& MutableDefinitions() noexcept;
#endif

private:
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);

Expand All @@ -588,9 +595,6 @@ class Node {
#endif

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
// internal only method to allow selected classes to directly alter the input/output definitions and arg counts
Definitions& MutableDefinitions() noexcept;

// internal only method to allow selected classes to directly alter the links between nodes.
Relationships& MutableRelationships() noexcept;

Expand Down Expand Up @@ -721,11 +725,12 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi

/** Replaces the initializer tensor with the same name as the given initializer tensor.
The replacement initializer tensor must have the same type and shape as the existing initializer tensor.
The new_initializer is expected to be either small or have external data reference stored in OrtValue.

Note: This currently has linear time complexity. There is room for improvement but it would likely require changes to
how initializer tensors are stored and tracked.
*/
common::Status ReplaceInitializedTensor(ONNX_NAMESPACE::TensorProto new_initializer);
common::Status ReplaceInitializedTensor(const ONNX_NAMESPACE::TensorProto& new_initializer, const OrtValue& ort_value);

#if !defined(DISABLE_EXTERNAL_INITIALIZERS)
/** This function takes externally provided data for initializers with external data
Expand All @@ -745,6 +750,18 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
/** Add an initializer tensor to the Graph. */
void AddInitializedTensor(const ONNX_NAMESPACE::TensorProto& tensor_proto);

/// <summary>
/// Add initializer to the Graph. This method takes a tensor proto that contains
/// a data pointer to ort_value. For small tensors (LT utils::kSmallTensorExternalDataThreshold),
/// the data would still be contained within tensor_proto, and
/// OrtValue would be unallocated in this case, and not added to ortvalue_initializers_.
/// </summary>
/// <param name="tensor_proto">tensor proto with external data pointing to OrtValue.</param>
/// <param name="ort_value_initializer">value that contains the initializer tensor. This may
/// be unallocated for small tensors.</param>
Status AddInitializedOrtValue(const ONNX_NAMESPACE::TensorProto& tensor_proto,
const OrtValue& ort_value_initializer);
#endif

/** Remove the initializer tensor with the provided name from the Graph. */
Expand All @@ -769,7 +786,7 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi

/** Populate `value` if an externally allocated OrtValue exists for an initializer with the given name.
*/
bool GetOrtValueInitializer(const std::string& name, OrtValue& value) const;
bool GetOrtValueInitializer(const std::string& name, OrtValue& value, bool check_outer_scope = false) const;

/** Gets all the initializer tensors in this Graph. */
const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return name_to_initial_tensor_; }
Expand Down Expand Up @@ -1645,8 +1662,16 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
// so they can be used to resolve outer scope dependencies when running BuildConnections for the subgraphs.
common::Status SetOuterScopeNodeArgs(const std::unordered_set<std::string>& outer_scope_node_args);

// Implementation for initializer replacement
Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer, bool is_external);
/// <summary>
/// Replace initializer with new_initializer.
/// </summary>
/// <param name="new_initializer"></param>
/// <param name="ort_value">ort_value with data, may be empty</param>
/// <param name="must_replace_external">This is true when we replace the initializer with external data
/// with OrtValue from the customer, in which case we enforce that the original initializer must have external data</param>
/// <returns></returns>
Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer,
OrtValue ort_value, bool must_replace_external);

template <typename StringRange> // range-initializer returning std::string
std::vector<NodeArg*> CreateNodeArgs(const StringRange& names,
Expand Down
6 changes: 2 additions & 4 deletions include/onnxruntime/core/optimizer/graph_transformer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
const IExecutionProvider& execution_provider /*required by constant folding*/,
const logging::Logger& logger,
const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
concurrency::ThreadPool* intra_op_thread_pool = nullptr);

#endif // !defined(ORT_MINIMAL_BUILD)

Expand Down Expand Up @@ -89,8 +88,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
const IExecutionProvider& cpu_execution_provider,
const logging::Logger& logger,
const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
concurrency::ThreadPool* intra_op_thread_pool = nullptr);

#endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ Status T5DecoderSubgraph::CreateInitialFeeds(
} else {
size_t total_size = static_cast<size_t>(sequence_length) * static_cast<size_t>(batch_beam_size);
size_t total_size_bytes = total_size * sizeof(int);
AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
AllocatorPtr buffer_allocator = CPUAllocator::DefaultInstance();
// TODO: not need extra buffer. Copy directly to input_ids_data instead like the user_cuda above.
auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size_bytes, false, stream);
int* seq_copy_ptr = seq_copy.get();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ Status WhisperDecoderSubgraph::CreateInitialFeeds(
Tensor::InitOrtValue(DataTypeImpl::GetType<int32_t>(), input_ids_shape, allocator, input_ids);
int32_t* input_ids_data = input_ids.GetMutable<Tensor>()->MutableData<int32_t>();

AllocatorPtr buffer_allocator = std::make_shared<onnxruntime::CPUAllocator>();
AllocatorPtr buffer_allocator = CPUAllocator::DefaultInstance();
size_t total_size = static_cast<size_t>(static_cast<long long>(cur_len) * batch_beam_size * sizeof(int));
auto seq_copy = IAllocator::MakeUniquePtr<int>(buffer_allocator, total_size, false, stream);
int* seq_copy_ptr = seq_copy.get();
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/dlpack/dlpack_converter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@
deleter(p);
};

ort_value.Init(p_tensor.release(), DataTypeImpl::GetType<Tensor>(), deleter);
ort_value.Init(p_tensor.release(), DataTypeImpl::GetType<Tensor>(), std::move(deleter));

Check warning on line 260 in onnxruntime/core/dlpack/dlpack_converter.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <utility> for move [build/include_what_you_use] [4] Raw Output: onnxruntime/core/dlpack/dlpack_converter.cc:260: Add #include <utility> for move [build/include_what_you_use] [4]
return ort_value;
}

Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/core/framework/allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@
return AllocatorDefaultAllocAligned(size, alignment);
}

AllocatorPtr CPUAllocator::DefaultInstance() {
static AllocatorPtr instance = std::make_shared<CPUAllocator>();

Check warning on line 103 in onnxruntime/core/framework/allocator.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <memory> for make_shared<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/framework/allocator.cc:103: Add #include <memory> for make_shared<> [build/include_what_you_use] [4]
return instance;
}

void* CPUAllocator::Alloc(size_t size) {
const auto alignment = std::max(Info().device.GetAlignment(), MlasGetPreferredBufferAlignment());
return AllocatorDefaultAllocAligned(size, alignment);
Expand Down
10 changes: 10 additions & 0 deletions onnxruntime/core/framework/endian_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,16 @@ void SwapByteOrderCopy(size_t element_size_in_bytes,
}
}

void SwapByteOrderInplace(size_t element_size_in_bytes, gsl::span<std::byte> bytes) {
ORT_ENFORCE(element_size_in_bytes > 0, "Expecting a positive element size");
ORT_ENFORCE(bytes.size_bytes() % element_size_in_bytes == 0, "Expecting a match");
if (element_size_in_bytes > 1) {
for (size_t offset = 0, lim = bytes.size_bytes(); offset < lim; offset += element_size_in_bytes) {
std::reverse(bytes.begin() + offset, bytes.begin() + offset + element_size_in_bytes);
}
}
}

namespace detail {

Status CopyLittleEndian(size_t element_size_in_bytes,
Expand Down
15 changes: 15 additions & 0 deletions onnxruntime/core/framework/endian_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,21 @@ void SwapByteOrderCopy(size_t element_size_in_bytes,
gsl::span<const unsigned char> source_bytes,
gsl::span<unsigned char> destination_bytes);

/**
* Swaps the byte order of the elements in the given byte span in place.
*
* This is a low-level function - please be sure to pass in valid arguments.
* In particular:
* - bytes should have a size that is a multiple of element_size_in_bytes.
* - element_size_in_bytes should be greater than zero.
* - bytes should not overlap with itself.
*
* @param element_size_in_bytes The size of an individual element, in bytes.
* @param source_bytes The source byte span.
*/
void SwapByteOrderInplace(size_t element_size_in_bytes,
gsl::span<std::byte> bytes);

namespace detail {

/**
Expand Down
7 changes: 4 additions & 3 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "core/framework/resource_accountant.h"
#include "core/graph/function.h"
#include "core/graph/function_utils.h"
#include "core/graph/graph_utils.h"
#include "core/graph/graph_viewer.h"
#include "core/graph/model.h"
#include "core/graph/model_saving_options.h"
Expand Down Expand Up @@ -902,9 +903,9 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
}

// handle initializers
for (const auto& initialized_tensor : graph.GetAllInitializedTensors()) {
if (ep_graph.GetNodeArg(initialized_tensor.first) != nullptr) {
ep_graph.AddInitializedTensor(*initialized_tensor.second);
for (const auto& [name, _] : graph.GetAllInitializedTensors()) {
if (ep_graph.GetNodeArg(name) != nullptr) {
graph_utils::MakeInitializerCopyIfNotExist(graph, ep_graph, name);
}
}

Expand Down
12 changes: 4 additions & 8 deletions onnxruntime/core/framework/session_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -300,17 +300,13 @@
return p_seq_exec_plan_->allocation_plan;
}

Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d,
Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& ort_value,
bool constant, bool sparse) {
auto p = initialized_tensors_.insert({ort_value_index, ort_value});
if (!p.second)
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "duplicated ort_value index:", ort_value_index,
". Do you have duplicated calls to SessionState::AddInitializedTensor function?");

if (d != nullptr && d->f != nullptr) {
deleter_for_initialized_tensors_.insert_or_assign(ort_value_index, *d);
}

if (constant) {
constant_initialized_tensors_.insert({ort_value_index, ort_value});
}
Expand Down Expand Up @@ -1620,16 +1616,16 @@
Env::Default(), graph_location, *graph_viewer_,
GetAllocator(OrtDevice()),
ort_value_name_idx_map_, initializer_allocation_order, *tensor_allocator,
[this, remove_initializers](const std::string& name, int idx, const OrtValue& value, const OrtCallback& d,
[this, remove_initializers](const std::string& name, int idx, const OrtValue& value,

Check warning on line 1619 in onnxruntime/core/framework/session_state.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <string> for string [build/include_what_you_use] [4] Raw Output: onnxruntime/core/framework/session_state.cc:1619: Add #include <string> for string [build/include_what_you_use] [4]
bool constant, bool sparse) -> Status {
ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, &d, constant, sparse));
ORT_RETURN_IF_ERROR(AddInitializedTensor(idx, value, constant, sparse));
if (remove_initializers) {
graph_.RemoveInitializedTensor(name);
}
return Status::OK();
},
logger_, data_transfer_mgr_, external_data_loader_mgr_, *p_seq_exec_plan_, session_options,
memory_profile_func, name_to_buffered_tensor_, graph_.GetPrepacked()));
memory_profile_func, graph_.GetPrepacked()));

#if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
// Record Weight allocation info on device
Expand Down
18 changes: 1 addition & 17 deletions onnxruntime/core/framework/session_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include "core/common/logging/logging.h"
#include "core/common/profiler.h"
#include "core/framework/allocation_planner.h"
#include "core/framework/callback.h"
#include "core/framework/data_transfer_manager.h"
#include "core/framework/external_data_loader_manager.h"
#include "core/framework/execution_providers.h"
Expand Down Expand Up @@ -102,9 +101,6 @@ class SessionState {
AllocatorMap* parent_allocators = nullptr);

~SessionState() {
for (auto& kvp : deleter_for_initialized_tensors_) {
kvp.second.f(kvp.second.param);
}
}

// Graph viewer. CreateGraphInfo must have been called previously.
Expand Down Expand Up @@ -143,12 +139,11 @@ class SessionState {
/**
* Adds an initialized tensor (weight) so that it can be used by the
* execution frame to setup the appropriate OrtValue vectors.
* This function will take a shallow copy of d if d is not NULL.
* If 'constant' is true the tensor value cannot be overridden by an input at runtime.
* If 'sparse' is true the tensor value represents a densified weight that was initially stored in the model
* as sparse tensor.
*/
Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, const OrtCallback* d, bool constant, bool sparse);
Status AddInitializedTensor(int ort_value_index, const OrtValue& ort_value, bool constant, bool sparse);

/**
* Gets the map of ort_value_index to initialized tensors (weights) so that it can be used by the
Expand Down Expand Up @@ -310,10 +305,6 @@ class SessionState {
const InlinedHashSet<NodeIndex>* GetToBeExecutedRange(gsl::span<int const> fetch_mlvalue_idxs) const;
#endif

std::unordered_map<std::string, std::unique_ptr<Tensor>>* GetMutableBufferedTensors() {
return &name_to_buffered_tensor_;
}

Status FinalizeSessionState(const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
const KernelRegistryManager& kernel_registry_manager,
bool remove_initializers = true,
Expand Down Expand Up @@ -509,7 +500,6 @@ class SessionState {

// This data structure is for uninitializing string tensors and
// munmap memory region and close file descriptor
InlinedHashMap<int, OrtCallback> deleter_for_initialized_tensors_;
InlinedVector<BufferUniquePtr> weights_buffers_;
std::optional<SequentialExecutionPlan> p_seq_exec_plan_;

Expand Down Expand Up @@ -607,12 +597,6 @@ class SessionState {
// flag to indicate whether current session using any EP that create device stream dynamically.
bool has_device_stream_enabled_ep_ = false;
#endif

// Holds the tensors which provide memory buffer for TensorProtos
// Use case: in optimizer, transform a TensorProto to a new TensorProto whose the memory buffer is
// allocated by CPU instead by protobuf's arena. Arena style memory allocators do not fully release
// a instance's memory which may result large memory consumption, which is a tradeoff for speed.
std::unordered_map<std::string, std::unique_ptr<Tensor>> name_to_buffered_tensor_;
};

} // namespace onnxruntime
Loading
Loading