diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 9a0708d72b4f8..11ca73790ea79 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -1454,13 +1454,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi return Resolve(default_options); } - /// - /// This function converts all the graph TensorProto initializers into OrtValues - /// and creates a in-memory external data reference for each OrtValue. - /// - /// - Status ConvertInitializersIntoOrtValues(); - /** * @brief Converts a subset of graph TensorProto initializers into OrtValues and updates the graph proto. * diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc index 4f8c5607afce9..254c520b4e54a 100644 --- a/onnxruntime/core/framework/session_state_utils.cc +++ b/onnxruntime/core/framework/session_state_utils.cc @@ -140,12 +140,12 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st std::move(tensor), ort_value); } } else { - // for internal initializer, always allocate memory on device - tensor - ORT_RETURN_IF_ERROR(AllocateTensor(memory_buffer, tensor, type, tensor_shape, - use_device_allocator_for_initializers, alloc)); - if (device == default_cpu_device) { // deserialize directly to CPU tensor + // Do not use arena for internal initializer, just like we do for OrtValue initializers + ORT_RETURN_IF_ERROR(AllocateTensorOnDeviceOrMemory(/* use_device_allocator_for_initializers =*/true, + tensor_shape, type, + default_cpu_alloc, tensor)); ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, tensor)); Tensor::InitOrtValue(std::move(tensor), ort_value); return common::Status::OK(); @@ -154,13 +154,19 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators"); } + // Allocate according to the plan on the device or directly on the device according to + // use_device_allocator_for_initializers + ORT_RETURN_IF_ERROR(AllocateTensor(memory_buffer, tensor, type, tensor_shape, + use_device_allocator_for_initializers, alloc)); + // deserialize to CPU first for non-CPU allocator, then copy // for internal initializer - // 1. allocate memory on CPU - deserialized_tensor - // 2. deserialize tensor_proto into a preallocated tensor (deserialized_tensor) + // 1. allocate memory on CPU - deserialized_tensor. Do not use arena not to waste space for temporary buffers. + // 2. deserialize tensor_proto into a pre-allocated tensor (deserialized_tensor) // 3. copy tensor from CPU to device - deserialized_tensor -> tensor (allocated above) -> ort_value Tensor deserialized_tensor; - ORT_RETURN_IF_ERROR(AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, + ORT_RETURN_IF_ERROR(AllocateTensorOnDeviceOrMemory(/* use_device_allocator_for_initializers =*/true, + tensor_shape, type, default_cpu_alloc, deserialized_tensor)); ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, deserialized_tensor)); @@ -346,6 +352,13 @@ common::Status SaveInitializedTensors( << i.second << " bytes for " << i.first.ToString() << std::endl; } + // ??? Should we ignore this session option if the EP is explicitly providing the read only allocator? + // bool have_readonly_initializer_allocator = alloc->Info().alloc_type == OrtReadOnlyAllocator; + // This option also means to ignore arena if present and use Reserve(). + const bool use_device_allocator_for_initializers = + session_options.config_options.GetConfigOrDefault( + kOrtSessionOptionsUseDeviceAllocatorForInitializers, "0") == "1"; + // 3. create weight tensors based on weights buffer for (const auto& entry : id_to_initialized_tensor) { // We check for cancellation for every initializer since mapping from disk can be costly @@ -375,12 +388,6 @@ common::Status SaveInitializedTensors( // TODO: if the tensor need be copied, does it have enough room? ORT_RETURN_IF_ERROR(planner.GetPreallocatedBuffer(ort_value_index, name, memory_buffer, alloc)); - // ??? Should we ignore this session option if the EP is explicitly providing the read only allocator? - // bool have_readonly_initializer_allocator = alloc->Info().alloc_type == OrtReadOnlyAllocator; - const bool use_device_allocator_for_initializers = - session_options.config_options.GetConfigOrDefault( - kOrtSessionOptionsUseDeviceAllocatorForInitializers, "0") == "1"; - // Check if we already have an OrtValue for this initializer on CPU if (OrtValue ort_value_from_graph; graph.GetOrtValueInitializer(name, ort_value_from_graph)) { diff --git a/onnxruntime/core/framework/tensor.cc b/onnxruntime/core/framework/tensor.cc index 133f21be97c46..eefd7825eca5b 100644 --- a/onnxruntime/core/framework/tensor.cc +++ b/onnxruntime/core/framework/tensor.cc @@ -93,14 +93,14 @@ Tensor::Tensor(MLDataType elt_type, const TensorShape& shape, std::shared_ptr 0) { p_data = allocator->Alloc(len); } - Init(elt_type, shape, p_data, allocator, 0L); + Init(elt_type, shape, p_data, std::move(allocator), 0L); } Tensor::Tensor(MLDataType elt_type, const TensorShape& shape, void* p_data, std::shared_ptr deleter, ptrdiff_t offset, gsl::span strides) : alloc_info_(deleter->Info()) { ORT_ENFORCE(elt_type != nullptr); - Init(elt_type, shape, p_data, deleter, offset, strides); + Init(elt_type, shape, p_data, std::move(deleter), offset, strides); } void Tensor::InitOrtValue(MLDataType elt_type, const TensorShape& shape, std::shared_ptr allocator, diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 8b599dc86d997..3d67314cf693a 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -1231,6 +1231,28 @@ Graph::Graph(const Model& owning_model, ArgNameToTypeMap name_to_type_map; const auto& model_path = ModelPath(); + // If the tensor proto data is large enough, move data from TensorProto to an OrtValue + // - Add external data reference to TensorProto that points to an OrtValue. + // This lambda should not be used on initializers that already have external data reference. + // Otherwise, this function does nothing. + auto put_large_tensor_in_ort_value = [this, &model_path](ONNX_NAMESPACE::TensorProto& tensor_proto) { + size_t size_in_bytes = 0; + ORT_THROW_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_in_bytes)); + if (size_in_bytes > utils::kSmallTensorExternalDataThreshold) { + OrtValue ort_value; + ORT_THROW_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), model_path, tensor_proto, + CPUAllocator::DefaultInstance(), ort_value)); + constexpr const bool use_tensor_buffer_true = true; + auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get(), tensor_proto.name(), + use_tensor_buffer_true); + assert(ort_value.IsAllocated()); + auto ins_result = ortvalue_initializers_.insert_or_assign(tensor_proto_to_add.name(), std::move(ort_value)); + ORT_ENFORCE(ins_result.second, "Unexpected duplicate insert or assign OrtValue for tensor: ", tensor_proto_to_add.name(), + " in the initializer list."); + tensor_proto = std::move(tensor_proto_to_add); + } + }; + // Process 'Constant' nodes // Put the 'TensorProto' stored in the 'Constant' nodes attribute into the graphs initializer list for (auto& node : graph_proto_->node()) { @@ -1250,6 +1272,8 @@ Graph::Graph(const Model& owning_model, } } + put_large_tensor_in_ort_value(*tensor); + // Ensure initializers are also graph inputs. if (ir_version_ < 4) { TypeProto t{utils::TypeProtoFromTensorProto(*tensor)}; @@ -1326,7 +1350,25 @@ Graph::Graph(const Model& owning_model, } // Copy initial tensors to a map. - for (auto& tensor : graph_proto_->initializer()) { + for (int i = 0, lim = graph_proto_->initializer_size(); i < lim; ++i) { + auto& tensor = *graph_proto_->mutable_initializer(i); + // If data is on disk, it will be loaded either by optimizers + // or during session state finalization. + // If data is already in memory, do nothing. + if (!utils::HasExternalData(tensor)) { + // sparse_tensor_names_ contain references to strings to save memory + // in case we replace the tensor_proto, we want to make sure we remove + // the old reference first, and then add a new one. + const bool is_sparse = sparse_tensor_names_.count(tensor.name()); + if (is_sparse) { + sparse_tensor_names_.erase(tensor.name()); + } + put_large_tensor_in_ort_value(tensor); + if (is_sparse) { + sparse_tensor_names_.emplace(tensor.name()); + } + } + auto p = name_to_initial_tensor_.emplace(tensor.name(), &tensor); if (!p.second) { LOGS(logger_, WARNING) << "Duplicate initializer (dense, sparse or ConstantNode): '" << tensor.name() @@ -3415,38 +3457,6 @@ Status Graph::Resolve(const ResolveOptions& options) { return ForThisAndAllSubgraphs(all_subgraphs, finalize_func); } -Status Graph::ConvertInitializersIntoOrtValues() { - std::vector all_subgraphs; - FindAllSubgraphs(all_subgraphs); - - auto put_weights_maybe_in_memory_func = [&](Graph& graph) -> Status { - // if we have any initializers that are not in memory, put them there. - const auto& model_path = graph.ModelPath(); - auto& graph_proto = *graph.graph_proto_; - for (int i = 0, lim = graph_proto.initializer_size(); i < lim; ++i) { - auto& tensor_proto = *graph_proto.mutable_initializer(i); - if (utils::HasExternalData(tensor_proto)) { - continue; // ignore data on disk, that will be loaded either by EP or at session_state finalize - } - - size_t size_in_bytes = 0; - ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_in_bytes)); - if (size_in_bytes > utils::kSmallTensorExternalDataThreshold) { - OrtValue ort_value; - ORT_RETURN_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), model_path, tensor_proto, - CPUAllocator::DefaultInstance(), ort_value)); - constexpr const bool use_tensor_buffer_true = true; - auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get(), tensor_proto.name(), - use_tensor_buffer_true); - ORT_RETURN_IF_ERROR(graph.ReplaceInitializedTensor(tensor_proto_to_add, ort_value)); - } - } - return Status::OK(); - }; - - return ForThisAndAllSubgraphs(all_subgraphs, put_weights_maybe_in_memory_func); -} - void Graph::SetName(const std::string& name) { graph_proto_->set_name(name); } diff --git a/onnxruntime/core/graph/graph_utils.cc b/onnxruntime/core/graph/graph_utils.cc index c38e1fcff46b3..0480263befdd1 100644 --- a/onnxruntime/core/graph/graph_utils.cc +++ b/onnxruntime/core/graph/graph_utils.cc @@ -285,7 +285,7 @@ NodeArg& AddInitializer(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_ini return GetOrCreateNodeArg(graph, new_initializer); } -NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer) { +NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer) { const bool has_external_data = utils::HasExternalData(new_initializer); ORT_ENFORCE(!has_external_data, "Expecting an initializer that contains data inline"); @@ -293,11 +293,11 @@ NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::Tens ORT_THROW_IF_ERROR(utils::CreateTensorFromTensorProto(Env::Default(), graph.ModelPath(), new_initializer, tensor)); auto tensor_proto_with_ptr = utils::TensorToTensorProto(tensor, new_initializer.name(), true); - return AddInitializerWithExternalData(graph, tensor_proto_with_ptr, std::move(tensor)); + return AddInitializerWithOrtValue(graph, tensor_proto_with_ptr, std::move(tensor)); } -NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, - Tensor&& tensor) { +NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, + Tensor&& tensor) { OrtValue ort_value; if (utils::HasExternalDataInMemory(new_initializer)) { Tensor::InitOrtValue(std::move(tensor), ort_value); @@ -307,8 +307,8 @@ NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::Tens return GetOrCreateNodeArg(graph, new_initializer); } -NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, - OrtValue ort_value) { +NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, + OrtValue ort_value) { ORT_THROW_IF_ERROR(graph.AddInitializedOrtValue(new_initializer, ort_value)); return GetOrCreateNodeArg(graph, new_initializer); } diff --git a/onnxruntime/core/graph/graph_utils.h b/onnxruntime/core/graph/graph_utils.h index b6bf5927a3c6c..256a6fc81495d 100644 --- a/onnxruntime/core/graph/graph_utils.h +++ b/onnxruntime/core/graph/graph_utils.h @@ -45,8 +45,8 @@ NodeArg& AddInitializer(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_ini /// TensorProto with external data contained in ort_value /// ort_value with data /// -NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, - OrtValue ort_value); +NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, + OrtValue ort_value); /** Add a new initializer to 'graph'. * Checks that new_initializer does not already exist in 'graph' before adding it. @@ -55,7 +55,7 @@ NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::Tens * @returns The NodeArg for the new initializer. * @remarks No matching graph input is created, so the initializer will be constant. */ -NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, Tensor&& tensor); +NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer, Tensor&& tensor); /** Add a new initializer to 'graph'. * The function unpacks data into a tensor and converts new_initializer to a TensorProto with external data in memory. @@ -67,7 +67,7 @@ NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::Tens * @returns The NodeArg for the new initializer. * @remarks No matching graph input is created, so the initializer will be constant. */ -NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer); +NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer); /// /// If the initializer with the given name does not exist in the destination graph, but exists in the diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc index 3f9b58f71bd23..9fd71b3b00cd0 100644 --- a/onnxruntime/core/optimizer/attention_fusion.cc +++ b/onnxruntime/core/optimizer/attention_fusion.cc @@ -111,7 +111,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size, utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow(element_count) * sizeof(MLFloat16)); } - return graph_utils::AddInitializer(graph, initializer); + return graph_utils::AddInitializerWithOrtValue(graph, initializer); } static NodeArg* ConvertMaskToInt32(Graph& graph, NodeArg* mask_input, ProviderType provider_type, diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc index 86a7a4d6afbf8..87af11808506d 100644 --- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc +++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc @@ -189,7 +189,7 @@ NodeArg* CreateInitializerFromVector(Graph& graph, "total_count: ", total_count, " values.size(): ", values.size()); utils::SetRawDataInTensorProto(const_tensor, values.data(), values.size() * sizeof(int64_t)); - return &graph_utils::AddInitializer(graph, const_tensor); + return &graph_utils::AddInitializerWithOrtValue(graph, const_tensor); } NodeArg* InsertNodesForValidIndices(Graph& graph, diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc index 16e8955cb4486..9db3e85585035 100644 --- a/onnxruntime/core/optimizer/constant_folding.cc +++ b/onnxruntime/core/optimizer/constant_folding.cc @@ -95,7 +95,7 @@ static bool ConstantFoldShapeNode(Graph& graph, Node& node) { ONNX_NAMESPACE::TensorShapeProto result_shape; result_shape.add_dim()->set_dim_value(clamped_slice_length); constant_arg_out->SetShape(result_shape); - graph_utils::AddInitializer(graph, shape_constant); + graph_utils::AddInitializerWithOrtValue(graph, shape_constant); } return is_concrete_shape; // convert to constant if this is true @@ -317,11 +317,11 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, // Build the TensorProto that corresponds to the computed OrtValue and add it as initializer to the graph. auto* constant_arg_out = node->MutableOutputDefs()[fetch_idx]; const Tensor& out_tensor = ort_value.Get(); - constexpr const bool use_tensor_buffer_false = false; + constexpr const bool use_tensor_buffer_true = true; ONNX_NAMESPACE::TensorProto out_tensorproto = utils::TensorToTensorProto( out_tensor, constant_arg_out->Name(), - use_tensor_buffer_false); + use_tensor_buffer_true); ONNX_NAMESPACE::TensorShapeProto result_shape; for (auto& dim : out_tensor.Shape().GetDims()) { @@ -329,7 +329,12 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, } constant_arg_out->SetShape(result_shape); - graph.AddInitializedTensor(out_tensorproto); + // The data is too small and has been inlined. + if (!utils::HasExternalData(out_tensorproto)) { + ORT_THROW_IF_ERROR(graph.AddInitializedOrtValue(out_tensorproto, OrtValue())); + } else { + ORT_THROW_IF_ERROR(graph.AddInitializedOrtValue(out_tensorproto, ort_value)); + } } } } diff --git a/onnxruntime/core/optimizer/conv_add_fusion.cc b/onnxruntime/core/optimizer/conv_add_fusion.cc index 6478fa7d29d5b..e1fd199bfa943 100644 --- a/onnxruntime/core/optimizer/conv_add_fusion.cc +++ b/onnxruntime/core/optimizer/conv_add_fusion.cc @@ -79,7 +79,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie auto new_name = graph.GenerateNodeArgName("ConvAddFusion_B_" + B_input_name); new_conv_B_tensor_proto.set_name(new_name); - NodeArg& new_conv_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); + NodeArg& new_conv_B_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_conv_B_tensor_proto); graph_utils::ReplaceNodeInput(node, 2, new_conv_B_node_arg); } else { @@ -94,7 +94,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie auto new_name = graph.GenerateNodeArgName("ConvAddFusion_Add_B_" + add_B_tensor_proto->name()); new_conv_B_tensor_proto.set_name(new_name); - NodeArg& new_add_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); + NodeArg& new_add_B_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_conv_B_tensor_proto); graph_utils::AddNodeInput(node, 2, new_add_B_node_arg); } diff --git a/onnxruntime/core/optimizer/conv_bn_fusion.cc b/onnxruntime/core/optimizer/conv_bn_fusion.cc index a14639631d7a1..4c493f45a2b61 100644 --- a/onnxruntime/core/optimizer/conv_bn_fusion.cc +++ b/onnxruntime/core/optimizer/conv_bn_fusion.cc @@ -120,10 +120,10 @@ Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff new_conv_W_tensor_proto.set_name(new_W_name); new_conv_B_tensor_proto.set_name(new_B_name); - NodeArg& new_conv_W_node_arg = graph_utils::AddInitializer(graph, new_conv_W_tensor_proto); + NodeArg& new_conv_W_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_conv_W_tensor_proto); graph_utils::ReplaceNodeInput(node, 1, new_conv_W_node_arg); - auto& new_conv_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); + auto& new_conv_B_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_conv_B_tensor_proto); if (conv_inputs.size() == 3) { graph_utils::ReplaceNodeInput(node, 2, new_conv_B_node_arg); diff --git a/onnxruntime/core/optimizer/conv_mul_fusion.cc b/onnxruntime/core/optimizer/conv_mul_fusion.cc index e91a00729e9db..9563415ad56b6 100644 --- a/onnxruntime/core/optimizer/conv_mul_fusion.cc +++ b/onnxruntime/core/optimizer/conv_mul_fusion.cc @@ -90,7 +90,7 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef new_conv_W_tensor_proto.set_name(new_W_name); // Replace initializers of conv node - NodeArg& new_conv_W_node_arg = graph_utils::AddInitializer(graph, new_conv_W_tensor_proto); + NodeArg& new_conv_W_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_conv_W_tensor_proto); graph_utils::ReplaceNodeInput(conv_node, 1, new_conv_W_node_arg); if (is_3d) { @@ -100,7 +100,7 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef auto new_B_name = graph.GenerateNodeArgName("ConvMulFusion_Mul_B_" + mul_B_tensor_proto->name()); new_conv_B_tensor_proto.set_name(new_B_name); - NodeArg& new_conv_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); + NodeArg& new_conv_B_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_conv_B_tensor_proto); graph_utils::ReplaceNodeInput(conv_node, 2, new_conv_B_node_arg); } diff --git a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc index 96f75f07e32e1..6ea2a17d26125 100644 --- a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc +++ b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc @@ -53,7 +53,7 @@ static void ApplyNewInputValue(Graph& graph, Node& node, QDQ::InputIndex index, auto new_name = graph.GenerateNodeArgName("DoubleQDQRemoved_" + node.InputDefs()[index]->Name()); new_input_tensor.set_name(new_name); new_input_tensor.add_dims(1); - NodeArg& new_input = graph_utils::AddInitializer(graph, new_input_tensor); + NodeArg& new_input = graph_utils::AddInitializerWithOrtValue(graph, new_input_tensor); graph_utils::ReplaceNodeInput(node, index, new_input); } diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc index f8fd807084d38..9e35550e2f845 100644 --- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc @@ -474,7 +474,7 @@ static NodeArg* ExtractEmbedding(Graph& graph, utils::SetRawDataInTensorProto(initializer, data, gsl::narrow(element_count) * sizeof(MLFloat16)); } - NodeArg& node_arg = graph_utils::AddInitializer(graph, initializer); + NodeArg& node_arg = graph_utils::AddInitializerWithOrtValue(graph, initializer); modified = true; return &node_arg; } diff --git a/onnxruntime/core/optimizer/fuse_initializers_transformer.cc b/onnxruntime/core/optimizer/fuse_initializers_transformer.cc index e604c688ee033..9a24687d9a4d9 100644 --- a/onnxruntime/core/optimizer/fuse_initializers_transformer.cc +++ b/onnxruntime/core/optimizer/fuse_initializers_transformer.cc @@ -137,8 +137,12 @@ static void FuseInitializerWithNode(Graph& graph, graph.RemoveEdge(node.Index(), next_node.Index(), 0, static_cast(next_node_arg_index)); // Add the new converted Tensor in next node as initializer potentially with external data - ONNX_NAMESPACE::TensorProto dst_tensor = utils::TensorToTensorProto(new_data.Get(), new_arg_name, false); - auto& new_arg = graph_utils::AddInitializer(graph, dst_tensor); + ONNX_NAMESPACE::TensorProto dst_tensor = utils::TensorToTensorProto(new_data.Get(), new_arg_name, true); + if (!utils::HasExternalData(dst_tensor)) { + new_data = OrtValue(); // Data is inline + } + + auto& new_arg = graph_utils::AddInitializerWithOrtValue(graph, dst_tensor, std::move(new_data)); graph_utils::ReplaceNodeInput(next_node, static_cast(next_node_arg_index), new_arg); } diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc index bd730683a4c91..32a45e548ca0a 100644 --- a/onnxruntime/core/optimizer/gather_fusion.cc +++ b/onnxruntime/core/optimizer/gather_fusion.cc @@ -256,7 +256,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra axes_initializer_proto.add_dims(static_cast(1)); axes_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); axes_initializer_proto.add_int64_data(axis); - NodeArg* axes_arg = &graph_utils::AddInitializer(graph, axes_initializer_proto); + NodeArg* axes_arg = &graph_utils::AddInitializerWithOrtValue(graph, axes_initializer_proto); Node& squeeze_node = graph.AddNode(graph.GenerateNodeName("Squeeze"), "Squeeze", "Squeeze for Fused Gather nodes", {split_output_arg, axes_arg}, {original_output_arg}); @@ -272,7 +272,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); split_initializer_proto.add_dims(static_cast(split_values.size())); split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end()); - NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto); + NodeArg* split_initializer_arg = &graph_utils::AddInitializerWithOrtValue(graph, split_initializer_proto); const auto split_node_name = graph.GenerateNodeName(nodes_to_fuse[0].get().Name() + "/GatherSliceToSplitFusion"); Node& split_node = graph.AddNode(split_node_name, "Split", "Split for Fused Gather nodes", {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs); @@ -359,7 +359,7 @@ Status GatherToSliceFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le unsqueeze_axes_initializer_proto.add_dims(static_cast(1)); unsqueeze_axes_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); unsqueeze_axes_initializer_proto.add_int64_data(static_cast(0)); - NodeArg* unsqueeze_axes_arg = &graph_utils::AddInitializer(graph, unsqueeze_axes_initializer_proto); + NodeArg* unsqueeze_axes_arg = &graph_utils::AddInitializerWithOrtValue(graph, unsqueeze_axes_initializer_proto); for (size_t i = 0; i < range_input_defs.size(); ++i) { Node& unsqueeze_node = graph.AddNode(graph.GenerateNodeName("Unsqueeze_" + std::to_string(i)), "Unsqueeze", @@ -386,7 +386,7 @@ Status GatherToSliceFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le } else { slice_axes_initializer_proto.add_int32_data(static_cast(axis)); } - NodeArg* slice_axes_arg = &graph_utils::AddInitializer(graph, slice_axes_initializer_proto); + NodeArg* slice_axes_arg = &graph_utils::AddInitializerWithOrtValue(graph, slice_axes_initializer_proto); Node& slice_node = graph.AddNode(graph.GenerateNodeName("Slice"), "Slice", "Slice for Fused Gather nodes", {gather_node.MutableInputDefs()[0], unsqueeze_outputs[0], unsqueeze_outputs[1], slice_axes_arg, unsqueeze_outputs[2]}, diff --git a/onnxruntime/core/optimizer/group_query_attention_fusion.cc b/onnxruntime/core/optimizer/group_query_attention_fusion.cc index fcb744bea4df1..f6bfd29315c58 100644 --- a/onnxruntime/core/optimizer/group_query_attention_fusion.cc +++ b/onnxruntime/core/optimizer/group_query_attention_fusion.cc @@ -129,8 +129,8 @@ static std::vector MergeQkvWeightsForMatMulNBits( utils::SetRawDataInTensorProto(qkv_weight_initializer, merged_qkv_weight.data(), element_count * sizeof(uint8_t)); utils::SetRawDataInTensorProto(qkv_scale_initializer, merged_qkv_scale.data(), scale_elements_count * sizeof(MLFloat16)); - NodeArg& qkv_weight_arg = graph_utils::AddInitializer(graph, qkv_weight_initializer); - NodeArg& qkv_scale_arg = graph_utils::AddInitializer(graph, qkv_scale_initializer); + NodeArg& qkv_weight_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_weight_initializer); + NodeArg& qkv_scale_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_scale_initializer); std::vector result_node_args = {&qkv_weight_arg, &qkv_scale_arg}; @@ -160,7 +160,7 @@ static std::vector MergeQkvWeightsForMatMulNBits( utils::SetRawDataInTensorProto(qkv_zp_initializer, merged_qkv_zp.data(), zp_elements_count * sizeof(uint8_t)); - NodeArg& qkv_zp_arg = graph_utils::AddInitializer(graph, qkv_zp_initializer); + NodeArg& qkv_zp_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_zp_initializer); result_node_args.push_back(&qkv_zp_arg); } diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc index fed72db71332a..5db61877811aa 100644 --- a/onnxruntime/core/optimizer/matmul_add_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_add_fusion.cc @@ -194,7 +194,7 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, shape_initializer_proto.add_dims(static_cast(shape.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); utils::SetRawDataInTensorProto(shape_initializer_proto, shape.data(), shape.size() * sizeof(int64_t)); - NodeArg* shape_arg = &graph_utils::AddInitializer(graph, shape_initializer_proto); + NodeArg* shape_arg = &graph_utils::AddInitializerWithOrtValue(graph, shape_initializer_proto); ONNX_NAMESPACE::TypeProto new_arg_type; const ONNX_NAMESPACE::TensorProto_DataType element_type = static_cast( gemm_input_defs[0]->TypeAsProto()->tensor_type().elem_type()); diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.cc b/onnxruntime/core/optimizer/matmul_bn_fusion.cc index 367fb42d7928d..63a15ad630ce9 100644 --- a/onnxruntime/core/optimizer/matmul_bn_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_bn_fusion.cc @@ -212,14 +212,14 @@ Status MatmulBNFusion::Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& matmul_b.ToProto(new_gemm_b_tensor); const std::string new_gemm_b_name = graph.GenerateNodeArgName("MatMulBnFusion_GemmB_" + matmul_b_tensor->name()); new_gemm_b_tensor.set_name(new_gemm_b_name); - NodeArg& new_gemm_b_node_arg = graph_utils::AddInitializer(graph, new_gemm_b_tensor); + NodeArg& new_gemm_b_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_gemm_b_tensor); // create bias tensorProto for new Gemm node from initializer. ONNX_NAMESPACE::TensorProto new_gemm_bias_tensor; bias.ToProto(new_gemm_bias_tensor); const std::string new_gemm_bias_name = graph.GenerateNodeArgName("MatMulBnFusion_GemmBias"); new_gemm_bias_tensor.set_name(new_gemm_bias_name); - NodeArg& new_gemm_bias_node_arg = graph_utils::AddInitializer(graph, new_gemm_bias_tensor); + NodeArg& new_gemm_bias_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_gemm_bias_tensor); Node& gemm_node = graph.AddNode( graph.GenerateNodeArgName("MatMulBnFusion_Gemm"), diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc index 3d1e5ccfdc4d5..bff9d2990118a 100644 --- a/onnxruntime/core/optimizer/nchwc_transformer.cc +++ b/onnxruntime/core/optimizer/nchwc_transformer.cc @@ -437,7 +437,7 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_conv_W_tensor_proto.add_dims(conv_W_dims[i]); } - nchwc_conv_W_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_W_tensor_proto); + nchwc_conv_W_arg = &graph_utils::AddInitializerWithOrtValue(graph_, nchwc_conv_W_tensor_proto); filters_map->emplace(input_defs[1], nchwc_conv_W_arg); } @@ -464,7 +464,7 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels); - nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto); + nchwc_conv_B_arg = &graph_utils::AddInitializerWithOrtValue(graph_, nchwc_conv_B_tensor_proto); aligned_biases_.emplace(input_defs[2], nchwc_conv_B_arg); } } @@ -580,7 +580,7 @@ Node& NchwcTransformerImpl::InsertReshape(NodeArg* input_arg, } shape_tensor_proto.add_dims(split_channels ? kNchwcDims + 1 : kNchwcDims); - shape_arg = &graph_utils::AddInitializer(graph_, shape_tensor_proto); + shape_arg = &graph_utils::AddInitializerWithOrtValue(graph_, shape_tensor_proto); } Node& reshape_node = graph_.AddNode(graph_.GenerateNodeName("Reshape"), @@ -896,7 +896,7 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) { nchwc_conv_W_tensor_proto.add_dims(1); nchwc_conv_W_tensor_proto.add_dims(1); - auto* nchwc_conv_W_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_W_tensor_proto); + auto* nchwc_conv_W_arg = &graph_utils::AddInitializerWithOrtValue(graph_, nchwc_conv_W_tensor_proto); std::copy_n(bn_B.data(), channels, padded_buffer.data()); @@ -907,7 +907,7 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) { gsl::narrow(nchwc_channels) * sizeof(float)); nchwc_conv_B_tensor_proto.add_dims(nchwc_channels); - auto* nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto); + auto* nchwc_conv_B_arg = &graph_utils::AddInitializerWithOrtValue(graph_, nchwc_conv_B_tensor_proto); // Create the replacement node. std::string nchwc_node_name = graph_.GenerateNodeName(output_defs[0]->Name() + "_bn_nchwc"); diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc index 42d27de632b91..d0c88886e5a06 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc @@ -130,22 +130,22 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph, const log weights_proto_u8.set_name(weight_tensor_proto->name() + "_s8_2_u8"); weights_proto_u8.mutable_dims()->CopyFrom(weight_tensor_proto->dims()); utils::SetRawDataInTensorProto(weights_proto_u8, w_temp.data(), static_cast(w_temp.size())); - input_defs[w_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8); + input_defs[w_idx] = &graph_utils::AddInitializerWithOrtValue(graph, weights_proto_u8); ONNX_NAMESPACE::TensorProto weight_zp_proto_u8; QDQ::Int8TensorProto2Uint8(weight_zp_tensor_proto, weight_zp_proto_u8, graph, true); - input_defs[w_zp_idx] = &graph_utils::AddInitializer(graph, weight_zp_proto_u8); + input_defs[w_zp_idx] = &graph_utils::AddInitializerWithOrtValue(graph, weight_zp_proto_u8); ONNX_NAMESPACE::TensorProto r_proto_u8; r_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); r_proto_u8.set_name(r_tensor_proto->name() + "_s8_2_u8"); r_proto_u8.mutable_dims()->CopyFrom(r_tensor_proto->dims()); utils::SetRawDataInTensorProto(r_proto_u8, r_temp.data(), static_cast(r_temp.size())); - input_defs[r_idx] = &graph_utils::AddInitializer(graph, r_proto_u8); + input_defs[r_idx] = &graph_utils::AddInitializerWithOrtValue(graph, r_proto_u8); ONNX_NAMESPACE::TensorProto r_zp_proto_u8; QDQ::Int8TensorProto2Uint8(r_zp_tensor_proto, r_zp_proto_u8, graph, true); - input_defs[r_zp_idx] = &graph_utils::AddInitializer(graph, r_zp_proto_u8); + input_defs[r_zp_idx] = &graph_utils::AddInitializerWithOrtValue(graph, r_zp_proto_u8); return true; } diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc index 828165e99d840..bf1c8949e3883 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc @@ -61,7 +61,7 @@ static bool QDQ_S8_to_U8(Graph& graph, Node& q_node, Node& dq_node) { zp_tensor_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); zp_tensor_proto_u8.set_name(graph.GenerateNodeArgName("qdq_s8_to_u8_zp_conversion")); utils::SetRawDataInTensorProto(zp_tensor_proto_u8, &q_zp_value, sizeof(uint8_t)); - NodeArg* zp_u8_arg = &graph_utils::AddInitializer(graph, zp_tensor_proto_u8); + NodeArg* zp_u8_arg = &graph_utils::AddInitializerWithOrtValue(graph, zp_tensor_proto_u8); auto q_output_node_arg_name = graph.GenerateNodeArgName("qdq_s8_to_u8_quant"); NodeArg* q_output_arg = &graph.GetOrCreateNodeArg(q_output_node_arg_name, nullptr); diff --git a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc index f094f3c199f2a..101fedfb3e28f 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc @@ -43,12 +43,12 @@ bool ConvertS8WeightToU8(Graph& graph, Node& op_node, // The weights fits into S7, overflow is not a problem, no need to convert to U8 return false; } - input_defs[weights_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8); + input_defs[weights_idx] = &graph_utils::AddInitializerWithOrtValue(graph, weights_proto_u8); // Convert weight zero point to uint8 ONNX_NAMESPACE::TensorProto weight_zp_proto_u8; Int8TensorProto2Uint8(weight_zp_tensor_proto, weight_zp_proto_u8, graph, true); - input_defs[weight_zp_idx] = &graph_utils::AddInitializer(graph, weight_zp_proto_u8); + input_defs[weight_zp_idx] = &graph_utils::AddInitializerWithOrtValue(graph, weight_zp_proto_u8); return true; } diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index 34d7ba3c79775..dddf80252f727 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -439,23 +439,23 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph, } } - auto weight_T_tp = utils::TensorToTensorProto(weight_dst, weight_dst_name, false); - auto scale_T_tp = utils::TensorToTensorProto(scale_dst, scale_dst_name, false); + auto weight_T_tp = utils::TensorToTensorProto(weight_dst, weight_dst_name, true); + auto scale_T_tp = utils::TensorToTensorProto(scale_dst, scale_dst_name, true); std::optional zp_T_tp; if (zp_dst) { - zp_T_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, false)); + zp_T_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true)); } auto& input_defs = replacement_node.MutableInputDefs(); - input_defs.push_back(&graph_utils::AddInitializer(graph, weight_T_tp)); + input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, weight_T_tp, std::move(weight_dst))); replacement_node.MutableInputArgsCount().push_back(1); - input_defs.push_back(&graph_utils::AddInitializer(graph, scale_T_tp)); + input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, scale_T_tp, std::move(scale_dst))); replacement_node.MutableInputArgsCount().push_back(1); if (zp_T_tp) { - input_defs.push_back(&graph_utils::AddInitializer(graph, zp_T_tp.value())); + input_defs.push_back(&graph_utils::AddInitializerWithOrtValue(graph, zp_T_tp.value(), std::move(*zp_dst))); replacement_node.MutableInputArgsCount().push_back(1); } diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc index c6898ca1da2e5..31cdc6e7fbc5c 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc @@ -170,14 +170,14 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph weight_scale_proto.set_name(graph.GenerateNodeArgName(node.Name() + "_weight_scale")); weight_scale_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); weight_scale_proto.mutable_float_data()->Add(scale); - weight_scale_arg = &graph_utils::AddInitializer(graph, weight_scale_proto); + weight_scale_arg = &graph_utils::AddInitializerWithOrtValue(graph, weight_scale_proto); // Weight zero point initializer. ONNX_NAMESPACE::TensorProto weight_zp_proto; weight_zp_proto.set_name(graph.GenerateNodeArgName(node.Name() + "_weight_zp")); weight_zp_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8); weight_zp_proto.mutable_int32_data()->Add(static_cast(zp)); - NodeArg& weight_zp_arg = graph_utils::AddInitializer(graph, weight_zp_proto); + NodeArg& weight_zp_arg = graph_utils::AddInitializerWithOrtValue(graph, weight_zp_proto); // Q from float32 to int8. ONNX_NAMESPACE::TypeProto weight_q_type_proto; diff --git a/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc b/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc index 3eb3ee7e75c75..9bd91e7916ecb 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/where_dummy_dq.cc @@ -119,9 +119,9 @@ Status WhereDummyDq::InsertDummyDQ(Node& node, Graph& graph, bool& modified, con } // Start editing the graph - NodeArg& dummy_data_arg = graph_utils::AddInitializer(graph, dummy_data_proto); - NodeArg& dummy_scale_arg = graph_utils::AddInitializer(graph, dummy_scale_proto); - NodeArg& dummy_zp_arg = graph_utils::AddInitializer(graph, dummy_zp_proto); + NodeArg& dummy_data_arg = graph_utils::AddInitializerWithOrtValue(graph, dummy_data_proto); + NodeArg& dummy_scale_arg = graph_utils::AddInitializerWithOrtValue(graph, dummy_scale_proto); + NodeArg& dummy_zp_arg = graph_utils::AddInitializerWithOrtValue(graph, dummy_zp_proto); ONNX_NAMESPACE::TypeProto dummy_dq_type_proto = utils::TypeProtoFromTensorProto(*const_node_data_proto); dummy_dq_type_proto.mutable_tensor_type()->set_elem_type(const_node_data_proto->data_type()); diff --git a/onnxruntime/core/optimizer/relu_clip_fusion.cc b/onnxruntime/core/optimizer/relu_clip_fusion.cc index 07902fde04930..494c646778d10 100644 --- a/onnxruntime/core/optimizer/relu_clip_fusion.cc +++ b/onnxruntime/core/optimizer/relu_clip_fusion.cc @@ -97,7 +97,7 @@ Status FuseReluClip::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff mutable_next_node->AddAttribute("min", 0.f); } else { // Add the initialized tensor to the graph - auto* replacement_min_nodearg = &graph_utils::AddInitializer(graph, replacement_min); + auto* replacement_min_nodearg = &graph_utils::AddInitializerWithOrtValue(graph, replacement_min); // Replace the input def at the appropriate index of the Clip node auto& mutable_input_defs = mutable_next_node->MutableInputDefs(); diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc index 324905f953eec..daab9bba278aa 100644 --- a/onnxruntime/core/optimizer/reshape_fusion.cc +++ b/onnxruntime/core/optimizer/reshape_fusion.cc @@ -438,7 +438,7 @@ bool ReshapeFusion::Fuse_Subgraph(Node& reshape, Graph& graph, const logging::Lo shape_initializer_proto.add_dims(static_cast(shape_value.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t)); - auto& new_node_arg = graph_utils::AddInitializer(graph, shape_initializer_proto); + auto& new_node_arg = graph_utils::AddInitializerWithOrtValue(graph, shape_initializer_proto); // Safely remove concat parent nodes which have only one output for (int i = 0; i < concat_input_count; ++i) { @@ -492,7 +492,7 @@ bool ReshapeFusion::FuseContiguousReshapes(Node& reshape, Graph& graph) { shape_initializer_proto.add_dims(static_cast(shape_value.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t)); - NodeArg* shape_arg = &graph_utils::AddInitializer(graph, shape_initializer_proto); + NodeArg* shape_arg = &graph_utils::AddInitializerWithOrtValue(graph, shape_initializer_proto); Node& reshape_node = graph.AddNode(graph.GenerateNodeName(name + "_new_reshape"), "Reshape", "Reshape for " + name, {contiguous_reshapes[0].get().MutableInputDefs()[0], shape_arg}, {contiguous_reshapes.back().get().MutableOutputDefs()[0]}); diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc index 5c09e5225ab9c..60ab064465f2f 100644 --- a/onnxruntime/core/optimizer/stft_decomposition.cc +++ b/onnxruntime/core/optimizer/stft_decomposition.cc @@ -46,7 +46,7 @@ NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[T proto.add_dims(shape[i]); } utils::SetRawDataInTensorProto(proto, begin, element_count * sizeof(TDataType)); - return &graph_utils::AddInitializer(graph, proto); + return &graph_utils::AddInitializerWithOrtValue(graph, proto); } template @@ -225,8 +225,8 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve } const int64_t weight_shape[] = {dft_unique_bins, 1, 1, dft_size}; - auto real_weights = AddInitializer(graph, "stft_real_conv_weights", weight_shape, real_weights_data.data()); - auto imaginary_weights = AddInitializer(graph, "stft_imaginary_conv_weights", weight_shape, imag_weights_data.data()); + auto* real_weights = AddInitializer(graph, "stft_real_conv_weights", weight_shape, real_weights_data.data()); + auto* imaginary_weights = AddInitializer(graph, "stft_imaginary_conv_weights", weight_shape, imag_weights_data.data()); const int64_t signal_reshaped[] = {batch_size, 1, 1, signal_length}; auto signal_shape = AddShapeInitializer(graph, "stft_signal_shape", signal_reshaped); diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc index 46311303639ab..00b3dc4dd8b4e 100644 --- a/onnxruntime/core/optimizer/transformer_memcpy.cc +++ b/onnxruntime/core/optimizer/transformer_memcpy.cc @@ -433,9 +433,23 @@ bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker auto& new_def = graph_.GetOrCreateNodeArg(new_def_name, provider_def->TypeAsProto()); TensorProto new_tensor_proto = *tensor_proto; - *(new_tensor_proto.mutable_name()) = new_def_name; - - ORT_IGNORE_RETURN_VALUE(graph_utils::AddInitializer(graph_, new_tensor_proto)); + *(new_tensor_proto.mutable_name()) = std::move(new_def_name); + + // Query any OrtValue existing for the original initializer + // We are checking outer scope because GetInitializer is called with true, therefore, we potentially + // have references to parent graphs. + // We are doing this so the same OrtValue is re-used in subgraphs and no copies made for big items. + constexpr const bool check_outer_scope_true = true; + OrtValue ort_value; + // The initializer can be in memory with OrtValue or it can be a flatbuffer mapped. + if (utils::HasExternalDataInMemory(new_tensor_proto) && + graph_.GetOrtValueInitializer(name, ort_value, check_outer_scope_true)) { + // Re-use the same ort_value and proto that points to the same buffer + ORT_IGNORE_RETURN_VALUE(graph_utils::AddInitializerWithOrtValue(graph_, new_tensor_proto, + std::move(ort_value))); + } else { + ORT_IGNORE_RETURN_VALUE(graph_utils::AddInitializer(graph_, new_tensor_proto)); + } replacements.insert(std::make_pair(provider_def, &new_def)); } diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc index 96c5ec8562a03..6a02ca3578da2 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc @@ -588,10 +588,10 @@ void ApiGraph::TransposeInitializer(std::string_view name, const std::vector& shape) { @@ -624,7 +624,7 @@ void ApiGraph::ReshapeInitializer(std::string_view name, const std::vector()->Reshape(new_shape); + } + + auto& new_node_arg = graph_utils::AddInitializerWithOrtValue(graph, new_tensor_proto, ort_value); graph_utils::ReplaceNodeWithInitializer(graph, node, new_node_arg); // Remove the Unsqueeze node and replace it with the initializer. diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 27e9f82a43557..5be46cd480004 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -446,7 +446,7 @@ inline bool HasExternalDataInMemory(const ONNX_NAMESPACE::TensorProto& ten_proto } // namespace utils namespace graph_utils { -inline NodeArg& AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer) { +inline NodeArg& AddInitializerWithOrtValue(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer) { return g_host->GraphUtils__AddInitializerWithExternalData(graph, new_initializer); } inline void MakeInitializerCopyIfNotExist(const Graph& src_graph, Graph& dst_graph, const std::string& name, diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index e3291cdce62c5..14f0892687ad1 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1421,29 +1421,6 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool } } - // We choose to convert initializers into OrtValues before partitioning here so plug-in EPs could - // take advantage of the initializers being in OrtValue format and not to deal with protobuf. - // - // The initializers data is transferred to an OrtValue. The original TensorProto is replaced - // with a TensorProto that has the same data type, shape and name. However, its external data - // is used in a non-standard way. The location is set to a string constant utils::kTensorProtoMemoryAddressTag, - // The file offset is set to the address of the OrtValue's data buffer, and the length is set to the size of the - // OrtValue's data buffer. Because this external location is non-standard, onnx code can not handle it, so we choose - // to do it as late as possible but before the partitioning so type and shape inference accesses the initializers - // before they are converted to OrtValues. - // - // If any transformations are applied later, they would not introduce any in-memory initializers, - // type and shape inference would run only on any newly added nodes and any new initializers - // will be converted at session finalization time. - // - // The conversion is performed using the following steps (within ConvertInitializersIntoOrtValues()) - // constexpr const bool use_tensor_buffer_true = true; - // auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get(), tensor_proto.name(), - // use_tensor_buffer_true); - // ORT_RETURN_IF_ERROR(graph.ReplaceInitializedTensor(tensor_proto_to_add, ort_value)); - - ORT_RETURN_IF_ERROR_SESSIONID_(graph.ConvertInitializersIntoOrtValues()); - // Do partitioning based on execution providers' capabilities. ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn, session_options_.config_options, *session_logger_, diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 960b9eff051be..146112b7dac76 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1433,7 +1433,7 @@ struct ProviderHostImpl : ProviderHost { NodeArg& GraphUtils__AddInitializerWithExternalData(Graph& graph, const ONNX_NAMESPACE::TensorProto& new_initializer) override { - return graph_utils::AddInitializerWithExternalData(graph, new_initializer); + return graph_utils::AddInitializerWithOrtValue(graph, new_initializer); } void GraphUtils__MakeInitializerCopyIfNotExist(const Graph& src_graph, Graph& dst_graph, diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index bcd6b62e649be..c3d8f2631feb4 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -1017,7 +1017,7 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) { tensor.add_float_data(1.0f); tensor.set_data_type(TensorProto_DataType_FLOAT); tensor.set_name("init_data"); - graph_utils::AddInitializerWithExternalData(main_graph, tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, tensor); // Main graph's inputs/outputs main_graph.SetInputs({&abs_data_in, &if_in}); @@ -1124,7 +1124,7 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG tensor.add_int64_data(1); tensor.set_data_type(TensorProto_DataType_INT64); tensor.set_name("init_data"); - graph_utils::AddInitializerWithExternalData(main_graph, tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, tensor); // Main graph's inputs/outputs main_graph.SetInputs({&abs_data_in, &if_in}); @@ -1549,7 +1549,7 @@ TEST_F(PlannerTest, ParaPlanCreation) { for (int i = 0; i < 64 * 3 * 7 * 7; ++i) conv_0_weight_tensor.add_float_data(0.234f); conv_0_weight_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_0_weight_tensor.set_name("conv_0_weight"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_0_weight_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_0_weight_tensor); ONNX_NAMESPACE::TensorProto conv_1_weight_tensor; conv_1_weight_tensor.add_dims(64L); @@ -1559,7 +1559,7 @@ TEST_F(PlannerTest, ParaPlanCreation) { conv_1_weight_tensor.set_data_type(TensorProto_DataType_FLOAT); for (int i = 0; i < 64 * 64; ++i) conv_1_weight_tensor.add_float_data(1.017f); conv_1_weight_tensor.set_name("conv_1_weight"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_1_weight_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_1_weight_tensor); ONNX_NAMESPACE::TensorProto conv_2_weight_tensor; conv_2_weight_tensor.add_dims(64L); @@ -1569,7 +1569,7 @@ TEST_F(PlannerTest, ParaPlanCreation) { for (int i = 0; i < 64 * 64 * 3 * 3; ++i) conv_2_weight_tensor.add_float_data(2.317f); conv_2_weight_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_2_weight_tensor.set_name("conv_2_weight"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_2_weight_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_2_weight_tensor); ONNX_NAMESPACE::TensorProto conv_3_weight_tensor; conv_3_weight_tensor.add_dims(256L); @@ -1579,7 +1579,7 @@ TEST_F(PlannerTest, ParaPlanCreation) { for (int i = 0; i < 256 * 64; ++i) conv_3_weight_tensor.add_float_data(1.256f); conv_3_weight_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_3_weight_tensor.set_name("conv_3_weight"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_3_weight_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_3_weight_tensor); ONNX_NAMESPACE::TensorProto conv_4_weight_tensor; conv_4_weight_tensor.add_dims(256L); @@ -1589,7 +1589,7 @@ TEST_F(PlannerTest, ParaPlanCreation) { for (int i = 0; i < 256 * 64; ++i) conv_4_weight_tensor.add_float_data(1.913f); conv_4_weight_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_4_weight_tensor.set_name("conv_4_weight"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_4_weight_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_4_weight_tensor); auto& conv_0_weight = main_graph.GetOrCreateNodeArg("conv_0_weight", &conv_0_weight_type); auto& conv_1_weight = main_graph.GetOrCreateNodeArg("conv_1_weight", &conv_1_weight_type); @@ -1602,35 +1602,35 @@ TEST_F(PlannerTest, ParaPlanCreation) { conv_0_bias_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_0_bias_tensor.set_name("conv_0_bias"); for (int i = 0; i < 64; ++i) conv_0_bias_tensor.add_float_data(1.123f); - graph_utils::AddInitializerWithExternalData(main_graph, conv_0_bias_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_0_bias_tensor); ONNX_NAMESPACE::TensorProto conv_1_bias_tensor; conv_1_bias_tensor.add_dims(64L); for (int i = 0; i < 64; ++i) conv_1_bias_tensor.add_float_data(2.234f); conv_1_bias_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_1_bias_tensor.set_name("conv_1_bias"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_1_bias_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_1_bias_tensor); ONNX_NAMESPACE::TensorProto conv_2_bias_tensor; conv_2_bias_tensor.add_dims(64L); for (int i = 0; i < 64; ++i) conv_2_bias_tensor.add_float_data(0.121f); conv_2_bias_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_2_bias_tensor.set_name("conv_2_bias"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_2_bias_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_2_bias_tensor); ONNX_NAMESPACE::TensorProto conv_3_bias_tensor; conv_3_bias_tensor.add_dims(256L); for (int i = 0; i < 256; ++i) conv_3_bias_tensor.add_float_data(1.201f); conv_3_bias_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_3_bias_tensor.set_name("conv_3_bias"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_3_bias_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_3_bias_tensor); ONNX_NAMESPACE::TensorProto conv_4_bias_tensor; conv_4_bias_tensor.add_dims(256L); for (int i = 0; i < 256; ++i) conv_4_bias_tensor.add_float_data(0.897f); conv_4_bias_tensor.set_data_type(TensorProto_DataType_FLOAT); conv_4_bias_tensor.set_name("conv_4_bias"); - graph_utils::AddInitializerWithExternalData(main_graph, conv_4_bias_tensor); + graph_utils::AddInitializerWithOrtValue(main_graph, conv_4_bias_tensor); auto& conv_0_bias = main_graph.GetOrCreateNodeArg("conv_0_bias", &conv_0_bias_type); auto& conv_1_bias = main_graph.GetOrCreateNodeArg("conv_1_bias", &conv_1_bias_type); diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc index 1553469c52df7..13e379f1dacfc 100644 --- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc +++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc @@ -67,7 +67,7 @@ static common::Status LoadInferenceSessionFromModel(FenceCudaTestInferenceSessio tensor_proto.set_data_type(PROTO_DATATYPE); \ for (auto v : value) tensor_proto.PROTO_ADD_DATA(v); \ tensor_proto.set_name(name); \ - return graph_utils::AddInitializer(graph, tensor_proto); \ + return graph_utils::AddInitializerWithOrtValue(graph, tensor_proto); \ } CREATE_INITIALIZER_FUNC(float, TensorProto_DataType_FLOAT, add_float_data) diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index cdcd3c2327421..9bdc0898c81c1 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -311,35 +311,30 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) { } } +#ifdef USE_CUDA // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator // if the relevant session option config flag is set TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { - // For this test we need to enable the arena-based allocator. - if (!DoesCpuAllocatorSupportArenaUsage()) { - GTEST_SKIP() << "CPU allocator does not support arena usage."; - } - AllocatorPtr cpu_allocator = CPUAllocator::DefaultInstance(); - // Part 1: Feature turned ON (i.e.) allocate from non-arena memory - { - std::basic_ostringstream oss; - oss << ORT_TSTR("testdata/mul_1.onnx"); - Status status; - std::shared_ptr model; - ASSERT_TRUE((status = Model::Load(oss.str(), model, nullptr, DefaultLoggingManager().DefaultLogger())).IsOK()) - << status; - Graph& graph = model->MainGraph(); + const auto& default_logger = DefaultLoggingManager().DefaultLogger(); + auto setup_and_run_test = [&cpu_allocator, &default_logger](Model& model, bool use_device_allocator) -> AllocatorStats { + Graph& graph = model.MainGraph(); + DataTransferManager dtm; ExecutionProviders execution_providers; - CPUExecutionProviderInfo epi{true}; // use an arena-based allocator for this EP - status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi)); - ASSERT_TRUE(status.IsOK()) << status; - + auto tmp_cpu_execution_provider = DefaultCudaExecutionProvider(); + tmp_cpu_execution_provider->SetLogger(&default_logger); + EXPECT_STATUS_OK(dtm.RegisterDataTransfer(tmp_cpu_execution_provider->GetDataTransfer())); + EXPECT_STATUS_OK(execution_providers.Add(kCudaExecutionProvider, std::move(tmp_cpu_execution_provider))); + + // Make sure CPU allocator is registered + auto cpu_execution_provider = DefaultCpuExecutionProvider(); + cpu_execution_provider->SetLogger(&default_logger); + EXPECT_STATUS_OK(dtm.RegisterDataTransfer(cpu_execution_provider->GetDataTransfer())); + EXPECT_STATUS_OK(execution_providers.Add(kCpuExecutionProvider, std::move(cpu_execution_provider))); KernelRegistryManager krm; - status = krm.RegisterKernels(execution_providers); - ASSERT_TRUE(status.IsOK()) << status; + EXPECT_STATUS_OK(krm.RegisterKernels(execution_providers)); - DataTransferManager dtm; ExternalDataLoaderManager edlm; profiling::Profiler profiler; @@ -348,20 +343,23 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; sess_options.use_deterministic_compute = false; sess_options.enable_mem_reuse = true; - // disable allocating initialized tensor memory from the arena(by default it will be allocated by the arena) - ASSERT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsUseDeviceAllocatorForInitializers, - "1")); + + if (use_device_allocator) { + // disable allocating initialized tensor memory from the arena(by default it will be allocated by the arena) + EXPECT_STATUS_OK(sess_options.config_options.AddConfigEntry(kOrtSessionOptionsUseDeviceAllocatorForInitializers, + "1")); + } SessionState session_state(graph, execution_providers, nullptr, nullptr, dtm, edlm, - DefaultLoggingManager().DefaultLogger(), profiler, sess_options); + default_logger, profiler, sess_options); // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup auto graph_optimizer_registry = std::make_unique(&sess_options, - execution_providers.Get(onnxruntime::kCpuExecutionProvider), - &DefaultLoggingManager().DefaultLogger()); + execution_providers.Get(kCudaExecutionProvider), + &default_logger); // Partition the graph GraphPartitioner partitioner(krm, execution_providers, std::move(graph_optimizer_registry)); - ASSERT_STATUS_OK(partitioner.Partition( + EXPECT_STATUS_OK(partitioner.Partition( graph, session_state.GetMutableFuncMgr(), [&cpu_allocator](Graph& graph, bool& modified, const IExecutionProvider& execution_provider, const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status { @@ -369,95 +367,49 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { cpu_allocator, debug_graph_fn); }, sess_options.config_options, - DefaultLoggingManager().DefaultLogger())); + default_logger)); - ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm)); + EXPECT_STATUS_OK(session_state.FinalizeSessionState(model.ModelPath(), krm)); - // Fetch the CPU arena-allocator from the session state - OrtMemoryInfo mem_info(CPU, OrtArenaAllocator); + // Fetch the CUDA arena-allocator from the session state + OrtMemoryInfo mem_info(CUDA, OrtArenaAllocator); AllocatorPtr alloc = session_state.GetAllocator(mem_info); - ASSERT_TRUE(alloc != nullptr); + EXPECT_NE(alloc, nullptr); - // Get stats for the CPU arena-based allocator + // Get stats for the CUDA arena-based allocator AllocatorStats alloc_stats; static_cast(alloc.get())->GetStats(&alloc_stats); - // Assert that we have made 1 Reserve() call (for allocating memory for the sole initializer in the model) - ASSERT_EQ(alloc_stats.num_reserves, 1); - } + return alloc_stats; + }; - // Part 2: Feature turned OFF (i.e.) allocate from arena memory (default behavior) + const ORTCHAR_T* model_path = ORT_TSTR("testdata/mul_1.onnx"); + // Part 1: Feature turned ON (i.e.) allocate from non-arena memory { - std::basic_ostringstream oss; - oss << ORT_TSTR("testdata/mul_1.onnx"); - Status status; std::shared_ptr model; - ASSERT_TRUE((status = Model::Load(oss.str(), model, nullptr, DefaultLoggingManager().DefaultLogger())).IsOK()) - << status; - Graph& graph = model->MainGraph(); + ASSERT_STATUS_OK(Model::Load(model_path, model, nullptr, default_logger)); - ExecutionProviders execution_providers; - CPUExecutionProviderInfo epi{true}; // use an arena-based allocator for this EP - status = execution_providers.Add(onnxruntime::kCpuExecutionProvider, std::make_unique(epi)); - ASSERT_TRUE(status.IsOK()) << status; + auto alloc_stats = setup_and_run_test(*model, /*use_device_allocator=*/true); - KernelRegistryManager krm; - status = krm.RegisterKernels(execution_providers); - ASSERT_TRUE(status.IsOK()) << status; - - DataTransferManager dtm; - ExternalDataLoaderManager edlm; - profiling::Profiler profiler; - - SessionOptions sess_options; - sess_options.enable_mem_pattern = false; - sess_options.execution_mode = ExecutionMode::ORT_SEQUENTIAL; - sess_options.use_deterministic_compute = false; - sess_options.enable_mem_reuse = true; - - SessionState session_state(graph, execution_providers, nullptr, nullptr, dtm, edlm, - DefaultLoggingManager().DefaultLogger(), profiler, sess_options); - - // Create GraphOptimizerRegistry instance for providing predefined graph optimizers and selection functions for EPs to lookup - auto graph_optimizer_registry = std::make_unique(&sess_options, - execution_providers.Get(onnxruntime::kCpuExecutionProvider), - &DefaultLoggingManager().DefaultLogger()); - - // Partition the graph - GraphPartitioner partitioner(krm, execution_providers, std::move(graph_optimizer_registry)); - ASSERT_STATUS_OK(partitioner.Partition( - graph, session_state.GetMutableFuncMgr(), - [&cpu_allocator](Graph& graph, bool& modified, - const IExecutionProvider& execution_provider, - const layout_transformation::DebugGraphFn& debug_graph_fn) -> Status { - return layout_transformation::TransformLayoutForEP( - graph, modified, execution_provider, cpu_allocator, debug_graph_fn); - }, - sess_options.config_options, - DefaultLoggingManager().DefaultLogger())); - - // Finalize the session state - ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm)); + // Assert that we have made 1 Reserve() call (for allocating memory for the sole initializer in the model) + ASSERT_EQ(1, alloc_stats.num_reserves); + } - // Fetch the CPU arena-allocator from the session state - OrtMemoryInfo mem_info(CPU, OrtArenaAllocator); - AllocatorPtr alloc = session_state.GetAllocator(mem_info); - ASSERT_TRUE(alloc != nullptr); + // Part 2: Feature turned OFF (i.e.) allocate from arena memory (default behavior) + { + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(model_path, model, nullptr, default_logger)); - // Get stats for the CPU arena-based allocator - AllocatorStats alloc_stats; - static_cast(alloc.get())->GetStats(&alloc_stats); + auto alloc_stats = setup_and_run_test(*model, /*use_device_allocator=*/false); - // Assert that we have made no Reserve() calls - ASSERT_EQ(alloc_stats.num_reserves, 0); + // One reserve call should have been made (for allocating memory for the sole initializer in the model) + ASSERT_EQ(1, alloc_stats.num_reserves); - // Assert to ensure an allocation was made for the initializer through the arena allocator (Alloc() was invoked) - ASSERT_EQ(alloc_stats.num_allocs, 1); + // This counter comes from Reserve(). The actual call for arena based allocator went to StreamAwareArena instance + ASSERT_EQ(1, alloc_stats.num_allocs); } } -#ifdef USE_CUDA - namespace { using ParitionVerifierFn = std::function; diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index 7371ad5cf0ded..6a3f2f974b9f5 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -2808,11 +2808,6 @@ TEST_F(GraphTest, ShapeInferenceAfterInitializerExternalization) { // First resolve should succeed ASSERT_STATUS_OK(graph.Resolve()); - // Now trigger the in-memory externalization - // This converts initializers > 127 bytes to OrtValues with external data references - Status convert_status = graph.ConvertInitializersIntoOrtValues(); - ASSERT_TRUE(convert_status.IsOK()) << "ConvertInitializersIntoOrtValues failed: " << convert_status.ErrorMessage(); - // Check if the initializer was actually externalized const ONNX_NAMESPACE::TensorProto* initializer_after = nullptr; ASSERT_TRUE(graph.GetInitializedTensor("split_sizes", initializer_after)); diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index a746493d779f8..6a6545c68cb4f 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -299,7 +299,9 @@ void RunWithOneSessionMultiThreadsInference(PathString model_name, std::string s ASSERT_TRUE(HasCacheFileWithPrefix(params.trt_engine_cache_prefix)); } -TEST(TensorrtExecutionProviderTest, SessionCreationWithMultiThreadsAndInferenceWithMultiThreads) { +// The test is disabled due to the issue described at +// https://github.com/microsoft/onnxruntime/issues/26366 +TEST(TensorrtExecutionProviderTest, DISABLED_SessionCreationWithMultiThreadsAndInferenceWithMultiThreads) { std::vector threads; PathString model_name = ORT_TSTR("trt_execution_provider_multithreading_test.onnx"); std::string graph_name = "multithreading_test"; diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index f2a7ee71a363a..c4a92e959b273 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -3486,7 +3486,7 @@ TEST(CApiTest, TestSharedAllocators) { ASSERT_TRUE(num_allocations == 6); size_t num_reserve_allocations = custom_allocator.NumReserveAllocations(); - ASSERT_TRUE(num_reserve_allocations == 1); + ASSERT_EQ(3, num_reserve_allocations); // Ensure that there was no leak custom_allocator.LeakCheck(); diff --git a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc index ff220fcb067b8..56e4cc273b1df 100644 --- a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc +++ b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc @@ -121,7 +121,7 @@ void Conv1dToMatmul(Graph& graph, Node& conv, const std::string transformer_name initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); InlinedVector initializer_proto_value{weight_squeeze_axis}; initializer_proto.set_raw_data(initializer_proto_value.data(), initializer_proto_value.size() * sizeof(int64_t)); - auto& axes_input = graph_utils::AddInitializer(graph, initializer_proto); + auto& axes_input = graph_utils::AddInitializerWithOrtValue(graph, initializer_proto); // Squeeze node doesn't have opschema here, so we need to set input args count manually weight_squeeze.MutableInputArgsCount().resize(2); graph_utils::AddNodeInput(weight_squeeze, 1, axes_input); diff --git a/orttraining/orttraining/core/optimizer/megatron_transformer.cc b/orttraining/orttraining/core/optimizer/megatron_transformer.cc index 7c429ae5cb643..837820ef2f9a1 100644 --- a/orttraining/orttraining/core/optimizer/megatron_transformer.cc +++ b/orttraining/orttraining/core/optimizer/megatron_transformer.cc @@ -453,15 +453,15 @@ Status MegatronTransformer::TransformGPT2MLP(Graph& graph, bool& modified, return skip_status; } - NodeArg& a_weight_partition_arg = graph_utils::AddInitializer(graph, a_weight_initializer_partition); + NodeArg& a_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, a_weight_initializer_partition); graph_utils::ReplaceNodeInput(node, 1, a_weight_partition_arg); updated_weight_names_.insert({a_weight_arg->Name(), a_weight_partition_arg.Name()}); - NodeArg& a_bias_partition_arg = graph_utils::AddInitializer(graph, a_bias_initializer_partition); + NodeArg& a_bias_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, a_bias_initializer_partition); graph_utils::ReplaceNodeInput(add_node, 1, a_bias_partition_arg); updated_weight_names_.insert({b_weight_arg->Name(), a_bias_partition_arg.Name()}); - NodeArg& b_weight_partition_arg = graph_utils::AddInitializer(graph, b_weight_initializer_partition); + NodeArg& b_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, b_weight_initializer_partition); graph_utils::ReplaceNodeInput(matmul2_node, 1, b_weight_partition_arg); updated_weight_names_.insert({a_bias_arg->Name(), b_weight_partition_arg.Name()}); @@ -600,15 +600,15 @@ Status MegatronTransformer::TransformBARTMLP(Graph& graph, bool& modified, return skip_status; } - NodeArg& dense_wi_weight_partition_arg = graph_utils::AddInitializer(graph, dense_wi_weight_initializer_partition); + NodeArg& dense_wi_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, dense_wi_weight_initializer_partition); graph_utils::ReplaceNodeInput(*second_op, 0, dense_wi_weight_partition_arg); updated_weight_names_.insert({dense_wi_weight_arg->Name(), dense_wi_weight_partition_arg.Name()}); - NodeArg& dense_wi_bias_partition_arg = graph_utils::AddInitializer(graph, dense_wi_bias_initializer_partition); + NodeArg& dense_wi_bias_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, dense_wi_bias_initializer_partition); graph_utils::ReplaceNodeInput(biasgelu_node, 1, dense_wi_bias_partition_arg); updated_weight_names_.insert({dense_wi_bias_arg->Name(), dense_wi_bias_partition_arg.Name()}); - NodeArg& dense_wo_weight_partition_arg = graph_utils::AddInitializer(graph, dense_wo_weight_initializer_partition); + NodeArg& dense_wo_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, dense_wo_weight_initializer_partition); graph_utils::ReplaceNodeInput(*transpose_op_ptr, 0, dense_wo_weight_partition_arg); updated_weight_names_.insert({dense_wo_weight_arg->Name(), dense_wo_weight_partition_arg.Name()}); @@ -814,15 +814,15 @@ Status MegatronTransformer::TransformGPT2Attention(Graph& graph, bool& modified, [](Node* node_ptr) { return node_ptr != nullptr; }); // Replace by the partition weights. - NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializer(graph, qkv_weight_initializer_partition); + NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_weight_initializer_partition); graph_utils::ReplaceNodeInput(node, 1, qkv_weight_partition_arg); updated_weight_names_.insert({qkv_weight_arg->Name(), qkv_weight_partition_arg.Name()}); - NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializer(graph, qkv_bias_initializer_partition); + NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_bias_initializer_partition); graph_utils::ReplaceNodeInput(add_node, 1, qkv_bias_partition_arg); updated_weight_names_.insert({qkv_bias_arg->Name(), qkv_bias_partition_arg.Name()}); - NodeArg& dense_weight_partition_arg = graph_utils::AddInitializer(graph, dense_weight_initializer_partition); + NodeArg& dense_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, dense_weight_initializer_partition); graph_utils::ReplaceNodeInput(matmul_node, 1, dense_weight_partition_arg); updated_weight_names_.insert({dense_weight_arg->Name(), dense_weight_partition_arg.Name()}); @@ -849,7 +849,7 @@ Status MegatronTransformer::TransformGPT2Attention(Graph& graph, bool& modified, val_partition.insert(val_partition.end(), val, val + size); val_partition[2] /= horizontal_parallel_size_; tensor_partition.set_raw_data(val_partition.data(), size * sizeof(int64_t)); - NodeArg& node_arg_partition = graph_utils::AddInitializer(graph, tensor_partition); + NodeArg& node_arg_partition = graph_utils::AddInitializerWithOrtValue(graph, tensor_partition); graph_utils::ReplaceNodeInput(*node_ptr, 1, node_arg_partition); graph.RemoveInitializedTensor(shape_arg->Name()); } @@ -1130,7 +1130,7 @@ Status MegatronTransformer::TransformBARTAttention(Graph& graph, bool& modified, size_t i = 0; for (auto trans_ptr : weight_transpose_node_ptrs) { auto weight_name = trans_ptr->MutableInputDefs()[0]->Name(); - NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializer(graph, qkv_weight_initializer_partitions[i]); + NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_weight_initializer_partitions[i]); graph_utils::ReplaceNodeInput(*trans_ptr, 0, qkv_weight_partition_arg); graph.RemoveInitializedTensor(weight_name); updated_weight_names_.insert({weight_name, qkv_weight_partition_arg.Name()}); @@ -1139,14 +1139,14 @@ Status MegatronTransformer::TransformBARTAttention(Graph& graph, bool& modified, i = 0; for (auto add_ptr : bias_add_node_ptrs) { auto bias_name = add_ptr->MutableInputDefs()[1]->Name(); - NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializer(graph, qkv_bias_initializer_partitions[i]); + NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, qkv_bias_initializer_partitions[i]); graph_utils::ReplaceNodeInput(*add_ptr, 1, qkv_bias_partition_arg); graph.RemoveInitializedTensor(bias_name); updated_weight_names_.insert({bias_name, qkv_bias_partition_arg.Name()}); i++; } - NodeArg& dense_weight_partition_arg = graph_utils::AddInitializer(graph, dense_weight_initializer_partition); + NodeArg& dense_weight_partition_arg = graph_utils::AddInitializerWithOrtValue(graph, dense_weight_initializer_partition); graph_utils::ReplaceNodeInput(*last_transpose, 0, dense_weight_partition_arg); graph.RemoveInitializedTensor(dense_weight_arg->Name()); updated_weight_names_.insert({dense_weight_arg->Name(), dense_weight_partition_arg.Name()}); @@ -1178,7 +1178,7 @@ Status MegatronTransformer::TransformBARTAttention(Graph& graph, bool& modified, val_partition.insert(val_partition.end(), val, val + size); val_partition[idx] /= horizontal_parallel_size_; tensor_partition.set_raw_data(val_partition.data(), size * sizeof(int64_t)); - NodeArg& node_arg_partition = graph_utils::AddInitializer(graph, tensor_partition); + NodeArg& node_arg_partition = graph_utils::AddInitializerWithOrtValue(graph, tensor_partition); graph_utils::ReplaceNodeInput(*node_ptr, 1, node_arg_partition); graph.RemoveInitializedTensor(shape_arg->Name()); } diff --git a/orttraining/orttraining/core/optimizer/qdq_fusion.cc b/orttraining/orttraining/core/optimizer/qdq_fusion.cc index 42720dbbb11e5..6219d1cfafae1 100644 --- a/orttraining/orttraining/core/optimizer/qdq_fusion.cc +++ b/orttraining/orttraining/core/optimizer/qdq_fusion.cc @@ -45,7 +45,7 @@ int ReplaceOrCreateZeroPointInitializer(Graph& graph, Node& quantize_node) { // Since the quantize node has the zero point initializer input, replace it graph_utils::ReplaceNodeInput(quantize_node, 2, - graph_utils::AddInitializer(graph, zero_point_tensor_float)); + graph_utils::AddInitializerWithOrtValue(graph, zero_point_tensor_float)); } else { // The quantize node does not have the zero point optional input. // Create the zero point initializer to be 0. @@ -55,7 +55,7 @@ int ReplaceOrCreateZeroPointInitializer(Graph& graph, Node& quantize_node) { // Since the input did not exist, add the newly created initializer as an input graph_utils::AddNodeInput(quantize_node, 2, - graph_utils::AddInitializer(graph, zero_point_tensor_float)); + graph_utils::AddInitializerWithOrtValue(graph, zero_point_tensor_float)); } return zero_point_type; diff --git a/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc b/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc index 84bf715c7c85a..3ab1636460adf 100644 --- a/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc +++ b/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc @@ -83,7 +83,7 @@ Status SceLossGradBiasFusion::ApplyImpl(Graph& graph, bool& modified, int graph_ ignore_index_initializer_proto.set_name(graph.GenerateNodeArgName("sce_grad_ignore_index")); ignore_index_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); ignore_index_initializer_proto.add_int64_data(static_cast(-1)); - new_scegrad_node_inputs.emplace_back(&graph_utils::AddInitializer(graph, ignore_index_initializer_proto)); + new_scegrad_node_inputs.emplace_back(&graph_utils::AddInitializerWithOrtValue(graph, ignore_index_initializer_proto)); } new_scegrad_node_inputs.emplace_back(bias_def); if (!p_reshape) {