diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index bd694f7b3b23c..866892979b749 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -1220,7 +1220,10 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi #endif #if !defined(ORT_MINIMAL_BUILD) - /** Gets the GraphProto representation of this Graph only. */ + /** Gets the GraphProto representation of this Graph only. + * This does not remove in-memory tags for graph initializers. + * Use ToGraphProto() const to get a GraphProto that can be serialized externally. + */ const ONNX_NAMESPACE::GraphProto& ToGraphProto(); /// @@ -1439,6 +1442,27 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi return Resolve(default_options); } + /// + /// This function converts all the graph TensorProto initializers into OrtValues + /// and creates a in-memory external data reference for each OrtValue. + /// + /// + Status ConvertInitializersIntoOrtValues(); + + /** + * @brief Converts a subset of graph TensorProto initializers into OrtValues and updates the graph proto. + * + * This function converts specified TensorProto initializers in the graph into OrtValues and + * creates in-memory external data references for each OrtValue. It then updates the provided + * GraphProto with the modified initializers. + * + * @param iterators Span of iterators pointing to the initializers and the order that should be processed + * @param output_graph_proto The GraphProto to be updated with the modified initializers + * @return Status Returns a Status object indicating success or any errors that occurred during conversion + */ + Status RegenerateInitializersAndReplaceInMemory(gsl::span iterators, + ONNX_NAMESPACE::GraphProto& output_graph_proto) const; + const std::unordered_set& GetOuterScopeNodeArgNames() const noexcept { return outer_scope_node_arg_names_; } @@ -1595,20 +1619,25 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi /// This function is used by ToGraphProto() to ensure in-memory external data references /// don't leak externally since they are non-standard. /// - /// It handles two scenarios: - /// - When GraphSynchronizationNeeded() is false: GraphProto is simply copied + /// It is used when GraphSynchronizationNeeded() is false: GraphProto is simply copied /// from graph_proto_ by ToGraphProto(). This copy includes both main graph /// and subgraph initializers. This function examines all initializers /// and inlines any in-memory data references. - /// - When GraphSynchronizationNeeded() is true: ToGraphProto() generates a new GraphProto - /// using ToGraphProtoInternal(). This doesn't transfer main graph initializers, which are - /// copied and inlined by ToGraphProto() itself. This function processes only the subgraph initializers - /// as needed. /// /// The GraphProto to process - /// Whether to process the main graph initializers - /// Status indicating success or failure /// - Status ProcessSubgraphsInMemoryData(ONNX_NAMESPACE::GraphProto& output_graph_proto, bool process_main) const; + /// Status indicating success or failure + Status ProcessSubgraphsInMemoryData(ONNX_NAMESPACE::GraphProto& output_graph_proto) const; + + /// + /// This function replaces all of the initializers within output_graph_proto + /// from this Graph instance. All in memory initializers are regenerated and inlined. + /// This is necessary even if the graph_proto_ is already up to date because initializers() may + /// contain obsolete initializers that are no longer in use due to optimizations and contain obsolete + /// references to OrtValues that may no longer be around (since we like appending rather than replacing). + /// + /// Destination GraphProto to receive the updated initializers. + /// Status indicating success or failure. + Status RegenerateInitializersAndReplaceInMemory(ONNX_NAMESPACE::GraphProto& output_graph_proto) const; /// /// This function traverses the graph bottom up and externalizes diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 8ae7535da4413..e4f8cd6df678e 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -666,12 +666,16 @@ void Node::ToProto(NodeProto& proto, bool update_subgraphs) const { // Set attributes. proto.clear_attribute(); - for (const auto& attribute : attributes_) { + for (const auto& [name, attribute] : attributes_) { const gsl::not_null attr{proto.add_attribute()}; - *attr = attribute.second; // copy - if (update_subgraphs && attr->has_g()) { + *attr = attribute; // copy + if (update_subgraphs && utils::HasGraph(*attr)) { + auto find_hit = attr_to_subgraph_map_.find(name); + // Force ToGraphProto() const to be called so + // that any in-memory TensorProto initializers go back to being inlined + const Graph& subgraph = *find_hit->second; attr->clear_g(); - *attr->mutable_g() = attr_to_subgraph_map_.find(attribute.first)->second->ToGraphProto(); + *attr->mutable_g() = subgraph.ToGraphProto(); } } @@ -3381,7 +3385,12 @@ Status Graph::Resolve(const ResolveOptions& options) { return Status::OK(); }; - ORT_RETURN_IF_ERROR(ForThisAndAllSubgraphs(all_subgraphs, finalize_func)); + return ForThisAndAllSubgraphs(all_subgraphs, finalize_func); +} + +Status Graph::ConvertInitializersIntoOrtValues() { + std::vector all_subgraphs; + FindAllSubgraphs(all_subgraphs); auto put_weights_maybe_in_memory_func = [&](Graph& graph) -> Status { // if we have any initializers that are not in memory, put them there. @@ -4308,11 +4317,47 @@ Status InlineOrCopyInitializer(const Graph& src_graph, const ONNX_NAMESPACE::Ten } return Status::OK(); } - } // namespace -Status Graph::ProcessSubgraphsInMemoryData(ONNX_NAMESPACE::GraphProto& output_graph_proto, - bool process_main) const { +Status Graph::RegenerateInitializersAndReplaceInMemory(gsl::span iterators, + ONNX_NAMESPACE::GraphProto& output_graph_proto) const { + auto& mutable_initializers = *output_graph_proto.mutable_initializer(); + +#if !defined(DISABLE_SPARSE_TENSORS) + output_graph_proto.clear_sparse_initializer(); + + const auto& model_path = ModelPath(); + const bool has_sparse_initializers = !sparse_tensor_names_.empty(); + const auto sparse_end = sparse_tensor_names_.end(); + + for (const auto& iter : iterators) { + const auto& [name, tensor_proto] = *iter; + const auto& initializer = *tensor_proto; + if (!has_sparse_initializers || sparse_end == sparse_tensor_names_.find(name)) { + ORT_RETURN_IF_ERROR(InlineOrCopyInitializer(*this, initializer, + *mutable_initializers.Add())); + } else { + auto& sparse_initializer = *output_graph_proto.add_sparse_initializer(); + if (utils::HasExternalDataInMemory(initializer)) { + ONNX_NAMESPACE::TensorProto tensor_proto_inlined; + ORT_RETURN_IF_ERROR(InlineOrCopyInitializer(*this, initializer, + tensor_proto_inlined)); + ORT_RETURN_IF_ERROR(utils::DenseTensorToSparseTensorProto(tensor_proto_inlined, model_path, sparse_initializer)); + } else { + ORT_RETURN_IF_ERROR(utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer)); + } + } + } +#else + for (const auto& iter : iterators) { + const auto& [name, tensor_proto] = *iter; + ORT_RETURN_IF_ERROR(InlineOrCopyInitializer(*this, *tensor_proto, *mutable_initializers.Add())); + } +#endif + return Status::OK(); +} + +Status Graph::ProcessSubgraphsInMemoryData(ONNX_NAMESPACE::GraphProto& output_graph_proto) const { for (const auto& node : Nodes()) { if (node.ContainsSubgraph()) { // Let's find this node in the output_graph_proto @@ -4343,103 +4388,48 @@ Status Graph::ProcessSubgraphsInMemoryData(ONNX_NAMESPACE::GraphProto& output_gr "Subgraph ", name, " is referred to in GetAttributeNameToSubgraphMap, but not found in node ", node.Name(), " while attempting to recurse into it."); auto& result_subgraph = *sub_hit->mutable_g(); - ORT_RETURN_IF_ERROR(subgraph->ProcessSubgraphsInMemoryData(result_subgraph, process_main)); + ORT_RETURN_IF_ERROR(subgraph->ProcessSubgraphsInMemoryData(result_subgraph)); } } } - // When graph_proto is copied from graph_proto, initializers already present in the main graph - if (parent_graph_ != nullptr || process_main) { -#if !defined(DISABLE_SPARSE_TENSORS) - auto* mutable_initializers = output_graph_proto.mutable_initializer(); - const auto& model_path = ModelPath(); - const bool has_sparse_initializers = !sparse_tensor_names_.empty(); - const auto sparse_end = sparse_tensor_names_.end(); - - // We want to make sure that sparse initializers do not appear - // as dense duplicates within the initializers list. - std::optional> initializer_to_remove; - if (has_sparse_initializers) { - // We need to remove the dense initializers that are sparse tensors - initializer_to_remove.emplace(); - } - - for (auto first = mutable_initializers->begin(), end = mutable_initializers->end(); first != end; ++first) { - auto& initializer = *first; - if (utils::HasExternalDataInMemory(initializer)) { - // If the initializer has external data in memory, we need to inline it. - ORT_RETURN_IF_ERROR(InlineOrCopyInitializer(*this, initializer, initializer)); - } - if (has_sparse_initializers && sparse_end != sparse_tensor_names_.find(initializer.name())) { - auto& sparse_initializer = *output_graph_proto.add_sparse_initializer(); - ORT_RETURN_IF_ERROR(utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer)); - initializer_to_remove->insert(initializer.name()); - } - } - - // erase/remove dense initializers that are sparse tensors so no duplicates are present - if (initializer_to_remove && !initializer_to_remove->empty()) { - mutable_initializers->erase(std::remove_if( - mutable_initializers->begin(), mutable_initializers->end(), - [&initializer_to_remove](const ONNX_NAMESPACE::TensorProto& initializer) { - return initializer_to_remove->count(initializer.name()) > 0; - }), - mutable_initializers->end()); - } -#else - for (auto& initializer : *output_graph_proto.mutable_initializer()) { - if (utils::HasExternalDataInMemory(initializer)) { - // If the initializer has external data in memory, we need to inline it. - ORT_RETURN_IF_ERROR(InlineOrCopyInitializer(*this, initializer, initializer)); - } + // Filter in iterators for weights that are present in the name_to_initial_tensor_ map + // and preserve the order. This is needed for tests. + InlinedVector initializers_to_process; + initializers_to_process.reserve(name_to_initial_tensor_.size()); + for (const auto& tensor_proto : output_graph_proto.initializer()) { + auto hit = name_to_initial_tensor_.find(tensor_proto.name()); + if (hit != name_to_initial_tensor_.end()) { + initializers_to_process.push_back(hit); } -#endif } - return Status::OK(); + + output_graph_proto.clear_initializer(); + return RegenerateInitializersAndReplaceInMemory(initializers_to_process, output_graph_proto); } ONNX_NAMESPACE::GraphProto Graph::ToGraphProto() const { GraphProto result; if (!GraphProtoSyncNeeded()) { result = *graph_proto_; - ORT_THROW_IF_ERROR(ProcessSubgraphsInMemoryData(result, /*process_main*/ true)); + ORT_THROW_IF_ERROR(ProcessSubgraphsInMemoryData(result)); } else { + // Recursion is handled via Node::ToProto() const -> Graph::ToGraphProto() const (this method) + // so below we handle this graph only. ToGraphProtoInternal(result); - ORT_THROW_IF_ERROR(ProcessSubgraphsInMemoryData(result, /*process_main*/ false)); - - // Add initializers to parent graph by copy converting them from graph_proto_ - // ToGraphProtoInternal() does not copy initializers for the main graph - auto* mutable_initializers = result.mutable_initializer(); - -#if !defined(DISABLE_SPARSE_TENSORS) - const auto& model_path = ModelPath(); - const bool has_sparse_initializers = !sparse_tensor_names_.empty(); - const auto sparse_end = sparse_tensor_names_.end(); - - for (const auto& initializer : graph_proto_->initializer()) { - if (!has_sparse_initializers || sparse_end == sparse_tensor_names_.find(initializer.name())) { - ORT_THROW_IF_ERROR(InlineOrCopyInitializer(*this, initializer, - *mutable_initializers->Add())); - } else { - auto& sparse_initializer = *result.add_sparse_initializer(); - if (utils::HasExternalDataInMemory(initializer)) { - ONNX_NAMESPACE::TensorProto tensor_proto; - ORT_THROW_IF_ERROR(InlineOrCopyInitializer(*this, initializer, - tensor_proto)); - ORT_THROW_IF_ERROR(utils::DenseTensorToSparseTensorProto(tensor_proto, model_path, sparse_initializer)); - } else { - ORT_THROW_IF_ERROR(utils::DenseTensorToSparseTensorProto(initializer, model_path, sparse_initializer)); - } + InlinedVector initializers_to_process; + initializers_to_process.reserve(name_to_initial_tensor_.size()); + for (const auto& tensor_proto : graph_proto_->initializer()) { + auto hit = name_to_initial_tensor_.find(tensor_proto.name()); + if (hit != name_to_initial_tensor_.end()) { + initializers_to_process.push_back(hit); } } -#else - for (const auto& initializer : graph_proto_->initializer()) { - ORT_THROW_IF_ERROR(InlineOrCopyInitializer(*this, initializer, *mutable_initializers->Add())); - } -#endif - } + ORT_THROW_IF_ERROR(RegenerateInitializersAndReplaceInMemory(initializers_to_process, + result)); + } return result; } @@ -5235,23 +5225,7 @@ Status Graph::AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& nod tensor_proto.set_name(std::string(new_name.value())); } - // In the constant node, we won't have symbolic dims. - const auto tensor_shape = utils::GetTensorShapeFromTensorProto(tensor_proto); - auto ml_data = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType(); - const size_t size_in_bytes = Tensor::CalculateTensorStorageSize(ml_data, tensor_shape); - - if (size_in_bytes > utils::kSmallTensorExternalDataThreshold) { - OrtValue ort_value; - ORT_RETURN_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), ModelPath(), tensor_proto, - CPUAllocator::DefaultInstance(), ort_value)); - - constexpr const bool use_tensor_buffer_true = true; - auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get(), tensor_proto.name(), - use_tensor_buffer_true); - ORT_RETURN_IF_ERROR(AddInitializedOrtValue(tensor_proto_to_add, ort_value)); - } else { - AddInitializedTensor(tensor_proto); - } + AddInitializedTensor(tensor_proto); if (GetNodeArg(tensor_proto.name()) == nullptr) { TypeProto t{utils::TypeProtoFromTensorProto(tensor_proto)}; diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc index 616bc1257676f..3f9b58f71bd23 100644 --- a/onnxruntime/core/optimizer/attention_fusion.cc +++ b/onnxruntime/core/optimizer/attention_fusion.cc @@ -111,7 +111,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size, utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow(element_count) * sizeof(MLFloat16)); } - return graph_utils::AddInitializerWithExternalData(graph, initializer); + return graph_utils::AddInitializer(graph, initializer); } static NodeArg* ConvertMaskToInt32(Graph& graph, NodeArg* mask_input, ProviderType provider_type, diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc index a98d0ea6f978b..86a7a4d6afbf8 100644 --- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc +++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc @@ -189,7 +189,7 @@ NodeArg* CreateInitializerFromVector(Graph& graph, "total_count: ", total_count, " values.size(): ", values.size()); utils::SetRawDataInTensorProto(const_tensor, values.data(), values.size() * sizeof(int64_t)); - return &graph_utils::AddInitializerWithExternalData(graph, const_tensor); + return &graph_utils::AddInitializer(graph, const_tensor); } NodeArg* InsertNodesForValidIndices(Graph& graph, diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc index 3d838d8aacfbb..16e8955cb4486 100644 --- a/onnxruntime/core/optimizer/constant_folding.cc +++ b/onnxruntime/core/optimizer/constant_folding.cc @@ -95,7 +95,7 @@ static bool ConstantFoldShapeNode(Graph& graph, Node& node) { ONNX_NAMESPACE::TensorShapeProto result_shape; result_shape.add_dim()->set_dim_value(clamped_slice_length); constant_arg_out->SetShape(result_shape); - graph_utils::AddInitializerWithExternalData(graph, shape_constant); + graph_utils::AddInitializer(graph, shape_constant); } return is_concrete_shape; // convert to constant if this is true @@ -317,11 +317,11 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, // Build the TensorProto that corresponds to the computed OrtValue and add it as initializer to the graph. auto* constant_arg_out = node->MutableOutputDefs()[fetch_idx]; const Tensor& out_tensor = ort_value.Get(); - constexpr const bool use_tensor_buffer_true = true; + constexpr const bool use_tensor_buffer_false = false; ONNX_NAMESPACE::TensorProto out_tensorproto = utils::TensorToTensorProto( out_tensor, constant_arg_out->Name(), - use_tensor_buffer_true); + use_tensor_buffer_false); ONNX_NAMESPACE::TensorShapeProto result_shape; for (auto& dim : out_tensor.Shape().GetDims()) { @@ -329,12 +329,7 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, } constant_arg_out->SetShape(result_shape); - // The data is too small and has been inlined. - if (!utils::HasExternalData(out_tensorproto)) { - ORT_THROW_IF_ERROR(graph.AddInitializedOrtValue(out_tensorproto, OrtValue())); - } else { - ORT_THROW_IF_ERROR(graph.AddInitializedOrtValue(out_tensorproto, ort_value)); - } + graph.AddInitializedTensor(out_tensorproto); } } } diff --git a/onnxruntime/core/optimizer/conv_add_fusion.cc b/onnxruntime/core/optimizer/conv_add_fusion.cc index c349adfccce53..6478fa7d29d5b 100644 --- a/onnxruntime/core/optimizer/conv_add_fusion.cc +++ b/onnxruntime/core/optimizer/conv_add_fusion.cc @@ -79,7 +79,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie auto new_name = graph.GenerateNodeArgName("ConvAddFusion_B_" + B_input_name); new_conv_B_tensor_proto.set_name(new_name); - NodeArg& new_conv_B_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_conv_B_tensor_proto); + NodeArg& new_conv_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); graph_utils::ReplaceNodeInput(node, 2, new_conv_B_node_arg); } else { @@ -94,7 +94,7 @@ Status ConvAddFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& modifie auto new_name = graph.GenerateNodeArgName("ConvAddFusion_Add_B_" + add_B_tensor_proto->name()); new_conv_B_tensor_proto.set_name(new_name); - NodeArg& new_add_B_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_conv_B_tensor_proto); + NodeArg& new_add_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); graph_utils::AddNodeInput(node, 2, new_add_B_node_arg); } diff --git a/onnxruntime/core/optimizer/conv_bn_fusion.cc b/onnxruntime/core/optimizer/conv_bn_fusion.cc index 8bf5420baddde..a14639631d7a1 100644 --- a/onnxruntime/core/optimizer/conv_bn_fusion.cc +++ b/onnxruntime/core/optimizer/conv_bn_fusion.cc @@ -120,10 +120,10 @@ Status ConvBNFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff new_conv_W_tensor_proto.set_name(new_W_name); new_conv_B_tensor_proto.set_name(new_B_name); - NodeArg& new_conv_W_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_conv_W_tensor_proto); + NodeArg& new_conv_W_node_arg = graph_utils::AddInitializer(graph, new_conv_W_tensor_proto); graph_utils::ReplaceNodeInput(node, 1, new_conv_W_node_arg); - auto& new_conv_B_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_conv_B_tensor_proto); + auto& new_conv_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); if (conv_inputs.size() == 3) { graph_utils::ReplaceNodeInput(node, 2, new_conv_B_node_arg); diff --git a/onnxruntime/core/optimizer/conv_mul_fusion.cc b/onnxruntime/core/optimizer/conv_mul_fusion.cc index dc50a150537f7..e91a00729e9db 100644 --- a/onnxruntime/core/optimizer/conv_mul_fusion.cc +++ b/onnxruntime/core/optimizer/conv_mul_fusion.cc @@ -90,7 +90,7 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef new_conv_W_tensor_proto.set_name(new_W_name); // Replace initializers of conv node - NodeArg& new_conv_W_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_conv_W_tensor_proto); + NodeArg& new_conv_W_node_arg = graph_utils::AddInitializer(graph, new_conv_W_tensor_proto); graph_utils::ReplaceNodeInput(conv_node, 1, new_conv_W_node_arg); if (is_3d) { @@ -100,7 +100,7 @@ Status ConvMulFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_ef auto new_B_name = graph.GenerateNodeArgName("ConvMulFusion_Mul_B_" + mul_B_tensor_proto->name()); new_conv_B_tensor_proto.set_name(new_B_name); - NodeArg& new_conv_B_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_conv_B_tensor_proto); + NodeArg& new_conv_B_node_arg = graph_utils::AddInitializer(graph, new_conv_B_tensor_proto); graph_utils::ReplaceNodeInput(conv_node, 2, new_conv_B_node_arg); } diff --git a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc index 7f214e656e0ab..96f75f07e32e1 100644 --- a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc +++ b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc @@ -53,7 +53,7 @@ static void ApplyNewInputValue(Graph& graph, Node& node, QDQ::InputIndex index, auto new_name = graph.GenerateNodeArgName("DoubleQDQRemoved_" + node.InputDefs()[index]->Name()); new_input_tensor.set_name(new_name); new_input_tensor.add_dims(1); - NodeArg& new_input = graph_utils::AddInitializerWithExternalData(graph, new_input_tensor); + NodeArg& new_input = graph_utils::AddInitializer(graph, new_input_tensor); graph_utils::ReplaceNodeInput(node, index, new_input); } diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc index ad25f95ac1186..f8fd807084d38 100644 --- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc @@ -474,7 +474,7 @@ static NodeArg* ExtractEmbedding(Graph& graph, utils::SetRawDataInTensorProto(initializer, data, gsl::narrow(element_count) * sizeof(MLFloat16)); } - NodeArg& node_arg = graph_utils::AddInitializerWithExternalData(graph, initializer); + NodeArg& node_arg = graph_utils::AddInitializer(graph, initializer); modified = true; return &node_arg; } diff --git a/onnxruntime/core/optimizer/fuse_initializers_transformer.cc b/onnxruntime/core/optimizer/fuse_initializers_transformer.cc index 388ab14dd51fe..e604c688ee033 100644 --- a/onnxruntime/core/optimizer/fuse_initializers_transformer.cc +++ b/onnxruntime/core/optimizer/fuse_initializers_transformer.cc @@ -137,12 +137,8 @@ static void FuseInitializerWithNode(Graph& graph, graph.RemoveEdge(node.Index(), next_node.Index(), 0, static_cast(next_node_arg_index)); // Add the new converted Tensor in next node as initializer potentially with external data - ONNX_NAMESPACE::TensorProto dst_tensor = utils::TensorToTensorProto(new_data.Get(), new_arg_name, true); - if (!utils::HasExternalData(dst_tensor)) { - new_data = OrtValue(); // Data is inline - } - - auto& new_arg = graph_utils::AddInitializerWithExternalData(graph, dst_tensor, std::move(new_data)); + ONNX_NAMESPACE::TensorProto dst_tensor = utils::TensorToTensorProto(new_data.Get(), new_arg_name, false); + auto& new_arg = graph_utils::AddInitializer(graph, dst_tensor); graph_utils::ReplaceNodeInput(next_node, static_cast(next_node_arg_index), new_arg); } diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc index 3cd06350df95d..bd730683a4c91 100644 --- a/onnxruntime/core/optimizer/gather_fusion.cc +++ b/onnxruntime/core/optimizer/gather_fusion.cc @@ -256,7 +256,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra axes_initializer_proto.add_dims(static_cast(1)); axes_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); axes_initializer_proto.add_int64_data(axis); - NodeArg* axes_arg = &graph_utils::AddInitializerWithExternalData(graph, axes_initializer_proto); + NodeArg* axes_arg = &graph_utils::AddInitializer(graph, axes_initializer_proto); Node& squeeze_node = graph.AddNode(graph.GenerateNodeName("Squeeze"), "Squeeze", "Squeeze for Fused Gather nodes", {split_output_arg, axes_arg}, {original_output_arg}); @@ -272,7 +272,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); split_initializer_proto.add_dims(static_cast(split_values.size())); split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end()); - NodeArg* split_initializer_arg = &graph_utils::AddInitializerWithExternalData(graph, split_initializer_proto); + NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto); const auto split_node_name = graph.GenerateNodeName(nodes_to_fuse[0].get().Name() + "/GatherSliceToSplitFusion"); Node& split_node = graph.AddNode(split_node_name, "Split", "Split for Fused Gather nodes", {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs); @@ -359,7 +359,7 @@ Status GatherToSliceFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le unsqueeze_axes_initializer_proto.add_dims(static_cast(1)); unsqueeze_axes_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); unsqueeze_axes_initializer_proto.add_int64_data(static_cast(0)); - NodeArg* unsqueeze_axes_arg = &graph_utils::AddInitializerWithExternalData(graph, unsqueeze_axes_initializer_proto); + NodeArg* unsqueeze_axes_arg = &graph_utils::AddInitializer(graph, unsqueeze_axes_initializer_proto); for (size_t i = 0; i < range_input_defs.size(); ++i) { Node& unsqueeze_node = graph.AddNode(graph.GenerateNodeName("Unsqueeze_" + std::to_string(i)), "Unsqueeze", @@ -386,7 +386,7 @@ Status GatherToSliceFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le } else { slice_axes_initializer_proto.add_int32_data(static_cast(axis)); } - NodeArg* slice_axes_arg = &graph_utils::AddInitializerWithExternalData(graph, slice_axes_initializer_proto); + NodeArg* slice_axes_arg = &graph_utils::AddInitializer(graph, slice_axes_initializer_proto); Node& slice_node = graph.AddNode(graph.GenerateNodeName("Slice"), "Slice", "Slice for Fused Gather nodes", {gather_node.MutableInputDefs()[0], unsqueeze_outputs[0], unsqueeze_outputs[1], slice_axes_arg, unsqueeze_outputs[2]}, diff --git a/onnxruntime/core/optimizer/matmul_add_fusion.cc b/onnxruntime/core/optimizer/matmul_add_fusion.cc index 761fe1854274e..fed72db71332a 100644 --- a/onnxruntime/core/optimizer/matmul_add_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_add_fusion.cc @@ -194,7 +194,7 @@ Status MatMulAddFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, shape_initializer_proto.add_dims(static_cast(shape.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); utils::SetRawDataInTensorProto(shape_initializer_proto, shape.data(), shape.size() * sizeof(int64_t)); - NodeArg* shape_arg = &graph_utils::AddInitializerWithExternalData(graph, shape_initializer_proto); + NodeArg* shape_arg = &graph_utils::AddInitializer(graph, shape_initializer_proto); ONNX_NAMESPACE::TypeProto new_arg_type; const ONNX_NAMESPACE::TensorProto_DataType element_type = static_cast( gemm_input_defs[0]->TypeAsProto()->tensor_type().elem_type()); diff --git a/onnxruntime/core/optimizer/matmul_bn_fusion.cc b/onnxruntime/core/optimizer/matmul_bn_fusion.cc index 725cb3fc33f04..367fb42d7928d 100644 --- a/onnxruntime/core/optimizer/matmul_bn_fusion.cc +++ b/onnxruntime/core/optimizer/matmul_bn_fusion.cc @@ -212,14 +212,14 @@ Status MatmulBNFusion::Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& matmul_b.ToProto(new_gemm_b_tensor); const std::string new_gemm_b_name = graph.GenerateNodeArgName("MatMulBnFusion_GemmB_" + matmul_b_tensor->name()); new_gemm_b_tensor.set_name(new_gemm_b_name); - NodeArg& new_gemm_b_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_gemm_b_tensor); + NodeArg& new_gemm_b_node_arg = graph_utils::AddInitializer(graph, new_gemm_b_tensor); // create bias tensorProto for new Gemm node from initializer. ONNX_NAMESPACE::TensorProto new_gemm_bias_tensor; bias.ToProto(new_gemm_bias_tensor); const std::string new_gemm_bias_name = graph.GenerateNodeArgName("MatMulBnFusion_GemmBias"); new_gemm_bias_tensor.set_name(new_gemm_bias_name); - NodeArg& new_gemm_bias_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_gemm_bias_tensor); + NodeArg& new_gemm_bias_node_arg = graph_utils::AddInitializer(graph, new_gemm_bias_tensor); Node& gemm_node = graph.AddNode( graph.GenerateNodeArgName("MatMulBnFusion_Gemm"), diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc index 335209dbfadaf..f094a48e10c33 100644 --- a/onnxruntime/core/optimizer/nchwc_transformer.cc +++ b/onnxruntime/core/optimizer/nchwc_transformer.cc @@ -437,7 +437,7 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_conv_W_tensor_proto.add_dims(conv_W_dims[i]); } - nchwc_conv_W_arg = &graph_utils::AddInitializerWithExternalData(graph_, nchwc_conv_W_tensor_proto); + nchwc_conv_W_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_W_tensor_proto); filters_map->emplace(input_defs[1], nchwc_conv_W_arg); } @@ -464,7 +464,7 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels); - nchwc_conv_B_arg = &graph_utils::AddInitializerWithExternalData(graph_, nchwc_conv_B_tensor_proto); + nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto); aligned_biases_.emplace(input_defs[2], nchwc_conv_B_arg); } } @@ -580,7 +580,7 @@ Node& NchwcTransformerImpl::InsertReshape(NodeArg* input_arg, } shape_tensor_proto.add_dims(split_channels ? kNchwcDims + 1 : kNchwcDims); - shape_arg = &graph_utils::AddInitializerWithExternalData(graph_, shape_tensor_proto); + shape_arg = &graph_utils::AddInitializer(graph_, shape_tensor_proto); } Node& reshape_node = graph_.AddNode(graph_.GenerateNodeName("Reshape"), @@ -892,7 +892,7 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) { nchwc_conv_W_tensor_proto.add_dims(1); nchwc_conv_W_tensor_proto.add_dims(1); - auto* nchwc_conv_W_arg = &graph_utils::AddInitializerWithExternalData(graph_, nchwc_conv_W_tensor_proto); + auto* nchwc_conv_W_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_W_tensor_proto); std::copy_n(bn_B.data(), channels, padded_buffer.data()); @@ -903,7 +903,7 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) { gsl::narrow(nchwc_channels) * sizeof(float)); nchwc_conv_B_tensor_proto.add_dims(nchwc_channels); - auto* nchwc_conv_B_arg = &graph_utils::AddInitializerWithExternalData(graph_, nchwc_conv_B_tensor_proto); + auto* nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto); // Create the replacement node. std::string nchwc_node_name = graph_.GenerateNodeName(output_defs[0]->Name() + "_bn_nchwc"); diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc index 42cd31b5bd7b4..42d27de632b91 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc @@ -130,22 +130,22 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph, const log weights_proto_u8.set_name(weight_tensor_proto->name() + "_s8_2_u8"); weights_proto_u8.mutable_dims()->CopyFrom(weight_tensor_proto->dims()); utils::SetRawDataInTensorProto(weights_proto_u8, w_temp.data(), static_cast(w_temp.size())); - input_defs[w_idx] = &graph_utils::AddInitializerWithExternalData(graph, weights_proto_u8); + input_defs[w_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8); ONNX_NAMESPACE::TensorProto weight_zp_proto_u8; QDQ::Int8TensorProto2Uint8(weight_zp_tensor_proto, weight_zp_proto_u8, graph, true); - input_defs[w_zp_idx] = &graph_utils::AddInitializerWithExternalData(graph, weight_zp_proto_u8); + input_defs[w_zp_idx] = &graph_utils::AddInitializer(graph, weight_zp_proto_u8); ONNX_NAMESPACE::TensorProto r_proto_u8; r_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); r_proto_u8.set_name(r_tensor_proto->name() + "_s8_2_u8"); r_proto_u8.mutable_dims()->CopyFrom(r_tensor_proto->dims()); utils::SetRawDataInTensorProto(r_proto_u8, r_temp.data(), static_cast(r_temp.size())); - input_defs[r_idx] = &graph_utils::AddInitializerWithExternalData(graph, r_proto_u8); + input_defs[r_idx] = &graph_utils::AddInitializer(graph, r_proto_u8); ONNX_NAMESPACE::TensorProto r_zp_proto_u8; QDQ::Int8TensorProto2Uint8(r_zp_tensor_proto, r_zp_proto_u8, graph, true); - input_defs[r_zp_idx] = &graph_utils::AddInitializerWithExternalData(graph, r_zp_proto_u8); + input_defs[r_zp_idx] = &graph_utils::AddInitializer(graph, r_zp_proto_u8); return true; } diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc index 98c818b0c761b..828165e99d840 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc @@ -61,7 +61,7 @@ static bool QDQ_S8_to_U8(Graph& graph, Node& q_node, Node& dq_node) { zp_tensor_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); zp_tensor_proto_u8.set_name(graph.GenerateNodeArgName("qdq_s8_to_u8_zp_conversion")); utils::SetRawDataInTensorProto(zp_tensor_proto_u8, &q_zp_value, sizeof(uint8_t)); - NodeArg* zp_u8_arg = &graph_utils::AddInitializerWithExternalData(graph, zp_tensor_proto_u8); + NodeArg* zp_u8_arg = &graph_utils::AddInitializer(graph, zp_tensor_proto_u8); auto q_output_node_arg_name = graph.GenerateNodeArgName("qdq_s8_to_u8_quant"); NodeArg* q_output_arg = &graph.GetOrCreateNodeArg(q_output_node_arg_name, nullptr); diff --git a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc index 616144c0ccde0..f094f3c199f2a 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.cc @@ -43,12 +43,12 @@ bool ConvertS8WeightToU8(Graph& graph, Node& op_node, // The weights fits into S7, overflow is not a problem, no need to convert to U8 return false; } - input_defs[weights_idx] = &graph_utils::AddInitializerWithExternalData(graph, weights_proto_u8); + input_defs[weights_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8); // Convert weight zero point to uint8 ONNX_NAMESPACE::TensorProto weight_zp_proto_u8; Int8TensorProto2Uint8(weight_zp_tensor_proto, weight_zp_proto_u8, graph, true); - input_defs[weight_zp_idx] = &graph_utils::AddInitializerWithExternalData(graph, weight_zp_proto_u8); + input_defs[weight_zp_idx] = &graph_utils::AddInitializer(graph, weight_zp_proto_u8); return true; } diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index dce69e2913582..34d7ba3c79775 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -439,23 +439,23 @@ Status DQMatMulToMatMulNBitsAction::ProcessNewNode(Graph& graph, } } - auto weight_T_tp = utils::TensorToTensorProto(weight_dst, weight_dst_name, true); - auto scale_T_tp = utils::TensorToTensorProto(scale_dst, scale_dst_name, true); + auto weight_T_tp = utils::TensorToTensorProto(weight_dst, weight_dst_name, false); + auto scale_T_tp = utils::TensorToTensorProto(scale_dst, scale_dst_name, false); std::optional zp_T_tp; if (zp_dst) { - zp_T_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, true)); + zp_T_tp.emplace(utils::TensorToTensorProto(*zp_dst, zp_dst_name, false)); } auto& input_defs = replacement_node.MutableInputDefs(); - input_defs.push_back(&graph_utils::AddInitializerWithExternalData(graph, weight_T_tp, std::move(weight_dst))); + input_defs.push_back(&graph_utils::AddInitializer(graph, weight_T_tp)); replacement_node.MutableInputArgsCount().push_back(1); - input_defs.push_back(&graph_utils::AddInitializerWithExternalData(graph, scale_T_tp, std::move(scale_dst))); + input_defs.push_back(&graph_utils::AddInitializer(graph, scale_T_tp)); replacement_node.MutableInputArgsCount().push_back(1); if (zp_T_tp) { - input_defs.push_back(&graph_utils::AddInitializerWithExternalData(graph, zp_T_tp.value(), std::move(*zp_dst))); + input_defs.push_back(&graph_utils::AddInitializer(graph, zp_T_tp.value())); replacement_node.MutableInputArgsCount().push_back(1); } diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc index aa6f9c5409de7..8caa67f266266 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc @@ -131,14 +131,14 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph weight_scale_proto.set_name(graph.GenerateNodeArgName(node.Name() + "_weight_scale")); weight_scale_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); weight_scale_proto.mutable_float_data()->Add(scale); - weight_scale_arg = &graph_utils::AddInitializerWithExternalData(graph, weight_scale_proto); + weight_scale_arg = &graph_utils::AddInitializer(graph, weight_scale_proto); // Weight zero point initializer. ONNX_NAMESPACE::TensorProto weight_zp_proto; weight_zp_proto.set_name(graph.GenerateNodeArgName(node.Name() + "_weight_zp")); weight_zp_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8); weight_zp_proto.mutable_int32_data()->Add(static_cast(zp)); - NodeArg& weight_zp_arg = graph_utils::AddInitializerWithExternalData(graph, weight_zp_proto); + NodeArg& weight_zp_arg = graph_utils::AddInitializer(graph, weight_zp_proto); // Q from float32 to int8. ONNX_NAMESPACE::TypeProto weight_q_type_proto; diff --git a/onnxruntime/core/optimizer/relu_clip_fusion.cc b/onnxruntime/core/optimizer/relu_clip_fusion.cc index efd7022ab764b..07902fde04930 100644 --- a/onnxruntime/core/optimizer/relu_clip_fusion.cc +++ b/onnxruntime/core/optimizer/relu_clip_fusion.cc @@ -97,7 +97,7 @@ Status FuseReluClip::Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_eff mutable_next_node->AddAttribute("min", 0.f); } else { // Add the initialized tensor to the graph - auto* replacement_min_nodearg = &graph_utils::AddInitializerWithExternalData(graph, replacement_min); + auto* replacement_min_nodearg = &graph_utils::AddInitializer(graph, replacement_min); // Replace the input def at the appropriate index of the Clip node auto& mutable_input_defs = mutable_next_node->MutableInputDefs(); diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc index 36213609f6b61..324905f953eec 100644 --- a/onnxruntime/core/optimizer/reshape_fusion.cc +++ b/onnxruntime/core/optimizer/reshape_fusion.cc @@ -438,7 +438,7 @@ bool ReshapeFusion::Fuse_Subgraph(Node& reshape, Graph& graph, const logging::Lo shape_initializer_proto.add_dims(static_cast(shape_value.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t)); - auto& new_node_arg = graph_utils::AddInitializerWithExternalData(graph, shape_initializer_proto); + auto& new_node_arg = graph_utils::AddInitializer(graph, shape_initializer_proto); // Safely remove concat parent nodes which have only one output for (int i = 0; i < concat_input_count; ++i) { @@ -492,7 +492,7 @@ bool ReshapeFusion::FuseContiguousReshapes(Node& reshape, Graph& graph) { shape_initializer_proto.add_dims(static_cast(shape_value.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t)); - NodeArg* shape_arg = &graph_utils::AddInitializerWithExternalData(graph, shape_initializer_proto); + NodeArg* shape_arg = &graph_utils::AddInitializer(graph, shape_initializer_proto); Node& reshape_node = graph.AddNode(graph.GenerateNodeName(name + "_new_reshape"), "Reshape", "Reshape for " + name, {contiguous_reshapes[0].get().MutableInputDefs()[0], shape_arg}, {contiguous_reshapes.back().get().MutableOutputDefs()[0]}); diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc index 74121508132dc..5c09e5225ab9c 100644 --- a/onnxruntime/core/optimizer/stft_decomposition.cc +++ b/onnxruntime/core/optimizer/stft_decomposition.cc @@ -46,7 +46,7 @@ NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[T proto.add_dims(shape[i]); } utils::SetRawDataInTensorProto(proto, begin, element_count * sizeof(TDataType)); - return &graph_utils::AddInitializerWithExternalData(graph, proto); + return &graph_utils::AddInitializer(graph, proto); } template diff --git a/onnxruntime/core/optimizer/transformer_memcpy.cc b/onnxruntime/core/optimizer/transformer_memcpy.cc index a320de2ee7a13..cc7682b2b418d 100644 --- a/onnxruntime/core/optimizer/transformer_memcpy.cc +++ b/onnxruntime/core/optimizer/transformer_memcpy.cc @@ -383,21 +383,7 @@ bool TransformerMemcpyImpl::ProcessInitializers(const KernelRegistryManager& ker TensorProto new_tensor_proto = *tensor_proto; *(new_tensor_proto.mutable_name()) = new_def_name; - // Query any OrtValue existing for the original initializer - // We are checking outer scope because GetInitializer is called with true, therefore, we potentially - // have references to parent graphs. - // We are doing this so the same OrtValue is re-used in subgraphs and no copies made for big items. - constexpr const bool check_outer_scope_true = true; - OrtValue ort_value; - // The initializer can be in memory with OrtValue or it can be a flatbuffer mapped. - if (utils::HasExternalDataInMemory(new_tensor_proto) && - graph_.GetOrtValueInitializer(name, ort_value, check_outer_scope_true)) { - // Re-use the same ort_value and proto that points to the same buffer - ORT_IGNORE_RETURN_VALUE(graph_utils::AddInitializerWithExternalData(graph_, new_tensor_proto, - std::move(ort_value))); - } else { - ORT_IGNORE_RETURN_VALUE(graph_utils::AddInitializer(graph_, new_tensor_proto)); - } + ORT_IGNORE_RETURN_VALUE(graph_utils::AddInitializer(graph_, new_tensor_proto)); replacements.insert(std::make_pair(provider_def, &new_def)); } diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc index 48ea54434b805..3a95d2a53e8f5 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc @@ -586,10 +586,10 @@ void ApiGraph::TransposeInitializer(std::string_view name, const std::vector& shape) { @@ -622,7 +622,7 @@ void ApiGraph::ReshapeInitializer(std::string_view name, const std::vector()->Reshape(new_shape); - } - - auto& new_node_arg = graph_utils::AddInitializerWithExternalData(graph, new_tensor_proto, ort_value); + auto& new_node_arg = graph_utils::AddInitializer(graph, new_tensor_proto); graph_utils::ReplaceNodeWithInitializer(graph, node, new_node_arg); // Remove the Unsqueeze node and replace it with the initializer. diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index f4f76a389030e..51922cda691b7 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1421,6 +1421,29 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool } } + // We choose to convert initializers into OrtValues before partitioning here so plug-in EPs could + // take advantage of the initializers being in OrtValue format and not to deal with protobuf. + // + // The initializers data is transferred to an OrtValue. The original TensorProto is replaced + // with a TensorProto that has the same data type, shape and name. However, its external data + // is used in a non-standard way. The location is set to a string constant utils::kTensorProtoMemoryAddressTag, + // The file offset is set to the address of the OrtValue's data buffer, and the length is set to the size of the + // OrtValue's data buffer. Because this external location is non-standard, onnx code can not handle it, so we choose + // to do it as late as possible but before the partitioning so type and shape inference accesses the initializers + // before they are converted to OrtValues. + // + // If any transformations are applied later, they would not introduce any in-memory initializers, + // type and shape inference would run only on any newly added nodes and any new initializers + // will be converted at session finalization time. + // + // The conversion is performed using the following steps (within ConvertInitializersIntoOrtValues()) + // constexpr const bool use_tensor_buffer_true = true; + // auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get(), tensor_proto.name(), + // use_tensor_buffer_true); + // ORT_RETURN_IF_ERROR(graph.ReplaceInitializedTensor(tensor_proto_to_add, ort_value)); + + ORT_RETURN_IF_ERROR_SESSIONID_(graph.ConvertInitializersIntoOrtValues()); + // Do partitioning based on execution providers' capabilities. ORT_RETURN_IF_ERROR_SESSIONID_(partitioner.Partition(graph, session_state_->GetMutableFuncMgr(), transform_layout_fn, session_options_.config_options, *session_logger_, diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc index b86f3efeefafd..fced72ce3246d 100644 --- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc +++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc @@ -67,7 +67,7 @@ static common::Status LoadInferenceSessionFromModel(FenceCudaTestInferenceSessio tensor_proto.set_data_type(PROTO_DATATYPE); \ for (auto v : value) tensor_proto.PROTO_ADD_DATA(v); \ tensor_proto.set_name(name); \ - return graph_utils::AddInitializerWithExternalData(graph, tensor_proto); \ + return graph_utils::AddInitializer(graph, tensor_proto); \ } CREATE_INITIALIZER_FUNC(float, TensorProto_DataType_FLOAT, add_float_data) diff --git a/onnxruntime/test/ir/graph_test.cc b/onnxruntime/test/ir/graph_test.cc index e2b54950e7b24..ca1166e19037c 100644 --- a/onnxruntime/test/ir/graph_test.cc +++ b/onnxruntime/test/ir/graph_test.cc @@ -1894,14 +1894,21 @@ TEST_F(GraphTest, AddRemoveInitializerHandling) { ASSERT_EQ(graph_proto_from_graph.initializer_size(), 2); auto validate_proto = [&](const GraphProto& proto) { + // Due to changes in a way we generate ToGraphProto() const, we can not guarantee the order of initializers + // in the generated GraphProto. auto initializers = proto.initializer(); - // we expect '2' to be before '1' due to the remove moving the last initializer into the slot of the one being - // removed in order to free memory and only move one entry - EXPECT_EQ(initializers[0].name(), init2.name()); - EXPECT_EQ(initializers[0].int32_data()[0], 2); - - EXPECT_EQ(initializers[1].name(), init.name()); - EXPECT_EQ(initializers[1].int32_data()[0], 1); + auto hit = std::find_if(initializers.begin(), initializers.end(), + [&init](const ONNX_NAMESPACE::TensorProto& t) { return t.name() == init.name(); }); + EXPECT_NE(hit, initializers.end()) + << "Initializer with name '" << init.name() << "' not found in the proto."; + EXPECT_EQ(hit->int32_data()[0], 1); + + hit = std::find_if(initializers.begin(), initializers.end(), + [&init2](const ONNX_NAMESPACE::TensorProto& t) { return t.name() == init2.name(); }); + EXPECT_NE(hit, initializers.end()) + << "Initializer with name '" << init2.name() << "' not found in the proto."; + + EXPECT_EQ(hit->int32_data()[0], 2); }; validate_proto(graph_proto_from_const_graph); diff --git a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc index 90be9e24d3dd4..ff220fcb067b8 100644 --- a/orttraining/orttraining/core/optimizer/conv1d_replacement.cc +++ b/orttraining/orttraining/core/optimizer/conv1d_replacement.cc @@ -121,7 +121,7 @@ void Conv1dToMatmul(Graph& graph, Node& conv, const std::string transformer_name initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); InlinedVector initializer_proto_value{weight_squeeze_axis}; initializer_proto.set_raw_data(initializer_proto_value.data(), initializer_proto_value.size() * sizeof(int64_t)); - auto& axes_input = graph_utils::AddInitializerWithExternalData(graph, initializer_proto); + auto& axes_input = graph_utils::AddInitializer(graph, initializer_proto); // Squeeze node doesn't have opschema here, so we need to set input args count manually weight_squeeze.MutableInputArgsCount().resize(2); graph_utils::AddNodeInput(weight_squeeze, 1, axes_input); diff --git a/orttraining/orttraining/core/optimizer/megatron_transformer.cc b/orttraining/orttraining/core/optimizer/megatron_transformer.cc index 55286379fd273..7c429ae5cb643 100644 --- a/orttraining/orttraining/core/optimizer/megatron_transformer.cc +++ b/orttraining/orttraining/core/optimizer/megatron_transformer.cc @@ -453,15 +453,15 @@ Status MegatronTransformer::TransformGPT2MLP(Graph& graph, bool& modified, return skip_status; } - NodeArg& a_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, a_weight_initializer_partition); + NodeArg& a_weight_partition_arg = graph_utils::AddInitializer(graph, a_weight_initializer_partition); graph_utils::ReplaceNodeInput(node, 1, a_weight_partition_arg); updated_weight_names_.insert({a_weight_arg->Name(), a_weight_partition_arg.Name()}); - NodeArg& a_bias_partition_arg = graph_utils::AddInitializerWithExternalData(graph, a_bias_initializer_partition); + NodeArg& a_bias_partition_arg = graph_utils::AddInitializer(graph, a_bias_initializer_partition); graph_utils::ReplaceNodeInput(add_node, 1, a_bias_partition_arg); updated_weight_names_.insert({b_weight_arg->Name(), a_bias_partition_arg.Name()}); - NodeArg& b_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, b_weight_initializer_partition); + NodeArg& b_weight_partition_arg = graph_utils::AddInitializer(graph, b_weight_initializer_partition); graph_utils::ReplaceNodeInput(matmul2_node, 1, b_weight_partition_arg); updated_weight_names_.insert({a_bias_arg->Name(), b_weight_partition_arg.Name()}); @@ -600,15 +600,15 @@ Status MegatronTransformer::TransformBARTMLP(Graph& graph, bool& modified, return skip_status; } - NodeArg& dense_wi_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, dense_wi_weight_initializer_partition); + NodeArg& dense_wi_weight_partition_arg = graph_utils::AddInitializer(graph, dense_wi_weight_initializer_partition); graph_utils::ReplaceNodeInput(*second_op, 0, dense_wi_weight_partition_arg); updated_weight_names_.insert({dense_wi_weight_arg->Name(), dense_wi_weight_partition_arg.Name()}); - NodeArg& dense_wi_bias_partition_arg = graph_utils::AddInitializerWithExternalData(graph, dense_wi_bias_initializer_partition); + NodeArg& dense_wi_bias_partition_arg = graph_utils::AddInitializer(graph, dense_wi_bias_initializer_partition); graph_utils::ReplaceNodeInput(biasgelu_node, 1, dense_wi_bias_partition_arg); updated_weight_names_.insert({dense_wi_bias_arg->Name(), dense_wi_bias_partition_arg.Name()}); - NodeArg& dense_wo_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, dense_wo_weight_initializer_partition); + NodeArg& dense_wo_weight_partition_arg = graph_utils::AddInitializer(graph, dense_wo_weight_initializer_partition); graph_utils::ReplaceNodeInput(*transpose_op_ptr, 0, dense_wo_weight_partition_arg); updated_weight_names_.insert({dense_wo_weight_arg->Name(), dense_wo_weight_partition_arg.Name()}); @@ -814,15 +814,15 @@ Status MegatronTransformer::TransformGPT2Attention(Graph& graph, bool& modified, [](Node* node_ptr) { return node_ptr != nullptr; }); // Replace by the partition weights. - NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, qkv_weight_initializer_partition); + NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializer(graph, qkv_weight_initializer_partition); graph_utils::ReplaceNodeInput(node, 1, qkv_weight_partition_arg); updated_weight_names_.insert({qkv_weight_arg->Name(), qkv_weight_partition_arg.Name()}); - NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializerWithExternalData(graph, qkv_bias_initializer_partition); + NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializer(graph, qkv_bias_initializer_partition); graph_utils::ReplaceNodeInput(add_node, 1, qkv_bias_partition_arg); updated_weight_names_.insert({qkv_bias_arg->Name(), qkv_bias_partition_arg.Name()}); - NodeArg& dense_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, dense_weight_initializer_partition); + NodeArg& dense_weight_partition_arg = graph_utils::AddInitializer(graph, dense_weight_initializer_partition); graph_utils::ReplaceNodeInput(matmul_node, 1, dense_weight_partition_arg); updated_weight_names_.insert({dense_weight_arg->Name(), dense_weight_partition_arg.Name()}); @@ -849,7 +849,7 @@ Status MegatronTransformer::TransformGPT2Attention(Graph& graph, bool& modified, val_partition.insert(val_partition.end(), val, val + size); val_partition[2] /= horizontal_parallel_size_; tensor_partition.set_raw_data(val_partition.data(), size * sizeof(int64_t)); - NodeArg& node_arg_partition = graph_utils::AddInitializerWithExternalData(graph, tensor_partition); + NodeArg& node_arg_partition = graph_utils::AddInitializer(graph, tensor_partition); graph_utils::ReplaceNodeInput(*node_ptr, 1, node_arg_partition); graph.RemoveInitializedTensor(shape_arg->Name()); } @@ -1130,7 +1130,7 @@ Status MegatronTransformer::TransformBARTAttention(Graph& graph, bool& modified, size_t i = 0; for (auto trans_ptr : weight_transpose_node_ptrs) { auto weight_name = trans_ptr->MutableInputDefs()[0]->Name(); - NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, qkv_weight_initializer_partitions[i]); + NodeArg& qkv_weight_partition_arg = graph_utils::AddInitializer(graph, qkv_weight_initializer_partitions[i]); graph_utils::ReplaceNodeInput(*trans_ptr, 0, qkv_weight_partition_arg); graph.RemoveInitializedTensor(weight_name); updated_weight_names_.insert({weight_name, qkv_weight_partition_arg.Name()}); @@ -1139,14 +1139,14 @@ Status MegatronTransformer::TransformBARTAttention(Graph& graph, bool& modified, i = 0; for (auto add_ptr : bias_add_node_ptrs) { auto bias_name = add_ptr->MutableInputDefs()[1]->Name(); - NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializerWithExternalData(graph, qkv_bias_initializer_partitions[i]); + NodeArg& qkv_bias_partition_arg = graph_utils::AddInitializer(graph, qkv_bias_initializer_partitions[i]); graph_utils::ReplaceNodeInput(*add_ptr, 1, qkv_bias_partition_arg); graph.RemoveInitializedTensor(bias_name); updated_weight_names_.insert({bias_name, qkv_bias_partition_arg.Name()}); i++; } - NodeArg& dense_weight_partition_arg = graph_utils::AddInitializerWithExternalData(graph, dense_weight_initializer_partition); + NodeArg& dense_weight_partition_arg = graph_utils::AddInitializer(graph, dense_weight_initializer_partition); graph_utils::ReplaceNodeInput(*last_transpose, 0, dense_weight_partition_arg); graph.RemoveInitializedTensor(dense_weight_arg->Name()); updated_weight_names_.insert({dense_weight_arg->Name(), dense_weight_partition_arg.Name()}); @@ -1178,7 +1178,7 @@ Status MegatronTransformer::TransformBARTAttention(Graph& graph, bool& modified, val_partition.insert(val_partition.end(), val, val + size); val_partition[idx] /= horizontal_parallel_size_; tensor_partition.set_raw_data(val_partition.data(), size * sizeof(int64_t)); - NodeArg& node_arg_partition = graph_utils::AddInitializerWithExternalData(graph, tensor_partition); + NodeArg& node_arg_partition = graph_utils::AddInitializer(graph, tensor_partition); graph_utils::ReplaceNodeInput(*node_ptr, 1, node_arg_partition); graph.RemoveInitializedTensor(shape_arg->Name()); } diff --git a/orttraining/orttraining/core/optimizer/qdq_fusion.cc b/orttraining/orttraining/core/optimizer/qdq_fusion.cc index 4a5bdc1f8fcd2..42720dbbb11e5 100644 --- a/orttraining/orttraining/core/optimizer/qdq_fusion.cc +++ b/orttraining/orttraining/core/optimizer/qdq_fusion.cc @@ -45,7 +45,7 @@ int ReplaceOrCreateZeroPointInitializer(Graph& graph, Node& quantize_node) { // Since the quantize node has the zero point initializer input, replace it graph_utils::ReplaceNodeInput(quantize_node, 2, - graph_utils::AddInitializerWithExternalData(graph, zero_point_tensor_float)); + graph_utils::AddInitializer(graph, zero_point_tensor_float)); } else { // The quantize node does not have the zero point optional input. // Create the zero point initializer to be 0. @@ -55,7 +55,7 @@ int ReplaceOrCreateZeroPointInitializer(Graph& graph, Node& quantize_node) { // Since the input did not exist, add the newly created initializer as an input graph_utils::AddNodeInput(quantize_node, 2, - graph_utils::AddInitializerWithExternalData(graph, zero_point_tensor_float)); + graph_utils::AddInitializer(graph, zero_point_tensor_float)); } return zero_point_type; diff --git a/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc b/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc index 8c9c12ceb4497..84bf715c7c85a 100644 --- a/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc +++ b/orttraining/orttraining/core/optimizer/sce_loss_grad_bias_fusion.cc @@ -83,7 +83,7 @@ Status SceLossGradBiasFusion::ApplyImpl(Graph& graph, bool& modified, int graph_ ignore_index_initializer_proto.set_name(graph.GenerateNodeArgName("sce_grad_ignore_index")); ignore_index_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); ignore_index_initializer_proto.add_int64_data(static_cast(-1)); - new_scegrad_node_inputs.emplace_back(&graph_utils::AddInitializerWithExternalData(graph, ignore_index_initializer_proto)); + new_scegrad_node_inputs.emplace_back(&graph_utils::AddInitializer(graph, ignore_index_initializer_proto)); } new_scegrad_node_inputs.emplace_back(bias_def); if (!p_reshape) {