From dc06c255b4842d8e291de5b5d74c9aa062b15ef4 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Fri, 19 May 2023 14:33:45 -0700 Subject: [PATCH] fix transpose optimizer on GPU EP (#15988) ### Description because of #15618 , the default allocator changed to device allocator, which will be GPU instead of CPU. in transpose optimizer we expect to read data from initializers so a CPU allocator is required here. this change fixes transpose optimizer on GPU EP Fixes the issue referred to in #15869, #15796 --- .../transpose_optimizer/optimizer_api_impl.cc | 5 ++--- .../transpose_optimizer/optimizer_utils.h | 3 ++- onnxruntime/core/session/inference_session.cc | 18 ++++++++++++------ .../test/framework/session_state_test.cc | 18 +++++++++++++++--- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc index 318ffe27173de..f47c5a922c6f9 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc @@ -911,9 +911,8 @@ PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& nod } Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvider& execution_provider, - const DebugGraphFn& debug_graph_fn) { - // sub graph recurse will be added later - auto api_graph = MakeApiGraph(graph, execution_provider.GetAllocator(OrtMemTypeDefault), nullptr); + AllocatorPtr cpu_allocator, const DebugGraphFn& debug_graph_fn) { + auto api_graph = MakeApiGraph(graph, cpu_allocator, nullptr); const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps(); for (auto& node : api_graph->Nodes()) { diff --git a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_utils.h b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_utils.h index 75c4d5cfe5ada..691e52c58e625 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_utils.h +++ b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_utils.h @@ -84,11 +84,12 @@ const std::unordered_set& GetORTLayoutSensitiveOps(); /// graph to transform /// indicates whether the graph is modified during transformation /// execution provider for which the transformation needs to be performed +/// a CPU allocator used in layout transformation. /// Optional functor to debug the graph produced during layout transformation. /// This is called after layout transformation if new nodes are inserted, and again after those are optimized. /// Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvider& execution_provider, - const DebugGraphFn& debug_graph_fn = {}); + AllocatorPtr cpu_allocator, const DebugGraphFn& debug_graph_fn = {}); /// /// Checks if the opset of the Graph is supported by the layout transformer. diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 6d87d2bbb40fe..362d15b620ab0 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -942,7 +942,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph, bool const layout_transformer::DebugGraphFn& debug_graph_fn) -> Status { ORT_RETURN_IF_ERROR_SESSIONID_( layout_transformer::TransformLayoutForEP(graph_to_transform, modified, execution_provider, - debug_graph_fn)); + execution_providers_.GetDefaultCpuAllocator(), debug_graph_fn)); if (modified) { ORT_RETURN_IF_ERROR_SESSIONID_( @@ -1266,13 +1266,19 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph, const ExecutionProviders& providers, KernelRegistryManager& kernel_registry_manager, SessionState& session_state) { + layout_transformer::TransformLayoutFunction transform_layout_fn = nullptr; + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // only provide NCWH to NHWC layout transformer if supported - layout_transformer::TransformLayoutFunction transform_layout_fn = layout_transformer::IsSupportedOpset(graph) - ? layout_transformer::TransformLayoutForEP - : nullptr; -#else // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) - layout_transformer::TransformLayoutFunction transform_layout_fn{}; + if (layout_transformer::IsSupportedOpset(graph)) { + transform_layout_fn = + [&providers](Graph& graph_to_transform, bool& modified, + const IExecutionProvider& execution_provider, + const layout_transformer::DebugGraphFn& debug_graph_fn) -> Status { + return layout_transformer::TransformLayoutForEP(graph_to_transform, modified, execution_provider, + providers.GetDefaultCpuAllocator(), debug_graph_fn); + }; + } #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) GraphPartitioner partitioner(kernel_registry_manager, providers); diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index 942e6b0a986a6..d25e462f0e422 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -157,7 +157,11 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) { GraphPartitioner partitioner(krm, execution_providers); status = partitioner.Partition(graph, session_state.GetMutableFuncMgr(), - layout_transformer::TransformLayoutForEP); + [&execution_providers](Graph& graph, bool& modified, + const IExecutionProvider& execution_provider, + const layout_transformer::DebugGraphFn& debug_graph_fn) -> Status { + return layout_transformer::TransformLayoutForEP(graph, modified, execution_provider, execution_providers.GetDefaultCpuAllocator(), debug_graph_fn); + }); ASSERT_TRUE(status.IsOK()) << status; ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm)); @@ -231,7 +235,11 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { // Partition the graph GraphPartitioner partitioner(krm, execution_providers); status = partitioner.Partition(graph, session_state.GetMutableFuncMgr(), - layout_transformer::TransformLayoutForEP); + [&execution_providers](Graph& graph, bool& modified, + const IExecutionProvider& execution_provider, + const layout_transformer::DebugGraphFn& debug_graph_fn) -> Status { + return layout_transformer::TransformLayoutForEP(graph, modified, execution_provider, execution_providers.GetDefaultCpuAllocator(), debug_graph_fn); + }); ASSERT_TRUE(status.IsOK()) << status; ASSERT_STATUS_OK(session_state.FinalizeSessionState(oss.str(), krm)); @@ -282,7 +290,11 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { // Partition the graph GraphPartitioner partitioner(krm, execution_providers); status = partitioner.Partition(graph, session_state.GetMutableFuncMgr(), - layout_transformer::TransformLayoutForEP); + [&execution_providers](Graph& graph, bool& modified, + const IExecutionProvider& execution_provider, + const layout_transformer::DebugGraphFn& debug_graph_fn) -> Status { + return layout_transformer::TransformLayoutForEP(graph, modified, execution_provider, execution_providers.GetDefaultCpuAllocator(), debug_graph_fn); + }); ASSERT_TRUE(status.IsOK()) << status; // Finalize the session state