microsoft · skottmckay · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023 · Apr 18, 2023
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
@@ -177,10 +177,12 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
   }
 
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
-  // Run layout transformer only for EPs other than CPU EP and provided the preferred layout is NHWC
-  // CPU EP layout transformation happens later when level 3 transformers are run.
-  if (params.mode != GraphPartitioner::Mode::kAssignOnly &&
-      current_ep.GetPreferredLayout() == DataLayout::NHWC) {
+  // Run layout transformation for all EPs.
+  // For an EP that wants NHWC this will wrap layout sensitive nodes with Transpose nodes first.
+  // In both NCHW and NHWC EPs the EP specific transpose optimization is run last to optimize
+  // transposes for nodes assigned to the EP or unassigned nodes. This allows things like the
+  // EP aware Resize handling to be run.
+  if (params.mode != GraphPartitioner::Mode::kAssignOnly && params.transform_layout.get()) {
     for (auto& capability : capabilities) {
       TryAssignNodes(graph, *capability->sub_graph, ep_type);
     }

diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc
@@ -92,7 +92,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
   // so supporting older opsets is unnecessary.
 
   // NOTE: This should be in sync with GetLayoutSensitiveOps in
-  // /onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
+  // /onnxruntime/core/optimizer/transpose_optimization/transpose_optimizer.cc
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 11);
 
   REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 9);

diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -23,17 +23,9 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
   static std::unordered_set<std::string_view> ort_layout_sensitive_ops = []() {
     const auto& layout_sensitive_ops = onnx_transpose_optimization::GetLayoutSensitiveOps();
     std::unordered_set<std::string_view> ort_specific_ops =
-    { "FusedConv",
-      "QLinearAveragePool",
-      "QLinearGlobalAveragePool"
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN) || defined(USE_WEBNN)
-      // The CUDA/ROCM Resize kernel is layout sensitive as it only handles NCHW input.
-      // The CPU kernel and ONNX spec are not limited to handling NCHW input so are not layout sensitive, and
-      // onnx_layout_transformation::HandleResize is used.
-      ,
-      "Resize"
-#endif
-    };
+        {"FusedConv",
+         "QLinearAveragePool",
+         "QLinearGlobalAveragePool"};
 
     ort_specific_ops.insert(layout_sensitive_ops.cbegin(), layout_sensitive_ops.cend());
     return ort_specific_ops;
@@ -42,6 +34,24 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
   return ort_layout_sensitive_ops;
 }
 
+const std::unordered_set<std::string_view> GetEPLayoutSensitiveOps(const IExecutionProvider& execution_provider) {
+  std::unordered_set<std::string_view> layout_sensitive_ops = GetORTLayoutSensitiveOps();
+
+  const auto& ep = execution_provider.Type();
+
+  // EPs where the Resize implementation only handles one layout - either NCHW or NHWC. The ONNX spec for Resize is
+  // not layout specific. We assume if the EP has a layout sensitive Resize it only handles its preferred layout,
+  // so when doing layout transformation we consider the Resize to be layout sensitive and can wrap the Resize 
+  // in Transpose nodes to convert to the preferred layout, but we can't push any Transpose operations through the 
+  // Resize in the general transpose optimization. 
+  const auto& layout_sensitive_eps = EPsWithLayoutSensitiveResize();
+  if (layout_sensitive_eps.find(ep) != layout_sensitive_eps.end()) {
+    layout_sensitive_ops.insert("Resize");
+  }
+
+  return layout_sensitive_ops;
+}
+
 // Cost check for aggressively pushing the Transpose nodes involved in the layout transformation further out.
 static CostCheckResult
 PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& node,
@@ -64,93 +74,104 @@ Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvid
                             const DebugGraphFn& debug_graph_fn) {
   // We pass in nullptr for the new_node_ep param as new nodes will be assigned by the graph partitioner after
   // TransformLayoutForEP returns.
-  // sub graph recurse will be added later.
+  // sub graph recurse will be added later
   auto api_graph = MakeApiGraph(graph, cpu_allocator, /*new_node_ep*/ nullptr);
-  const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps();
+  const auto& layout_sensitive_ops = GetEPLayoutSensitiveOps(execution_provider);
 
-  // to convert to NHWC we need to wrap layout sensitive nodes to Transpose from NCHW to NHWC and back.
-  for (auto& node : api_graph->Nodes()) {
-    if (layout_sensitive_ops.count(node->OpType())) {
-      if (node->GetExecutionProviderType() != execution_provider.Type()) {
-        continue;
-      }
+  CostCheckFn cost_check;
 
-      auto domain = node->Domain();
-      // Skip if domain is incorrect
-      if (domain != kOnnxDomain && domain != kMSDomain) {
-        continue;
-      }
+  // if converting to NHWC we need to wrap layout sensitive nodes to Transpose from NCHW to NHWC and back.
+  if (execution_provider.GetPreferredLayout() == DataLayout::NHWC) {
+    for (auto& node : api_graph->Nodes()) {
+      if (layout_sensitive_ops.count(node->OpType())) {
+        if (node->GetExecutionProviderType() != execution_provider.Type()) {
+          continue;
+        }
 
-      // if already transformed then change the domain to kMSInternalNHWCDomain this way the EP
-      // knows this op is in the expected format.
-      if (node->GetAttributeIntDefault("channels_last", 0) == 1) {
-        SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain);
-        // Changing the domain for the node requires creating a new node and replacing the old one
-        // therefore set the modified flag.
-        modified = true;
-        continue;
-      }
+        auto domain = node->Domain();
+        // Skip if domain is incorrect
+        if (domain != kOnnxDomain && domain != kMSDomain) {
+          continue;
+        }
 
-      // Skip if unknown rank
-      auto shape = api_graph->GetValueInfo(node->Inputs()[0])->Shape();
-      if (!shape.has_value()) {
-        continue;
-      }
+        // if already transformed then change the domain to kMSInternalNHWCDomain this way the EP
+        // knows this op is in the expected format.
+        if (node->GetAttributeIntDefault("channels_last", 0) == 1) {
+          SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain);
+          // Changing the domain for the node requires creating a new node and replacing the old one
+          // therefore set the modified flag.
+          modified = true;
+          continue;
+        }
 
-      // Convert to channels last
-      size_t rank = shape->size();
+        // Skip if unknown rank
+        auto shape = api_graph->GetValueInfo(node->Inputs()[0])->Shape();
+        if (!shape.has_value()) {
+          continue;
+        }
 
-      bool has_channel_last_attr = node->GetAttributeInt("channels_last").has_value() ? true : false;
-      if (has_channel_last_attr) {
-        node->SetAttributeInt("channels_last", 1);
-      }
+        // Convert to channels last
+        size_t rank = shape->size();
 
-      auto input_perm = onnx_transpose_optimization::ChannelFirstToLastPerm(rank);
-      auto output_perm = onnx_transpose_optimization::ChannelLastToFirstPerm(rank);
-
-      // Except for resize and convolution ops, all the other layout sensitive ops only require layout transformation
-      // for 0th input and output. For resize, add the other relevant inputs which need conversion. For Conv - layout
-      // transformer only converts layout for 0th input, weights should be handled by every EP.
-      if (node->OpType() == "Resize") {
-        // Older versions of resize have a bug where ROI and Scales cannot be made empty inputs. To handle this case,
-        // we need to jump a few extra hoops to make sure its inputs are correctly handled.
-        //
-        // Current code skips layout conversion for ROI because it needs special handling as ROI size is 2*rank.
-        // Enable passing in ROI for layout conversion when an EP which supports ROI starts using layout transformer.
-        // NNAPI which currently uses layout transformer does not support it.
-        std::vector<const std::vector<int64_t>*> input_perms{&input_perm, nullptr};
-        for (size_t i = 2; i < node->Inputs().size(); i++) {
-          auto constant = api_graph->GetConstant(node->Inputs()[i]);
-          if (constant != nullptr && constant->Data().size() > 0) {
-            input_perms.push_back(&input_perm);
-          } else {
-            // TODO: Fix inconsistency. We should Transpose the non-const inputs so that the result of our changes
-            // is consistent - all layout specific inputs are in NHWC format when we're done.
-            // This may need to check the opset to see if it's safe so that an empty non-constant input doesn't
-            // have an invalid Transpose added to it.
-            // Caveat: Typically `scales` and `sizes` are constants so this may not happen in a production model.
-            input_perms.push_back(nullptr);
+        bool has_channel_last_attr = node->GetAttributeInt("channels_last").has_value() ? true : false;
+        if (has_channel_last_attr) {
+          node->SetAttributeInt("channels_last", 1);
+        }
+
+        auto input_perm = onnx_transpose_optimization::ChannelFirstToLastPerm(rank);
+        auto output_perm = onnx_transpose_optimization::ChannelLastToFirstPerm(rank);
+
+        // Except for resize and convolution ops, all the other layout sensitive ops only require layout transformation
+        // for 0th input and output. For resize, add the other relevant inputs which need conversion. For Conv - layout
+        // transformer only converts layout for 0th input, weights should be handled by every EP.
+        if (node->OpType() == "Resize") {
+          // Older versions of resize have a bug where ROI and Scales cannot be made empty inputs. To handle this case,
+          // we need to jump a few extra hoops to make sure its inputs are correctly handled.
+          //
+          // Current code skips layout conversion for ROI because it needs special handling as ROI size is 2*rank.
+          // Enable passing in ROI for layout conversion when an EP which supports ROI starts using layout transformer.
+          // NNAPI which currently uses layout transformer does not support it.
+          std::vector<const std::vector<int64_t>*> input_perms{&input_perm, nullptr};
+          for (size_t i = 2; i < node->Inputs().size(); i++) {
+            auto constant = api_graph->GetConstant(node->Inputs()[i]);
+            if (constant != nullptr && constant->Data().size() > 0) {
+              input_perms.push_back(&input_perm);
+            } else {
+              // TODO: Fix inconsistency. We should Transpose the non-const inputs so that the result of our changes
+              // is consistent - all layout specific inputs are in NHWC format when we're done.
+              // This may need to check the opset to see if it's safe so that an empty non-constant input doesn't
+              // have an invalid Transpose added to it.
+              // Caveat: Typically `scales` and `sizes` are constants so this may not happen in a production model.
+              input_perms.push_back(nullptr);
+            }
           }
+          WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm});
+        } else {
+          WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm});
         }
-        WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm});
-      } else {
-        WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm});
+
+        // TODO: Technically Resize doesn't need to change domain as the ONNX Resize spec is not layout sensitive.
+        SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain);
+        modified = true;
       }
+    }
 
-      // TODO: Technically Resize doesn't need to change domain as the ONNX Resize spec is not layout sensitive.
-      SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain);
-      modified = true;
+    cost_check = PostLayoutTransformCostCheck;
+
+    // debug the changes made inserting Transpose nodes around layout sensitive ops.
+    if (debug_graph_fn) {
+      debug_graph_fn(graph);
     }
-  }
 
-  // debug the changes made inserting Transpose nodes around layout sensitive ops.
-  if (debug_graph_fn) {
-    debug_graph_fn(graph);
+  } else {
+    // layout is fine for the EP but we still want to run the transpose optimizer one more time for the EP specific
+    // Transpose -> Resize logic.
+    cost_check = OrtEPCostCheck;
   }
 
   const auto max_node_idx = graph.MaxNodeIndex();
-  OptimizeResult result = onnx_transpose_optimization::Optimize(*api_graph, execution_provider.Type(),
-                                                                PostLayoutTransformCostCheck);
+  OptimizeResult result =
+      onnx_transpose_optimization::Optimize(*api_graph, execution_provider.Type(), cost_check, OrtHandlers());
 
   if (result.error_msg) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Layout/Transpose optimization for ", execution_provider.Type(),

diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc
@@ -926,18 +926,7 @@ static void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, con
   node.SetInput(i, gather_output);
 }
 
-static bool HandleResize([[maybe_unused]] HandlerArgs& args) {
-#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN)
-  // The CUDA Resize kernel requires that the input is NCHW, so we can't push a Transpose through a Resize
-  // in ORT builds with CUDA enabled.
-  // The ROCm EP is generated from the CUDA EP kernel so the same applies to builds with ROCm enabled.
-  // The QNN EP requires the input to be NHWC, so the Resize handler is also not enabled for QNN builds.
-  //
-  // TODO: Remove this special case once the CUDA Resize kernel is implemented "generically" (i.e.) aligning with the
-  // generic nature of the ONNX spec.
-  // See https://github.com/microsoft/onnxruntime/pull/10824 for a similar fix applied to the CPU Resize kernel.
-  return false;
-#else
+bool HandleResize([[maybe_unused]] HandlerArgs& args) {
   auto inputs = args.node.Inputs();
   int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
 
@@ -963,7 +952,6 @@ static bool HandleResize([[maybe_unused]] HandlerArgs& args) {
   TransposeOutputs(args.ctx, args.node, args.perm);
 
   return true;
-#endif
 }
 
 constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};

diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h
@@ -67,6 +67,7 @@ bool HandleSimpleNodeWithAxis(HandlerArgs& args, std::optional<int64_t> default_
 
 // base handlers that are used by extended handlers. add from transpose_optimizer.cc as needed.
 bool HandleReduceOps(HandlerArgs& args);
+bool HandleResize([[maybe_unused]] HandlerArgs& args);
 
 void TransposeInput(api::GraphRef& graph, api::NodeRef& node, size_t i,
                     const std::vector<int64_t>& perm,

diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc
@@ -838,5 +838,4 @@ onnxruntime::Graph& GraphFromApiGraph(onnx_transpose_optimization::api::GraphRef
 onnxruntime::Node& NodeFromApiNode(onnx_transpose_optimization::api::NodeRef& node) {
   return static_cast<ApiNode&>(node).Node();
 }
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc
@@ -5,12 +5,29 @@
 
 #include <algorithm>
 #include "core/graph/constants.h"
+#include "core/framework/utils.h"
 #include "core/optimizer/transpose_optimization/ort_optimizer_utils.h"
 
 using namespace onnx_transpose_optimization;
 
 namespace onnxruntime {
 
+static bool EPAwareHandleResize(HandlerArgs& args) {
+  // Whilst Resize is not technically layout sensitive, some execution providers implement handling for only one
+  // layout. Due to that, only push a Transpose through a Resize once it is assigned and we know it's not being handled
+  // by an EP that only supports a single layout.
+  const auto& layout_sensitive_eps = EPsWithLayoutSensitiveResize();
+
+  const auto& provider = args.ctx.provider_type;
+  if (provider.empty() || layout_sensitive_eps.find(provider) != layout_sensitive_eps.end()) {
+    return false;
+  }
+
+  return HandleResize(args);
+}
+
+constexpr HandlerInfo ep_aware_resize_handler = {&FirstInput, &EPAwareHandleResize};
+
 static bool HandleQLinearConcat(HandlerArgs& args) {
   return HandleSimpleNodeWithAxis(args);
 }
@@ -86,9 +103,17 @@ static bool HandleMaxPool(HandlerArgs& args) {
 }
 
 constexpr HandlerInfo max_pool_op_handler = {&FirstInput, &HandleMaxPool};
+
 constexpr HandlerInfo node_1_inp_handler = {&FirstInput, &HandleSimpleNode};
 constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps};
 
+const HandlerMap& OrtHandlers() {
+  static const HandlerMap extended_handler_map{
+      {"Resize", ep_aware_resize_handler},
+  };
+
+  return extended_handler_map;
+}
 // ORT contrib ops and special cased ONNX ops where we have EP specific handling
 const HandlerMap& OrtExtendedHandlers() {
   static const HandlerMap extended_handler_map = []() {
@@ -104,12 +129,37 @@ const HandlerMap& OrtExtendedHandlers() {
         {"com.microsoft.QLinearSigmoid", node_1_inp_handler},
     };
 
+    const auto& base_handlers = OrtHandlers();
+    std::for_each(base_handlers.begin(), base_handlers.end(), [&map](const auto& entry) { map.insert(entry); });
+
     return map;
   }();
 
   return extended_handler_map;
 }
 
+// EPs that require Resize to stay in the current layout.
+//   The CUDA Resize kernel requires that the input is NCHW
+//   The ROCm EP is generated from the CUDA EP kernel so the same applies to it.
+//     TODO: Remove this special case once the CUDA Resize kernel is implemented "generically"
+//           i.e. aligning with the generic nature of the ONNX spec.
+//           See https://github.com/microsoft/onnxruntime/pull/10824 for a similar fix applied to the CPU Resize.
+//   The QNN EP requires the Resize to remain in NHWC once the layout transformer makes that adjustment
+//   and moves the node to the kMSInternalNHWCDomain domain. We need it to be in this list so that the layout
+//   transformation inserts Transpose nodes around the Resize to convert from NCWH to NHWC. As there is no handler for
+//   the replacement Resize node in the kMSInternalNHWCDomain domain we will not push any Transpose nodes through it
+//   later.
+const std::unordered_set<std::string_view> EPsWithLayoutSensitiveResize() {
+  static std::unordered_set<std::string_view> eps = {
+      kCudaExecutionProvider,
+      kRocmExecutionProvider,
+      kQnnExecutionProvider,
+      onnxruntime::utils::kInternalTestingExecutionProvider,  // for testing the behavior
+  };
+
+  return eps;
+}
+
 CostCheckResult OrtEPCostCheck(const api::GraphRef& graph, const api::NodeRef& node,
                                const std::vector<int64_t>& /*perm*/,
                                const std::unordered_set<std::string>& /*outputs_leading_to_transpose*/) {