-
Notifications
You must be signed in to change notification settings - Fork 3.7k
Add EP specific Resize handling to the transpose optimizer #15552
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
25d5fb6
dc47b62
4a00978
2c4658a
d7e1788
a1653fe
e8f9410
587969e
5b0c034
9fbcc3b
f2ca8e0
8f03818
842c9a5
1da9140
a0af8ed
8387a08
9386ff9
313922d
d5b6728
62008a0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -177,10 +177,12 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) { | |
| } | ||
|
|
||
| #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) | ||
| // Run layout transformer only for EPs other than CPU EP and provided the preferred layout is NHWC | ||
| // CPU EP layout transformation happens later when level 3 transformers are run. | ||
| if (params.mode != GraphPartitioner::Mode::kAssignOnly && | ||
| current_ep.GetPreferredLayout() == DataLayout::NHWC) { | ||
| // Run layout transformation for all EPs. | ||
| // For an EP that wants NHWC this will wrap layout sensitive nodes with Transpose nodes first. | ||
| // In both NCHW and NHWC EPs the EP specific transpose optimization is run last to optimize | ||
| // transposes for nodes assigned to the EP or unassigned nodes. This allows things like the | ||
| // EP aware Resize handling to be run. | ||
| if (params.mode != GraphPartitioner::Mode::kAssignOnly && params.transform_layout.get()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have two questions here.
// In inference_session.cc
auto has_layout_sensitive_ops_fn = [](const std::string& ep) -> bool {
return !layout_transformation::GetEPLayoutSensitiveOps(ep).empty();
};
partitioner.Partition(graph, ..., transform_layout_fn, mode, debug_graph_fn, has_layout_sensitive_ops_fn);
// In graph_partitioner.cc::GetCapabilityForEP()
const bool ep_prefers_nhwc = current_ep.GetPreferredLayout() == DataLayout::NHWC;
const bool ep_needs_transpose_opt = params.has_layout_sensitive_ops.get() && params.has_layout_sensitive_ops(current_ep.Type());
if (params_mode != kAssignOnly && params.transform_layout.get() && (ep_prefers_nhwc || ep_needs_transpose_opt)) {
// Run layout transformation
}
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Or instead of passing in a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just realized that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I primarily suggested this extra function because your original comment on this code noted that "this feels a little too heavy as it only needs to be called for NCHW EPs with layout sensitive Resize currently." I was trying to suggest something that would avoid having to run an unnecessary layout transformation on EPs that don't need it. However, I'm not sure if this would even provide a meaningful performance improvement, just throwing it out there. |
||
| for (auto& capability : capabilities) { | ||
| TryAssignNodes(graph, *capability->sub_graph, ep_type); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,17 +23,9 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() { | |
| static std::unordered_set<std::string_view> ort_layout_sensitive_ops = []() { | ||
| const auto& layout_sensitive_ops = onnx_transpose_optimization::GetLayoutSensitiveOps(); | ||
| std::unordered_set<std::string_view> ort_specific_ops = | ||
| { "FusedConv", | ||
| "QLinearAveragePool", | ||
| "QLinearGlobalAveragePool" | ||
| #if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN) || defined(USE_WEBNN) | ||
| // The CUDA/ROCM Resize kernel is layout sensitive as it only handles NCHW input. | ||
| // The CPU kernel and ONNX spec are not limited to handling NCHW input so are not layout sensitive, and | ||
| // onnx_layout_transformation::HandleResize is used. | ||
| , | ||
| "Resize" | ||
| #endif | ||
| }; | ||
| {"FusedConv", | ||
| "QLinearAveragePool", | ||
| "QLinearGlobalAveragePool"}; | ||
|
|
||
| ort_specific_ops.insert(layout_sensitive_ops.cbegin(), layout_sensitive_ops.cend()); | ||
| return ort_specific_ops; | ||
|
|
@@ -42,6 +34,24 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() { | |
| return ort_layout_sensitive_ops; | ||
| } | ||
|
|
||
| const std::unordered_set<std::string_view> GetEPLayoutSensitiveOps(const IExecutionProvider& execution_provider) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make this function static? |
||
| std::unordered_set<std::string_view> layout_sensitive_ops = GetORTLayoutSensitiveOps(); | ||
|
|
||
| const auto& ep = execution_provider.Type(); | ||
|
|
||
| // EPs where the Resize implementation only handles one layout - either NCHW or NHWC. The ONNX spec for Resize is | ||
| // not layout specific. We assume if the EP has a layout sensitive Resize it only handles its preferred layout, | ||
| // so when doing layout transformation we consider the Resize to be layout sensitive and can wrap the Resize | ||
| // in Transpose nodes to convert to the preferred layout, but we can't push any Transpose operations through the | ||
| // Resize in the general transpose optimization. | ||
| const auto& layout_sensitive_eps = EPsWithLayoutSensitiveResize(); | ||
| if (layout_sensitive_eps.find(ep) != layout_sensitive_eps.end()) { | ||
| layout_sensitive_ops.insert("Resize"); | ||
| } | ||
|
|
||
| return layout_sensitive_ops; | ||
| } | ||
|
|
||
| // Cost check for aggressively pushing the Transpose nodes involved in the layout transformation further out. | ||
| static CostCheckResult | ||
| PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& node, | ||
|
|
@@ -64,93 +74,104 @@ Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvid | |
| const DebugGraphFn& debug_graph_fn) { | ||
| // We pass in nullptr for the new_node_ep param as new nodes will be assigned by the graph partitioner after | ||
| // TransformLayoutForEP returns. | ||
| // sub graph recurse will be added later. | ||
| // sub graph recurse will be added later | ||
| auto api_graph = MakeApiGraph(graph, cpu_allocator, /*new_node_ep*/ nullptr); | ||
| const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps(); | ||
| const auto& layout_sensitive_ops = GetEPLayoutSensitiveOps(execution_provider); | ||
|
|
||
| // to convert to NHWC we need to wrap layout sensitive nodes to Transpose from NCHW to NHWC and back. | ||
| for (auto& node : api_graph->Nodes()) { | ||
| if (layout_sensitive_ops.count(node->OpType())) { | ||
| if (node->GetExecutionProviderType() != execution_provider.Type()) { | ||
| continue; | ||
| } | ||
| CostCheckFn cost_check; | ||
|
|
||
| auto domain = node->Domain(); | ||
| // Skip if domain is incorrect | ||
| if (domain != kOnnxDomain && domain != kMSDomain) { | ||
| continue; | ||
| } | ||
| // if converting to NHWC we need to wrap layout sensitive nodes to Transpose from NCHW to NHWC and back. | ||
| if (execution_provider.GetPreferredLayout() == DataLayout::NHWC) { | ||
| for (auto& node : api_graph->Nodes()) { | ||
| if (layout_sensitive_ops.count(node->OpType())) { | ||
| if (node->GetExecutionProviderType() != execution_provider.Type()) { | ||
| continue; | ||
| } | ||
|
|
||
| // if already transformed then change the domain to kMSInternalNHWCDomain this way the EP | ||
| // knows this op is in the expected format. | ||
| if (node->GetAttributeIntDefault("channels_last", 0) == 1) { | ||
| SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain); | ||
| // Changing the domain for the node requires creating a new node and replacing the old one | ||
| // therefore set the modified flag. | ||
| modified = true; | ||
| continue; | ||
| } | ||
| auto domain = node->Domain(); | ||
| // Skip if domain is incorrect | ||
| if (domain != kOnnxDomain && domain != kMSDomain) { | ||
| continue; | ||
| } | ||
|
|
||
| // Skip if unknown rank | ||
| auto shape = api_graph->GetValueInfo(node->Inputs()[0])->Shape(); | ||
| if (!shape.has_value()) { | ||
| continue; | ||
| } | ||
| // if already transformed then change the domain to kMSInternalNHWCDomain this way the EP | ||
| // knows this op is in the expected format. | ||
| if (node->GetAttributeIntDefault("channels_last", 0) == 1) { | ||
| SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain); | ||
| // Changing the domain for the node requires creating a new node and replacing the old one | ||
| // therefore set the modified flag. | ||
| modified = true; | ||
| continue; | ||
| } | ||
|
|
||
| // Convert to channels last | ||
| size_t rank = shape->size(); | ||
| // Skip if unknown rank | ||
| auto shape = api_graph->GetValueInfo(node->Inputs()[0])->Shape(); | ||
| if (!shape.has_value()) { | ||
| continue; | ||
| } | ||
|
|
||
| bool has_channel_last_attr = node->GetAttributeInt("channels_last").has_value() ? true : false; | ||
| if (has_channel_last_attr) { | ||
| node->SetAttributeInt("channels_last", 1); | ||
| } | ||
| // Convert to channels last | ||
| size_t rank = shape->size(); | ||
|
|
||
| auto input_perm = onnx_transpose_optimization::ChannelFirstToLastPerm(rank); | ||
| auto output_perm = onnx_transpose_optimization::ChannelLastToFirstPerm(rank); | ||
|
|
||
| // Except for resize and convolution ops, all the other layout sensitive ops only require layout transformation | ||
| // for 0th input and output. For resize, add the other relevant inputs which need conversion. For Conv - layout | ||
| // transformer only converts layout for 0th input, weights should be handled by every EP. | ||
| if (node->OpType() == "Resize") { | ||
| // Older versions of resize have a bug where ROI and Scales cannot be made empty inputs. To handle this case, | ||
| // we need to jump a few extra hoops to make sure its inputs are correctly handled. | ||
| // | ||
| // Current code skips layout conversion for ROI because it needs special handling as ROI size is 2*rank. | ||
| // Enable passing in ROI for layout conversion when an EP which supports ROI starts using layout transformer. | ||
| // NNAPI which currently uses layout transformer does not support it. | ||
| std::vector<const std::vector<int64_t>*> input_perms{&input_perm, nullptr}; | ||
| for (size_t i = 2; i < node->Inputs().size(); i++) { | ||
| auto constant = api_graph->GetConstant(node->Inputs()[i]); | ||
| if (constant != nullptr && constant->Data().size() > 0) { | ||
| input_perms.push_back(&input_perm); | ||
| } else { | ||
| // TODO: Fix inconsistency. We should Transpose the non-const inputs so that the result of our changes | ||
| // is consistent - all layout specific inputs are in NHWC format when we're done. | ||
| // This may need to check the opset to see if it's safe so that an empty non-constant input doesn't | ||
| // have an invalid Transpose added to it. | ||
| // Caveat: Typically `scales` and `sizes` are constants so this may not happen in a production model. | ||
| input_perms.push_back(nullptr); | ||
| bool has_channel_last_attr = node->GetAttributeInt("channels_last").has_value() ? true : false; | ||
| if (has_channel_last_attr) { | ||
| node->SetAttributeInt("channels_last", 1); | ||
| } | ||
|
|
||
| auto input_perm = onnx_transpose_optimization::ChannelFirstToLastPerm(rank); | ||
| auto output_perm = onnx_transpose_optimization::ChannelLastToFirstPerm(rank); | ||
|
|
||
| // Except for resize and convolution ops, all the other layout sensitive ops only require layout transformation | ||
| // for 0th input and output. For resize, add the other relevant inputs which need conversion. For Conv - layout | ||
| // transformer only converts layout for 0th input, weights should be handled by every EP. | ||
| if (node->OpType() == "Resize") { | ||
| // Older versions of resize have a bug where ROI and Scales cannot be made empty inputs. To handle this case, | ||
| // we need to jump a few extra hoops to make sure its inputs are correctly handled. | ||
| // | ||
| // Current code skips layout conversion for ROI because it needs special handling as ROI size is 2*rank. | ||
| // Enable passing in ROI for layout conversion when an EP which supports ROI starts using layout transformer. | ||
| // NNAPI which currently uses layout transformer does not support it. | ||
| std::vector<const std::vector<int64_t>*> input_perms{&input_perm, nullptr}; | ||
| for (size_t i = 2; i < node->Inputs().size(); i++) { | ||
| auto constant = api_graph->GetConstant(node->Inputs()[i]); | ||
| if (constant != nullptr && constant->Data().size() > 0) { | ||
| input_perms.push_back(&input_perm); | ||
| } else { | ||
| // TODO: Fix inconsistency. We should Transpose the non-const inputs so that the result of our changes | ||
| // is consistent - all layout specific inputs are in NHWC format when we're done. | ||
| // This may need to check the opset to see if it's safe so that an empty non-constant input doesn't | ||
| // have an invalid Transpose added to it. | ||
| // Caveat: Typically `scales` and `sizes` are constants so this may not happen in a production model. | ||
| input_perms.push_back(nullptr); | ||
| } | ||
| } | ||
| WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm}); | ||
| } else { | ||
| WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm}); | ||
| } | ||
| WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm}); | ||
| } else { | ||
| WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm}); | ||
|
|
||
| // TODO: Technically Resize doesn't need to change domain as the ONNX Resize spec is not layout sensitive. | ||
| SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain); | ||
| modified = true; | ||
| } | ||
| } | ||
|
|
||
| // TODO: Technically Resize doesn't need to change domain as the ONNX Resize spec is not layout sensitive. | ||
| SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain); | ||
| modified = true; | ||
| cost_check = PostLayoutTransformCostCheck; | ||
|
|
||
| // debug the changes made inserting Transpose nodes around layout sensitive ops. | ||
| if (debug_graph_fn) { | ||
| debug_graph_fn(graph); | ||
| } | ||
| } | ||
|
|
||
| // debug the changes made inserting Transpose nodes around layout sensitive ops. | ||
| if (debug_graph_fn) { | ||
| debug_graph_fn(graph); | ||
| } else { | ||
| // layout is fine for the EP but we still want to run the transpose optimizer one more time for the EP specific | ||
| // Transpose -> Resize logic. | ||
| cost_check = OrtEPCostCheck; | ||
| } | ||
|
|
||
| const auto max_node_idx = graph.MaxNodeIndex(); | ||
| OptimizeResult result = onnx_transpose_optimization::Optimize(*api_graph, execution_provider.Type(), | ||
| PostLayoutTransformCostCheck); | ||
| OptimizeResult result = | ||
| onnx_transpose_optimization::Optimize(*api_graph, execution_provider.Type(), cost_check, OrtHandlers()); | ||
|
|
||
| if (result.error_msg) { | ||
| return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Layout/Transpose optimization for ", execution_provider.Type(), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This feels a little too heavy as it only needs to be called for NCHW EPs with layout sensitive Resize currently.
The second call to GetCapability also muddies the water a little. Previously the EP could optimize and ignore assigned nodes (although this wasn't required and an EP could ask for an assigned node). With the TryAssignNodes the EP implementation of GetCapability now needs to process at a minimum all unassigned nodes or nodes assigned to the EP. This matters for fusion and compilation as we need complete lists of capabilities from the second call. If the EP ignored assigned nodes we potentially end up in a bad state.
Still doesn't feel like we have the best solution.