Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,9 @@ static Status GetCapabilityForEP(const GetCapabilityForEPParams& params) {
}

#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
// Run layout transformer only for EPs other than CPU EP and provided the preferred layout is NHWC
// Run layout transformer for EPs with preferred layout of NHWC
// CPU EP layout transformation happens later when level 3 transformers are run.
if (params.mode != GraphPartitioner::Mode::kAssignOnly &&
if (params.mode != GraphPartitioner::Mode::kAssignOnly && params.transform_layout.get() &&
current_ep.GetPreferredLayout() == DataLayout::NHWC) {
for (auto& capability : capabilities) {
TryAssignNodes(graph, *capability->sub_graph, ep_type);
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/framework/kernel_registry_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,9 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node,
auto create_error_message = [&node, &status](const std::string& prefix) {
std::ostringstream errormsg;
errormsg << prefix << node.OpType() << "(" << node.SinceVersion() << ")";
if (!node.Name().empty()) errormsg << " (node " << node.Name() << "). ";
if (!status.IsOK()) errormsg << status.ErrorMessage();
errormsg << " (node:'" << node.Name() << "' ep:'" << node.GetExecutionProviderType() << "'). ";
if (!status.IsOK())
errormsg << status.ErrorMessage();

return errormsg.str();
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function<void(ONNX_NAMES
// so supporting older opsets is unnecessary.

// NOTE: This should be in sync with GetLayoutSensitiveOps in
// /onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
// /onnxruntime/core/optimizer/transpose_optimization/transpose_optimizer.cc
REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 11);

REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 9);
Expand Down
19 changes: 11 additions & 8 deletions onnxruntime/core/optimizer/graph_transformer_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
const InlinedHashSet<std::string_view> cpu_ep = {onnxruntime::kCpuExecutionProvider};
#endif
const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();

switch (level) {
case TransformerLevel::Level1: {
// RewriteRule optimizations are the simplest (they generally remove unnecessary nodes and are cheap to run)
Expand Down Expand Up @@ -240,13 +242,14 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(

// run TransposeOptimizer last as it works in a slightly different way by moving Transpose nodes around.
// shouldn't affect the end result - just easier to debug any issue if it's last.
// local CPU allocator is enough as this allocator is finally passed to a local tensor.
// We will also benefit by using a local allocator as we don't need to pass allocator as parameter for EP API refactor
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
transformers.emplace_back(std::make_unique<TransposeOptimizer>(std::move(cpu_allocator)));
} break;

case TransformerLevel::Level2: {
// we run TransposeOptimizer again in Level2 for some CPU EP specific optimizations that can only be
// applied once nodes are assigned to the CPU EP (which happens between level 1 and level 2).
transformers.emplace_back(std::make_unique<TransposeOptimizer>(std::move(cpu_allocator), kCpuExecutionProvider));

const bool enable_quant_qdq_cleanup =
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsEnableQuantQDQCleanup, "0") == "1";
#if !defined(DISABLE_CONTRIB_OPS)
Expand Down Expand Up @@ -366,16 +369,16 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
if (MlasNchwcGetBlockSize() > 1) {
transformers.emplace_back(std::make_unique<NchwcTransformer>());
}
AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();

auto cpu_registry = cpu_execution_provider.GetKernelRegistry();
auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry));
if (nhwc_transformer->IsActive()) {
transformers.emplace_back(std::move(nhwc_transformer));
}
// NCHWCtransformer should have a higher priority versus this. Because NCHWCtransformer also do the similar things
// of fusion patterns and target on CPU. However, NCHWCtransformer will reorder the layout to nchwc which is only available for
// x86-64 cpu, not edge cpu like arm. But This transformer could be used by opencl-ep/cpu-ep. So
// we will prefer NhwcTransformer once ort runs on x86-64 CPU, otherwise ConvAddActivationFusion is enabled.

// NchwcTransformer must have a higher priority than ConvAddActivationFusion. NchwcTransformer does similar
// fusions targeting CPU but also reorders the layout to NCHWc which is expected to be more efficient but is
// only available on x86-64.
// PR #6351 implemented similar fusion-pattern for CUDA only, and can only fuse conv-add-relu,
// while we can fuse more activation.
transformers.emplace_back(std::make_unique<ConvAddActivationFusion>(cpu_ep));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,27 +13,91 @@ using namespace onnx_transpose_optimization;

namespace onnxruntime {
namespace layout_transformation {
namespace {
// Cost check for aggressively pushing the Transpose nodes involved in the layout transformation further out.
CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& node,
const std::vector<int64_t>& perm,
const std::unordered_set<std::string>& outputs_leading_to_transpose) {
// we aggressively push the layout transpose nodes.
// Exception: pushing through a Concat can result in Transpose nodes being added to multiple other inputs which
// can potentially be worse for performance. Use the cost check in that case.
if (node.OpType() != "Concat" &&
(perm == ChannelFirstToLastPerm(perm.size()) || perm == ChannelLastToFirstPerm(perm.size()))) {
return CostCheckResult::kPushTranspose;
}

// for other nodes use the default ORT cost check
return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
}

/// <summary>
/// Default function for checking if a node should have its layout changed. Allows EP specific adjustments to the
/// default set of layout sensitive operators if required.
///
/// Longer term, if required, the EP API could allow the EP to provide a delegate to plugin EP specific logic so we
/// don't hardcode it here.
/// </summary>
/// <param name="node">Node to check</param>
/// <returns>true if the node should have its layout converted to NHWC.</returns>
bool ConvertNodeLayout(const api::NodeRef& node) {
// skip if op is not an ONNX or contrib op
auto domain = node.Domain();
if (domain != kOnnxDomain && domain != kMSDomain) {
return false;
}

const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps();

// handle special cases
#if defined(USE_XNNPACK)
if (node.GetExecutionProviderType() == kXnnpackExecutionProvider) {
if (node.OpType() == "Resize") {
// XNNPACK supports NCHW and NHWC for Resize so we don't need to use the internal NHWC domain and wrap the Resize
// with Transpose nodes. EPAwareHandleResize will allow an NCHW <-> NHWC Transpose to be pushed through
// the Resize during transpose optimization.
return false;
}
}
#endif

#if defined(USE_JSEP)
// TODO(fs-eire): Remove special case handing of JSEP once NHWC Resize implementation is fixed
if (node.GetExecutionProviderType() == kJsExecutionProvider) {
if (node.OpType() == "Resize") {
// leave Resize as-is pending bugfix for NHWC implementation. this means the node will remain in the ONNX domain
// with the original input layout.
return false;
}
}
#endif

// #if defined(USE_CUDA)
// if (node.GetExecutionProviderType() == kCudaExecutionProvider) {
// Update as per https://github.com/microsoft/onnxruntime/pull/17200 with CUDA ops that support NHWC
// }
// #endif

return layout_sensitive_ops.count(node.OpType()) != 0;
}
} // namespace

// Layout sensitive NCHW ops. TransformLayoutForEP will wrap these with Transpose nodes to convert the input
// data to NHWC and output data back to NCHW, and move the op to the internal NHWC domain (kMSInternalNHWCDomain).
// The EP requesting these ops MUST be able to handle the node with the operator in the kMSInternalNHWCDomain.
// The EP requesting these ops MUST be able to handle the node with the operator in the kMSInternalNHWCDomain domain.
// Once all the layout sensitive ops requested by the EP are wrapped the transpose optimizer will attempt to remove
// as many of the layout transposes as possible.
const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
static std::unordered_set<std::string_view> ort_layout_sensitive_ops = []() {
const auto& layout_sensitive_ops = onnx_transpose_optimization::GetLayoutSensitiveOps();
std::unordered_set<std::string_view> ort_specific_ops =
{ "FusedConv",
"QLinearAveragePool",
"QLinearGlobalAveragePool"
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN) || defined(USE_WEBNN)
// The CUDA/ROCM Resize kernel is layout sensitive as it only handles NCHW input.
// The CPU kernel and ONNX spec are not limited to handling NCHW input so are not layout sensitive, and
// onnx_layout_transformation::HandleResize is used.
,
"Resize"
#endif
};
{
"FusedConv",
"QLinearAveragePool",
"QLinearGlobalAveragePool",
// Whilst the ONNX spec doesn't specify a layout for Resize, we treat it as layout sensitive by default
// as EPs tend to only support one layout.
"Resize",
};

ort_specific_ops.insert(layout_sensitive_ops.cbegin(), layout_sensitive_ops.cend());
return ort_specific_ops;
Expand All @@ -42,45 +106,21 @@ const std::unordered_set<std::string_view>& GetORTLayoutSensitiveOps() {
return ort_layout_sensitive_ops;
}

// Cost check for aggressively pushing the Transpose nodes involved in the layout transformation further out.
static CostCheckResult
PostLayoutTransformCostCheck(const api::GraphRef& graph, const api::NodeRef& node,
const std::vector<int64_t>& perm,
const std::unordered_set<std::string>& outputs_leading_to_transpose) {
// we aggressively push the layout transpose nodes.
// Exception: pushing through a Concat can result in Transpose nodes being added to multiple other inputs which
// can potentially be worse for performance. Use the cost check in that case.
if (node.OpType() != "Concat" &&
(perm == ChannelFirstToLastPerm(perm.size()) || perm == ChannelLastToFirstPerm(perm.size()))) {
return CostCheckResult::kPushTranspose;
}

// for other nodes use the default ORT cost check
return OrtEPCostCheck(graph, node, perm, outputs_leading_to_transpose);
}

Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvider& execution_provider,
AllocatorPtr cpu_allocator,
const DebugGraphFn& debug_graph_fn) {
// We pass in nullptr for the new_node_ep param as new nodes will be assigned by the graph partitioner after
// TransformLayoutForEP returns.
// sub graph recurse will be added later.
// sub graph recurse will be added later
auto api_graph = MakeApiGraph(graph, cpu_allocator, /*new_node_ep*/ nullptr);
const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps();

// to convert to NHWC we need to wrap layout sensitive nodes to Transpose from NCHW to NHWC and back.
for (auto& node : api_graph->Nodes()) {
if (layout_sensitive_ops.count(node->OpType())) {
if (node->GetExecutionProviderType() != execution_provider.Type()) {
continue;
}

auto domain = node->Domain();
// Skip if domain is incorrect
if (domain != kOnnxDomain && domain != kMSDomain) {
continue;
}
if (node->GetExecutionProviderType() != execution_provider.Type()) {
continue;
}

if (ConvertNodeLayout(*node)) {
// if already transformed then change the domain to kMSInternalNHWCDomain this way the EP
// knows this op is in the expected format.
if (node->GetAttributeIntDefault("channels_last", 0) == 1) {
Expand Down Expand Up @@ -137,7 +177,6 @@ Status TransformLayoutForEP(Graph& graph, bool& modified, const IExecutionProvid
WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm});
}

// TODO: Technically Resize doesn't need to change domain as the ONNX Resize spec is not layout sensitive.
SwapNodeOpTypeAndDomain(*api_graph, *node, node->OpType(), kMSInternalNHWCDomain);
modified = true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1242,18 +1242,7 @@ static void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, con
node.SetInput(i, gather_output);
}

static bool HandleResize([[maybe_unused]] HandlerArgs& args) {
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_QNN) || defined(USE_WEBNN)
// The CUDA Resize kernel requires that the input is NCHW, so we can't push a Transpose through a Resize
// in ORT builds with CUDA enabled.
// The ROCm EP is generated from the CUDA EP kernel so the same applies to builds with ROCm enabled.
// The QNN EP requires the input to be NHWC, so the Resize handler is also not enabled for QNN builds.
//
// TODO: Remove this special case once the CUDA Resize kernel is implemented "generically" (i.e.) aligning with the
// generic nature of the ONNX spec.
// See https://github.com/microsoft/onnxruntime/pull/10824 for a similar fix applied to the CPU Resize kernel.
return false;
#else
bool HandleResize([[maybe_unused]] HandlerArgs& args) {
auto inputs = args.node.Inputs();
int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());

Expand All @@ -1279,10 +1268,10 @@ static bool HandleResize([[maybe_unused]] HandlerArgs& args) {
TransposeOutputs(args.ctx, args.node, args.perm);

return true;
#endif
}

constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
// Not currently registered by default.
// constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};

static bool HandlePad(HandlerArgs& args) {
size_t rank = args.perm.size();
Expand Down Expand Up @@ -2034,15 +2023,19 @@ static const std::unordered_map<std::string_view, const HandlerInfo&> handler_ma
{"Split", split_handler},
{"Shape", shape_handler},
{"Pad", pad_handler},
{"Resize", resize_handler},
{"ReduceSum", reduce_op_handler},

// Execution providers tend to only implement Resize for specific layouts. Due to that, it's safer to not
// push a Transpose through a Resize unless the EP specifically checks that it can handle the change via an
// extended handler.
// {"Resize", resize_handler},

{"ReduceLogSum", reduce_op_handler},
{"ReduceLogSumExp", reduce_op_handler},
{"ReduceMax", reduce_op_handler},
{"ReduceMean", reduce_op_handler},
{"ReduceMin", reduce_op_handler},
{"ReduceProd", reduce_op_handler},
{"ReduceSum", reduce_op_handler},
{"ReduceSumSquare", reduce_op_handler},
{"ReduceL1", reduce_op_handler},
{"ReduceL2", reduce_op_handler},
Expand Down Expand Up @@ -2385,6 +2378,8 @@ OptimizeResult OptimizeImpl(OptimizerCtx& ctx) {
continue;
}

// NOTE: this bleeds ORT specific logic into the base optimizer, however we justify that for now because we expect
// the types that the ORT DQ provides to be added to the ONNX spec, at which point this special case can go away.
if (IsMSDomain(dq_domain) && !TransposeQuantizeDequantizeAxis(ctx.graph, perm_inv, *dq_node)) {
continue;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ bool HandleSimpleNodeWithAxis(HandlerArgs& args, std::optional<int64_t> default_

// base handlers that are used by extended handlers. add from transpose_optimizer.cc as needed.
bool HandleReduceOps(HandlerArgs& args);
bool HandleResize([[maybe_unused]] HandlerArgs& args);

void TransposeInput(api::GraphRef& graph, api::NodeRef& node, size_t i,
const std::vector<int64_t>& perm,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,35 @@

#include <algorithm>
#include "core/graph/constants.h"
#include "core/framework/utils.h"
#include "core/optimizer/transpose_optimization/ort_optimizer_utils.h"

using namespace onnx_transpose_optimization;

namespace onnxruntime {

static bool EPAwareHandleResize(HandlerArgs& args) {
// Whilst Resize is not technically layout sensitive, execution providers typically implement handling for only one
// layout. Due to that, only push a Transpose through a Resize once it is assigned and we know it's being handled
// by an EP that supports multiple layouts. Currently that's the CPU and XNNPACK EPs.
const auto ep_type = args.node.GetExecutionProviderType();
if (ep_type == kCpuExecutionProvider || ep_type == kXnnpackExecutionProvider) {
// allow NCHW <-> NHWC for now. not clear any other sort of transpose has a valid usage in a real model
int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
if (rank_int == 4) {
static const std::vector<int64_t> nchw_to_nhwc_perm{0, 2, 3, 1};
static const std::vector<int64_t> nhwc_to_nchw_perm{0, 3, 1, 2};
if (args.perm == nchw_to_nhwc_perm || args.perm == nhwc_to_nchw_perm) {
return HandleResize(args);
}
}
}

return false;
}

constexpr HandlerInfo ep_aware_resize_handler = {&FirstInput, &EPAwareHandleResize};

static bool HandleQLinearConcat(HandlerArgs& args) {
return HandleSimpleNodeWithAxis(args);
}
Expand Down Expand Up @@ -62,7 +85,7 @@ static bool HandleMaxPool(HandlerArgs& args) {
ORT_UNUSED_PARAMETER(args);
return false;
#else
if (args.node.GetExecutionProviderType() != "CPUExecutionProvider") {
if (args.node.GetExecutionProviderType() != kCpuExecutionProvider) {
return false;
}

Expand Down Expand Up @@ -103,6 +126,7 @@ static bool HandleContribQuantizeDequantizeLinear(HandlerArgs& args) {
}

constexpr HandlerInfo max_pool_op_handler = {&FirstInput, &HandleMaxPool};

constexpr HandlerInfo node_1_inp_handler = {&FirstInput, &HandleSimpleNode};
constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps};
constexpr HandlerInfo contrib_quantize_dequantize_linear_handler = {&FirstInput,
Expand All @@ -113,6 +137,7 @@ const HandlerMap& OrtExtendedHandlers() {
static const HandlerMap extended_handler_map = []() {
HandlerMap map = {
{"MaxPool", max_pool_op_handler},
{"Resize", ep_aware_resize_handler},
{"com.microsoft.QuantizeLinear", contrib_quantize_dequantize_linear_handler},
{"com.microsoft.DequantizeLinear", contrib_quantize_dequantize_linear_handler},
{"com.microsoft.QLinearAdd", q_linear_binary_op_handler},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ namespace onnxruntime {
/// <summary>
/// Get the extended handlers for ORT specific transpose optimization.
/// These include handlers for contrib ops, and where we have an NHWC version of a layout sensitive op.
/// Extends the handlers returned by OrtHandlers.
/// </summary>
/// <returns>HandlerMap</returns>
const onnx_transpose_optimization::HandlerMap& OrtExtendedHandlers();
Expand Down
Loading