Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ std::vector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(TransformerLevel
If transformers_and_rules_to_enable is not empty, it returns the intersection between the predefined transformers/rules
and the transformers_and_rules_to_enable. */
std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerLevel level,
gsl::span<const FreeDimensionOverride> free_dimension_overrides,
const SessionOptions& session_options,
const IExecutionProvider& execution_provider /*required by constant folding*/,
const std::vector<std::string>& rules_and_transformers_to_enable = {});

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,9 @@ static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save
// Note that an alternative way not using this option at runtime is to train and export a model without denormals
// and that's recommended because turning this option on may hurt model accuracy.
static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero";

// It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not.
// "0": disable. ORT doesn't do fusion logic for QDQ format.
// "1": enable. ORT does fusion logic for QDQ format.
// Its default value is "1"
static const char* const kOrtSessionOptionsEnableQuantQDQ = "session.enable_quant_qdq";
Copy link
Contributor

@guoyu-wang guoyu-wang Feb 11, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the qdq is by default enabled, is it better to make this "session.disable_quant_qdq"?

7 changes: 7 additions & 0 deletions onnxruntime/core/optimizer/constant_folding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ using namespace onnxruntime::common;
namespace onnxruntime {

ConstantFolding::ConstantFolding(const IExecutionProvider& execution_provider,
bool skip_dequantize_linear,
const std::unordered_set<std::string>& compatible_execution_providers,
const std::unordered_set<std::string>& excluded_initializers) noexcept
: GraphTransformer("ConstantFolding", compatible_execution_providers),
skip_dequantize_linear_(skip_dequantize_linear),
excluded_initializers_(excluded_initializers),
execution_provider_(execution_provider) {
}
Expand Down Expand Up @@ -66,6 +68,11 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
continue;
}

// avoid to constant fold DequantizeLinear for QDQ format
if (skip_dequantize_linear_ && node->OpType().compare("DequantizeLinear") == 0) {
continue;
}

ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level, logger));

// Updating a node may allow shape inferencing to infer output shapes of following nodes,
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/optimizer/constant_folding.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ class ConstantFolding : public GraphTransformer {
\param execution_provider Execution provider instance to execute constant folding.
*/
ConstantFolding(const IExecutionProvider& execution_provider,
bool skip_dequantize_linear,
const std::unordered_set<std::string>& compatible_execution_providers = {},
const std::unordered_set<std::string>& excluded_initializers = {}) noexcept;

private:
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;

bool skip_dequantize_linear_;
const std::unordered_set<std::string> excluded_initializers_;
const IExecutionProvider& execution_provider_;
};
Expand Down
8 changes: 5 additions & 3 deletions onnxruntime/core/optimizer/graph_transformer_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "core/optimizer/skip_layer_norm_fusion.h"
#include "core/optimizer/slice_elimination.h"
#include "core/optimizer/unsqueeze_elimination.h"
#include "core/session/onnxruntime_session_options_config_keys.h"

namespace onnxruntime {
class IExecutionProvider;
Expand Down Expand Up @@ -110,20 +111,21 @@ std::unique_ptr<RuleBasedGraphTransformer> GenerateRuleBasedGraphTransformer(Tra
}

std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerLevel level,
gsl::span<const FreeDimensionOverride> free_dimension_overrides,
const SessionOptions& session_options,
const IExecutionProvider& execution_provider, /*required by constant folding*/
const std::vector<std::string>& transformers_and_rules_to_enable) {
std::vector<std::unique_ptr<GraphTransformer>> transformers;
std::unique_ptr<RuleBasedGraphTransformer> rule_transformer = nullptr;
bool enable_quant_qdq = session_options.GetConfigOrDefault(kOrtSessionOptionsEnableQuantQDQ, "1") == "1";
switch (level) {
case TransformerLevel::Level1: {
std::unordered_set<std::string> l1_execution_providers = {};

transformers.emplace_back(onnxruntime::make_unique<CommonSubexpressionElimination>(l1_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<ConstantFolding>(execution_provider, l1_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<ConstantFolding>(execution_provider, enable_quant_qdq, l1_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<MatMulAddFusion>(l1_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<ReshapeFusion>(l1_execution_providers));
transformers.emplace_back(onnxruntime::make_unique<FreeDimensionOverrideTransformer>(free_dimension_overrides));
transformers.emplace_back(onnxruntime::make_unique<FreeDimensionOverrideTransformer>(session_options.free_dimension_overrides));

rule_transformer = GenerateRuleBasedGraphTransformer(level, transformers_and_rules_to_enable, l1_execution_providers);
} break;
Expand Down
3 changes: 1 addition & 2 deletions onnxruntime/core/session/inference_session.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
#include "core/util/protobuf_parsing_utils.h"
#include "core/util/thread_utils.h"


using namespace ONNX_NAMESPACE;
using namespace onnxruntime::experimental;
using namespace onnxruntime::common;
Expand Down Expand Up @@ -1895,7 +1894,7 @@ void InferenceSession::AddPredefinedTransformers(GraphTransformerManager& transf
auto add_transformers = [&](TransformerLevel level) {
// Generate and register transformers for level
auto transformers_to_register =
optimizer_utils::GenerateTransformers(level, session_options_.free_dimension_overrides,
optimizer_utils::GenerateTransformers(level, session_options_,
*execution_providers_.Get(onnxruntime::kCpuExecutionProvider),
custom_list);
for (auto& entry : transformers_to_register) {
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/test/optimizer/cse_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ TEST(CseTests, MergeConstants) {
ASSERT_TRUE(
graph_transformation_mgr.Register(onnxruntime::make_unique<CommonSubexpressionElimination>(), TransformerLevel::Level1).IsOK());
ASSERT_TRUE(
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get()), TransformerLevel::Level1).IsOK());
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1).IsOK());
ASSERT_TRUE(
graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());

Expand Down
71 changes: 65 additions & 6 deletions onnxruntime/test/optimizer/graph_transform_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "core/optimizer/gemm_activation_fusion.h"
#include "core/optimizer/graph_transformer.h"
#include "core/optimizer/graph_transformer_mgr.h"
#include "core/optimizer/graph_transformer_utils.h"
#include "core/optimizer/identity_elimination.h"
#include "core/optimizer/initializer.h"
#include "core/optimizer/layer_norm_fusion.h"
Expand All @@ -52,6 +53,7 @@
#include "core/optimizer/utils.h"
#include "core/platform/env.h"
#include "core/session/inference_session.h"
#include "core/session/onnxruntime_session_options_config_keys.h"
#include "core/util/math.h"
#include "gtest/gtest.h"
#include "test/capturing_sink.h"
Expand Down Expand Up @@ -145,7 +147,7 @@ TEST_F(GraphTransformationTests, ConstantFolding) {
std::unique_ptr<CPUExecutionProvider> e =
onnxruntime::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get()), TransformerLevel::Level1);
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1);

ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));

Expand All @@ -163,7 +165,7 @@ TEST_F(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
std::unique_ptr<CPUExecutionProvider> e =
onnxruntime::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get()), TransformerLevel::Level1);
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1);

// assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
for (auto& node : graph.Nodes()) {
Expand Down Expand Up @@ -244,7 +246,7 @@ TEST_F(GraphTransformationTests, ConstantFoldingSubgraph) {
std::unique_ptr<CPUExecutionProvider> e =
onnxruntime::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get()), TransformerLevel::Level1);
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1);

ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));

Expand All @@ -269,7 +271,11 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithShapeToInitializer) {
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
std::unique_ptr<CPUExecutionProvider> e =
onnxruntime::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(), compatible_eps, excluded_initializers), TransformerLevel::Level1);
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(),
false /*skip_dequantize_linear*/,
compatible_eps,
excluded_initializers),
TransformerLevel::Level1);

ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_).IsOK());

Expand All @@ -293,7 +299,10 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) {
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
std::unique_ptr<CPUExecutionProvider> e =
onnxruntime::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(), compatible_eps), TransformerLevel::Level1);
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(*e.get(),
false /*skip_dequantize_linear*/,
compatible_eps),
TransformerLevel::Level1);

ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_).IsOK());

Expand All @@ -303,6 +312,57 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) {
ASSERT_TRUE(op_to_count["Add"] == 1);
}

static void VerifyConstantFoldingWithDequantizeLinear(int quantize_linear_count,
int dequantize_linear_count,
int conv_count,
Graph& graph,
SessionOptions& session_options,
const Logger& logger) {
std::unique_ptr<CPUExecutionProvider> e =
onnxruntime::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());

bool has_constant_folding = false;
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, session_options, *e.get(), {});
for (auto& transformer : transformers) {
if (transformer->Name() == "ConstantFolding") {
graph_transformation_mgr.Register(std::move(transformer), TransformerLevel::Level1);
has_constant_folding = true;
}
}

ASSERT_TRUE(has_constant_folding);
ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, logger).IsOK());

std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
ASSERT_TRUE(op_to_count["QuantizeLinear"] == quantize_linear_count);
ASSERT_TRUE(op_to_count["DequantizeLinear"] == dequantize_linear_count);
ASSERT_TRUE(op_to_count["Conv"] == conv_count);
}

TEST_F(GraphTransformationTests, ConstantFoldingWithDequantizeLinear) {
auto model_uri = MODEL_FOLDER "fusion/constant_folding_dequantizelinear.onnx";
std::shared_ptr<Model> model;
ASSERT_TRUE(Model::Load(model_uri, model, nullptr, *logger_).IsOK());
Graph& graph = model->MainGraph();
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
ASSERT_TRUE(op_to_count["QuantizeLinear"] == 1);
ASSERT_TRUE(op_to_count["DequantizeLinear"] == 3);
ASSERT_TRUE(op_to_count["Conv"] == 1);

SessionOptions session_options;
// Check DequantizeLinear aren't constant folded for default setting.
VerifyConstantFoldingWithDequantizeLinear(1, 3, 1, graph, session_options, *logger_);

// set SessionOptionsEnableQuantQDQ to enable it explicitly
session_options.AddConfigEntry(kOrtSessionOptionsEnableQuantQDQ, "1");
VerifyConstantFoldingWithDequantizeLinear(1, 3, 1, graph, session_options, *logger_);

// set SessionOptionsEnableQuantQDQ to disable it
session_options.AddConfigEntry(kOrtSessionOptionsEnableQuantQDQ, "0");
VerifyConstantFoldingWithDequantizeLinear(1, 1, 1, graph, session_options, *logger_);
}

TEST_F(GraphTransformationTests, ShapeToInitializer) {
auto model_uri = MODEL_FOLDER "shape-add.onnx";
std::shared_ptr<Model> model;
Expand Down Expand Up @@ -1932,7 +1992,6 @@ TEST_F(GraphTransformationTests, AttentionFusionWithPastAndUnidirMaskTest) {
EXPECT_EQ(op_to_count["Softmax"], 0);
EXPECT_EQ(op_to_count["com.microsoft.Attention"], 1);


GraphViewer graph_viewer(graph);
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GeneratePreTrainingTransformers(
if (config.enable_gelu_approximation) {
transformers.emplace_back(onnxruntime::make_unique<GeluApproximation>(compatible_eps));
}
transformers.emplace_back(onnxruntime::make_unique<ConstantFolding>(execution_provider, compatible_eps, weights_to_train));
transformers.emplace_back(onnxruntime::make_unique<ConstantFolding>(execution_provider, false /*skip_dequantize_linear*/, compatible_eps, weights_to_train));
transformers.emplace_back(onnxruntime::make_unique<ReshapeFusion>(compatible_eps));
transformers.emplace_back(onnxruntime::make_unique<ConcatSliceElimination>(compatible_eps));
transformers.emplace_back(onnxruntime::make_unique<ComputationReductionTransformer>(compatible_eps));
Expand Down