diff --git a/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/include/onnxruntime/core/optimizer/graph_transformer_utils.h index b5d0287f5d495..9c9ff163600a0 100644 --- a/include/onnxruntime/core/optimizer/graph_transformer_utils.h +++ b/include/onnxruntime/core/optimizer/graph_transformer_utils.h @@ -25,7 +25,7 @@ std::vector> GenerateRewriteRules(TransformerLevel If transformers_and_rules_to_enable is not empty, it returns the intersection between the predefined transformers/rules and the transformers_and_rules_to_enable. */ std::vector> GenerateTransformers(TransformerLevel level, - gsl::span free_dimension_overrides, + const SessionOptions& session_options, const IExecutionProvider& execution_provider /*required by constant folding*/, const std::vector& rules_and_transformers_to_enable = {}); diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index ff247f2072da3..0b9ebf27e7e3f 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -40,3 +40,9 @@ static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save // Note that an alternative way not using this option at runtime is to train and export a model without denormals // and that's recommended because turning this option on may hurt model accuracy. static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero"; + +// It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not. +// "0": disable. ORT doesn't do fusion logic for QDQ format. +// "1": enable. ORT does fusion logic for QDQ format. +// Its default value is "1" +static const char* const kOrtSessionOptionsEnableQuantQDQ = "session.enable_quant_qdq"; diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc index bc2153d9518d0..1c53bba193e6f 100644 --- a/onnxruntime/core/optimizer/constant_folding.cc +++ b/onnxruntime/core/optimizer/constant_folding.cc @@ -13,9 +13,11 @@ using namespace onnxruntime::common; namespace onnxruntime { ConstantFolding::ConstantFolding(const IExecutionProvider& execution_provider, + bool skip_dequantize_linear, const std::unordered_set& compatible_execution_providers, const std::unordered_set& excluded_initializers) noexcept : GraphTransformer("ConstantFolding", compatible_execution_providers), + skip_dequantize_linear_(skip_dequantize_linear), excluded_initializers_(excluded_initializers), execution_provider_(execution_provider) { } @@ -66,6 +68,11 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, continue; } + // avoid to constant fold DequantizeLinear for QDQ format + if (skip_dequantize_linear_ && node->OpType().compare("DequantizeLinear") == 0) { + continue; + } + ORT_RETURN_IF_ERROR(Recurse(*node, modified, graph_level, logger)); // Updating a node may allow shape inferencing to infer output shapes of following nodes, diff --git a/onnxruntime/core/optimizer/constant_folding.h b/onnxruntime/core/optimizer/constant_folding.h index e4b3a7a48373b..30bc4b2c7c7be 100644 --- a/onnxruntime/core/optimizer/constant_folding.h +++ b/onnxruntime/core/optimizer/constant_folding.h @@ -23,12 +23,14 @@ class ConstantFolding : public GraphTransformer { \param execution_provider Execution provider instance to execute constant folding. */ ConstantFolding(const IExecutionProvider& execution_provider, + bool skip_dequantize_linear, const std::unordered_set& compatible_execution_providers = {}, const std::unordered_set& excluded_initializers = {}) noexcept; private: Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override; + bool skip_dequantize_linear_; const std::unordered_set excluded_initializers_; const IExecutionProvider& execution_provider_; }; diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index b460808935251..d9e31a976f27a 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -37,6 +37,7 @@ #include "core/optimizer/skip_layer_norm_fusion.h" #include "core/optimizer/slice_elimination.h" #include "core/optimizer/unsqueeze_elimination.h" +#include "core/session/onnxruntime_session_options_config_keys.h" namespace onnxruntime { class IExecutionProvider; @@ -110,20 +111,21 @@ std::unique_ptr GenerateRuleBasedGraphTransformer(Tra } std::vector> GenerateTransformers(TransformerLevel level, - gsl::span free_dimension_overrides, + const SessionOptions& session_options, const IExecutionProvider& execution_provider, /*required by constant folding*/ const std::vector& transformers_and_rules_to_enable) { std::vector> transformers; std::unique_ptr rule_transformer = nullptr; + bool enable_quant_qdq = session_options.GetConfigOrDefault(kOrtSessionOptionsEnableQuantQDQ, "1") == "1"; switch (level) { case TransformerLevel::Level1: { std::unordered_set l1_execution_providers = {}; transformers.emplace_back(onnxruntime::make_unique(l1_execution_providers)); - transformers.emplace_back(onnxruntime::make_unique(execution_provider, l1_execution_providers)); + transformers.emplace_back(onnxruntime::make_unique(execution_provider, enable_quant_qdq, l1_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(l1_execution_providers)); transformers.emplace_back(onnxruntime::make_unique(l1_execution_providers)); - transformers.emplace_back(onnxruntime::make_unique(free_dimension_overrides)); + transformers.emplace_back(onnxruntime::make_unique(session_options.free_dimension_overrides)); rule_transformer = GenerateRuleBasedGraphTransformer(level, transformers_and_rules_to_enable, l1_execution_providers); } break; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 3e98121fd8b47..4cd6376f0668e 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -52,7 +52,6 @@ #include "core/util/protobuf_parsing_utils.h" #include "core/util/thread_utils.h" - using namespace ONNX_NAMESPACE; using namespace onnxruntime::experimental; using namespace onnxruntime::common; @@ -1895,7 +1894,7 @@ void InferenceSession::AddPredefinedTransformers(GraphTransformerManager& transf auto add_transformers = [&](TransformerLevel level) { // Generate and register transformers for level auto transformers_to_register = - optimizer_utils::GenerateTransformers(level, session_options_.free_dimension_overrides, + optimizer_utils::GenerateTransformers(level, session_options_, *execution_providers_.Get(onnxruntime::kCpuExecutionProvider), custom_list); for (auto& entry : transformers_to_register) { diff --git a/onnxruntime/test/optimizer/cse_test.cc b/onnxruntime/test/optimizer/cse_test.cc index cbae298289029..7fdf6c9f76671 100644 --- a/onnxruntime/test/optimizer/cse_test.cc +++ b/onnxruntime/test/optimizer/cse_test.cc @@ -283,7 +283,7 @@ TEST(CseTests, MergeConstants) { ASSERT_TRUE( graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level1).IsOK()); ASSERT_TRUE( - graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get()), TransformerLevel::Level1).IsOK()); + graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1).IsOK()); ASSERT_TRUE( graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK()); diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index ffcb01192d58d..73a1dad6cfd09 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -35,6 +35,7 @@ #include "core/optimizer/gemm_activation_fusion.h" #include "core/optimizer/graph_transformer.h" #include "core/optimizer/graph_transformer_mgr.h" +#include "core/optimizer/graph_transformer_utils.h" #include "core/optimizer/identity_elimination.h" #include "core/optimizer/initializer.h" #include "core/optimizer/layer_norm_fusion.h" @@ -52,6 +53,7 @@ #include "core/optimizer/utils.h" #include "core/platform/env.h" #include "core/session/inference_session.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #include "core/util/math.h" #include "gtest/gtest.h" #include "test/capturing_sink.h" @@ -145,7 +147,7 @@ TEST_F(GraphTransformationTests, ConstantFolding) { std::unique_ptr e = onnxruntime::make_unique(CPUExecutionProviderInfo()); onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; - graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get()), TransformerLevel::Level1); + graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1); ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_)); @@ -163,7 +165,7 @@ TEST_F(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) { std::unique_ptr e = onnxruntime::make_unique(CPUExecutionProviderInfo()); onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; - graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get()), TransformerLevel::Level1); + graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1); // assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu for (auto& node : graph.Nodes()) { @@ -244,7 +246,7 @@ TEST_F(GraphTransformationTests, ConstantFoldingSubgraph) { std::unique_ptr e = onnxruntime::make_unique(CPUExecutionProviderInfo()); onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; - graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get()), TransformerLevel::Level1); + graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), false /*skip_dequantize_linear*/), TransformerLevel::Level1); ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_)); @@ -269,7 +271,11 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithShapeToInitializer) { onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; std::unique_ptr e = onnxruntime::make_unique(CPUExecutionProviderInfo()); - graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), compatible_eps, excluded_initializers), TransformerLevel::Level1); + graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), + false /*skip_dequantize_linear*/, + compatible_eps, + excluded_initializers), + TransformerLevel::Level1); ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_).IsOK()); @@ -293,7 +299,10 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) { onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; std::unique_ptr e = onnxruntime::make_unique(CPUExecutionProviderInfo()); - graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), compatible_eps), TransformerLevel::Level1); + graph_transformation_mgr.Register(onnxruntime::make_unique(*e.get(), + false /*skip_dequantize_linear*/, + compatible_eps), + TransformerLevel::Level1); ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_).IsOK()); @@ -303,6 +312,57 @@ TEST_F(GraphTransformationTests, ConstantFoldingWithScalarShapeToInitializer) { ASSERT_TRUE(op_to_count["Add"] == 1); } +static void VerifyConstantFoldingWithDequantizeLinear(int quantize_linear_count, + int dequantize_linear_count, + int conv_count, + Graph& graph, + SessionOptions& session_options, + const Logger& logger) { + std::unique_ptr e = + onnxruntime::make_unique(CPUExecutionProviderInfo()); + + bool has_constant_folding = false; + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + auto transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level1, session_options, *e.get(), {}); + for (auto& transformer : transformers) { + if (transformer->Name() == "ConstantFolding") { + graph_transformation_mgr.Register(std::move(transformer), TransformerLevel::Level1); + has_constant_folding = true; + } + } + + ASSERT_TRUE(has_constant_folding); + ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, logger).IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["QuantizeLinear"] == quantize_linear_count); + ASSERT_TRUE(op_to_count["DequantizeLinear"] == dequantize_linear_count); + ASSERT_TRUE(op_to_count["Conv"] == conv_count); +} + +TEST_F(GraphTransformationTests, ConstantFoldingWithDequantizeLinear) { + auto model_uri = MODEL_FOLDER "fusion/constant_folding_dequantizelinear.onnx"; + std::shared_ptr model; + ASSERT_TRUE(Model::Load(model_uri, model, nullptr, *logger_).IsOK()); + Graph& graph = model->MainGraph(); + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["QuantizeLinear"] == 1); + ASSERT_TRUE(op_to_count["DequantizeLinear"] == 3); + ASSERT_TRUE(op_to_count["Conv"] == 1); + + SessionOptions session_options; + // Check DequantizeLinear aren't constant folded for default setting. + VerifyConstantFoldingWithDequantizeLinear(1, 3, 1, graph, session_options, *logger_); + + // set SessionOptionsEnableQuantQDQ to enable it explicitly + session_options.AddConfigEntry(kOrtSessionOptionsEnableQuantQDQ, "1"); + VerifyConstantFoldingWithDequantizeLinear(1, 3, 1, graph, session_options, *logger_); + + // set SessionOptionsEnableQuantQDQ to disable it + session_options.AddConfigEntry(kOrtSessionOptionsEnableQuantQDQ, "0"); + VerifyConstantFoldingWithDequantizeLinear(1, 1, 1, graph, session_options, *logger_); +} + TEST_F(GraphTransformationTests, ShapeToInitializer) { auto model_uri = MODEL_FOLDER "shape-add.onnx"; std::shared_ptr model; @@ -1932,7 +1992,6 @@ TEST_F(GraphTransformationTests, AttentionFusionWithPastAndUnidirMaskTest) { EXPECT_EQ(op_to_count["Softmax"], 0); EXPECT_EQ(op_to_count["com.microsoft.Attention"], 1); - GraphViewer graph_viewer(graph); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); diff --git a/onnxruntime/test/testdata/transform/fusion/constant_folding_dequantizelinear.onnx b/onnxruntime/test/testdata/transform/fusion/constant_folding_dequantizelinear.onnx new file mode 100644 index 0000000000000..c8f922f0fa07f Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/constant_folding_dequantizelinear.onnx differ diff --git a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc index 4c274455ca83b..402a85ac5990d 100644 --- a/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc +++ b/orttraining/orttraining/core/optimizer/graph_transformer_utils.cc @@ -94,7 +94,7 @@ std::vector> GeneratePreTrainingTransformers( if (config.enable_gelu_approximation) { transformers.emplace_back(onnxruntime::make_unique(compatible_eps)); } - transformers.emplace_back(onnxruntime::make_unique(execution_provider, compatible_eps, weights_to_train)); + transformers.emplace_back(onnxruntime::make_unique(execution_provider, false /*skip_dequantize_linear*/, compatible_eps, weights_to_train)); transformers.emplace_back(onnxruntime::make_unique(compatible_eps)); transformers.emplace_back(onnxruntime::make_unique(compatible_eps)); transformers.emplace_back(onnxruntime::make_unique(compatible_eps));