microsoft · jywu-msft · Jul 18, 2025 · Jul 3, 2025 · Jul 8, 2025 · Jul 10, 2025
diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake
@@ -13,8 +13,8 @@
 
   # Header paths
   find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX)
-  if(OpenVINO_VERSION VERSION_LESS 2024.5)
-    message(FATAL_ERROR "OpenVINO 2024.5 and newer are supported. Please, use latest OpenVINO release")
+  if(OpenVINO_VERSION VERSION_LESS 2025.0)
+    message(FATAL_ERROR "OpenVINO 2025.0 and newer are supported. Please, use latest OpenVINO release")
   endif()
 
   if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4)
@@ -49,7 +49,7 @@
   endif()
   add_dependencies(onnxruntime_providers_openvino onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/)
-  target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen)
+  target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS} Eigen3::Eigen onnx_proto)
 
   target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\")
 

diff --git a/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc b/onnxruntime/core/optimizer/double_qdq_pairs_remover.cc
@@ -52,6 +52,7 @@ static void ApplyNewInputValue(Graph& graph, Node& node, QDQ::InputIndex index,
   input_init.ToProto(new_input_tensor);
   auto new_name = graph.GenerateNodeArgName("DoubleQDQRemoved_" + node.InputDefs()[index]->Name());
   new_input_tensor.set_name(new_name);
+  new_input_tensor.add_dims(1);
   NodeArg& new_input = graph_utils::AddInitializerWithExternalData(graph, new_input_tensor);
   graph_utils::ReplaceNodeInput(node, index, new_input);
 }

diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -20,6 +20,7 @@
 #include "core/providers/openvino/ov_interface.h"
 #include "core/providers/openvino/ov_versions/capability.h"
 #include "core/providers/openvino/qdq_transformations/qdq_stripping.h"
+#include "core/providers/openvino/qdq_transformations/qdq_scales_fix.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -117,7 +118,9 @@ BackendManager::BackendManager(SessionContext& session_context,
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
     if ((!session_context_.disable_dynamic_shapes &&
          (session_context_.device_type.find("CPU") != std::string::npos ||
-          session_context_.device_type.find("GPU") != std::string::npos)) ||
+          session_context_.device_type.find("GPU") != std::string::npos ||
+          (session_context_.device_type.find("NPU") != std::string::npos &&
+           session_context_.enable_causallm))) ||
         (subgraph_context_.is_ep_ctx_graph)) {
       LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
                          << "Creating backend Dynamic Shapes";
@@ -429,8 +432,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
 
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
-  if ((session_context_.device_type.find("NPU") != std::string::npos ||
-       session_context_.device_type.find("GPU") != std::string::npos) &&
+  if ((session_context_.device_type.find("NPU") != std::string::npos) &&
       (enable_ovep_qdq_optimizer || session_context_.so_share_ep_contexts)) {
     std::unique_ptr<onnxruntime::Model> model;
     Status status = CreateModelWithStrippedQDQNodes(subgraph, logger, session_context_.so_share_ep_contexts, enable_ovep_qdq_optimizer, model, shared_context_.shared_weights);
@@ -440,6 +442,17 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
+  } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
+             enable_ovep_qdq_optimizer) {
+    // Create a copy of the model
+    std::unique_ptr<onnxruntime::Model> model;
+    Status status = qdq_scales_fix::Transform(subgraph, logger, model);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return model_proto;
   } else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
     auto model = subgraph.CreateModel(logger);

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -36,10 +36,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
   if (ValidateSubgraph(const_outputs_map_))
     return;
 
-  // Pre-requisite is provider_option "context" must be set
-  auto auto_unified_compile = ((hw_target.find("AUTO") == std::string::npos) ||
-                               (session_context_.OpenVINO_Version.at(0) >= 2024 &&
-                                session_context_.OpenVINO_Version.at(1) > 2));
   ov::AnyMap device_config;
   SetOVDeviceConfiguration(device_config);
   if (subgraph_context_.is_ep_ctx_graph) {
@@ -81,42 +77,46 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
       ORT_THROW(msg);
     }  // Delete stream after it is no longer needed
   } else {
+    std::shared_ptr<const onnxruntime::openvino_ep::OVNetwork> ov_model;
     std::string model = model_proto->SerializeAsString();
     if (!subgraph_context.has_dynamic_input_shape) {
       model_proto.reset();
     }
+    bool eligible_for_cpu_fallback = session_context_.device_type.find("NPU") != std::string::npos &&
+                                     !session_context_.so_disable_cpu_ep_fallback &&
+                                     !subgraph_context_.is_ep_ctx_graph;
+#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
+    eligible_for_cpu_fallback = false;
+#endif
+    auto auto_unified_compile = (hw_target.find("AUTO") == std::string::npos);
+
+    // Unified compile is efficient with cahce_dir cached model loading that bypass Read Model
+    // Does not support model with exteral weights, dynamic input shape, Epctx onnx cached model,
+    // reshape, enable_causallm, and for NPU CPU fallback
+
+    auto is_unified_compile = (!session_context_.has_external_weights &&
+                               !subgraph_context_.has_dynamic_input_shape &&
+                               !session_context_.so_context_enable &&
+                               session_context_.reshape.empty() &&
+                               !enable_causallm &&
+                               !eligible_for_cpu_fallback &&
+                               auto_unified_compile);
     try {
-      // SetOVDeviceConfiguration(device_config);
-      if (!session_context_.has_external_weights &&
-          !subgraph_context_.has_dynamic_input_shape &&
-          !session_context_.so_context_enable &&
-          session_context_.reshape.empty() &&
-          !enable_causallm &&
-          auto_unified_compile) {
-        // Unified OV compile_model is efficient when ov model caching is enabled
-        // Unified OV compile_model API is supported with AUTO from version 2024.3 and above
-        // Inputs with static dimensions
-        // Not enabled for models with external weights and when ep context is set.
-
+      if (is_unified_compile) {
         exe_network_ = OVCore::Get()->CompileModel(model,
                                                    hw_target,
                                                    device_config,
                                                    subgraph_context_.subgraph_name);
       } else {  // For all other types use ov::ov_core read_model() to generate OV IR
                 // followed by ov::ov_core compile_model()
-        auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
+        ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
         exe_network_ = OVCore::Get()->CompileModel(
             ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
       }
       LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
     } catch (const OnnxRuntimeException& ex) {
       std::string exception_str = ex.what();
-      bool eligible_for_cpu_fallback = session_context_.device_type.find("NPU") != std::string::npos &&
-                                       !session_context_.so_disable_cpu_ep_fallback &&
-                                       !subgraph_context_.is_ep_ctx_graph;
-#if defined(OPENVINO_DISABLE_NPU_FALLBACK)
-      eligible_for_cpu_fallback = false;
-#endif
+
       if (eligible_for_cpu_fallback) {
         LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU."
                               << "Falling back to OV CPU for execution";
@@ -125,8 +125,6 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         device_config.clear();
         SetOVDeviceConfiguration(device_config);
         try {
-          // Recreate the model with CPU device type
-          auto ov_model = CreateOVModel(std::move(model), session_context_, const_outputs_map_);
           exe_network_ = OVCore::Get()->CompileModel(
               ov_model, hw_target, device_config, enable_causallm, subgraph_context_.subgraph_name);
         } catch (std::string const& msg) {

diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -133,6 +133,22 @@ class OVInferRequest {
       auto tensor_ptr = std::make_shared<ov::Tensor>(type, shape, const_cast<void*>(ort_ptr));
       SetTensor(name, tensor_ptr);
       cached_binding = {tensor_ptr, ort_ptr};
+    } else if (ort_ptr == nullptr) {
+      // a null ort_ptr is expected for a tensor that has 0 elements.
+      // for example, a tensor of shape=[1, 8, 0, 64], which is valid.
+      // So, we check to see if at least one shape entry is 0.
+      auto contains_zero = [](const ov::Shape& shape) {
+        for (auto& s : shape)
+          if (s == 0) return true;
+        return false;
+      };
+      if (contains_zero(shape)) {
+        // if there are zero elements (i.e. at least one shape entry is 0),
+        // then create and set the tensor anyway.
+        auto tensor_ptr = std::make_shared<ov::Tensor>(type, shape);
+        SetTensor(name, tensor_ptr);
+        cached_binding = {tensor_ptr, ort_ptr};
+      }
     }
   }
 

diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp b/onnxruntime/core/providers/openvino/ov_protobuf_utils.cpp
@@ -0,0 +1,24 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#include "ov_protobuf_utils.h"
+
+#include "core/graph/onnx_protobuf.h"
+#include "core/common/common.h"
+
+namespace onnxruntime {
+namespace openvino_ep {
+float get_float_initializer_data(const void* initializer) {
+  const auto* tp = reinterpret_cast<const ONNX_NAMESPACE::TensorProto*>(initializer);
+  ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT)));
+  // ORT_ENFORCE(initializer.dims_size() == 1);
+  return tp->float_data(0);
+}
+void set_float_initializer_data(const void* initializer, float data) {
+  auto* tp = (ONNX_NAMESPACE::TensorProto*)(initializer);
+  ORT_ENFORCE((tp->has_data_type() && (tp->data_type() == ONNX_NAMESPACE::TensorProto_DataType_FLOAT)));
+  // ORT_ENFORCE(initializer.dims_size() == 1);
+  tp->set_float_data(0, data);
+}
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_protobuf_utils.h b/onnxruntime/core/providers/openvino/ov_protobuf_utils.h
@@ -0,0 +1,10 @@
+// Copyright (C) Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+namespace onnxruntime {
+namespace openvino_ep {
+float get_float_initializer_data(const void* initializer);
+void set_float_initializer_data(const void* initializer, float data);
+}  // namespace openvino_ep
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -41,16 +41,14 @@ GetCapability::GetCapability(const EPCtxHandler& ep_ctx_handler,
     npu_qdq_optimizer_enabled = true;  // see data_ops.cc ~615 where we check for int16 types for gpu, this may change to a better approach later
   }
 
-#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5
-  data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 6
-  data_ops_ = new DataOps(graph_viewer_, V_2024_6, device_type_, npu_qdq_optimizer_enabled);
-#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0
+#if OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 0
   data_ops_ = new DataOps(graph_viewer_, V_2025_0, device_type_, npu_qdq_optimizer_enabled);
 #elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 1
   data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled);
+#elif OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR == 2
+  data_ops_ = new DataOps(graph_viewer_, V_2025_2, device_type_, npu_qdq_optimizer_enabled);
 #else
-  data_ops_ = new DataOps(graph_viewer_, V_2025_1, device_type_, npu_qdq_optimizer_enabled);
+  data_ops_ = new DataOps(graph_viewer_, V_2025_2, device_type_, npu_qdq_optimizer_enabled);
 #endif
 }
 

diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -229,6 +229,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"Sigmoid", V_2020_4, {"CPU", "GPU"}},
     {"Sign", V_2020_4, {"CPU"}},
     {"Sign", V_2022_1, {"GPU"}},
+    {"SimplifiedLayerNormalization", V_2025_2, {"CPU", "GPU"}},
     {"Sin", V_2022_1, {"CPU", "GPU"}},
     {"Sinh", V_2020_4, {"CPU"}},
     {"Size", V_2022_1, {"CPU", "GPU"}},
@@ -402,7 +403,7 @@ void DataOps::populate_op_mode_supported() {
 
   // populate unsupportedmode_t
   {
-    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1},
+    UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1, V_2025_2},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch)
                                for (size_t i = 0; i < node->InputDefs().size(); i++) {
@@ -418,7 +419,8 @@ void DataOps::populate_op_mode_supported() {
   }
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
-                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1},
+                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1,
+                              V_2025_2},
                              [this](const Node* node, const InitializedTensorSet&) {
                                const auto& input_args = node->InputDefs();
                                const auto& input_arg = (input_args.size() > 1) ? input_args[1] : input_args[0];
@@ -437,7 +439,8 @@ void DataOps::populate_op_mode_supported() {
   }
   {
     UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2,
-                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1},
+                              V_2024_3, V_2024_4, V_2024_5, V_2024_6, V_2025_0, V_2025_1,
+                              V_2025_2},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // If the operator is unsqueeze
                                // If axes is an input, then we cannot produce a static graph.
@@ -452,8 +455,8 @@ void DataOps::populate_op_mode_supported() {
     op_list_.insert({"Unsqueeze", obj});
   }
   {
-    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5, V_2024_6,
-                              V_2025_0, V_2025_1},
+    UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5,
+                              V_2024_6, V_2025_0, V_2025_1, V_2025_2},
                              [this](const Node* node, const InitializedTensorSet&) {
                                // check for attributes
                                auto& upsample_attr = node->GetAttributes();

diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h
@@ -35,7 +35,8 @@ enum versionNum {
   V_2024_5,
   V_2024_6,
   V_2025_0,
-  V_2025_1
+  V_2025_1,
+  V_2025_2
 };
 
 using VersionNum = enum versionNum;