microsoft · HectorSVC · Aug 29, 2025 · Aug 22, 2025 · Aug 21, 2025 · Aug 25, 2025
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -90,7 +90,12 @@ BackendManager::BackendManager(SessionContext& session_context,
           "[OpenVINO-EP] Bounded dynamic model execution using provider option reshape_input is not supported for OVEP EPContext model";
       ORT_THROW(exception_str);
     }
-    model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
+    if (subgraph_context_.is_ep_ctx_ovir_encapsulated) {
+      model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.onnx_model_path_name.replace_extension("xml").string(), subgraph);
+    } else {
+      model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
+    }
+
   } else {
     model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
   }
@@ -236,7 +241,9 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     std::ofstream blob_file(blob_filename,
                             std::ios::out | std::ios::trunc | std::ios::binary);
     if (!blob_file) {
-      ORT_THROW("Unable to open file for epctx model dump.");
+      std::ostringstream err_msg;
+      err_msg << "Unable to open file for epctx model dump: " << blob_filename;
+      ORT_THROW(err_msg.str());
     }
     compiled_model.export_model(blob_file);
     model_blob_str = blob_filename.filename().string();
@@ -375,6 +382,56 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
+static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+  for (std::size_t i = 0; i < node_indices.size(); i++) {
+    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
+    for (auto& output : node->OutputDefs()) {
+      if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
+  const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
+  return type_proto && type_proto->has_tensor_type() &&
+         (type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
+          type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16);
+}
+
+// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
+static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) {
+  std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+
+  for (size_t i = 0; i < node_indices.size(); i++) {
+    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
+
+    if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
+      const auto& input_defs = node->InputDefs();
+
+      if (node->OpType() == "DequantizeLinear") {
+        // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
+        // Check quantized input tensor and optional zero point
+        if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) ||
+            (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
+          return true;
+        }
+      } else if (node->OpType() == "QuantizeLinear") {
+        // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
+        const auto& output_defs = node->OutputDefs();
+        if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) ||
+            (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
                                 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
                                 [[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -433,6 +490,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
   }
 #endif
 
+  // Check if the graph is QDQ and has int16 or uint16 quantization
+  // If so, we will apply the QDQ scales fix transformation (for GPU device only)
+  bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
+
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
   if ((session_context_.device_type.find("NPU") != std::string::npos) &&
@@ -446,7 +507,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
   } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-             enable_ovep_qdq_optimizer) {
+             is_qdq_graph_uint16_or_int16) {
     // Create a copy of the model
     std::unique_ptr<onnxruntime::Model> model;
     Status status = qdq_scales_fix::Transform(subgraph, logger, model);
@@ -456,6 +517,16 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
+  } else if (IsModelBF16(subgraph)) {
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled";
+    std::unique_ptr<onnxruntime::Model> model;
+    Status status = bfloat16_fix::Transform(subgraph, logger, model);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return model_proto;
   } else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
     auto model = subgraph.CreateModel(logger);

diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -150,6 +150,11 @@ CreateOVModel(std::string&& model,
       LOGS_DEFAULT(INFO) << log_tag << "Reshaping the ov tensor to specified shape";
       ov_model->reshape(session_context.reshape);
     }
+
+    if (!session_context.layout.empty()) {
+      LOGS_DEFAULT(INFO) << log_tag << "Setting the ov tensor layout to specified layout";
+      ov_model = Set_Layout(ov_model, session_context.layout);
+    }
     // Check for Constant Folding
     if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
@@ -199,6 +204,41 @@ GetOutputTensor(Ort::KernelContext& context,
   return context.GetOutput(index, output_shape);
 }
 
+std::shared_ptr<OVNetwork> Set_Layout(std::shared_ptr<OVNetwork> ov_model, const layout_t& layout) {
+  ov::preprocess::PrePostProcessor preproc(ov_model);
+
+  const auto& inputs = ov_model->inputs();
+  const auto& outputs = ov_model->outputs();
+
+  auto find_tensor_index = [](const std::vector<ov::Output<ov::Node>>& tensors, const std::string& name) -> std::optional<size_t> {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      const auto& tensor = tensors[i];
+      if (tensor.get_any_name() == name || tensor.get_tensor().get_names().count(name) > 0) {
+        return i;
+      }
+    }
+    return std::nullopt;
+  };
+
+  for (const auto& [tensor_name, layout_value] : layout) {
+    bool tensor_found = false;
+
+    if (auto input_idx = find_tensor_index(inputs, tensor_name)) {
+      preproc.input(*input_idx).tensor().set_layout(layout_value);
+      tensor_found = true;
+    } else if (auto output_idx = find_tensor_index(outputs, tensor_name)) {
+      preproc.output(*output_idx).tensor().set_layout(layout_value);
+      tensor_found = true;
+    }
+
+    if (!tensor_found) {
+      LOGS_DEFAULT(WARNING) << "Tensor '" << tensor_name << "' not found in model inputs or outputs";
+    }
+  }
+
+  return preproc.build();
+}
+
 int GetFirstAvailableDevice(SessionContext& session_context) {
   int i = 0;
   // Get the first available VAD-M device and set the device to busy

diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -79,6 +79,8 @@ int GetFirstAvailableDevice(SessionContext& session_context);
 
 void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor);
 
+std::shared_ptr<OVNetwork> Set_Layout(std::shared_ptr<OVNetwork> ov_model, const layout_t& layout);
+
 template <typename T>
 void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> node);
 

diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -59,7 +59,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         };
         // If the EPContext node with OVIR Encapsulation, then create
         // an executable network from EP_CACHE_CONTEXT using read_model() & compile_model()
-        exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream,
+        exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream->stream_,
                                                                    hw_target,
                                                                    device_config,
                                                                    enable_causallm,
@@ -98,6 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                !subgraph_context_.has_dynamic_input_shape &&
                                !session_context_.so_context_enable &&
                                session_context_.reshape.empty() &&
+                               session_context_.layout.empty() &&
                                !enable_causallm &&
                                !eligible_for_cpu_fallback &&
                                auto_unified_compile);
@@ -213,101 +214,29 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (!session_context_.load_config.empty()) {
     const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
-    if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
-      if (target_config.find("NPU") != target_config.end()) {
-        auto npu_genai_config = target_config.at("NPU");
-        CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
-      } else {
-        LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
-      }
-    }
+    // Extract device names from device string and apply their configs
+    // Examples: "GPU" -> ["GPU"], "AUTO:GPU.0,CPU" -> ["AUTO", "GPU", "CPU"]
+    auto apply_device_config = [&](std::string_view device) {
+      if (device.empty()) return;
 
-    if (session_context_.device_type.find("NPU") != std::string::npos) {
-      auto npuw_config = target_config.at("NPU");
-
-      // Check if "NPU_USE_NPUW" exists and is set to "YES"
-      auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW");
-      if (npu_use_npuw_it != npuw_config.end() &&
-          npu_use_npuw_it->second.is<std::string>() &&
-          npu_use_npuw_it->second.as<std::string>() == "YES") {
-        // Only add NPUW-related keys if NPU_USE_NPUW is "YES"
-        for (const auto& [key, value] : npuw_config) {
-          if (key.find("NPUW") != std::string::npos) {
-            if (!value.is<std::string>()) {
-              LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key;
-              continue;
-            }
-            device_config[key] = value;
-          }
-        }
-      } else {
-        // Check if there are any "NPUW" keys and log a warning
-        if (std::any_of(npuw_config.begin(), npuw_config.end(),
-                        [&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) {
-          LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'.";
-        }
-      }
-    }
-    auto find_device_type_mode = [&](const std::string& device_type) -> std::string {
-      std::string device_mode = "";
-      auto delimiter_pos = device_type.find(':');
-      if (delimiter_pos != std::string::npos) {
-        std::stringstream str_stream(device_type.substr(0, delimiter_pos));
-        std::getline(str_stream, device_mode, ',');
-      }
-      return device_mode;
-    };
-
-    // Parse device types like "AUTO:CPU,GPU" and extract individual devices
-    auto parse_individual_devices = [&](const std::string& device_type) -> std::vector<std::string> {
-      std::vector<std::string> devices;
-      auto delimiter_pos = device_type.find(':');
-      if (delimiter_pos != std::string::npos) {
-        std::stringstream str_stream(device_type.substr(delimiter_pos + 1));
-        std::string device;
-        while (std::getline(str_stream, device, ',')) {
-          devices.emplace_back(device);
-        }
-      } else {
-        devices.emplace_back(device_type);
-      }
-      return devices;
-    };
+      // Remove device index: "GPU.0" -> "GPU"
+      auto base_device = device.substr(0, device.find('.'));
 
-    // Set properties, Validation will be handled by OpenVINO Core
-    auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options) {
-      for (const auto& [key, value] : config_options) {
-        if ((key.find("NPUW") != std::string::npos) ||
-            ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
-          continue;
+      if (auto config_it = target_config.find(std::string(base_device)); config_it != target_config.end()) {
+        for (const auto& [key, value] : config_it->second) {
+          device_config[key] = value;
         }
-        OVCore::Get()->core.set_property(device, ov::AnyMap{{key, value}});
       }
     };
 
-    // Check if the device type is AUTO, HETERO, or MULTI
-    if (session_context_.device_type.find("AUTO") == 0 ||
-        session_context_.device_type.find("HETERO") == 0 ||
-        session_context_.device_type.find("MULTI") == 0) {
-      //// Parse to get the device mode (e.g., "AUTO:CPU,GPU" -> "AUTO")
-      std::unordered_set<std::string> supported_mode = {"AUTO", "HETERO", "MULTI"};
-      auto device_mode = find_device_type_mode(session_context_.device_type);
-      ORT_ENFORCE(supported_mode.find(device_mode) != supported_mode.end(), " Invalid device mode is passed : ", session_context_.device_type);
-      // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
-      auto individual_devices = parse_individual_devices(session_context_.device_type);
-      if (!device_mode.empty()) individual_devices.emplace_back(device_mode);
-
-      // Set properties only for individual devices (e.g., "CPU", "GPU")
-      for (const std::string& device : individual_devices) {
-        if (target_config.count(device)) {
-          // Set properties for the device
-          set_target_properties(device, target_config.at(device));
+    // Parse device string by splitting on ':' and ',' delimiters
+    const auto& device_str = session_context_.device_type;
+    for (size_t start = 0, pos = 0; pos <= device_str.size(); ++pos) {
+      if (pos == device_str.size() || device_str[pos] == ':' || device_str[pos] == ',') {
+        if (pos > start) {
+          apply_device_config(std::string_view(device_str).substr(start, pos - start));
         }
-      }
-    } else {
-      if (target_config.count(session_context_.device_type)) {
-        set_target_properties(session_context_.device_type,
-                              target_config.at(session_context_.device_type));
+        start = pos + 1;
       }
     }
   }

diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
@@ -70,6 +70,7 @@ class SharedContext : public WeakSingleton<SharedContext> {
 
 using config_t = std::map<std::string, ov::AnyMap>;
 using reshape_t = std::map<std::string, ov::PartialShape>;
+using layout_t = std::map<std::string, ov::Layout>;
 
 struct ProviderInfo {
   std::string device_type{""};             // [device_type]: Overrides the accelerator hardware type and
@@ -88,6 +89,7 @@ struct ProviderInfo {
                                            // (GPU) feature. If blob files are already present,
                                            // it will be directly loaded.
   reshape_t reshape{};                     // Used for reshaping the ov input tensor shape at runtime.
+  layout_t layout{};                       // Used for specifying the ov input/output tensor layout at runtime.
   std::string model_priority{"DEFAULT"};   // High-level OpenVINO model priority hint
                                            // Defines what model should be provided with more performant
                                            // bounded resource first
@@ -110,7 +112,7 @@ struct ProviderInfo {
   const ConfigOptions* config_options{NULL};
   const std::unordered_set<std::string> valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision",
                                                                "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer",
-                                                               "enable_causallm", "disable_dynamic_shapes", "reshape_input"};
+                                                               "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"};
 };
 
 // Holds context applicable to the entire EP instance.

diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
@@ -19,7 +19,7 @@ class IBackend {
   virtual ~IBackend() = default;
   virtual void RewindKVCache(size_t index) {}
 };
-using ptr_stream_t = std::unique_ptr<std::istream>;
+using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
 class BackendFactory {
  public:
   static std::shared_ptr<IBackend>

diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -100,7 +100,8 @@
   return Status::OK();
 }
 
-std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
+std::unique_ptr<ModelBlobWrapper>
+EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
   auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
   auto node = graph_viewer.GetNode(first_index);
   ORT_ENFORCE(node != nullptr);
@@ -113,10 +114,11 @@
   bool embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
 
   std::unique_ptr<std::istream> result;
+  std::filesystem::path blob_filepath{};
   if (embed_mode) {
     result.reset((std::istream*)new std::istringstream(ep_cache_context));
   } else {
-    auto blob_filepath = so_context_file_path;
+    blob_filepath = so_context_file_path;
     if (blob_filepath.empty() && !graph_viewer.ModelPath().empty()) {
       blob_filepath = graph_viewer.ModelPath();
     }
@@ -126,16 +128,18 @@
   }
 
   bool isXML = backend_utils::IsModelStreamXML(*result);
+  std::filesystem::path native_blob_path{};
   if (!isXML) {
     // If the model stream is not an XML (i.e. precompiled blob), the OpenVINO SDK version that it was
     // exported with must match the version that is currently running.
+    native_blob_path = std::move(blob_filepath);
     ORT_ENFORCE((attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_),
                 "EPCtx blob was exported / is compatible with OpenVINO SDK version " + attrs.at(EP_SDK_VER).s() +
                     ", but OpenVINO SDK version currently in use is " + openvino_sdk_version_);
   }
 
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
-  return result;
+  return std::make_unique<ModelBlobWrapper>(std::move(result), native_blob_path);
 }
 
 bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const {