Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 74 additions & 3 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,12 @@ BackendManager::BackendManager(SessionContext& session_context,
"[OpenVINO-EP] Bounded dynamic model execution using provider option reshape_input is not supported for OVEP EPContext model";
ORT_THROW(exception_str);
}
model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
if (subgraph_context_.is_ep_ctx_ovir_encapsulated) {
model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.onnx_model_path_name.replace_extension("xml").string(), subgraph);
} else {
model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
}

} else {
model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
}
Expand Down Expand Up @@ -236,7 +241,9 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
std::ofstream blob_file(blob_filename,
std::ios::out | std::ios::trunc | std::ios::binary);
if (!blob_file) {
ORT_THROW("Unable to open file for epctx model dump.");
std::ostringstream err_msg;
err_msg << "Unable to open file for epctx model dump: " << blob_filename;
ORT_THROW(err_msg.str());
}
compiled_model.export_model(blob_file);
model_blob_str = blob_filename.filename().string();
Expand Down Expand Up @@ -375,6 +382,56 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
return false;
}

static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
for (std::size_t i = 0; i < node_indices.size(); i++) {
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
for (auto& output : node->OutputDefs()) {
if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
return true;
}
}
return false;
}

static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
return type_proto && type_proto->has_tensor_type() &&
(type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16);
}

// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) {
std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();

for (size_t i = 0; i < node_indices.size(); i++) {
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));

if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
const auto& input_defs = node->InputDefs();

if (node->OpType() == "DequantizeLinear") {
// DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
// Check quantized input tensor and optional zero point
if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) ||
(input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
return true;
}
} else if (node->OpType() == "QuantizeLinear") {
// QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
const auto& output_defs = node->OutputDefs();
if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) ||
(input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
return true;
}
}
}
}
return false;
}

static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
[[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
[[maybe_unused]] const onnxruntime::Node& fused_node) {
Expand Down Expand Up @@ -433,6 +490,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
}
#endif

// Check if the graph is QDQ and has int16 or uint16 quantization
// If so, we will apply the QDQ scales fix transformation (for GPU device only)
bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);

const auto& onnx_model_path_name = subgraph.ModelPath();
// QDQ stripping enabled only for the NPU and experimentally on the GPU
if ((session_context_.device_type.find("NPU") != std::string::npos) &&
Expand All @@ -446,7 +507,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
return model_proto;
} else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
enable_ovep_qdq_optimizer) {
is_qdq_graph_uint16_or_int16) {
// Create a copy of the model
std::unique_ptr<onnxruntime::Model> model;
Status status = qdq_scales_fix::Transform(subgraph, logger, model);
Expand All @@ -456,6 +517,16 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
return model_proto;
} else if (IsModelBF16(subgraph)) {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled";
std::unique_ptr<onnxruntime::Model> model;
Status status = bfloat16_fix::Transform(subgraph, logger, model);
auto model_proto = model->ToProto();
model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
print_model_proto_duration();
DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
return model_proto;
} else {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
auto model = subgraph.CreateModel(logger);
Expand Down
40 changes: 40 additions & 0 deletions onnxruntime/core/providers/openvino/backend_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,11 @@ CreateOVModel(std::string&& model,
LOGS_DEFAULT(INFO) << log_tag << "Reshaping the ov tensor to specified shape";
ov_model->reshape(session_context.reshape);
}

if (!session_context.layout.empty()) {
LOGS_DEFAULT(INFO) << log_tag << "Setting the ov tensor layout to specified layout";
ov_model = Set_Layout(ov_model, session_context.layout);
}
// Check for Constant Folding
if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
ov::pass::ConstantFolding pass_const_obj;
Expand Down Expand Up @@ -199,6 +204,41 @@ GetOutputTensor(Ort::KernelContext& context,
return context.GetOutput(index, output_shape);
}

std::shared_ptr<OVNetwork> Set_Layout(std::shared_ptr<OVNetwork> ov_model, const layout_t& layout) {
ov::preprocess::PrePostProcessor preproc(ov_model);

const auto& inputs = ov_model->inputs();
const auto& outputs = ov_model->outputs();

auto find_tensor_index = [](const std::vector<ov::Output<ov::Node>>& tensors, const std::string& name) -> std::optional<size_t> {
for (size_t i = 0; i < tensors.size(); ++i) {
const auto& tensor = tensors[i];
if (tensor.get_any_name() == name || tensor.get_tensor().get_names().count(name) > 0) {
return i;
}
}
return std::nullopt;
};

for (const auto& [tensor_name, layout_value] : layout) {
bool tensor_found = false;

if (auto input_idx = find_tensor_index(inputs, tensor_name)) {
preproc.input(*input_idx).tensor().set_layout(layout_value);
tensor_found = true;
} else if (auto output_idx = find_tensor_index(outputs, tensor_name)) {
preproc.output(*output_idx).tensor().set_layout(layout_value);
tensor_found = true;
}

if (!tensor_found) {
LOGS_DEFAULT(WARNING) << "Tensor '" << tensor_name << "' not found in model inputs or outputs";
}
}

return preproc.build();
}

int GetFirstAvailableDevice(SessionContext& session_context) {
int i = 0;
// Get the first available VAD-M device and set the device to busy
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/openvino/backend_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ int GetFirstAvailableDevice(SessionContext& session_context);

void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor);

std::shared_ptr<OVNetwork> Set_Layout(std::shared_ptr<OVNetwork> ov_model, const layout_t& layout);

template <typename T>
void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> node);

Expand Down
107 changes: 18 additions & 89 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
};
// If the EPContext node with OVIR Encapsulation, then create
// an executable network from EP_CACHE_CONTEXT using read_model() & compile_model()
exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream,
exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream->stream_,
hw_target,
device_config,
enable_causallm,
Expand Down Expand Up @@ -98,6 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
!subgraph_context_.has_dynamic_input_shape &&
!session_context_.so_context_enable &&
session_context_.reshape.empty() &&
session_context_.layout.empty() &&
!enable_causallm &&
!eligible_for_cpu_fallback &&
auto_unified_compile);
Expand Down Expand Up @@ -213,101 +214,29 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
if (!session_context_.load_config.empty()) {
const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;

if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
if (target_config.find("NPU") != target_config.end()) {
auto npu_genai_config = target_config.at("NPU");
CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
} else {
LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
}
}
// Extract device names from device string and apply their configs
// Examples: "GPU" -> ["GPU"], "AUTO:GPU.0,CPU" -> ["AUTO", "GPU", "CPU"]
auto apply_device_config = [&](std::string_view device) {
if (device.empty()) return;

if (session_context_.device_type.find("NPU") != std::string::npos) {
auto npuw_config = target_config.at("NPU");

// Check if "NPU_USE_NPUW" exists and is set to "YES"
auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW");
if (npu_use_npuw_it != npuw_config.end() &&
npu_use_npuw_it->second.is<std::string>() &&
npu_use_npuw_it->second.as<std::string>() == "YES") {
// Only add NPUW-related keys if NPU_USE_NPUW is "YES"
for (const auto& [key, value] : npuw_config) {
if (key.find("NPUW") != std::string::npos) {
if (!value.is<std::string>()) {
LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key;
continue;
}
device_config[key] = value;
}
}
} else {
// Check if there are any "NPUW" keys and log a warning
if (std::any_of(npuw_config.begin(), npuw_config.end(),
[&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) {
LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'.";
}
}
}
auto find_device_type_mode = [&](const std::string& device_type) -> std::string {
std::string device_mode = "";
auto delimiter_pos = device_type.find(':');
if (delimiter_pos != std::string::npos) {
std::stringstream str_stream(device_type.substr(0, delimiter_pos));
std::getline(str_stream, device_mode, ',');
}
return device_mode;
};

// Parse device types like "AUTO:CPU,GPU" and extract individual devices
auto parse_individual_devices = [&](const std::string& device_type) -> std::vector<std::string> {
std::vector<std::string> devices;
auto delimiter_pos = device_type.find(':');
if (delimiter_pos != std::string::npos) {
std::stringstream str_stream(device_type.substr(delimiter_pos + 1));
std::string device;
while (std::getline(str_stream, device, ',')) {
devices.emplace_back(device);
}
} else {
devices.emplace_back(device_type);
}
return devices;
};
// Remove device index: "GPU.0" -> "GPU"
auto base_device = device.substr(0, device.find('.'));

// Set properties, Validation will be handled by OpenVINO Core
auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options) {
for (const auto& [key, value] : config_options) {
if ((key.find("NPUW") != std::string::npos) ||
((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
continue;
if (auto config_it = target_config.find(std::string(base_device)); config_it != target_config.end()) {
for (const auto& [key, value] : config_it->second) {
device_config[key] = value;
}
OVCore::Get()->core.set_property(device, ov::AnyMap{{key, value}});
}
};

// Check if the device type is AUTO, HETERO, or MULTI
if (session_context_.device_type.find("AUTO") == 0 ||
session_context_.device_type.find("HETERO") == 0 ||
session_context_.device_type.find("MULTI") == 0) {
//// Parse to get the device mode (e.g., "AUTO:CPU,GPU" -> "AUTO")
std::unordered_set<std::string> supported_mode = {"AUTO", "HETERO", "MULTI"};
auto device_mode = find_device_type_mode(session_context_.device_type);
ORT_ENFORCE(supported_mode.find(device_mode) != supported_mode.end(), " Invalid device mode is passed : ", session_context_.device_type);
// Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
auto individual_devices = parse_individual_devices(session_context_.device_type);
if (!device_mode.empty()) individual_devices.emplace_back(device_mode);

// Set properties only for individual devices (e.g., "CPU", "GPU")
for (const std::string& device : individual_devices) {
if (target_config.count(device)) {
// Set properties for the device
set_target_properties(device, target_config.at(device));
// Parse device string by splitting on ':' and ',' delimiters
const auto& device_str = session_context_.device_type;
for (size_t start = 0, pos = 0; pos <= device_str.size(); ++pos) {
if (pos == device_str.size() || device_str[pos] == ':' || device_str[pos] == ',') {
if (pos > start) {
apply_device_config(std::string_view(device_str).substr(start, pos - start));
}
}
} else {
if (target_config.count(session_context_.device_type)) {
set_target_properties(session_context_.device_type,
target_config.at(session_context_.device_type));
start = pos + 1;
}
}
}
Expand Down
4 changes: 3 additions & 1 deletion onnxruntime/core/providers/openvino/contexts.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class SharedContext : public WeakSingleton<SharedContext> {

using config_t = std::map<std::string, ov::AnyMap>;
using reshape_t = std::map<std::string, ov::PartialShape>;
using layout_t = std::map<std::string, ov::Layout>;

struct ProviderInfo {
std::string device_type{""}; // [device_type]: Overrides the accelerator hardware type and
Expand All @@ -88,6 +89,7 @@ struct ProviderInfo {
// (GPU) feature. If blob files are already present,
// it will be directly loaded.
reshape_t reshape{}; // Used for reshaping the ov input tensor shape at runtime.
layout_t layout{}; // Used for specifying the ov input/output tensor layout at runtime.
std::string model_priority{"DEFAULT"}; // High-level OpenVINO model priority hint
// Defines what model should be provided with more performant
// bounded resource first
Expand All @@ -110,7 +112,7 @@ struct ProviderInfo {
const ConfigOptions* config_options{NULL};
const std::unordered_set<std::string> valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision",
"load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer",
"enable_causallm", "disable_dynamic_shapes", "reshape_input"};
"enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"};
};

// Holds context applicable to the entire EP instance.
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/openvino/ibackend.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class IBackend {
virtual ~IBackend() = default;
virtual void RewindKVCache(size_t index) {}
};
using ptr_stream_t = std::unique_ptr<std::istream>;
using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
class BackendFactory {
public:
static std::shared_ptr<IBackend>
Expand Down
10 changes: 7 additions & 3 deletions onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@
return Status::OK();
}

std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
std::unique_ptr<ModelBlobWrapper>
EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
auto node = graph_viewer.GetNode(first_index);
ORT_ENFORCE(node != nullptr);
Expand All @@ -113,10 +114,11 @@
bool embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());

std::unique_ptr<std::istream> result;
std::filesystem::path blob_filepath{};
if (embed_mode) {
result.reset((std::istream*)new std::istringstream(ep_cache_context));
} else {
auto blob_filepath = so_context_file_path;
blob_filepath = so_context_file_path;
if (blob_filepath.empty() && !graph_viewer.ModelPath().empty()) {
blob_filepath = graph_viewer.ModelPath();
}
Expand All @@ -126,16 +128,18 @@
}

bool isXML = backend_utils::IsModelStreamXML(*result);
std::filesystem::path native_blob_path{};
if (!isXML) {
// If the model stream is not an XML (i.e. precompiled blob), the OpenVINO SDK version that it was
// exported with must match the version that is currently running.
native_blob_path = std::move(blob_filepath);
ORT_ENFORCE((attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_),
"EPCtx blob was exported / is compatible with OpenVINO SDK version " + attrs.at(EP_SDK_VER).s() +
", but OpenVINO SDK version currently in use is " + openvino_sdk_version_);
}

LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
return result;
return std::make_unique<ModelBlobWrapper>(std::move(result), native_blob_path);

Check warning on line 142 in onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <utility> for move [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc:142: Add #include <utility> for move [build/include_what_you_use] [4]

Check warning on line 142 in onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc

View workflow job for this annotation

GitHub Actions / Optional Lint C++

[cpplint] reported by reviewdog 🐶 Add #include <memory> for make_unique<> [build/include_what_you_use] [4] Raw Output: onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc:142: Add #include <memory> for make_unique<> [build/include_what_you_use] [4]
}

bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const {
Expand Down
Loading
Loading