diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h index c80b8c0c164b6..f40ea6591059e 100644 --- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h @@ -43,6 +43,9 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_ // Set RPC control latency for QNN HTP backend static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency"; +// Set QNN Lora Config File for apply Lora in QNN context binary +static const char* const kOrtRunOptionsConfigQnnLoraConfig = "qnn.lora_config"; + // Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true. // The value should be an integer. If the value is not set, the default value is 0 and // ORT session only captures one cuda graph before another capture is requested. diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 26d792c008edc..ba2c8d8cc86fd 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -52,6 +52,70 @@ static const char* DlError() { #endif } +Status ReadBinaryFromFile(const std::string& file_path, uint8_t* buffer, size_t buffer_size) { + ORT_RETURN_IF(nullptr == buffer, "Binary buffer is nullptr"); + std::ifstream in(file_path, std::ifstream::binary); + ORT_RETURN_IF(!in, "Failed to open input file: ", file_path.c_str()); + ORT_RETURN_IF(!in.read(reinterpret_cast(buffer), buffer_size), "Failed to read the contents of: ", file_path.c_str()); + return Status::OK(); +} + +Status QnnBackendManager::ParseLoraConfig(std::string lora_config_path) { + LOGS_DEFAULT(INFO) << "Acquiring the QnnInterface " << lora_config_path; + + // QNN Lora Config file format should be a single line, with the graph name first, + // followed by the qnn lora context binary path, separated by a semicolon (;) + // Example: ; + LOGS_DEFAULT(INFO) << "Loading Lora Config " << lora_config_path; + std::ifstream file(lora_config_path); + std::string line; + + if (file.is_open()) { + if (std::getline(file, line)) { + std::istringstream ss(line); + std::string graph_name; + std::string lora_adapter_bin_path; + + if (std::getline(ss, graph_name, ';') && std::getline(ss, lora_adapter_bin_path)) { + size_t buffer_size = std::filesystem::file_size(lora_adapter_bin_path.c_str()); + + ORT_RETURN_IF(0 == buffer_size, "Received path to an empty file. Nothing to deserialize."); + std::unique_ptr buffer = std::make_unique(buffer_size); + void* voidBufferPtr = static_cast(buffer.get()); + QnnContext_Buffer_t contextBuffer{QNN_CONTEXT_BUFFER_VERSION_1, + {QNN_CONTEXTMEMTYPE_RAW, {{voidBufferPtr, buffer_size}}}}; + + auto status = ReadBinaryFromFile(lora_adapter_bin_path, + reinterpret_cast(buffer.get()), + buffer_size); + + ORT_RETURN_IF(status != Status::OK(), "Failed to read binary data."); + Qnn_GraphHandle_t graph; + bool graph_retrieve_success = false; + for (size_t cIdx = 0; cIdx < contexts_.size(); cIdx++) { + auto graph_retrieve_rt = qnn_interface_.graphRetrieve(contexts_[cIdx], graph_name.c_str(), &graph); + if (QNN_SUCCESS != graph_retrieve_rt) { + continue; + } + + graph_retrieve_success = true; + + auto context_apply_binary_section_rt = qnn_interface_.contextApplyBinarySection( + contexts_[cIdx], graph, QNN_CONTEXT_SECTION_UPDATABLE, &contextBuffer, profile_backend_handle_, nullptr); + ORT_RETURN_IF(QNN_SUCCESS != context_apply_binary_section_rt, "Failed to apply binary section."); + break; + } + ORT_RETURN_IF_NOT(graph_retrieve_success, "Failed to retrieve graph: ", graph_name, " and apply binary section."); + } + } + file.close(); + } else { + LOGS_DEFAULT(ERROR) << "Unable to load Lora Config " << lora_config_path; + } + + return Status::OK(); +} + template Status QnnBackendManager::GetQnnInterfaceProvider(const char* lib_path, const char* interface_provider_name, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index 3592af41f03df..39fc66071981c 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -140,6 +140,8 @@ class QnnBackendManager : public std::enable_shared_from_this const Qnn_Tensor_t& qnn_tensor, Qnn_MemHandle_t& mem_handle); + Status ParseLoraConfig(std::string lora_config); + private: Status LoadBackend(); diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index a5813dc2a4adc..ae8b1ea17980f 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -1202,6 +1202,12 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio } } + std::string lora_config = ""; + if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnLoraConfig, lora_config)) { + LOGS_DEFAULT(VERBOSE) << "lora_config: " << lora_config; + ORT_RETURN_IF_ERROR(qnn_backend_manager_->ParseLoraConfig(lora_config)); + } + return Status::OK(); }