Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
69b662d
rename onnxruntime_qnn_ctx_gen to ep_weight_sharing_ctx_gen
HectorSVC Feb 3, 2025
9ec8b1f
rename
HectorSVC Feb 3, 2025
906cfd5
update the code to use C++ API
HectorSVC Feb 3, 2025
eb6104b
Merge branch 'main' into ep_weight_sharing_ctx_gen
HectorSVC Feb 12, 2025
eaf7041
When generate EPContext model, share QnnBackendManager across session…
HectorSVC Feb 14, 2025
3800cbc
Add -e option to make it work for other EPs
HectorSVC Feb 14, 2025
1a135ea
Merge branch 'main' into ep_weight_sharing_ctx_gen
HectorSVC Feb 14, 2025
c8cfc2e
update comments
HectorSVC Feb 14, 2025
c8eb395
add new UT
HectorSVC Feb 15, 2025
0b3dbed
fix UT
HectorSVC Feb 15, 2025
ff59f37
Merge branch 'main' into ep_weight_sharing_ctx_gen
HectorSVC Feb 19, 2025
de47a5c
remove include folder not used
HectorSVC Feb 19, 2025
dea89d7
remove all ORT internal dependencies, use ONNX API for model file update
HectorSVC Feb 20, 2025
aec117c
update README.md
HectorSVC Feb 20, 2025
92d8423
format
HectorSVC Feb 20, 2025
57783ae
address review comments
HectorSVC Feb 24, 2025
23e4cc0
build ep_weight_sharing_ctx_gen dynamic link to onnxruntime lib
HectorSVC Feb 25, 2025
b4a2e3a
add session option ep.stop_share_ep_contexts to tell ORT session when…
HectorSVC Feb 25, 2025
896f29f
fix linux build
HectorSVC Feb 26, 2025
387b2ba
Merge branch 'main' into ep_weight_sharing_ctx_gen
HectorSVC Mar 3, 2025
33d1b3e
revert merge issue
HectorSVC Mar 3, 2025
c106b51
fix merge issue
HectorSVC Mar 3, 2025
175c1ed
fix UT failure caused by another PR which changed the default EPConte…
HectorSVC Mar 4, 2025
c86a126
resolve the issue caused by another PR
HectorSVC Mar 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/onnxruntime_python.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1029,7 +1029,7 @@ if (onnxruntime_USE_QNN)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_qnn_ctx_gen>
$<TARGET_FILE:ep_weight_sharing_ctx_gen>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
Expand Down
33 changes: 18 additions & 15 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1289,31 +1289,34 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)

if(onnxruntime_USE_QNN)
#qnn ctx generator
set(onnxruntime_qnn_ctx_gen_src_dir ${TEST_SRC_DIR}/qnn_ctx_gen)
set(onnxruntime_qnn_ctx_gen_src_patterns
"${onnxruntime_qnn_ctx_gen_src_dir}/*.cc"
"${onnxruntime_qnn_ctx_gen_src_dir}/*.h")
set(ep_weight_sharing_ctx_gen_src_dir ${TEST_SRC_DIR}/ep_weight_sharing_ctx_gen)
set(ep_weight_sharing_ctx_gen_src_patterns
"${ep_weight_sharing_ctx_gen_src_dir}/*.cc"
"${ep_weight_sharing_ctx_gen_src_dir}/*.h")

file(GLOB onnxruntime_qnn_ctx_gen_src CONFIGURE_DEPENDS
${onnxruntime_qnn_ctx_gen_src_patterns}
file(GLOB ep_weight_sharing_ctx_gen_src CONFIGURE_DEPENDS
${ep_weight_sharing_ctx_gen_src_patterns}
)
onnxruntime_add_executable(onnxruntime_qnn_ctx_gen ${onnxruntime_qnn_ctx_gen_src})
target_include_directories(onnxruntime_qnn_ctx_gen PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
${CMAKE_CURRENT_BINARY_DIR})
onnxruntime_add_executable(ep_weight_sharing_ctx_gen ${ep_weight_sharing_ctx_gen_src})
target_include_directories(ep_weight_sharing_ctx_gen PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR})
if (WIN32)
target_compile_options(onnxruntime_qnn_ctx_gen PRIVATE ${disabled_warnings})
target_compile_options(ep_weight_sharing_ctx_gen PRIVATE ${disabled_warnings})
if (NOT DEFINED SYS_PATH_LIB)
set(SYS_PATH_LIB shlwapi)
endif()
endif()

if(WIN32)
target_link_libraries(onnxruntime_qnn_ctx_gen PRIVATE debug dbghelp advapi32)
if (onnxruntime_BUILD_SHARED_LIB)
set(ep_weight_sharing_ctx_gen_libs onnxruntime_common onnxruntime ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE})
target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE ${ep_weight_sharing_ctx_gen_libs})
if (WIN32)
target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE debug dbghelp advapi32)
endif()
else()
target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE onnxruntime_session ${onnxruntime_test_providers_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE})
endif()
target_link_libraries(onnxruntime_qnn_ctx_gen PRIVATE onnx_test_runner_common onnxruntime_test_utils onnxruntime_common onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers onnx_test_data_proto ${onnxruntime_test_providers_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})

set_target_properties(onnxruntime_qnn_ctx_gen PROPERTIES FOLDER "ONNXRuntimeTest")
set_target_properties(ep_weight_sharing_ctx_gen PROPERTIES FOLDER "ONNXRuntimeTest")
endif()

# shared lib
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -315,9 +315,12 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
// in case user need to merge/connect multiple EPContext nodes in one model
static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";

// Share EP related resources across EPs
// Share EP related resources across sessions
static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";

// Stop to share EP related resources across sessions from then on
static const char* const kOrtSessionOptionStopShareEpContexts = "ep.stop_share_ep_contexts";

// Use this config when dumping EP context model with an external initializers file
// All initializers will be inside the external data file if specified, otherwise all in Onnx file
static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,10 @@ Status QnnBackendManager::InitializeProfiling() {
QnnProfile_Level_t qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
if (ProfilingLevel::BASIC == profiling_level_merge_) {
qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
LOGS_DEFAULT(VERBOSE) << "Profiling level set to basic.";
} else if (ProfilingLevel::DETAILED == profiling_level_merge_) {
qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
LOGS_DEFAULT(VERBOSE) << "Profiling level set to detailed.";
}
Qnn_ErrorHandle_t result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_);
ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile! Error: ", QnnErrorHandleToString(result));
Expand Down
42 changes: 31 additions & 11 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
share_ep_contexts_ =
config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;

stop_share_ep_contexts_ =
config_options->GetConfigOrDefault(kOrtSessionOptionStopShareEpContexts, "0") == "1";
LOGS_DEFAULT(VERBOSE) << "User specified option - stop share EP contexts across sessions: " << stop_share_ep_contexts_;
}

static const std::string BACKEND_PATH = "backend_path";
Expand Down Expand Up @@ -384,17 +388,27 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
}
}

qnn_backend_manager_ = qnn::QnnBackendManager::Create(
qnn::QnnBackendManagerConfig{backend_path,
profiling_level_etw,
profiling_level,
profiling_file_path,
context_priority,
qnn_saver_path,
device_id_,
htp_arch,
soc_model,
enable_htp_weight_sharing});
// For context binary generation with weight sharing enabled, use the QnnBackendManager from the shared context if it exits
// So that all graphs from later sessions will be compiled into the same QNN context
if (context_cache_enabled_ && share_ep_contexts_ && SharedContext::GetInstance().GetSharedQnnBackendManager()) {
qnn_backend_manager_ = SharedContext::GetInstance().GetSharedQnnBackendManager();
// Clear the QnnBackendManager from singleton to stop the resource share
if (stop_share_ep_contexts_) {
SharedContext::GetInstance().ResetSharedQnnBackendManager();
}
} else {
qnn_backend_manager_ = qnn::QnnBackendManager::Create(
qnn::QnnBackendManagerConfig{backend_path,
profiling_level_etw,
profiling_level,
profiling_file_path,
context_priority,
qnn_saver_path,
device_id_,
htp_arch,
soc_model,
enable_htp_weight_sharing});
}

#if defined(_WIN32)
if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) {
Expand Down Expand Up @@ -1037,6 +1051,12 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
qnn_context_embed_mode_,
max_spill_fill_buffer_size,
logger));

if (share_ep_contexts_ && !stop_share_ep_contexts_ &&
nullptr == SharedContext::GetInstance().GetSharedQnnBackendManager()) {
ORT_RETURN_IF_NOT(SharedContext::GetInstance().SetSharedQnnBackendManager(qnn_backend_manager_),
"Failed to set shared QnnBackendManager.");
}
}
return Status::OK();
}
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ class QNNExecutionProvider : public IExecutionProvider {
uint32_t default_rpc_control_latency_ = 0;
bool enable_HTP_FP16_precision_ = true;
bool share_ep_contexts_ = false;
bool stop_share_ep_contexts_ = false;
bool enable_spill_fill_buffer_ = false;
#if defined(_WIN32)
onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
Expand Down
26 changes: 26 additions & 0 deletions onnxruntime/core/providers/qnn/shared_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,39 @@ class SharedContext {
return graph_exist;
}

bool SetSharedQnnBackendManager(std::shared_ptr<qnn::QnnBackendManager>& qnn_backend_manager) {
const std::lock_guard<std::mutex> lock(mtx_);

if (qnn_backend_manager_ != nullptr) {
if (qnn_backend_manager_ == qnn_backend_manager) {
return true;
}
return false;
}
qnn_backend_manager_ = qnn_backend_manager;
return true;
}

std::shared_ptr<qnn::QnnBackendManager> GetSharedQnnBackendManager() {
const std::lock_guard<std::mutex> lock(mtx_);
return qnn_backend_manager_;
}

void ResetSharedQnnBackendManager() {
const std::lock_guard<std::mutex> lock(mtx_);
qnn_backend_manager_.reset();
}

private:
SharedContext() = default;
~SharedContext() = default;

ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SharedContext);

// Used for passing through QNN models (deserialized from context binary) across sessions
std::vector<std::unique_ptr<qnn::QnnModel>> shared_qnn_models_;
// Used for compiling multiple models into same QNN context binary
std::shared_ptr<qnn::QnnBackendManager> qnn_backend_manager_;
// Producer sessions can be in parallel
// Consumer sessions have to be after producer sessions initialized
std::mutex mtx_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@

This tool provides the way to generate Onnx models that wraps QNN context binary warpt with weight sharing enabled. The options to use with the tool are listed below:

`onnxruntime_qnn_ctx_gen [options...] model_path,model_path`
`ep_weight_sharing_ctx_gen [options...] model_1_path,model_2_path`

./onnxruntime_qnn_ctx_gen -v -i "soc_model|60 htp_graph_finalization_optimization_mode|3" -C "ep.context_enable|1 ep.context_embed_mode|0" /mnt/c/model1.onnx,/mnt/c/model2.onnx
./ep_weight_sharing_ctx_gen -e qnn -v -i "soc_model|60 htp_graph_finalization_optimization_mode|3" /mnt/c/model1.onnx,/mnt/c/model2.onnx

Options:


-e [qnn|tensorrt|openvino|vitisai]: Specifies the compile based provider qnn, tensorrt, openvino, vitisai. Default is qnn.

-v: Show verbose information.

-C: [session_config_entries]: Specify session configuration entries as key-value pairs: -C "<key1>|<val1> <key2>|<val2>"
Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
[Example] -C "ep.context_enable|1 ep.context_embed_mode|0"
[Example] -C "ep.context_enable|1 ep.context_embed_mode|0". These are set as default so can be ignored.

-i: [provider_options]: Specify QNN EP specific runtime options as key value pairs. Different runtime options available are:
[Usage]: -i '<key1>|<value1> <key2>|<value2>'
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Copyright (c) 2023 NVIDIA Corporation.
// Licensed under the MIT License.

#include "command_args_parser.h"
Expand Down Expand Up @@ -29,28 +28,30 @@ namespace qnnctxgen {

/*static*/ void CommandLineParser::ShowUsage() {
printf(
"onnxruntime_qnn_ctx_gen [options...] model1_path,model2_path\n"
"Example: ./onnxruntime_qnn_ctx_gen -i \"soc_model|60 htp_graph_finalization_optimization_mode|3\" -C \"ep.context_node_name_prefix|_part1\" ./model1.onnx,./model2.onnx\n"
"ep_weight_sharing_ctx_gen [options...] model1_path,model2_path\n"
"Example: ./ep_weight_sharing_ctx_gen -i \"soc_model|60 htp_graph_finalization_optimization_mode|3\" -C \"ep.context_node_name_prefix|_part1\" ./model1.onnx,./model2.onnx\n"
"Options:\n"
"\t-e [qnn|tensorrt|openvino|vitisai]: Specifies the compile based provider 'qnn','tensorrt','openvino', 'vitisai'. "
"Default:'qnn'.\n"
"\t-v: Show verbose information.\n"
"\t-C: Specify session configuration entries as key-value pairs: -C \"<key1>|<value1> <key2>|<value2>\" \n"
"\t Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
"\t Force ep.context_enable to 1 and ep.context_embed_mode to 0. Change ep.context_file_path is not allowed."
"\t [Example] -C \"ep.context_node_name_prefix|_part1\" \n"
"\t-i: Specify QNN EP specific runtime options as key value pairs. Different runtime options available are: \n"
"\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
"\t [Usage]: -i '<key1>|<value1> <key2>|<value2>'\n"
"\n"
"\t [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. default to HTP backend\n"
"\t [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
"\t [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: '0', '1', '2', '3', default is '0'.\n"
"\t [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
"\t [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
"\t [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. default to HTP backend\n"
"\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
"\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: '0', '1', '2', '3', default is '0'.\n"
"\t [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
"\t [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
"\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
"\t [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
"\t [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
"\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
"\t [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
"\t [QNN only] [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
"\t Defaults to '1' (QNN EP handles the graph I/O quantization and dequantization). \n"
"\t [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
"\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
"\n"
"\t-h: help\n");
Expand Down Expand Up @@ -109,8 +110,22 @@ static bool ParseSessionConfigs(const std::string& configs_string,

/*static*/ bool CommandLineParser::ParseArguments(TestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
int ch;
while ((ch = getopt(argc, argv, ORT_TSTR("o:u:i:C:vh"))) != -1) {
while ((ch = getopt(argc, argv, ORT_TSTR("e:o:u:i:C:vh"))) != -1) {
switch (ch) {
case 'e':
if (!CompareCString(optarg, ORT_TSTR("qnn"))) {
test_config.machine_config.provider_type_name = onnxruntime::kQnnExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("openvino"))) {
test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("tensorrt"))) {
test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider;
} else if (!CompareCString(optarg, ORT_TSTR("vitisai"))) {
test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
} else {
fprintf(stderr, "The execution provider is not included in this tool.\n");
return false;
}
break;
case 'v':
test_config.run_config.f_verbose = true;
break;
Expand Down Expand Up @@ -162,7 +177,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer'])");
}

test_config.run_config.qnn_options[key] = value;
test_config.run_config.provider_options[key] = value;
}
break;
}
Expand Down
Loading
Loading