diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 3e713b69671e7..fb0c379522691 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -455,7 +455,7 @@ if (onnxruntime_USE_CUDA) list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64) else() if(onnxruntime_CUDNN_HOME) - list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64) + list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64) endif() list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64) endif() diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h index e7e989cf17f20..ddf390db3d1c3 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h @@ -5,7 +5,7 @@ /// /// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2. -/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally. +/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally. /// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined /// OrtTensorRTProviderOptions will be deprecated over time. /// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions. @@ -31,4 +31,7 @@ struct OrtTensorRTProviderOptionsV2 { int trt_force_sequential_engine_build; // force building TensorRT engine sequentially. Default 0 = false, nonzero = true int trt_context_memory_sharing_enable; // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true int trt_layer_norm_fp32_fallback; // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true + int trt_timing_cache_enable; // enable TensorRT timing cache. Default 0 = false, nonzero = true + int trt_force_timing_cache; // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true + int trt_detailed_build_log; // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true }; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 7d14f43064f00..10c67963b85cc 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -117,6 +117,32 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map loadTimingCacheFile(const std::string inFileName) { + std::ifstream iFile(inFileName, std::ios::in | std::ios::binary); + if (!iFile) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not read timing cache from: " << inFileName + << ". A new timing cache will be generated and written."; + return std::vector(); + } + iFile.seekg(0, std::ifstream::end); + size_t fsize = iFile.tellg(); + iFile.seekg(0, std::ifstream::beg); + std::vector content(fsize); + iFile.read(content.data(), fsize); + iFile.close(); + return content; +} + +inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) { + std::ofstream oFile(outFileName, std::ios::out | std::ios::binary); + if (!oFile) { + LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not write timing cache to: " << outFileName; + return; + } + oFile.write((char*)blob->data(), blob->size()); + oFile.close(); +} } // namespace namespace google { @@ -312,7 +338,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv } dump_subgraphs_ = info.dump_subgraphs; engine_cache_enable_ = info.engine_cache_enable; - if (engine_cache_enable_ || int8_enable_) { + timing_cache_enable_ = info.timing_cache_enable; + force_timing_cache_match_ = info.force_timing_cache; + detailed_build_log_ = info.detailed_build_log; + if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) { cache_path_ = info.engine_cache_path; } engine_decryption_enable_ = info.engine_decryption_enable; @@ -386,7 +415,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true); } - if (engine_cache_enable_ || int8_enable_) { + const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable); + if (!timing_cache_enable_env.empty()) { + timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true); + } + + const std::string detailed_build_log_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDetailedBuildLog); + if (!detailed_build_log_env.empty()) { + detailed_build_log_ = (std::stoi(detailed_build_log_env) == 0 ? false : true); + } + + const std::string timing_force_match_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kForceTimingCache); + if (!timing_force_match_env.empty()) { + force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true); + } + + if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) { const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath); cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath); if (!engine_cache_path.empty() && cache_path_.empty()) { @@ -438,7 +482,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv dla_core_ = 0; } - if (engine_cache_enable_ || int8_enable_) { + if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) { if (!cache_path_.empty() && !fs::is_directory(cache_path_)) { if (!fs::create_directory(cache_path_)) { throw std::runtime_error("Failed to create directory " + cache_path_); @@ -1373,6 +1417,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector timing_cache = nullptr; + if (timing_cache_enable_) { + std::vector loaded_timing_cache = loadTimingCacheFile(timing_cache_path); + timing_cache.reset(trt_config->createTimingCache(static_cast(loaded_timing_cache.data()), loaded_timing_cache.size())); + if (timing_cache == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not create timing cache: " + timing_cache_path); + } + trt_config->setTimingCache(*timing_cache, force_timing_cache_match_); + if (detailed_build_log_) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path; + } + } + // Build engine + std::chrono::steady_clock::time_point engine_build_start; + if (detailed_build_log_) { + engine_build_start = std::chrono::steady_clock::now(); + } trt_engine = std::unique_ptr(trt_builder->buildEngineWithConfig(*trt_network, *trt_config)); if (trt_engine == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP could not build engine for fused node: " + fused_node.Name()); } + if (detailed_build_log_) { + auto engine_build_stop = std::chrono::steady_clock::now(); + LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " << + std::chrono::duration_cast(engine_build_stop - engine_build_start).count() << "ms" << std::endl; + } if (engine_cache_enable_) { std::unique_ptr serializedModel(trt_engine->serialize()); size_t engine_size = serializedModel->size(); @@ -1438,7 +1512,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector(serializedModel->data()), engine_size); } - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path; + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path; + } + // serialize and save timing cache + if (timing_cache_enable_) { + auto timing_cache = trt_config->getTimingCache(); + std::unique_ptr timingCacheHostData{timing_cache->serialize()}; + if (timingCacheHostData == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not serialize timing cache: " + timing_cache_path); + } + saveTimingCacheFile(timing_cache_path, timingCacheHostData.get()); + if (detailed_build_log_) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path; + } } } } @@ -1504,7 +1591,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectornode_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_, - dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_}; + dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_, + force_timing_cache_match_, detailed_build_log_}; *state = p.release(); return 0; }; @@ -1545,6 +1633,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorengine_cache_path, trt_state->trt_node_name_with_precision); const std::string engine_cache_path = cache_path + ".engine"; const std::string profile_cache_path = cache_path + ".profile"; + std::string timing_cache_path = ""; + if (timing_cache_enable_) { + cudaDeviceProp prop; + CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_)); + timing_cache_path = GetTimingCachePath(cache_path_, prop); + } if (trt_state->engine_cache_enable && trt_engine == nullptr) { std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in); std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in); @@ -1779,11 +1873,35 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorsetDLACore(trt_state->dla_core); } + // Load timing cache from file. Create a fresh cache if the file doesn't exist + std::unique_ptr timing_cache = nullptr; + if (trt_state->timing_cache_enable) { + std::vector loaded_timing_cache = loadTimingCacheFile(timing_cache_path); + timing_cache.reset(trt_config->createTimingCache(static_cast(loaded_timing_cache.data()), loaded_timing_cache.size())); + if (timing_cache == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not create timing cache: " + timing_cache_path); + } + trt_config->setTimingCache(*timing_cache, force_timing_cache_match_); + if (detailed_build_log_) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path; + } + } + // Build engine { auto lock = GetApiLock(); + std::chrono::steady_clock::time_point engine_build_start; + if (detailed_build_log_) { + engine_build_start = std::chrono::steady_clock::now(); + } *(trt_state->engine) = std::unique_ptr( trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config)); + if (detailed_build_log_) { + auto engine_build_stop = std::chrono::steady_clock::now(); + LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " << + std::chrono::duration_cast(engine_build_stop - engine_build_start).count() << "ms" << std::endl; + } } if (trt_state->engine == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine."); @@ -1809,6 +1927,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectortiming_cache_enable) { + auto timing_cache = trt_config->getTimingCache(); + std::unique_ptr timingCacheHostData{timing_cache->serialize()}; + if (timingCacheHostData == nullptr) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT EP could not serialize timing cache: " + timing_cache_path); + } + saveTimingCacheFile(timing_cache_path, timingCacheHostData.get()); + if (detailed_build_log_) { + LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path; + } + } + // Build context if (trt_state->context_memory_sharing_enable) { *(trt_state->context) = std::unique_ptr( diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 784657e54dd44..cb87b31e01b96 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -30,6 +30,9 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD"; static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE"; static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK"; +static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE"; +static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE"; +static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE"; // Old env variable for backward compatibility static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH"; } // namespace tensorrt_env_vars @@ -114,6 +117,9 @@ struct TensorrtFuncState { bool engine_decryption_enable = false; int (*engine_decryption)(const char*, char*, size_t*) = nullptr; int (*engine_encryption)(const char*, char*, size_t) = nullptr; + bool timing_cache_enable = true; + bool force_timing_cache = false; + bool detailed_build_log = false; }; // Logical device representation. @@ -176,6 +182,9 @@ class TensorrtExecutionProvider : public IExecutionProvider { bool engine_decryption_enable_ = false; int (*engine_decryption_)(const char*, char*, size_t*) = nullptr; int (*engine_encryption_)(const char*, char*, size_t) = nullptr; + bool timing_cache_enable_ = false; + bool force_timing_cache_match_ = false; + bool detailed_build_log_ = false; std::unordered_set control_flow_op_set_ = {"If", "Loop", "Scan"}; std::unordered_map> parsers_; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc index 2db405d5120b6..ae06e6ce1338d 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc @@ -27,11 +27,14 @@ constexpr const char* kCachePath = "trt_engine_cache_path"; constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable"; constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path"; constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build"; -// add new provider option name here. +// add new provider option name here. constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable"; constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback"; +constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable"; +constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match"; +constexpr const char* kDetailedBuildLog = "trt_detailed_build_log"; } // namespace provider_option_names -} // namespace tensorrt +} // namespace tensorrt TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) { TensorrtExecutionProviderInfo info{}; @@ -57,15 +60,17 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions .AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name) .AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table) .AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable) - .AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core) + .AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core) .AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs) .AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable) .AddAssignmentToReference(tensorrt::provider_option_names::kCachePath, info.engine_cache_path) .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable) - .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path) + .AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path) .AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build) .AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable) .AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback) + .AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable) + .AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache) .Parse(options)); // add new provider option here. return info; @@ -93,6 +98,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE // add new provider option here. {tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)}, {tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)}, + {tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)}, }; return options; } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h index 1f1fdb679f2a9..1a2e5e01af464 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h @@ -17,10 +17,10 @@ struct TensorrtExecutionProviderInfo { void* user_compute_stream{nullptr}; bool has_trt_options{false}; int max_partition_iterations{1000}; - int min_subgraph_size{1}; + int min_subgraph_size{1}; size_t max_workspace_size{1 << 30}; bool fp16_enable{false}; - bool int8_enable{false}; + bool int8_enable{false}; std::string int8_calibration_table_name{""}; bool int8_use_native_calibration_table{false}; bool dla_enable{false}; @@ -33,6 +33,9 @@ struct TensorrtExecutionProviderInfo { bool force_sequential_engine_build{false}; bool context_memory_sharing_enable{false}; bool layer_norm_fp32_fallback{false}; + bool timing_cache_enable{false}; + bool force_timing_cache{false}; + bool detailed_build_log{false}; static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h index ad1fdf00227ec..22702bec3b504 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h @@ -168,6 +168,17 @@ std::string GetCachePath(const std::string& root, const std::string& name) { } } +/* + * Get Timing by compute capability + * + */ +std::string GetTimingCachePath(const std::string& root, cudaDeviceProp prop) { + // append compute capability of the GPU as this invalidates the cache and TRT will throw when loading the cache + const std::string timing_cache_name = "TensorrtExecutionProvider_cache_cc" + + std::to_string(prop.major * 10 + prop.minor) + ".timing"; + return GetCachePath(root, timing_cache_name); +} + /* * Get cache by type * diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc index dd6915878eff5..9b4b8236e0b23 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc @@ -70,6 +70,8 @@ struct Tensorrt_Provider : Provider { info.force_sequential_engine_build = options.trt_force_sequential_engine_build != 0; info.context_memory_sharing_enable = options.trt_context_memory_sharing_enable != 0; info.layer_norm_fp32_fallback = options.trt_layer_norm_fp32_fallback != 0; + info.timing_cache_enable = options.trt_timing_cache_enable != 0; + info.detailed_build_log = options.trt_detailed_build_log != 0; return std::make_shared(info); } @@ -137,6 +139,8 @@ struct Tensorrt_Provider : Provider { trt_options.trt_force_sequential_engine_build = internal_options.force_sequential_engine_build; trt_options.trt_context_memory_sharing_enable = internal_options.context_memory_sharing_enable; trt_options.trt_layer_norm_fp32_fallback = internal_options.layer_norm_fp32_fallback; + trt_options.trt_timing_cache_enable = internal_options.timing_cache_enable; + trt_options.trt_force_timing_cache = internal_options.force_timing_cache; } ProviderOptions GetProviderOptions(const void* provider_options) override { diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 0e82fdb0097b2..8e70dd24ac10a 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1274,7 +1274,10 @@ OrtTensorRTProviderOptionsV2 OrtTensorRTProviderOptionsToOrtTensorRTProviderOpti trt_options_converted.trt_engine_decryption_lib_path = legacy_trt_options->trt_engine_decryption_lib_path; trt_options_converted.trt_force_sequential_engine_build = legacy_trt_options->trt_force_sequential_engine_build; // Add new provider option below - // Use default value as this field is not available in OrtTensorRTProviderOptionsV + // Use default value as this field is not available in OrtTensorRTProviderOptions + trt_options_converted.trt_timing_cache_enable = 0; + trt_options_converted.trt_force_timing_cache = 0; + trt_options_converted.trt_detailed_build_log = 0; trt_options_converted.trt_context_memory_sharing_enable = 0; trt_options_converted.trt_layer_norm_fp32_fallback = 0; return trt_options_converted; @@ -1601,6 +1604,9 @@ ORT_API_STATUS_IMPL(OrtApis::CreateTensorRTProviderOptions, _Outptr_ OrtTensorRT (*out)->trt_force_sequential_engine_build = false; (*out)->trt_context_memory_sharing_enable = false; (*out)->trt_layer_norm_fp32_fallback = false; + (*out)->trt_timing_cache_enable = false; + (*out)->trt_force_timing_cache = false; + (*out)->trt_detailed_build_log = false; return nullptr; #else ORT_UNUSED_PARAMETER(out); diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index dd24ce51e1111..667073063d4cc 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -366,6 +366,9 @@ std::unique_ptr CreateExecutionProviderInstance( nullptr, 0, 0, + 0, + 0, + 0, 0}; for (auto option : it->second) { if (option.first == "device_id") { diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index f6ad3b5d75f77..552274b77bbfb 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -121,6 +121,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device bool trt_force_sequential_engine_build = false; bool trt_context_memory_sharing_enable = false; bool trt_layer_norm_fp32_fallback = false; + bool trt_timing_cache_enable = false; + bool trt_force_timing_cache = false; + bool trt_detailed_build_log = false; #ifdef _MSC_VER std::string ov_string = ToUTF8String(performance_test_config.run_config.ep_runtime_config_string); @@ -268,6 +271,30 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device } else { ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_layer_norm_fp32_fallback' should be a boolean i.e. true or false. Default value is false.\n"); } + } else if (key == "trt_timing_cache_enable") { + if (value == "true" || value == "True") { + trt_timing_cache_enable = true; + } else if (value == "false" || value == "False") { + trt_timing_cache_enable = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_timing_cache_enable' should be a boolean i.e. true or false. Default value is false.\n"); + } + } else if (key == "trt_force_timing_cache") { + if (value == "true" || value == "True") { + trt_force_timing_cache = true; + } else if (value == "false" || value == "False") { + trt_force_timing_cache = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_force_timing_cache' should be a boolean i.e. true or false. Default value is false.\n"); + } + } else if (key == "trt_detailed_build_log") { + if (value == "true" || value == "True") { + trt_detailed_build_log = true; + } else if (value == "false" || value == "False") { + trt_detailed_build_log = false; + } else { + ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_detailed_build_log' should be a boolean i.e. true or false. Default value is false.\n"); + } } else { ORT_THROW("[ERROR] [TensorRT] wrong key type entered. Choose from the following runtime key options that are available for TensorRT. ['device_id', 'trt_max_partition_iterations', 'trt_min_subgraph_size', 'trt_max_workspace_size', 'trt_fp16_enable', 'trt_int8_enable', 'trt_int8_calibration_table_name', 'trt_int8_use_native_calibration_table', 'trt_dla_enable', 'trt_dla_core', 'trt_dump_subgraphs', 'trt_engine_cache_enable', 'trt_engine_cache_path', 'trt_engine_decryption_enable', 'trt_engine_decryption_lib_path', 'trt_force_sequential_engine_build', 'trt_context_memory_sharing_enable', 'trt_layer_norm_fp32_fallback'] \n"); } @@ -293,6 +320,9 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device tensorrt_options.trt_force_sequential_engine_build = trt_force_sequential_engine_build; tensorrt_options.trt_context_memory_sharing_enable = trt_context_memory_sharing_enable; tensorrt_options.trt_layer_norm_fp32_fallback = trt_layer_norm_fp32_fallback; + tensorrt_options.trt_timing_cache_enable = trt_timing_cache_enable; + tensorrt_options.trt_force_timing_cache = trt_force_timing_cache; + tensorrt_options.trt_detailed_build_log = trt_detailed_build_log; session_options.AppendExecutionProvider_TensorRT_V2(tensorrt_options); OrtCUDAProviderOptions cuda_options; diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index 2039c65b53aa6..b27ded96d85a9 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -114,7 +114,9 @@ Status PerformanceRunner::Run() { } // warm up + initial_inference_result_.start = std::chrono::high_resolution_clock::now(); ORT_RETURN_IF_ERROR(RunOneIteration()); + initial_inference_result_.end = std::chrono::high_resolution_clock::now(); // TODO: start profiling // if (!performance_test_config_.run_config.profile_file.empty()) @@ -139,9 +141,12 @@ Status PerformanceRunner::Run() { std::chrono::duration session_create_duration = session_create_end_ - session_create_start_; // TODO: end profiling // if (!performance_test_config_.run_config.profile_file.empty()) session_object->EndProfiling(); + auto first_inference_duration = + std::chrono::duration_cast(initial_inference_result_.end - initial_inference_result_.start).count(); std::chrono::duration inference_duration = performance_result_.end - performance_result_.start; std::cout << "Session creation time cost: " << session_create_duration.count() << " s\n" + << "First inference time cost: " << first_inference_duration << " ms\n" << "Total inference time cost: " << performance_result_.total_time_cost << " s\n" // sum of time taken by each request << "Total inference requests: " << performance_result_.time_costs.size() << "\n" << "Average inference time cost: " << performance_result_.total_time_cost / performance_result_.time_costs.size() * 1000 << " ms\n" diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h index aae68fd2d289f..da2df9c39f44c 100644 --- a/onnxruntime/test/perftest/performance_runner.h +++ b/onnxruntime/test/perftest/performance_runner.h @@ -106,6 +106,7 @@ class PerformanceRunner { private: std::chrono::time_point session_create_start_; std::chrono::time_point session_create_end_; + PerformanceResult initial_inference_result_; PerformanceResult performance_result_; PerformanceTestConfig performance_test_config_; std::unique_ptr test_model_info_; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index 2073beb0cfc83..1da491fe4f9b0 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -683,7 +683,7 @@ TEST_P(ModelTest, Run) { if (test_case_name.find(ORT_TSTR("FLOAT16")) != std::string::npos) { OrtTensorRTProviderOptionsV2 params{0, 0, nullptr, 1000, 1, 1 << 30, 1, // enable fp16 - 0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0}; + 0, nullptr, 0, 0, 0, 0, 0, nullptr, 0, nullptr, 0, 0, 0, 0, 0, 0}; ortso.AppendExecutionProvider_TensorRT_V2(params); } else { OrtTensorRTProviderOptionsV2* ep_option = nullptr; diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index 2e139e0f57e68..2d38bf7b4b3ba 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -11,6 +11,8 @@ #include "core/providers/tensorrt/tensorrt_execution_provider_utils.h" #include #include +#include +#include using namespace std; using namespace ONNX_NAMESPACE; @@ -151,6 +153,9 @@ void RunWithOneSessionSingleThreadInference(std::string model_name, std::string nullptr, 0, 0, + 0, + 0, + 0, 0}; params.trt_engine_cache_enable = 1; @@ -222,6 +227,9 @@ void RunWithOneSessionMultiThreadsInference(std::string model_name, std::string nullptr, 0, 0, + 0, + 0, + 0, 0}; params.trt_engine_cache_enable = 1; @@ -386,6 +394,9 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { nullptr, 0, 0, + 0, + 0, + 0, 0}; if (cache_type.compare("engine") == 0) { @@ -498,9 +509,68 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { } } else if (cache_type.compare("timing") == 0) { // add test code here + + /* Following code block tests the functionality of engine and optimization profile of ORT TRT, including: + * - timing cache cache serialization/de-serialization + * - benefir of usign a timing cache no matter if dynamic / static input + */ + uint64_t compilation_without_cache_ms, compilation_with_cache_ms; + + params.trt_timing_cache_enable = 1; + // std::chrono + { + auto start = chrono::steady_clock::now(); + std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); + EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); + auto status = session_object.Load(model_name); + ASSERT_TRUE(status.IsOK()); + status = session_object.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // run inference + // TRT timing cache should be created under the situation of non-dynamic/dynamic shape input + status = session_object.Run(run_options, feeds, output_names, &fetches); + auto end = chrono::steady_clock::now(); + ASSERT_TRUE(status.IsOK()); + VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); + ASSERT_TRUE(IsCacheExistedByType("./", ".timing")); + compilation_without_cache_ms = chrono::duration_cast(end - start).count(); + } + + // get new session and reinitialize model + // second same inference should resuse the cache and therefore have a faster build + if (input_type.compare("static") == 0) { + { + InferenceSession session_object_new{so, GetEnvironment()}; + { + auto start = chrono::steady_clock::now(); + std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); + EXPECT_TRUE(session_object_new.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); + auto status = session_object_new.Load(model_name); + ASSERT_TRUE(status.IsOK()); + status = session_object_new.Initialize(); + ASSERT_TRUE(status.IsOK()); + + // run inference + // TRT timing cache should be created under the situation of non-dynamic/dynamic shape input + status = session_object_new.Run(run_options, feeds, output_names, &fetches); + // TODO narrow down actual compilation section + auto end = chrono::steady_clock::now(); + + ASSERT_TRUE(status.IsOK()); + VerifyOutputs(fetches, expected_dims_mul_m, expected_values_mul_m); + ASSERT_TRUE(IsCacheExistedByType("./", ".timing")); + compilation_with_cache_ms = chrono::duration_cast(end - start).count(); + } + } + ASSERT_TRUE(compilation_with_cache_ms <= compilation_without_cache_ms); + } else { + // TODO test dynamic shapes + } } // clean up caches + RemoveCachesByType("./", ".timing"); RemoveCachesByType("./", ".engine"); RemoveCachesByType("./", ".profile"); } @@ -515,11 +585,13 @@ TEST_P(TensorrtExecutionProviderCacheTest, Run) { * We have following test parameters: * - engine_static: engine cache enabled with non-dynamic input shape * - engine_dynamic: engine cache enabled with dynamic input shape - * - timing_static: will be added - * - timing_dynamic: will be added + * - timing_static: timing cache enabled, static input shape + * - timing_dynamic: timing cache enabled, static input shape */ INSTANTIATE_TEST_SUITE_P(TensorrtExecutionProviderCacheTests, TensorrtExecutionProviderCacheTest, testing::Values("engine_static", - "engine_dynamic"), + "engine_dynamic", + "timing_static", + "timing_dynamic"), [](const ::testing::TestParamInfo& info) {return info.param;}); TEST(TensorrtExecutionProviderTest, FunctionTest) {