diff --git a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h index 026fc3b2dc0a0..c9cd2a00ec167 100644 --- a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h +++ b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h @@ -7,8 +7,10 @@ * - `kDeviceId`: Specifies the GPU device ID to use. * - `kHasUserComputeStream`: Indicates whether a user-provided compute stream is used. * - `kUserComputeStream`: Specifies the user-provided compute stream. + * - `kUserAuxStreamArray`: Specifies the user-provided aux stream. * - `kMaxWorkspaceSize`: Sets the maximum workspace size for GPU memory allocation. * - 'kMaxSharedMemSize': Sets the maximum amount of shared memory that TensorRT kernels are allowed to use + * - `kLengthAuxStreamArray`: Specifies the length/size of the auxiliary streams array (kUserAuxStreamArray). Also sets the maximum number of auxiliary streams for TensorRT execution. * - `kDumpSubgraphs`: Enables or disables dumping of subgraphs for debugging. * - `kDetailedBuildLog`: Enables or disables detailed build logs for debugging. * - `kProfilesMinShapes`: Specifies the minimum shapes for profiling. @@ -24,8 +26,10 @@ namespace provider_option_names { constexpr const char* kDeviceId = "device_id"; constexpr const char* kHasUserComputeStream = "has_user_compute_stream"; constexpr const char* kUserComputeStream = "user_compute_stream"; +constexpr const char* kUserAuxStreamArray = "user_aux_stream_array"; constexpr const char* kMaxWorkspaceSize = "nv_max_workspace_size"; constexpr const char* kMaxSharedMemSize = "nv_max_shared_mem_size"; +constexpr const char* kLengthAuxStreamArray = "nv_length_aux_stream_array"; constexpr const char* kDumpSubgraphs = "nv_dump_subgraphs"; constexpr const char* kDetailedBuildLog = "nv_detailed_build_log"; constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes"; diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc index e2a8005aba1da..836f407ebb436 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc @@ -984,6 +984,17 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info) stream_ = nullptr; // Will be created in compute function } + if (info.user_aux_stream_array != nullptr) { + if (info.auxiliary_streams <= 0) { + ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "Auxiliary streams must be greater than 0 when using external auxiliary streams")); + } + external_aux_streams_ = true; + aux_streams_ = reinterpret_cast(info.user_aux_stream_array); + } else { + external_aux_streams_ = false; + aux_streams_ = nullptr; + } + std::string profile_min_shapes, profile_max_shapes, profile_opt_shapes; // incase the EP context is dumped the engine cache has to be enabled @@ -3033,6 +3044,11 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP select an optimization profile for the current context failed"); } + // Set auxiliary stream if provided by user + if (external_aux_streams_ && aux_streams_ != nullptr) { + trt_context->setAuxStreams(aux_streams_, (int32_t)auxiliary_streams_); + } + // Check before using trt_engine if (trt_engine == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "No engine is found."); @@ -3444,6 +3460,11 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra } } + // Set auxiliary stream if provided by user + if (external_aux_streams_ && aux_streams_ != nullptr) { + trt_context->setAuxStreams(aux_streams_, (int32_t)auxiliary_streams_); + } + // Start CUDA graph capture with the correct stream // Note: We need to set the stream and start capture here because this is where we have access to the actual compute stream // Get the graph annotation ID that was stored during OnRunStart diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h index bb8f687db094f..5c6ca20d75ec6 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h @@ -349,6 +349,8 @@ class NvExecutionProvider : public IExecutionProvider { mutable NvExecutionProviderInfo info_; bool external_stream_ = false; cudaStream_t stream_ = nullptr; + bool external_aux_streams_ = false; + cudaStream_t* aux_streams_ = nullptr; int max_partition_iterations_ = 1000; size_t min_subgraph_size_ = 1; size_t max_workspace_size_ = 0; diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc index f25718114891b..74e16079a7cad 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.cc @@ -16,6 +16,7 @@ NvExecutionProviderInfo NvExecutionProviderInfo::FromProviderOptions(const Provi const ConfigOptions& session_options) { NvExecutionProviderInfo info{}; void* user_compute_stream = nullptr; + void* user_aux_stream_array = nullptr; void* onnx_bytestream = nullptr; void* external_data_bytestream = nullptr; ORT_THROW_IF_ERROR( @@ -41,8 +42,17 @@ NvExecutionProviderInfo NvExecutionProviderInfo::FromProviderOptions(const Provi user_compute_stream = reinterpret_cast(address); return Status::OK(); }) + .AddValueParser( + nv::provider_option_names::kUserAuxStreamArray, + [&user_aux_stream_array](const std::string& value_str) -> Status { + size_t address; + ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(value_str, address)); + user_aux_stream_array = reinterpret_cast(address); + return Status::OK(); + }) .AddAssignmentToReference(nv::provider_option_names::kMaxWorkspaceSize, info.max_workspace_size) .AddAssignmentToReference(nv::provider_option_names::kMaxSharedMemSize, info.max_shared_mem_size) + .AddAssignmentToReference(nv::provider_option_names::kLengthAuxStreamArray, info.auxiliary_streams) .AddAssignmentToReference(nv::provider_option_names::kDumpSubgraphs, info.dump_subgraphs) .AddAssignmentToReference(nv::provider_option_names::kDetailedBuildLog, info.detailed_build_log) .AddAssignmentToReference(nv::provider_option_names::kProfilesMinShapes, info.profile_min_shapes) @@ -56,6 +66,7 @@ NvExecutionProviderInfo NvExecutionProviderInfo::FromProviderOptions(const Provi info.user_compute_stream = user_compute_stream; info.has_user_compute_stream = (user_compute_stream != nullptr); + info.user_aux_stream_array = user_aux_stream_array; info.onnx_bytestream = onnx_bytestream; info.external_data_bytestream = external_data_bytestream; @@ -98,8 +109,10 @@ ProviderOptions NvExecutionProviderInfo::ToProviderOptions(const NvExecutionProv {nv::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)}, {nv::provider_option_names::kHasUserComputeStream, MakeStringWithClassicLocale(info.has_user_compute_stream)}, {nv::provider_option_names::kUserComputeStream, MakeStringWithClassicLocale(reinterpret_cast(info.user_compute_stream))}, + {nv::provider_option_names::kUserAuxStreamArray, MakeStringWithClassicLocale(reinterpret_cast(info.user_aux_stream_array))}, {nv::provider_option_names::kMaxWorkspaceSize, MakeStringWithClassicLocale(info.max_workspace_size)}, {nv::provider_option_names::kMaxSharedMemSize, MakeStringWithClassicLocale(info.max_shared_mem_size)}, + {nv::provider_option_names::kLengthAuxStreamArray, MakeStringWithClassicLocale(info.auxiliary_streams)}, {nv::provider_option_names::kDumpSubgraphs, MakeStringWithClassicLocale(info.dump_subgraphs)}, {nv::provider_option_names::kDetailedBuildLog, MakeStringWithClassicLocale(info.detailed_build_log)}, {nv::provider_option_names::kProfilesMinShapes, MakeStringWithClassicLocale(info.profile_min_shapes)}, diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h index 372e8196f38c2..26f392ad446a3 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h @@ -21,6 +21,7 @@ struct NvExecutionProviderInfo { int device_id{0}; bool has_user_compute_stream{false}; void* user_compute_stream{nullptr}; + void* user_aux_stream_array{nullptr}; int max_partition_iterations{1000}; int min_subgraph_size{1}; size_t max_workspace_size{0};