Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions include/onnxruntime/core/framework/execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,24 @@ class IExecutionProvider {
*/
virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); }

/**
Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
the provider. Currently only CUDA execution provider supports it.
*/
virtual bool IsGraphCaptureEnabled() const { return false; }

/**
Indicate whether the graph has been captured and instantiated. Currently
only CUDA execution provider supports it.
*/
virtual bool IsGraphCaptured() const { return false; }

/**
Run the instantiated graph. Currently only CUDA execution provider supports
it.
*/
virtual common::Status ReplayGraph() { return Status::OK(); }

/**
Called when session creation is complete
This provides an opportunity for execution providers to optionally synchronize and
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ struct OrtCUDAProviderOptionsV2 {
// (will be overridden by contents of `default_memory_arena_cfg` is it exists)
OrtArenaCfg* default_memory_arena_cfg; // BFC Arena config flags.
int cudnn_conv_use_max_workspace; // flag specifying if maximum workspace can be used in cudnn conv algo search.
int enable_cuda_graph; // flag specifying if the CUDA graph is to be captured for the model.
};
70 changes: 69 additions & 1 deletion onnxruntime/core/providers/cuda/cuda_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,10 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de

// CUDA malloc/free is expensive so always use an arena
allocator_ = CreateCudaAllocator(device_id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, default_memory_arena_cfg);

#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
cuda_graph_.SetStream(stream_);
#endif
}

CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
Expand All @@ -155,6 +159,35 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
}
}

#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
}

void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
cuda_graph_.Reset();
cuda_graph_.CaptureBegin();
}

void CUDAExecutionProvider::PerThreadContext::CaptureEnd() {
cuda_graph_.CaptureEnd();
is_graph_captured_ = true;
}

bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured() const {
return is_graph_captured_;
}

Status CUDAExecutionProvider::PerThreadContext::ReplayGraph() {
ORT_ENFORCE(IsGraphCaptured());
return cuda_graph_.Replay();
}

void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
++regular_run_count_before_graph_capture_;
}
#endif

CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& info)
: IExecutionProvider{onnxruntime::kCudaExecutionProvider},
info_{info} {
Expand Down Expand Up @@ -331,17 +364,38 @@ Status CUDAExecutionProvider::OnRunStart() {
auto& current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent();
CUDA_RETURN_IF_ERROR(cudaEventCreate(&current_deferred_release_event, cudaEventDisableTiming));
deferred_release_cpu_ptr_.emplace(current_deferred_release_event, DeferredReleaseCPUPtrs());

if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
GetPerThreadContext().CaptureBegin();
}
return Status::OK();
}

Status CUDAExecutionProvider::OnRunEnd(bool sync_stream) {
if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
if (GetPerThreadContext().IsGraphCaptureAllowed()) {
GetPerThreadContext().CaptureEnd();
// CUDA work issued to a capturing stream doesn’t actually run on the GPU,
// so run the captured graph here to actually execute the work.
ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph());
} else {
GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
}
}
// record deferred release event on default stream, and release per_thread_context
auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent();
CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, static_cast<cudaStream_t>(GetComputeStream())));
if (sync_stream) {
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(GetComputeStream())));
}
ReleasePerThreadContext();

// If cuda graph is enabled, the per thread context will not be released
// because the per thread cuda graph needs to be maintained and replayed for
// the next run.
if (!IsGraphCaptureEnabled()) {
ReleasePerThreadContext();
}
std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true;

Expand All @@ -360,6 +414,20 @@ Status CUDAExecutionProvider::SetComputeStream(void* stream) {
return Status::OK();
}

#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
bool CUDAExecutionProvider::IsGraphCaptureEnabled() const {
return info_.enable_cuda_graph;
}

bool CUDAExecutionProvider::IsGraphCaptured() const {
return GetPerThreadContext().IsGraphCaptured();
}

Status CUDAExecutionProvider::ReplayGraph() {
return GetPerThreadContext().ReplayGraph();
}
#endif

namespace cuda {
// opset 1 to 9
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
Expand Down
27 changes: 27 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "core/framework/execution_provider.h"
#include "core/platform/ort_mutex.h"
#include "core/providers/cuda/cuda_execution_provider_info.h"
#include "core/providers/cuda/cuda_graph.h"
#include "core/providers/cuda/cuda_pch.h"
#include "core/providers/cuda/shared_inc/cuda_utils.h"
#include "core/providers/cuda/shared_inc/cuda_call.h"
Expand Down Expand Up @@ -94,6 +95,12 @@ class CUDAExecutionProvider : public IExecutionProvider {

std::unique_ptr<profiling::EpProfiler> GetProfiler() override;

#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
bool IsGraphCaptureEnabled() const override;
bool IsGraphCaptured() const override;
Status ReplayGraph() override;
#endif

private:
CUDAExecutionProviderInfo info_;
cudaDeviceProp device_prop_;
Expand Down Expand Up @@ -157,6 +164,15 @@ class CUDAExecutionProvider : public IExecutionProvider {
return allocator_;
}

#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
bool IsGraphCaptureAllowed() const;
void CaptureBegin();
void CaptureEnd();
bool IsGraphCaptured() const;
Status ReplayGraph();
void IncrementRegularRunCountBeforeGraphCapture();
#endif

private:
cudaStream_t stream_ = nullptr;
cublasHandle_t cublas_handle_ = nullptr;
Expand All @@ -173,6 +189,17 @@ class CUDAExecutionProvider : public IExecutionProvider {
std::unique_ptr<cuda::IConstantBuffer<BFloat16>> constant_ones_bfloat16_;

AllocatorPtr allocator_;

#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
// Cuda graph with multi threads will be supported in the future, so cuda_graph_
// is put under PerThreadContext.
CUDAGraph cuda_graph_;
bool is_graph_captured_ = false;
int regular_run_count_before_graph_capture_ = 0;
const int min_num_runs_before_cuda_graph_capture_ = 1; // required min regular runs before graph capture for the necessary memory allocations.

#endif

};

using PerThreadContextMap = std::unordered_map<const CUDAExecutionProvider*, std::weak_ptr<PerThreadContext>>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ constexpr const char* kGpuExternalAlloc = "gpu_external_alloc";
constexpr const char* kGpuExternalFree = "gpu_external_free";
constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
constexpr const char* kCudnnConvUseMaxWorkspace = "cudnn_conv_use_max_workspace";
constexpr const char* kEnableCudaGraph = "enable_cuda_graph";
} // namespace provider_option_names
} // namespace cuda

Expand Down Expand Up @@ -92,6 +93,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
*ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)
.AddAssignmentToReference(cuda::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream)
.AddAssignmentToReference(cuda::provider_option_names::kCudnnConvUseMaxWorkspace, info.cudnn_conv_use_max_workspace)
.AddAssignmentToReference(cuda::provider_option_names::kEnableCudaGraph, info.enable_cuda_graph)
.Parse(options));

CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
Expand All @@ -112,6 +114,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
EnumToName(*ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)},
{cuda::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
{cuda::provider_option_names::kCudnnConvUseMaxWorkspace, MakeStringWithClassicLocale(info.cudnn_conv_use_max_workspace)},
{cuda::provider_option_names::kEnableCudaGraph, MakeStringWithClassicLocale(info.enable_cuda_graph)}
};

return options;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ struct CUDAExecutionProviderInfo {
// If set to true, try to use as much as possible memory for algo search.
bool cudnn_conv_use_max_workspace{false};

bool enable_cuda_graph{false};

static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);
static ProviderOptions ToProviderOptions(const OrtCUDAProviderOptionsV2& info);
Expand Down
89 changes: 89 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_graph.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include "core/providers/cuda/cuda_graph.h"

#include "core/providers/cuda/cuda_common.h"
#include <cuda_runtime_api.h>
#include <driver_types.h>


namespace onnxruntime {

CUDAGraph::CUDAGraph(cudaStream_t stream) : stream_(stream) {
#if (defined(CUDA_VERSION) && CUDA_VERSION < 10000)
ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
#endif
}

void CUDAGraph::SetStream(cudaStream_t stream) {
stream_ = stream;
}

void CUDAGraph::CaptureBegin() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
ORT_ENFORCE(!has_graph_exec_,
"This cuda graph has already captured a graph. "
"Create a new instance to capture a new graph.");

CUDA_CALL_THROW(cudaStreamSynchronize(stream_));
// For now cuda graph can only work with a single thread. In the future, we
// will support multiple threads. For multiple threads with multiple graphs
// and streams, `cudaStreamCaptureModeGlobal` needs to be changed to
// `cudaStreamCaptureModeThreadLocal`
CUDA_CALL_THROW(cudaStreamBeginCapture(stream_, cudaStreamCaptureModeGlobal));
#else
ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
#endif
}

void CUDAGraph::CaptureEnd() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph_));
if (graph_ == NULL) {
ORT_THROW("CUDAGraph::CaptureEnd: graph_ is NULL");
}

has_graph_ = true;
CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
has_graph_exec_ = true;
CUDA_CALL_THROW(cudaGraphDestroy(graph_));
has_graph_ = false;
#else
ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
#endif
}

Status CUDAGraph::Replay() {
// Although this function is not thread safe, the lock is not needed here because
// CUDA EP maintains a separate cuda graph per thread
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_;
CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec_, stream_));
CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
#else
ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
#endif
return Status::OK();
}

void CUDAGraph::Reset() {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
if (has_graph_) {
CUDA_CALL_THROW(cudaGraphDestroy(graph_));
has_graph_ = false;
}
if (has_graph_exec_) {
CUDA_CALL_THROW(cudaGraphExecDestroy(graph_exec_));
has_graph_exec_ = false;
}
#else
ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
#endif
}

CUDAGraph::~CUDAGraph() {
Reset();
}

} // namespace onnxruntime
38 changes: 38 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_graph.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "core/common/common.h"
#include "core/platform/ort_mutex.h"
#include "core/providers/cuda/cuda_pch.h"

namespace onnxruntime {

using CaptureId_t = unsigned long long;

struct CUDAGraph {
CUDAGraph() {};
CUDAGraph(cudaStream_t stream);
~CUDAGraph();

void SetStream(cudaStream_t stream);
void CaptureBegin();
void CaptureEnd();
Status Replay();
void Reset();

private:
#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
cudaGraph_t graph_ = NULL;
cudaGraphExec_t graph_exec_ = NULL;
#endif

bool has_graph_ = false;
bool has_graph_exec_ = false;

CaptureId_t id_;
cudaStream_t stream_ = nullptr; // Does not own the stream
};

} // namespace onnxruntime
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_provider_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ struct CUDA_Provider : Provider {
info.user_compute_stream = params->user_compute_stream;
info.default_memory_arena_cfg = params->default_memory_arena_cfg;
info.cudnn_conv_use_max_workspace = params->cudnn_conv_use_max_workspace != 0;
info.enable_cuda_graph = params->enable_cuda_graph != 0;

return std::make_shared<CUDAProviderFactory>(info);
}
Expand All @@ -216,6 +217,7 @@ struct CUDA_Provider : Provider {
cuda_options.user_compute_stream = internal_options.user_compute_stream;
cuda_options.default_memory_arena_cfg = internal_options.default_memory_arena_cfg;
cuda_options.cudnn_conv_use_max_workspace = internal_options.cudnn_conv_use_max_workspace;
cuda_options.enable_cuda_graph = internal_options.enable_cuda_graph;
}

ProviderOptions GetProviderOptions(const void* provider_options) override {
Expand Down
Loading