microsoft · hariharans29 · Mar 7, 2022 · Dec 9, 2021 · Jan 29, 2022 · Jan 31, 2022
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
@@ -164,6 +164,24 @@ class IExecutionProvider {
   */
   virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); }
 
+  /**
+     Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
+     the provider. Currently only CUDA execution provider supports it.
+   */
+  virtual bool IsGraphCaptureEnabled() const { return false; }
+
+  /**
+     Indicate whether the graph has been captured and instantiated. Currently
+     only CUDA execution provider supports it.
+   */
+  virtual bool IsGraphCaptured() const { return false; }
+
+  /**
+     Run the instantiated graph. Currently only CUDA execution provider supports
+     it.
+   */
+  virtual common::Status ReplayGraph() { return Status::OK(); }
+
   /**
      Called when session creation is complete
      This provides an opportunity for execution providers to optionally synchronize and

diff --git a/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -25,4 +25,5 @@ struct OrtCUDAProviderOptionsV2 {
                                                            // (will be overridden by contents of `default_memory_arena_cfg` is it exists)
   OrtArenaCfg* default_memory_arena_cfg;                   // BFC Arena config flags.
   int cudnn_conv_use_max_workspace;                        // flag specifying if maximum workspace can be used in cudnn conv algo search.
+  int enable_cuda_graph;                                   // flag specifying if the CUDA graph is to be captured for the model.
 };
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -136,6 +136,10 @@ CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId de
 
   // CUDA malloc/free is expensive so always use an arena
   allocator_ = CreateCudaAllocator(device_id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, default_memory_arena_cfg);
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  cuda_graph_.SetStream(stream_);
+#endif
 }
 
 CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
@@ -155,6 +159,35 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
   }
 }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
+  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
+}
+
+void CUDAExecutionProvider::PerThreadContext::CaptureBegin()  {
+  cuda_graph_.Reset();
+  cuda_graph_.CaptureBegin();
+}
+
+void CUDAExecutionProvider::PerThreadContext::CaptureEnd() {
+  cuda_graph_.CaptureEnd();
+  is_graph_captured_ = true;
+}
+
+bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptured() const {
+  return is_graph_captured_;
+}
+
+Status CUDAExecutionProvider::PerThreadContext::ReplayGraph() {
+  ORT_ENFORCE(IsGraphCaptured());
+  return cuda_graph_.Replay();
+}
+
+void CUDAExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture() {
+  ++regular_run_count_before_graph_capture_;
+}
+#endif
+
 CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& info)
     : IExecutionProvider{onnxruntime::kCudaExecutionProvider},
       info_{info} {
@@ -331,17 +364,38 @@ Status CUDAExecutionProvider::OnRunStart() {
   auto& current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent();
   CUDA_RETURN_IF_ERROR(cudaEventCreate(&current_deferred_release_event, cudaEventDisableTiming));
   deferred_release_cpu_ptr_.emplace(current_deferred_release_event, DeferredReleaseCPUPtrs());
+
+  if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
+    LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
+    GetPerThreadContext().CaptureBegin();
+  }
   return Status::OK();
 }
 
 Status CUDAExecutionProvider::OnRunEnd(bool sync_stream) {
+  if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
+    if (GetPerThreadContext().IsGraphCaptureAllowed()) {
+      GetPerThreadContext().CaptureEnd();
+      // CUDA work issued to a capturing stream doesn’t actually run on the GPU,
+      // so run the captured graph here to actually execute the work.
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph());
+    } else {
+      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture();
+    }
+  }
   // record deferred release event on default stream, and release per_thread_context
   auto current_deferred_release_event = GetPerThreadContext().GetCurrentDeferredReleaseEvent();
   CUDA_RETURN_IF_ERROR(cudaEventRecord(current_deferred_release_event, static_cast<cudaStream_t>(GetComputeStream())));
   if (sync_stream) {
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(static_cast<cudaStream_t>(GetComputeStream())));
   }
-  ReleasePerThreadContext();
+
+  // If cuda graph is enabled, the per thread context will not be released
+  // because the per thread cuda graph needs to be maintained and replayed for
+  // the next run.
+  if (!IsGraphCaptureEnabled()) {
+    ReleasePerThreadContext();
+  }
   std::lock_guard<OrtMutex> lock(deferred_release_cpu_ptr_mutex_);
   deferred_release_cpu_ptr_[current_deferred_release_event].recorded = true;
 
@@ -360,6 +414,20 @@ Status CUDAExecutionProvider::SetComputeStream(void* stream) {
   return Status::OK();
 }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+bool CUDAExecutionProvider::IsGraphCaptureEnabled() const {
+  return info_.enable_cuda_graph;
+}
+
+bool CUDAExecutionProvider::IsGraphCaptured() const {
+  return GetPerThreadContext().IsGraphCaptured();
+}
+
+Status CUDAExecutionProvider::ReplayGraph() {
+  return GetPerThreadContext().ReplayGraph();
+}
+#endif
+
 namespace cuda {
 // opset 1 to 9
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -11,6 +11,7 @@
 #include "core/framework/execution_provider.h"
 #include "core/platform/ort_mutex.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
+#include "core/providers/cuda/cuda_graph.h"
 #include "core/providers/cuda/cuda_pch.h"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
@@ -94,6 +95,12 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   std::unique_ptr<profiling::EpProfiler> GetProfiler() override;
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  bool IsGraphCaptureEnabled() const override;
+  bool IsGraphCaptured() const override;
+  Status ReplayGraph() override;
+#endif
+
  private:
   CUDAExecutionProviderInfo info_;
   cudaDeviceProp device_prop_;
@@ -157,6 +164,15 @@ class CUDAExecutionProvider : public IExecutionProvider {
       return allocator_;
     }
 
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  bool IsGraphCaptureAllowed() const;
+  void CaptureBegin();
+  void CaptureEnd();
+  bool IsGraphCaptured() const;
+  Status ReplayGraph();
+  void IncrementRegularRunCountBeforeGraphCapture();
+#endif
+
    private:
     cudaStream_t stream_ = nullptr;
     cublasHandle_t cublas_handle_ = nullptr;
@@ -173,6 +189,17 @@ class CUDAExecutionProvider : public IExecutionProvider {
     std::unique_ptr<cuda::IConstantBuffer<BFloat16>> constant_ones_bfloat16_;
 
     AllocatorPtr allocator_;
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+    // Cuda graph with multi threads will be supported in the future, so cuda_graph_
+    // is put under PerThreadContext.
+    CUDAGraph cuda_graph_;
+    bool is_graph_captured_ = false;
+    int regular_run_count_before_graph_capture_ = 0;
+    const int min_num_runs_before_cuda_graph_capture_ = 1; // required min regular runs before graph capture for the necessary memory allocations.
+
+#endif
+
   };
 
   using PerThreadContextMap = std::unordered_map<const CUDAExecutionProvider*, std::weak_ptr<PerThreadContext>>;

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.cc
@@ -22,6 +22,7 @@ constexpr const char* kGpuExternalAlloc = "gpu_external_alloc";
 constexpr const char* kGpuExternalFree = "gpu_external_free";
 constexpr const char* kGpuExternalEmptyCache = "gpu_external_empty_cache";
 constexpr const char* kCudnnConvUseMaxWorkspace = "cudnn_conv_use_max_workspace";
+constexpr const char* kEnableCudaGraph = "enable_cuda_graph";
 }  // namespace provider_option_names
 }  // namespace cuda
 
@@ -92,6 +93,7 @@ CUDAExecutionProviderInfo CUDAExecutionProviderInfo::FromProviderOptions(const P
               *ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)
           .AddAssignmentToReference(cuda::provider_option_names::kDoCopyInDefaultStream, info.do_copy_in_default_stream)
           .AddAssignmentToReference(cuda::provider_option_names::kCudnnConvUseMaxWorkspace, info.cudnn_conv_use_max_workspace)
+          .AddAssignmentToReference(cuda::provider_option_names::kEnableCudaGraph, info.enable_cuda_graph)
           .Parse(options));
 
   CUDAExecutionProviderExternalAllocatorInfo alloc_info{alloc, free, empty_cache};
@@ -112,6 +114,7 @@ ProviderOptions CUDAExecutionProviderInfo::ToProviderOptions(const CUDAExecution
        EnumToName(*ort_cudnn_conv_algo_search_mapping, info.cudnn_conv_algo_search)},
       {cuda::provider_option_names::kDoCopyInDefaultStream, MakeStringWithClassicLocale(info.do_copy_in_default_stream)},
       {cuda::provider_option_names::kCudnnConvUseMaxWorkspace, MakeStringWithClassicLocale(info.cudnn_conv_use_max_workspace)},
+      {cuda::provider_option_names::kEnableCudaGraph, MakeStringWithClassicLocale(info.enable_cuda_graph)}
   };
 
   return options;

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h b/onnxruntime/core/providers/cuda/cuda_execution_provider_info.h
@@ -52,6 +52,8 @@ struct CUDAExecutionProviderInfo {
   // If set to true, try to use as much as possible memory for algo search.
   bool cudnn_conv_use_max_workspace{false};
 
+  bool enable_cuda_graph{false};
+
   static CUDAExecutionProviderInfo FromProviderOptions(const ProviderOptions& options);
   static ProviderOptions ToProviderOptions(const CUDAExecutionProviderInfo& info);
   static ProviderOptions ToProviderOptions(const OrtCUDAProviderOptionsV2& info);

diff --git a/onnxruntime/core/providers/cuda/cuda_graph.cc b/onnxruntime/core/providers/cuda/cuda_graph.cc
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_graph.h"
+
+#include "core/providers/cuda/cuda_common.h"
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+
+
+namespace onnxruntime {
+
+CUDAGraph::CUDAGraph(cudaStream_t stream) : stream_(stream) {
+#if (defined(CUDA_VERSION) && CUDA_VERSION < 10000)
+  ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
+#endif
+}
+
+void CUDAGraph::SetStream(cudaStream_t stream) {
+  stream_ = stream;
+}
+
+void CUDAGraph::CaptureBegin() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  ORT_ENFORCE(!has_graph_exec_,
+              "This cuda graph has already captured a graph. "
+              "Create a new instance to capture a new graph.");
+
+  CUDA_CALL_THROW(cudaStreamSynchronize(stream_));
+  // For now cuda graph can only work with a single thread. In the future, we
+  // will support multiple threads. For multiple threads with multiple graphs
+  // and streams, `cudaStreamCaptureModeGlobal` needs to be changed to
+  // `cudaStreamCaptureModeThreadLocal`
+  CUDA_CALL_THROW(cudaStreamBeginCapture(stream_, cudaStreamCaptureModeGlobal));
+#else
+  ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
+#endif
+}
+
+void CUDAGraph::CaptureEnd() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  CUDA_CALL_THROW(cudaStreamEndCapture(stream_, &graph_));
+  if (graph_ == NULL) {
+    ORT_THROW("CUDAGraph::CaptureEnd: graph_ is NULL");
+  }
+
+  has_graph_ = true;
+  CUDA_CALL_THROW(cudaGraphInstantiate(&graph_exec_, graph_, NULL, NULL, 0));
+  has_graph_exec_ = true;
+  CUDA_CALL_THROW(cudaGraphDestroy(graph_));
+  has_graph_ = false;
+#else
+  ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
+#endif
+}
+
+Status CUDAGraph::Replay() {
+  // Although this function is not thread safe, the lock is not needed here because
+  // CUDA EP maintains a separate cuda graph per thread
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_;
+  CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec_, stream_));
+  CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+#else
+  ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
+#endif
+  return Status::OK();
+}
+
+void CUDAGraph::Reset() {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  if (has_graph_) {
+    CUDA_CALL_THROW(cudaGraphDestroy(graph_));
+    has_graph_ = false;
+  }
+  if (has_graph_exec_) {
+    CUDA_CALL_THROW(cudaGraphExecDestroy(graph_exec_));
+    has_graph_exec_ = false;
+  }
+#else
+  ORT_THROW("CUDA graphs can only be used in Onnxruntime built with CUDA >= 10.0");
+#endif
+}
+
+CUDAGraph::~CUDAGraph() {
+  Reset();
+}
+
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/platform/ort_mutex.h"
+#include "core/providers/cuda/cuda_pch.h"
+
+namespace onnxruntime {
+
+using CaptureId_t = unsigned long long;
+
+struct CUDAGraph {
+  CUDAGraph() {};
+  CUDAGraph(cudaStream_t stream);
+  ~CUDAGraph();
+
+  void SetStream(cudaStream_t stream);
+  void CaptureBegin();
+  void CaptureEnd();
+  Status Replay();
+  void Reset();
+
+private:
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 10000
+  cudaGraph_t graph_ = NULL;
+  cudaGraphExec_t graph_exec_ = NULL;
+#endif
+
+  bool has_graph_ = false;
+  bool has_graph_exec_ = false;
+
+  CaptureId_t id_;
+  cudaStream_t stream_ = nullptr; // Does not own the stream
+};
+
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -199,6 +199,7 @@ struct CUDA_Provider : Provider {
     info.user_compute_stream = params->user_compute_stream;
     info.default_memory_arena_cfg = params->default_memory_arena_cfg;
     info.cudnn_conv_use_max_workspace = params->cudnn_conv_use_max_workspace != 0;
+    info.enable_cuda_graph = params->enable_cuda_graph != 0;
 
     return std::make_shared<CUDAProviderFactory>(info);
   }
@@ -216,6 +217,7 @@ struct CUDA_Provider : Provider {
     cuda_options.user_compute_stream = internal_options.user_compute_stream;
     cuda_options.default_memory_arena_cfg = internal_options.default_memory_arena_cfg;
     cuda_options.cudnn_conv_use_max_workspace = internal_options.cudnn_conv_use_max_workspace;
+    cuda_options.enable_cuda_graph = internal_options.enable_cuda_graph;
   }
 
   ProviderOptions GetProviderOptions(const void* provider_options) override {