cuda : synchronize graph capture and cublas handle destruction

slaren · slaren · commit 77d208b710cd · 2025-06-19T21:30:25.000+02:00
Workarounds an issue that may cause CUDA graph capture to fail when a cuBLAS handle is destroyed in a different thread

ggml-ci
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -19,10 +19,13 @@
 #endif
 #include "ggml-common.h"
 
-#include <cstdio>
 #include <array>
+#include <atomic>
 #include <cassert>
 #include <cfloat>
+#include <condition_variable>
+#include <cstdio>
+#include <mutex>
 #include <string>
 #include <vector>
 
@@ -752,6 +755,12 @@ struct ggml_cuda_graph {
 #endif
 };
 
+// destroying a cuBLAS handle while a graph is being captured in a different thread can result in a CUDA error
+// this lock is used to ensure that no cuBLAS handle is destroyed while a graph is being captured
+extern std::mutex ggml_cuda_lock;
+extern std::condition_variable ggml_cuda_lock_cv;
+extern std::atomic<int> ggml_cuda_lock_counter;
+
 struct ggml_backend_cuda_context {
     int device;
     std::string name;
@@ -768,6 +777,9 @@ struct ggml_backend_cuda_context {
     }
 
     ~ggml_backend_cuda_context() {
+        std::unique_lock<std::mutex> lock(ggml_cuda_lock);
+        ggml_cuda_lock_cv.wait(lock, []{ return ggml_cuda_lock_counter.load(std::memory_order_relaxed) == 0; });
+
         if (copy_event != nullptr) {
             CUDA_CHECK(cudaEventDestroy(copy_event));
         }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -514,6 +514,10 @@ std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(i
     return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
 
+std::mutex ggml_cuda_lock;
+std::condition_variable ggml_cuda_lock_cv;
+std::atomic<int> ggml_cuda_lock_counter;
+
 // cuda buffer
 
 struct ggml_backend_cuda_buffer_context {
@@ -2685,6 +2689,11 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
             CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
             graph_evaluated_or_captured = true; // CUDA graph has been captured
+
+            std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+            if (ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed) == 1) {
+                ggml_cuda_lock_cv.notify_all();
+            }
         } else {
             graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
         }
@@ -2760,7 +2769,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         }
     }
 
-    if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
+    if (use_cuda_graph && cuda_graph_update_required) {
+        // Start CUDA graph capture
+        if (ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed) == 0) {
+            ggml_cuda_lock_counter.fetch_sub(1, std::memory_order_relaxed);
+            std::lock_guard<std::mutex> lock(ggml_cuda_lock);
+            ggml_cuda_lock_counter.fetch_add(1, std::memory_order_relaxed);
+        }
+
         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
     }