From f39a785e4ed2142e1f6ec9d39ad8780da95403e4 Mon Sep 17 00:00:00 2001
From: Oliver Simons <osimons@nvidia.com>
Date: Fri, 20 Feb 2026 11:04:39 +0100
Subject: [PATCH] Add WIP implementation on cuda graph logic

---
 ggml/src/ggml-cuda/common.cuh   | 16 ++++++++--------
 ggml/src/ggml-cuda/ggml-cuda.cu | 11 +++++------
 2 files changed, 13 insertions(+), 14 deletions(-)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a3256d59dd0..189d331b315 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1149,8 +1149,8 @@ struct ggml_cuda_graph {
     size_t num_nodes = 0;
     std::vector<cudaGraphNode_t> nodes;
     bool disable_due_to_gpu_arch = false;
-    bool disable_due_to_too_many_updates = false;
-    int number_consecutive_updates = 0;
+    bool disable_due_to_too_many_rebuilds = false;
+    int number_consecutive_rebuilds = 0;
     std::vector<ggml_cuda_graph_node_properties> props;
 
     // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
@@ -1161,19 +1161,19 @@ struct ggml_cuda_graph {
 
     void record_update(bool use_graph, bool update_required) {
         if (use_graph && update_required) {
-            number_consecutive_updates++;
+            number_consecutive_rebuilds++;
         } else {
-            number_consecutive_updates = 0;
+            number_consecutive_rebuilds = 0;
         }
-        if (number_consecutive_updates >= 4) {
-            GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
-            disable_due_to_too_many_updates = true;
+        if (number_consecutive_rebuilds >= 4) {
+            GGML_LOG_INFO("%s: disabling CUDA graphs due to too many consecutive rebuilds\n", __func__);
+            disable_due_to_too_many_rebuilds = true;
         }
     }
 
     bool is_enabled() const {
         static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
-        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates);
+        return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_rebuilds);
     }
 #endif
 };
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ffa35eeb654..50ba12b458d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3048,10 +3048,10 @@ static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_c
 #endif // CUDART_VERSION >= 12000
 
     if (stat == cudaErrorGraphExecUpdateFailure) {
-#ifndef NDEBUG
-        GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
-#endif
-
+//#ifndef NDEBUG
+        // GGML_LOG_INFO("%s: CUDA graph update failed due to %d\n", __func__, static_cast<int>(result_info));
+//#endif
+        graph->record_update(true, true);
         // The pre-existing graph exec cannot be updated due to violated constraints
         // so instead clear error and re-instantiate
         (void)cudaGetLastError();
@@ -3059,6 +3059,7 @@ static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_c
         graph->instance = nullptr;
         CUDA_CHECK(cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0));
     } else {
+        graph->record_update(true, false);
         GGML_ASSERT(stat == cudaSuccess);
     }
 }
@@ -3937,8 +3938,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
     if (graph->is_enabled()) {
         cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph);
         use_cuda_graph             = ggml_cuda_graph_check_compability(cgraph);
-
-        graph->record_update(use_cuda_graph, cuda_graph_update_required);
     }
 #endif // USE_CUDA_GRAPH