From a874a2c9d3cbc8a278f73ac8620ecb85d6f53acc Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Fri, 20 Feb 2026 12:51:52 +0530 Subject: [PATCH 1/4] Improve CUDA graph capture Currently, CUDA graphs are eagerly enabled on the first call to ggml_backend_cuda_graph_compute. If the graph properties keep changing (4+ consecutive updates), the graph is permanently disabled. This is suboptimal because: - The first call always incurs CUDA graph capture overhead even if the graph is unstable - Once permanently disabled, CUDA graphs never re-enable even after the graph stabilizes (e.g., switching from prompt processing to decode) The new approach delays CUDA graph activation until warmup completes: the same cgraph must be called at least twice with matching properties before CUDA graph capture begins. This avoids wasted capture overhead on volatile graphs and allows graphs to become eligible once they stabilize. This also fixes issues such as https://github.com/ggml-org/llama.cpp/discussions/19708 --- ggml/src/ggml-cuda/common.cuh | 17 ++-------------- ggml/src/ggml-cuda/ggml-cuda.cu | 35 ++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 24 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a3256d59dd06..36d8a3aaab29 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1149,8 +1149,7 @@ struct ggml_cuda_graph { size_t num_nodes = 0; std::vector nodes; bool disable_due_to_gpu_arch = false; - bool disable_due_to_too_many_updates = false; - int number_consecutive_updates = 0; + bool warmup_complete = false; std::vector props; // these are extra tensors (inputs) that participate in the ggml graph but are not nodes @@ -1159,21 +1158,9 @@ struct ggml_cuda_graph { // ref: https://github.com/ggml-org/llama.cpp/pull/19165 std::vector extra; - void record_update(bool use_graph, bool update_required) { - if (use_graph && update_required) { - number_consecutive_updates++; - } else { - number_consecutive_updates = 0; - } - if (number_consecutive_updates >= 4) { - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); - disable_due_to_too_many_updates = true; - } - } - bool is_enabled() const { static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates); + return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env); } #endif }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ffa35eeb654c..64444340f20b 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2979,10 +2979,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx const void * graph_key = ggml_cuda_graph_get_key(cgraph); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); - if (graph->instance == nullptr) { - res = true; - } - // Check if the graph size has changed if (graph->props.size() != (size_t)cgraph->n_nodes) { res = true; @@ -3931,14 +3927,35 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, #ifdef USE_CUDA_GRAPH graph_key = ggml_cuda_graph_get_key(cgraph); - use_cuda_graph = ggml_cuda_graph_set_enabled(cuda_ctx, graph_key); + ggml_cuda_graph_set_enabled(cuda_ctx, graph_key); ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); if (graph->is_enabled()) { - cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph); - use_cuda_graph = ggml_cuda_graph_check_compability(cgraph); - - graph->record_update(use_cuda_graph, cuda_graph_update_required); + bool graph_compatible = ggml_cuda_graph_check_compability(cgraph); + if(graph_compatible) { + bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph); + + if (!graph->warmup_complete) { + // Warmup: need at least 2 calls with no property change on the 2nd call + if (!properties_changed) { + graph->warmup_complete = true; + GGML_LOG_DEBUG("%s: CUDA graph warmup complete\n", __func__); + use_cuda_graph = true; + cuda_graph_update_required = true; + } + // else: properties changed or first call — execute directly (use_cuda_graph stays false) + } else { + // Post-warmup: normal CUDA graph operation + if (properties_changed) { + // Properties changed — reset warmup, execute directly until stable again + graph->warmup_complete = false; + GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__); + } else { + use_cuda_graph = true; + cuda_graph_update_required = (graph->instance == nullptr); + } + } + } } #endif // USE_CUDA_GRAPH From a11e5502a77c659a81975911d58d5851a00afd6e Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Sat, 21 Feb 2026 09:03:00 +0530 Subject: [PATCH 2/4] Update ggml/src/ggml-cuda/ggml-cuda.cu MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 64444340f20b..bfdc8c2a5965 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3931,9 +3931,9 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cuda_graph * graph = cuda_ctx->cuda_graph(graph_key); if (graph->is_enabled()) { - bool graph_compatible = ggml_cuda_graph_check_compability(cgraph); - if(graph_compatible) { - bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph); + const bool graph_compatible = ggml_cuda_graph_check_compability(cgraph); + if (graph_compatible) { + const bool properties_changed = ggml_cuda_graph_update_required(cuda_ctx, cgraph); if (!graph->warmup_complete) { // Warmup: need at least 2 calls with no property change on the 2nd call From 27b965bc36cdaca4e2afee42404a672ed67723d3 Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Sat, 21 Feb 2026 09:05:45 +0530 Subject: [PATCH 3/4] Remove EM dashes --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index bfdc8c2a5965..5d6b87760e40 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3943,11 +3943,11 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, use_cuda_graph = true; cuda_graph_update_required = true; } - // else: properties changed or first call — execute directly (use_cuda_graph stays false) + // else: properties changed or first call - execute directly (use_cuda_graph stays false) } else { // Post-warmup: normal CUDA graph operation if (properties_changed) { - // Properties changed — reset warmup, execute directly until stable again + // Properties changed - reset warmup, execute directly until stable again graph->warmup_complete = false; GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__); } else { From 692ceeffae0f9a4e898aa708d3ab9e424b7d400e Mon Sep 17 00:00:00 2001 From: Gaurav Garg Date: Sat, 21 Feb 2026 13:32:08 +0530 Subject: [PATCH 4/4] Update ggml/src/ggml-cuda/ggml-cuda.cu Co-authored-by: Aman Gupta --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 5d6b87760e40..7e6d3303549a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3952,7 +3952,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, GGML_LOG_DEBUG("%s: CUDA graph warmup reset\n", __func__); } else { use_cuda_graph = true; - cuda_graph_update_required = (graph->instance == nullptr); + cuda_graph_update_required = graph->instance == nullptr; } } }