From f39a785e4ed2142e1f6ec9d39ad8780da95403e4 Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Fri, 20 Feb 2026 11:04:39 +0100 Subject: [PATCH] Add WIP implementation on cuda graph logic --- ggml/src/ggml-cuda/common.cuh | 16 ++++++++-------- ggml/src/ggml-cuda/ggml-cuda.cu | 11 +++++------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a3256d59dd0..189d331b315 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1149,8 +1149,8 @@ struct ggml_cuda_graph { size_t num_nodes = 0; std::vector nodes; bool disable_due_to_gpu_arch = false; - bool disable_due_to_too_many_updates = false; - int number_consecutive_updates = 0; + bool disable_due_to_too_many_rebuilds = false; + int number_consecutive_rebuilds = 0; std::vector props; // these are extra tensors (inputs) that participate in the ggml graph but are not nodes @@ -1161,19 +1161,19 @@ struct ggml_cuda_graph { void record_update(bool use_graph, bool update_required) { if (use_graph && update_required) { - number_consecutive_updates++; + number_consecutive_rebuilds++; } else { - number_consecutive_updates = 0; + number_consecutive_rebuilds = 0; } - if (number_consecutive_updates >= 4) { - GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); - disable_due_to_too_many_updates = true; + if (number_consecutive_rebuilds >= 4) { + GGML_LOG_INFO("%s: disabling CUDA graphs due to too many consecutive rebuilds\n", __func__); + disable_due_to_too_many_rebuilds = true; } } bool is_enabled() const { static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_updates); + return !(disable_due_to_gpu_arch || disable_cuda_graphs_due_to_env || disable_due_to_too_many_rebuilds); } #endif }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ffa35eeb654..50ba12b458d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3048,10 +3048,10 @@ static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_c #endif // CUDART_VERSION >= 12000 if (stat == cudaErrorGraphExecUpdateFailure) { -#ifndef NDEBUG - GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__); -#endif - +//#ifndef NDEBUG + // GGML_LOG_INFO("%s: CUDA graph update failed due to %d\n", __func__, static_cast(result_info)); +//#endif + graph->record_update(true, true); // The pre-existing graph exec cannot be updated due to violated constraints // so instead clear error and re-instantiate (void)cudaGetLastError(); @@ -3059,6 +3059,7 @@ static void ggml_cuda_graph_update_executable(ggml_backend_cuda_context * cuda_c graph->instance = nullptr; CUDA_CHECK(cudaGraphInstantiate(&graph->instance, graph->graph, NULL, NULL, 0)); } else { + graph->record_update(true, false); GGML_ASSERT(stat == cudaSuccess); } } @@ -3937,8 +3938,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, if (graph->is_enabled()) { cuda_graph_update_required = ggml_cuda_graph_update_required(cuda_ctx, cgraph); use_cuda_graph = ggml_cuda_graph_check_compability(cgraph); - - graph->record_update(use_cuda_graph, cuda_graph_update_required); } #endif // USE_CUDA_GRAPH