From 2b6aec1400d14d6047baab90e1fee8c6e1fd46f2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 28 Jan 2026 21:53:55 +0200 Subject: [PATCH 1/4] cuda : fix nkvo --- ggml/src/ggml-cuda/fattn.cu | 5 ----- src/llama-graph.cpp | 5 ----- 2 files changed, 10 deletions(-) diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu index 195904ee206..721edd99944 100644 --- a/ggml/src/ggml-cuda/fattn.cu +++ b/ggml/src/ggml-cuda/fattn.cu @@ -310,8 +310,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const } } - const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs)); - const int cc = ggml_cuda_info().devices[device].cc; switch (K->ne[0]) { @@ -334,9 +332,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const if (!gqa_opt_applies) { return BEST_FATTN_KERNEL_NONE; } - if (!V_is_K_view) { - return BEST_FATTN_KERNEL_NONE; - } break; default: return BEST_FATTN_KERNEL_NONE; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index b3198b7e3a2..16d42c4ae3d 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1630,11 +1630,6 @@ ggml_tensor * llm_graph_context::build_attn_mha( hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); cb(cur, LLAMA_TENSOR_NAME_FATTN, il); - if (!cparams.offload_kqv) { - // all nodes between the KV store and the attention output are run on the CPU - ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu); - } - ggml_flash_attn_ext_add_sinks(cur, sinks); ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32); From 2695eb444596c1ae734223e33f1dc2ef189b0f7d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 29 Jan 2026 12:43:44 +0200 Subject: [PATCH 2/4] cont : more robust cuda graph node property matching --- ggml/src/ggml-cuda/common.cuh | 4 ++- ggml/src/ggml-cuda/ggml-cuda.cu | 55 ++++++++++++++++++++------------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 3335f443aeb..7f3042e97a5 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1127,7 +1127,9 @@ struct ggml_cuda_graph_node_properties { int32_t flags; int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; - void * src_address[GGML_MAX_SRC]; + void * src_data[GGML_MAX_SRC]; + int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS]; + size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS]; int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e9df0ea4a7c..ceee33d8707 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2916,6 +2916,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { } static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) { + memset(props, 0, sizeof(ggml_cuda_graph_node_properties)); props->node_address = node->data; props->node_op = node->op; props->flags = node->flags; @@ -2924,7 +2925,16 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties props->nb[i] = node->nb[i]; } for (int i = 0; i < GGML_MAX_SRC; i++) { - props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr; + if (!node->src[i]) { + continue; + } + + props->src_data[i] = node->src[i]->data; + + for (int j = 0; j < GGML_MAX_DIMS; j++) { + props->src_ne[i][j] = node->src[i]->ne[j]; + props->src_nb[i][j] = node->src[i]->nb[j]; + } } memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS); } @@ -2948,12 +2958,27 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_ } } - for (int i = 0; i < GGML_MAX_SRC; i++) { - if (node->src[i] && - node->src[i]->data != props->src_address[i] && - node->op != GGML_OP_VIEW - ) { - return false; + if (node->op != GGML_OP_VIEW) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (!node->src[i]) { + if (props->src_data[i] != nullptr) { + return false; + } + continue; + } + + if (node->src[i]->data != props->src_data[i]) { + return false; + } + + // TODO: this is not ideal since it requires too much memory - figure out an optimization + // ref: https://github.com/ggml-org/llama.cpp/pull/19165 + for (int j = 0; j < GGML_MAX_DIMS; j++) { + if (node->src[i]->ne[j] != props->src_ne[i][j] || + node->src[i]->nb[j] != props->src_nb[i][j]) { + return false; + } + } } } @@ -2974,7 +2999,6 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) { } static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) { - bool res = false; const void * graph_key = ggml_cuda_graph_get_key(cgraph); @@ -2985,9 +3009,9 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx } // Check if the graph size has changed - if (graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) { + if (graph->props.size() != (size_t)cgraph->n_nodes) { res = true; - graph->props.resize(cgraph->n_nodes + cgraph->n_leafs); + graph->props.resize(cgraph->n_nodes); } // Loop over nodes in GGML graph to determine if CUDA graph update is required @@ -3003,17 +3027,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]); } - for (int i = 0; i < cgraph->n_leafs; i++) { - bool props_match = true; - if (!res) { - props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &graph->props[cgraph->n_nodes + i]); - } - if (!props_match) { - res = true; - } - ggml_cuda_graph_node_set_properties(&graph->props[cgraph->n_nodes + i], cgraph->leafs[i]); - } - return res; } From 6e2ac828f5a25025df0196bb6c438714b9053f1a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 29 Jan 2026 13:06:19 +0200 Subject: [PATCH 3/4] cont : restore pre-leafs implementation --- ggml/src/ggml-cuda/common.cuh | 3 +- ggml/src/ggml-cuda/ggml-cuda.cu | 53 ++++++++++++++++++++++----------- 2 files changed, 36 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 7f3042e97a5..b3e321483cb 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1128,8 +1128,6 @@ struct ggml_cuda_graph_node_properties { int64_t ne[GGML_MAX_DIMS]; size_t nb[GGML_MAX_DIMS]; void * src_data[GGML_MAX_SRC]; - int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS]; - size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS]; int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; }; @@ -1151,6 +1149,7 @@ struct ggml_cuda_graph { bool disable_due_to_too_many_updates = false; int number_consecutive_updates = 0; std::vector props; + std::vector extra; void record_update(bool use_graph, bool update_required) { if (use_graph && update_required) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ceee33d8707..2b638d131de 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -70,17 +70,18 @@ #include #include #include -#include +#include #include #include #include #include #include -#include -#include -#include +#include +#include +#include #include #include +#include static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); @@ -2930,11 +2931,6 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties } props->src_data[i] = node->src[i]->data; - - for (int j = 0; j < GGML_MAX_DIMS; j++) { - props->src_ne[i][j] = node->src[i]->ne[j]; - props->src_nb[i][j] = node->src[i]->nb[j]; - } } memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS); } @@ -2970,15 +2966,6 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_ if (node->src[i]->data != props->src_data[i]) { return false; } - - // TODO: this is not ideal since it requires too much memory - figure out an optimization - // ref: https://github.com/ggml-org/llama.cpp/pull/19165 - for (int j = 0; j < GGML_MAX_DIMS; j++) { - if (node->src[i]->ne[j] != props->src_ne[i][j] || - node->src[i]->nb[j] != props->src_nb[i][j]) { - return false; - } - } } } @@ -3016,8 +3003,13 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx // Loop over nodes in GGML graph to determine if CUDA graph update is required // and store properties to allow this comparison for the next token + std::unordered_set seen_node; + std::vector srcs_extra; for (int i = 0; i < cgraph->n_nodes; i++) { bool props_match = true; + + seen_node.insert(cgraph->nodes[i]); + if (!res) { props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]); } @@ -3025,6 +3017,31 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx res = true; } ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]); + + for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { + ggml_tensor * src = cgraph->nodes[i]->src[src_idx]; + if (src && seen_node.find(src) == seen_node.end()) { + srcs_extra.push_back(src); + } + } + } + + if (graph->extra.size() != (size_t) srcs_extra.size()) { + res = true; + graph->extra.resize(srcs_extra.size()); + } + + for (size_t i = 0; i < srcs_extra.size(); ++i) { + bool props_match = true; + + if (!res) { + props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]); + } + + if (!props_match) { + res = true; + } + ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]); } return res; From 73097f5fc0965bab8447822398d91464fce0f037 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 29 Jan 2026 14:31:48 +0200 Subject: [PATCH 4/4] cont : comments + static_assert --- ggml/src/ggml-cuda/common.cuh | 9 ++++++++- ggml/src/ggml-cuda/ggml-cuda.cu | 5 ++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index b3e321483cb..43280644e48 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1122,7 +1122,7 @@ struct ggml_tensor_extra_gpu { #endif struct ggml_cuda_graph_node_properties { - void * node_address; + void * node_data; ggml_op node_op; int32_t flags; int64_t ne[GGML_MAX_DIMS]; @@ -1131,6 +1131,8 @@ struct ggml_cuda_graph_node_properties { int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; }; +static_assert(std::is_trivial::value, "ggml_cuda_graph_node_properties must be trivial"); + struct ggml_cuda_graph { #ifdef USE_CUDA_GRAPH ~ggml_cuda_graph() { @@ -1149,6 +1151,11 @@ struct ggml_cuda_graph { bool disable_due_to_too_many_updates = false; int number_consecutive_updates = 0; std::vector props; + + // these are extra tensors (inputs) that participate in the ggml graph but are not nodes + // they properties also have to match in order to be able to safely reuse a CUDA graph + // ref: https://github.com/ggml-org/llama.cpp/pull/18583 + // ref: https://github.com/ggml-org/llama.cpp/pull/19165 std::vector extra; void record_update(bool use_graph, bool update_required) { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 2b638d131de..842f4e7941e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2918,7 +2918,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) { static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) { memset(props, 0, sizeof(ggml_cuda_graph_node_properties)); - props->node_address = node->data; + props->node_data = node->data; props->node_op = node->op; props->flags = node->flags; for (int i = 0; i < GGML_MAX_DIMS; i++) { @@ -2936,8 +2936,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties } static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) { - if (node->data != props->node_address && - node->op != GGML_OP_VIEW) { + if (node->data != props->node_data && node->op != GGML_OP_VIEW) { return false; }