From 2b6aec1400d14d6047baab90e1fee8c6e1fd46f2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 28 Jan 2026 21:53:55 +0200
Subject: [PATCH 1/4] cuda : fix nkvo

---
 ggml/src/ggml-cuda/fattn.cu | 5 -----
 src/llama-graph.cpp         | 5 -----
 2 files changed, 10 deletions(-)

diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu
index 195904ee206..721edd99944 100644
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -310,8 +310,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
         }
     }
 
-    const bool V_is_K_view = V->view_src && (V->view_src == K || (V->view_src == K->view_src && V->view_offs == K->view_offs));
-
     const int cc = ggml_cuda_info().devices[device].cc;
 
     switch (K->ne[0]) {
@@ -334,9 +332,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
             if (!gqa_opt_applies) {
                 return BEST_FATTN_KERNEL_NONE;
             }
-            if (!V_is_K_view) {
-                return BEST_FATTN_KERNEL_NONE;
-            }
             break;
         default:
             return BEST_FATTN_KERNEL_NONE;
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index b3198b7e3a2..16d42c4ae3d 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1630,11 +1630,6 @@ ggml_tensor * llm_graph_context::build_attn_mha(
                                   hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
         cb(cur, LLAMA_TENSOR_NAME_FATTN, il);
 
-        if (!cparams.offload_kqv) {
-            // all nodes between the KV store and the attention output are run on the CPU
-            ggml_backend_sched_set_tensor_backend(sched, cur, backend_cpu);
-        }
-
         ggml_flash_attn_ext_add_sinks(cur, sinks);
         ggml_flash_attn_ext_set_prec (cur, GGML_PREC_F32);
 

From 2695eb444596c1ae734223e33f1dc2ef189b0f7d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 29 Jan 2026 12:43:44 +0200
Subject: [PATCH 2/4] cont : more robust cuda graph node property matching

---
 ggml/src/ggml-cuda/common.cuh   |  4 ++-
 ggml/src/ggml-cuda/ggml-cuda.cu | 55 ++++++++++++++++++++-------------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 3335f443aeb..7f3042e97a5 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1127,7 +1127,9 @@ struct ggml_cuda_graph_node_properties {
     int32_t flags;
     int64_t ne[GGML_MAX_DIMS];
     size_t nb[GGML_MAX_DIMS];
-    void * src_address[GGML_MAX_SRC];
+    void * src_data[GGML_MAX_SRC];
+    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
+    size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 };
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e9df0ea4a7c..ceee33d8707 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2916,6 +2916,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
 }
 
 static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
+    memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
     props->node_address = node->data;
     props->node_op = node->op;
     props->flags = node->flags;
@@ -2924,7 +2925,16 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties
         props->nb[i] = node->nb[i];
     }
     for (int i = 0; i < GGML_MAX_SRC; i++) {
-        props->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
+        if (!node->src[i]) {
+            continue;
+        }
+
+        props->src_data[i] = node->src[i]->data;
+
+        for (int j = 0; j < GGML_MAX_DIMS; j++) {
+            props->src_ne[i][j] = node->src[i]->ne[j];
+            props->src_nb[i][j] = node->src[i]->nb[j];
+        }
     }
     memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
 }
@@ -2948,12 +2958,27 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
         }
     }
 
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i] &&
-            node->src[i]->data != props->src_address[i] &&
-            node->op != GGML_OP_VIEW
-        ) {
-            return false;
+    if (node->op != GGML_OP_VIEW) {
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            if (!node->src[i]) {
+                if (props->src_data[i] != nullptr) {
+                    return false;
+                }
+                continue;
+            }
+
+            if (node->src[i]->data != props->src_data[i]) {
+                return false;
+            }
+
+            // TODO: this is not ideal since it requires too much memory - figure out an optimization
+            // ref: https://github.com/ggml-org/llama.cpp/pull/19165
+            for (int j = 0; j < GGML_MAX_DIMS; j++) {
+                if (node->src[i]->ne[j] != props->src_ne[i][j] ||
+                    node->src[i]->nb[j] != props->src_nb[i][j]) {
+                    return false;
+                }
+            }
         }
     }
 
@@ -2974,7 +2999,6 @@ static const void * ggml_cuda_graph_get_key(ggml_cgraph * cgraph) {
 }
 
 static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph) {
-
     bool res = false;
 
     const void * graph_key = ggml_cuda_graph_get_key(cgraph);
@@ -2985,9 +3009,9 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
     }
 
     // Check if the graph size has changed
-    if (graph->props.size() != (size_t)cgraph->n_nodes + cgraph->n_leafs) {
+    if (graph->props.size() != (size_t)cgraph->n_nodes) {
         res = true;
-        graph->props.resize(cgraph->n_nodes + cgraph->n_leafs);
+        graph->props.resize(cgraph->n_nodes);
     }
 
     // Loop over nodes in GGML graph to determine if CUDA graph update is required
@@ -3003,17 +3027,6 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
         ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
     }
 
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        bool props_match = true;
-        if (!res) {
-            props_match = ggml_cuda_graph_node_properties_match(cgraph->leafs[i], &graph->props[cgraph->n_nodes + i]);
-        }
-        if (!props_match) {
-            res = true;
-        }
-        ggml_cuda_graph_node_set_properties(&graph->props[cgraph->n_nodes + i], cgraph->leafs[i]);
-    }
-
     return res;
 }
 

From 6e2ac828f5a25025df0196bb6c438714b9053f1a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 29 Jan 2026 13:06:19 +0200
Subject: [PATCH 3/4] cont : restore pre-leafs implementation

---
 ggml/src/ggml-cuda/common.cuh   |  3 +-
 ggml/src/ggml-cuda/ggml-cuda.cu | 53 ++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 7f3042e97a5..b3e321483cb 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1128,8 +1128,6 @@ struct ggml_cuda_graph_node_properties {
     int64_t ne[GGML_MAX_DIMS];
     size_t nb[GGML_MAX_DIMS];
     void * src_data[GGML_MAX_SRC];
-    int64_t src_ne[GGML_MAX_SRC][GGML_MAX_DIMS];
-    size_t src_nb[GGML_MAX_SRC][GGML_MAX_DIMS];
     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 };
 
@@ -1151,6 +1149,7 @@ struct ggml_cuda_graph {
     bool disable_due_to_too_many_updates = false;
     int number_consecutive_updates = 0;
     std::vector<ggml_cuda_graph_node_properties> props;
+    std::vector<ggml_cuda_graph_node_properties> extra;
 
     void record_update(bool use_graph, bool update_required) {
         if (use_graph && update_required) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ceee33d8707..2b638d131de 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -70,17 +70,18 @@
 #include <condition_variable>
 #include <cstddef>
 #include <cstdint>
-#include <float.h>
+#include <cfloat>
 #include <initializer_list>
 #include <limits>
 #include <map>
 #include <memory>
 #include <mutex>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
 #include <string>
 #include <vector>
+#include <unordered_set>
 
 static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
@@ -2930,11 +2931,6 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties
         }
 
         props->src_data[i] = node->src[i]->data;
-
-        for (int j = 0; j < GGML_MAX_DIMS; j++) {
-            props->src_ne[i][j] = node->src[i]->ne[j];
-            props->src_nb[i][j] = node->src[i]->nb[j];
-        }
     }
     memcpy(props->op_params, node->op_params, GGML_MAX_OP_PARAMS);
 }
@@ -2970,15 +2966,6 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
             if (node->src[i]->data != props->src_data[i]) {
                 return false;
             }
-
-            // TODO: this is not ideal since it requires too much memory - figure out an optimization
-            // ref: https://github.com/ggml-org/llama.cpp/pull/19165
-            for (int j = 0; j < GGML_MAX_DIMS; j++) {
-                if (node->src[i]->ne[j] != props->src_ne[i][j] ||
-                    node->src[i]->nb[j] != props->src_nb[i][j]) {
-                    return false;
-                }
-            }
         }
     }
 
@@ -3016,8 +3003,13 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
 
     // Loop over nodes in GGML graph to determine if CUDA graph update is required
     // and store properties to allow this comparison for the next token
+    std::unordered_set<ggml_tensor *> seen_node;
+    std::vector<ggml_tensor *> srcs_extra;
     for (int i = 0; i < cgraph->n_nodes; i++) {
         bool props_match = true;
+
+        seen_node.insert(cgraph->nodes[i]);
+
         if (!res) {
             props_match = ggml_cuda_graph_node_properties_match(cgraph->nodes[i], &graph->props[i]);
         }
@@ -3025,6 +3017,31 @@ static bool ggml_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx
             res = true;
         }
         ggml_cuda_graph_node_set_properties(&graph->props[i], cgraph->nodes[i]);
+
+        for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
+            ggml_tensor * src = cgraph->nodes[i]->src[src_idx];
+            if (src && seen_node.find(src) == seen_node.end()) {
+                srcs_extra.push_back(src);
+            }
+        }
+    }
+
+    if (graph->extra.size() != (size_t) srcs_extra.size()) {
+        res = true;
+        graph->extra.resize(srcs_extra.size());
+    }
+
+    for (size_t i = 0; i < srcs_extra.size(); ++i) {
+        bool props_match = true;
+
+        if (!res) {
+            props_match = ggml_cuda_graph_node_properties_match(srcs_extra[i], &graph->extra[i]);
+        }
+
+        if (!props_match) {
+            res = true;
+        }
+        ggml_cuda_graph_node_set_properties(&graph->extra[i], srcs_extra[i]);
     }
 
     return res;

From 73097f5fc0965bab8447822398d91464fce0f037 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 29 Jan 2026 14:31:48 +0200
Subject: [PATCH 4/4] cont : comments + static_assert

---
 ggml/src/ggml-cuda/common.cuh   | 9 ++++++++-
 ggml/src/ggml-cuda/ggml-cuda.cu | 5 ++---
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index b3e321483cb..43280644e48 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1122,7 +1122,7 @@ struct ggml_tensor_extra_gpu {
 #endif
 
 struct ggml_cuda_graph_node_properties {
-    void * node_address;
+    void * node_data;
     ggml_op node_op;
     int32_t flags;
     int64_t ne[GGML_MAX_DIMS];
@@ -1131,6 +1131,8 @@ struct ggml_cuda_graph_node_properties {
     int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
 };
 
+static_assert(std::is_trivial<ggml_cuda_graph_node_properties>::value, "ggml_cuda_graph_node_properties must be trivial");
+
 struct ggml_cuda_graph {
 #ifdef USE_CUDA_GRAPH
     ~ggml_cuda_graph() {
@@ -1149,6 +1151,11 @@ struct ggml_cuda_graph {
     bool disable_due_to_too_many_updates = false;
     int number_consecutive_updates = 0;
     std::vector<ggml_cuda_graph_node_properties> props;
+
+    // these are extra tensors (inputs) that participate in the ggml graph but are not nodes
+    // they properties also have to match in order to be able to safely reuse a CUDA graph
+    // ref: https://github.com/ggml-org/llama.cpp/pull/18583
+    // ref: https://github.com/ggml-org/llama.cpp/pull/19165
     std::vector<ggml_cuda_graph_node_properties> extra;
 
     void record_update(bool use_graph, bool update_required) {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 2b638d131de..842f4e7941e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2918,7 +2918,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
 
 static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
     memset(props, 0, sizeof(ggml_cuda_graph_node_properties));
-    props->node_address = node->data;
+    props->node_data = node->data;
     props->node_op = node->op;
     props->flags = node->flags;
     for (int i = 0; i < GGML_MAX_DIMS; i++) {
@@ -2936,8 +2936,7 @@ static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties
 }
 
 static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_graph_node_properties * props) {
-    if (node->data != props->node_address &&
-          node->op != GGML_OP_VIEW) {
+    if (node->data != props->node_data && node->op != GGML_OP_VIEW) {
         return false;
     }