From 773909b75cd83559f76d72fb72219001a8c71d4b Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 1 Nov 2025 14:30:00 +0800
Subject: [PATCH 01/18] CUDA: add stream-based concurrency

---
 ggml/src/ggml-cuda/common.cuh   |  46 ++++-
 ggml/src/ggml-cuda/ggml-cuda.cu | 298 ++++++++++++++++++++++++++++++--
 2 files changed, 324 insertions(+), 20 deletions(-)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 99ec96869a7..d3ab8b6c0cd 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -972,6 +972,32 @@ struct ggml_cuda_graph {
 #endif
 };
 
+struct ggml_cuda_concurrent_event {
+    std::vector<cudaEvent_t> per_stream_events;
+    cudaEvent_t              fork_event;
+    cudaEvent_t              join_event;
+
+    int                                          n_streams = 0;
+    std::unordered_map<const ggml_tensor *, int> stream_mapping;
+
+    const ggml_tensor * join_node;
+
+    ggml_cuda_concurrent_event() = default;
+
+    explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) {
+        per_stream_events.resize(n_streams);
+
+        for (size_t i = 0; i < per_stream_events.size(); ++i) {
+            cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming);
+        }
+
+        cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming);
+        cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming);
+    }
+};
+
+using ggml_cuda_stream_context = std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event>;
+
 struct ggml_backend_cuda_context {
     int device;
     std::string name;
@@ -982,11 +1008,15 @@ struct ggml_backend_cuda_context {
 
     std::unique_ptr<ggml_cuda_graph> cuda_graph;
 
+    int curr_stream_no = 0;
+
     explicit ggml_backend_cuda_context(int device) :
         device(device),
         name(GGML_CUDA_NAME + std::to_string(device)) {
     }
 
+    ggml_cuda_stream_context concurrent_stream_context;
+
     ~ggml_backend_cuda_context();
 
     cudaStream_t stream(int device, int stream) {
@@ -997,9 +1027,9 @@ struct ggml_backend_cuda_context {
         return streams[device][stream];
     }
 
-    cudaStream_t stream() {
-        return stream(device, 0);
-    }
+    cudaStream_t stream() { return stream(device, curr_stream_no); }
+
+    ggml_cuda_stream_context & stream_context() { return concurrent_stream_context; }
 
     cublasHandle_t cublas_handle(int device) {
         if (cublas_handles[device] == nullptr) {
@@ -1015,15 +1045,15 @@ struct ggml_backend_cuda_context {
     }
 
     // pool
-    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES];
+    std::unique_ptr<ggml_cuda_pool> pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS];
 
-    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device);
+    static std::unique_ptr<ggml_cuda_pool> new_pool_for_device(int device, int stream_no);
 
     ggml_cuda_pool & pool(int device) {
-        if (pools[device] == nullptr) {
-            pools[device] = new_pool_for_device(device);
+        if (pools[device][curr_stream_no] == nullptr) {
+            pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no);
         }
-        return *pools[device];
+        return *pools[device][curr_stream_no];
     }
 
     ggml_cuda_pool & pool() {
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 0b29074f33d..510388fa5ec 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -521,7 +521,8 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
 };
 #endif // defined(GGML_USE_VMM)
 
-std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
+std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int                  device,
+                                                                               [[maybe_unused]] int stream_no) {
 #if defined(GGML_USE_VMM)
     if (ggml_cuda_info().devices[device].vmm) {
         return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
@@ -3036,7 +3037,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 #ifndef NDEBUG
     const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
     GGML_ASSERT(unary_ops.size() == num_unary);
-#endif
+#endif;
 
     //TODO: remove special case once ggml_can_fuse can handle empty nodes
     std::initializer_list<enum ggml_op> topk_moe_ops =
@@ -3192,18 +3193,44 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     // flag used to determine whether it is an integrated_gpu
     const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
 
+    bool                         is_concurrent_event_active = false;
+    ggml_cuda_concurrent_event * concurrent_event           = nullptr;
+
     while (!graph_evaluated_or_captured) {
         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
         // With the use of CUDA graphs, the execution will be performed by the graph launch.
         if (!use_cuda_graph || cuda_graph_update_required) {
-
             [[maybe_unused]] int prev_i = 0;
 
+            ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
+
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
+                if (is_concurrent_event_active) {
+                    GGML_ASSERT(concurrent_event);
+
+                    if (node == concurrent_event->join_node) {
+                        cuda_ctx->curr_stream_no = 0;
+                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                            CUDA_CHECK(cudaEventRecord(concurrent_event->per_stream_events[i - 1],
+                                                       cuda_ctx->stream(cuda_ctx->device, i)));
+                            CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->per_stream_events[i - 1]));
+                        }
+
+                        is_concurrent_event_active = false;
+                        concurrent_event           = nullptr;
+
+                    } else {
+                        GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end());
+                        const int stream_mapping = concurrent_event->stream_mapping[node];
+                        cuda_ctx->curr_stream_no = stream_mapping;
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
+                    }
+                }
+                prev_i = i;
+
 #ifdef GGML_CUDA_DEBUG
                 const int nodes_fused = i - prev_i - 1;
-                prev_i = i;
                 if (nodes_fused > 0) {
                     GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
                 }
@@ -3213,6 +3240,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     continue;
                 }
 
+
+                // start of fusion operations
                 static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
                 if (!disable_fusion) {
 
@@ -3476,16 +3505,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         continue;
                     }
 
+                    //TODO: fix this
+                    static const bool graph_opt = (getenv("GGML_CUDA_GRAPH_OPT") != nullptr) && atoi(getenv("GGML_CUDA_GRAPH_OPT")) == 1;
+
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
-                        ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
-                        i += 2;
-                        continue;
+                        if (strncmp(cgraph->nodes[i+2]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) {
+                            ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
+                            i += 2;
+                            continue;
+                        }
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
-                        ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
-                        i++;
-                        continue;
+                        if (strncmp(cgraph->nodes[i+1]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) {
+                            ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
+                            i++;
+                            continue;
+                        }
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
@@ -3505,13 +3541,35 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                 }
 #else
                 GGML_UNUSED(integrated);
-#endif // NDEBUG
+#endif  // NDEBUG
 
                 bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
                 if (!ok) {
                     GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
                 }
                 GGML_ASSERT(ok);
+
+                if (!is_concurrent_event_active) {
+                    //const ggml_tensor * adjusted_node = node;
+                    // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD),
+                    // we can safely use the previous node to check if it can be forked
+                    if (stream_ctx.find(node) != stream_ctx.end()) {
+                        concurrent_event = &stream_ctx[node];
+
+                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+
+                        cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
+                        GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
+                        CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
+
+                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                            cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
+                            CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
+                        }
+
+                        is_concurrent_event_active = true;
+                    }
+               }
             }
         }
 
@@ -3651,6 +3709,222 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
     }
 }
 
+static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
+
+    static bool enable_graph_optimization = [] {
+        const char * env = getenv("GGML_CUDA_GRAPH_OPT");
+        return env != nullptr && atoi(env) == 1;
+    }();
+
+    if (!enable_graph_optimization) {
+        return;
+    }
+
+    GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "cuda graph optimization is only supported on single GPU");
+    GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
+
+    ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
+    stream_context.clear();
+
+    std::unordered_map<const ggml_tensor *, int> fan_out;
+    std::unordered_map<const ggml_tensor *, int> node_indices;
+
+    const auto & is_empty = [](const ggml_tensor * node) -> bool {
+        return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE ||
+               node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
+    };
+
+    const auto & is_src_of = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool {
+        for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
+            if (dst->src[s] == src) {
+                return true;
+            }
+        }
+        // implicit dependency if they view the same tensor
+        const ggml_tensor * dst2 = dst->view_src ? dst->view_src : dst;
+        const ggml_tensor * src2 = src->view_src ? src->view_src : src;
+        if (dst2 == src2) {
+            return true;
+        }
+        return false;
+    };
+
+    for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
+        const ggml_tensor * node = cgraph->nodes[node_idx];
+        node_indices[node]       = node_idx;
+
+        if (is_empty(node)) {
+            continue;
+        }
+        for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
+            const ggml_tensor * node = cgraph->nodes[node_idx]->src[src_idx];
+            //TODO: check why nrows > 1 fails, probably related to CUDA graphs
+            if (node && !is_empty(node) && ggml_nrows(node) <= 1) {
+                fan_out[node] += 1;
+            }
+        }
+    }
+
+    //Target Q, K, V
+    const int min_fan_out = 3;
+    const int max_fan_out = 3;
+
+    std::vector<std::pair<int, int>> concurrent_node_ranges;
+    for (const auto & [root_node, count] : fan_out) {
+        if (count >= min_fan_out && count <= max_fan_out) {
+            const int root_node_idx = node_indices[root_node];
+
+            bool is_part_of_event = false;
+            for (const auto & [start, end] : concurrent_node_ranges) {
+                if (root_node_idx >= start && root_node_idx <= end) {
+                    is_part_of_event = true;
+                }
+            }
+
+            if (is_part_of_event) {
+                continue;
+            }
+
+            std::vector<std::vector<const ggml_tensor *>> nodes_per_branch;
+            for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) {
+                const ggml_tensor * node = cgraph->nodes[i];
+                if (!is_empty(node) && is_src_of(node, root_node)) {
+                    nodes_per_branch.push_back({ node });
+                }
+            }
+
+            GGML_ASSERT(nodes_per_branch.size() == (size_t) count);
+
+            //find the join point
+            const ggml_tensor * join_node = nullptr;
+
+            auto belongs_to_branch = [&](const ggml_tensor * node, std::vector<const ggml_tensor *> & branch) -> bool {
+                for (const ggml_tensor * n : branch) {
+                    if (n == node) {
+                        return false;
+                    }
+
+                    if (is_src_of(node, n)) {
+                        return true;
+                    }
+                }
+                return false;
+            };
+
+            for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) {
+                const ggml_tensor * curr_node = cgraph->nodes[i];
+
+                int num_joins = 0;
+                for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
+                    if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) {
+                        num_joins++;
+                    }
+                }
+
+                if (num_joins >= 2) {
+                    join_node = curr_node;
+                    break;
+                }
+
+                bool found_branch = false;
+                for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
+                    if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) {
+                        //continue accumulating
+                        nodes_per_branch[branch_idx].push_back(curr_node);
+                        found_branch = true;
+                    } else {
+                        if (std::find(nodes_per_branch[branch_idx].begin(), nodes_per_branch[branch_idx].end(),
+                                      curr_node) != nodes_per_branch[branch_idx].end()) {
+                            found_branch = true;
+                        }
+                    }
+                }
+
+                if (!found_branch) {
+                    if (is_empty(curr_node)) {
+                        // we can put it in any branch because it will be ignored
+                        nodes_per_branch[0].push_back({ curr_node });
+                    }
+                }
+            }
+
+            if (join_node) {
+                //Create ggml_cuda_concurrent_event
+                ggml_cuda_concurrent_event concurrent_event(nodes_per_branch.size());
+                concurrent_event.join_node = join_node;
+
+                for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
+                    for (const ggml_tensor * n : nodes_per_branch[branch_idx]) {
+                        concurrent_event.stream_mapping[n] = branch_idx + 1;
+                    }
+                }
+
+                int fork_node_idx = node_indices[root_node];
+                int join_node_idx = node_indices[join_node];
+
+                int       current_branch_idx = 0;
+                int       current_node_idx   = fork_node_idx + 1;
+                const int n_branches         = nodes_per_branch.size();
+
+                int total_branch_nodes = 0;
+                for (std::vector<const ggml_tensor *> branch_nodes : nodes_per_branch) {
+                    total_branch_nodes += branch_nodes.size();
+                }
+
+                // there are other nodes in the middle which are unaccounted for
+                // usually (cpy) nodes, then ignore this fork
+                if (join_node_idx - fork_node_idx - 1 != total_branch_nodes) {
+                    GGML_LOG_DEBUG(
+                        "Skipping %s because the number of nodes in the middle is not equal to the total number of "
+                        "branch nodes %d != %d\n",
+                        root_node->name, join_node_idx - fork_node_idx - 1, total_branch_nodes);
+                    continue;
+                }
+
+                GGML_ASSERT(cuda_ctx->stream_context().find(root_node) == cuda_ctx->stream_context().end());
+                cuda_ctx->stream_context().emplace(root_node, concurrent_event);
+                GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node);
+                concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx);
+
+                // interleave tensors to extend lifetimes so that ggml graph doesn't recycle them
+                // example transformation:
+                // [attn-norm, QMul, QNorm, QRope, KMul, KNorm, KRope, VMul, attn] ->
+                // [attn-norm, QMul, KMul, VMul, QNorm, VNorm, QRope, KRope, attn]
+                // TODO: This breaks fusion within streams, how do we fix this?
+                while (current_node_idx < join_node_idx) {
+                    std::vector<const ggml_tensor *> & branch_nodes = nodes_per_branch[current_branch_idx];
+
+                    bool has_node = false;
+                    for (std::vector<const ggml_tensor *> branch_node : nodes_per_branch) {
+                        has_node |= branch_node.size() > 0;
+                    }
+
+                    GGML_ASSERT(has_node);
+
+                    if (branch_nodes.empty()) {
+                        current_branch_idx = (current_branch_idx + 1) % n_branches;
+                        continue;
+                    }
+
+                    cgraph->nodes[current_node_idx] = const_cast<ggml_tensor *>(branch_nodes.front());
+                    current_node_idx++;
+                    branch_nodes.erase(branch_nodes.begin());
+
+                    // append all empty nodes
+                    while (!branch_nodes.empty() && is_empty(branch_nodes.front())) {
+                        cgraph->nodes[current_node_idx] = const_cast<ggml_tensor *>(branch_nodes.front());
+                        current_node_idx++;
+                        branch_nodes.erase(branch_nodes.begin());
+                    }
+
+                    current_branch_idx = (current_branch_idx + 1) % n_branches;
+                }
+            }
+        }
+    }
+}
+
 static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .get_name                = */ ggml_backend_cuda_get_name,
     /* .free                    = */ ggml_backend_cuda_free,
@@ -3665,7 +3939,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .event_record            = */ ggml_backend_cuda_event_record,
     /* .event_wait              = */ ggml_backend_cuda_event_wait,
-    /* .graph_optimize          = */ NULL,
+    /* .graph_optimize          = */ ggml_backend_cuda_graph_optimize,
 };
 
 static ggml_guid_t ggml_backend_cuda_guid() {

From fb2e2fbf337c18b464f419781887a83f0091fdc6 Mon Sep 17 00:00:00 2001
From: Carl Philipp Klemm <carl@uvos.xyz>
Date: Thu, 6 Nov 2025 13:35:19 +0100
Subject: [PATCH 02/18] HIP: fix hipStreamWaitEvent define and nodiscard
 warnings

---
 ggml/src/ggml-cuda/common.cuh    | 7 ++++---
 ggml/src/ggml-cuda/vendors/hip.h | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index d3ab8b6c0cd..78eaf9cdc1e 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -25,6 +25,7 @@
 #include <cfloat>
 #include <cstdio>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #if defined(GGML_USE_HIP)
@@ -988,11 +989,11 @@ struct ggml_cuda_concurrent_event {
         per_stream_events.resize(n_streams);
 
         for (size_t i = 0; i < per_stream_events.size(); ++i) {
-            cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming);
+            CUDA_CHECK(cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming));
         }
 
-        cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming);
-        cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming);
+        CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming));
+        CUDA_CHECK(cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming));
     }
 };
 
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index 890c1036498..b7d6edf7fcb 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -105,7 +105,7 @@
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStreamWaitEvent hipStreamWaitEvent
 #define cudaGraphExec_t hipGraphExec_t
 #define cudaGraphNode_t hipGraphNode_t
 #define cudaKernelNodeParams hipKernelNodeParams

From 952526c61ca2aede276dc7553cae1f75f803ea17 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 13 Nov 2025 11:08:05 +0800
Subject: [PATCH 03/18] ggml-cuda: fix fusion inside stream

---
 ggml/src/ggml-cuda/common.cuh   | 10 ++++-
 ggml/src/ggml-cuda/ggml-cuda.cu | 65 +++++++++++++++++++++++----------
 2 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 78eaf9cdc1e..21d9b2126e3 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -997,7 +997,15 @@ struct ggml_cuda_concurrent_event {
     }
 };
 
-using ggml_cuda_stream_context = std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event>;
+struct ggml_cuda_stream_context {
+    std::vector<const ggml_tensor *> original_graph;
+    std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> concurrent_events;
+
+    void reset() {
+        original_graph.clear();
+        concurrent_events.clear();
+    }
+};
 
 struct ggml_backend_cuda_context {
     int device;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 510388fa5ec..90f5f618747 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3203,6 +3203,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
             [[maybe_unused]] int prev_i = 0;
 
             ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
+            if (stream_ctx.concurrent_events.size() > 0) {
+                cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_graph.data());
+            }
 
             for (int i = 0; i < cgraph->n_nodes; i++) {
                 ggml_tensor * node = cgraph->nodes[i];
@@ -3226,6 +3229,26 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         cuda_ctx->curr_stream_no = stream_mapping;
                         GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
                     }
+                } else if (i - prev_i > 1) {
+
+                    //the previous node was fused
+                    const ggml_tensor * prev_node = cgraph->nodes[i - 1];
+                    if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) {
+                        concurrent_event = &stream_ctx.concurrent_events[prev_node];
+
+                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+
+                        cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
+                        GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
+                        CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
+
+                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                            cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
+                            CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
+                        }
+
+                        is_concurrent_event_active = true;
+                    }
                 }
                 prev_i = i;
 
@@ -3505,23 +3528,16 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         continue;
                     }
 
-                    //TODO: fix this
-                    static const bool graph_opt = (getenv("GGML_CUDA_GRAPH_OPT") != nullptr) && atoi(getenv("GGML_CUDA_GRAPH_OPT")) == 1;
-
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
-                        if (strncmp(cgraph->nodes[i+2]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) {
-                            ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
-                            i += 2;
-                            continue;
-                        }
+                        ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
+                        i += 2;
+                        continue;
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
-                        if (strncmp(cgraph->nodes[i+1]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) {
-                            ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
-                            i++;
-                            continue;
-                        }
+                        ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
+                        i++;
+                        continue;
                     }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
@@ -3553,8 +3569,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     //const ggml_tensor * adjusted_node = node;
                     // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD),
                     // we can safely use the previous node to check if it can be forked
-                    if (stream_ctx.find(node) != stream_ctx.end()) {
-                        concurrent_event = &stream_ctx[node];
+                    if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) {
+                        concurrent_event = &stream_ctx.concurrent_events[node];
 
                         GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
 
@@ -3725,7 +3741,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
     GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
 
     ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
-    stream_context.clear();
+    stream_context.reset();
 
     std::unordered_map<const ggml_tensor *, int> fan_out;
     std::unordered_map<const ggml_tensor *, int> node_indices;
@@ -3771,6 +3787,15 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
     const int max_fan_out = 3;
 
     std::vector<std::pair<int, int>> concurrent_node_ranges;
+
+    //save the original graph
+    std::vector<const ggml_tensor *> original_graph;
+    original_graph.reserve(cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        original_graph.push_back(cgraph->nodes[i]);
+    }
+    cuda_ctx->stream_context().original_graph = std::move(original_graph);
+
     for (const auto & [root_node, count] : fan_out) {
         if (count >= min_fan_out && count <= max_fan_out) {
             const int root_node_idx = node_indices[root_node];
@@ -3799,7 +3824,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
             //find the join point
             const ggml_tensor * join_node = nullptr;
 
-            auto belongs_to_branch = [&](const ggml_tensor * node, std::vector<const ggml_tensor *> & branch) -> bool {
+            const auto & belongs_to_branch = [&](const ggml_tensor * node, std::vector<const ggml_tensor *> & branch) -> bool {
                 for (const ggml_tensor * n : branch) {
                     if (n == node) {
                         return false;
@@ -3882,8 +3907,9 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                     continue;
                 }
 
-                GGML_ASSERT(cuda_ctx->stream_context().find(root_node) == cuda_ctx->stream_context().end());
-                cuda_ctx->stream_context().emplace(root_node, concurrent_event);
+                std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> & concurrent_events = cuda_ctx->stream_context().concurrent_events;
+                GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end());
+                concurrent_events.emplace(root_node, concurrent_event);
                 GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node);
                 concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx);
 
@@ -3891,7 +3917,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                 // example transformation:
                 // [attn-norm, QMul, QNorm, QRope, KMul, KNorm, KRope, VMul, attn] ->
                 // [attn-norm, QMul, KMul, VMul, QNorm, VNorm, QRope, KRope, attn]
-                // TODO: This breaks fusion within streams, how do we fix this?
                 while (current_node_idx < join_node_idx) {
                     std::vector<const ggml_tensor *> & branch_nodes = nodes_per_branch[current_branch_idx];
 

From b87eb3ea7a7df06e345642b55d81c9b8957c359d Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 13 Nov 2025 16:23:25 +0800
Subject: [PATCH 04/18] ggml-cuda: fix bug w.r.t first stream launch

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 90f5f618747..e3578645149 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3203,6 +3203,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
             [[maybe_unused]] int prev_i = 0;
 
             ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
+
             if (stream_ctx.concurrent_events.size() > 0) {
                 cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_graph.data());
             }
@@ -3236,7 +3237,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) {
                         concurrent_event = &stream_ctx.concurrent_events[prev_node];
 
-                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, prev_node->name);
 
                         cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
                         GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
@@ -3248,6 +3249,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         }
 
                         is_concurrent_event_active = true;
+                        cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
                     }
                 }
                 prev_i = i;

From 8b048e8b4c285131ac1737f66d186479eb732425 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 13 Nov 2025 20:49:49 +0800
Subject: [PATCH 05/18] ggml-cuda: format

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e3578645149..d015fbce152 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3037,7 +3037,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
 #ifndef NDEBUG
     const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY);
     GGML_ASSERT(unary_ops.size() == num_unary);
-#endif;
+#endif
 
     //TODO: remove special case once ggml_can_fuse can handle empty nodes
     std::initializer_list<enum ggml_op> topk_moe_ops =

From 35ad678106bcfeacc9cbd9f9c7fbece2d21af020 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 14 Nov 2025 10:56:30 +0800
Subject: [PATCH 06/18] ggml-cuda: improve assert message

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d015fbce152..ecbd8886b1d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3740,7 +3740,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
         return;
     }
 
-    GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "cuda graph optimization is only supported on single GPU");
+    GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
     GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
 
     ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();

From 232c47fee63129359ed51041c8ee20a9583bf0f8 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 14 Nov 2025 23:26:51 +0800
Subject: [PATCH 07/18] ggml-cuda: use lambda instead of duplicating code

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 57 ++++++++++++---------------------
 1 file changed, 20 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index ecbd8886b1d..478dba13de0 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3196,6 +3196,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     bool                         is_concurrent_event_active = false;
     ggml_cuda_concurrent_event * concurrent_event           = nullptr;
 
+    const auto try_launch_concurrent_event = [&](const ggml_tensor * node) {
+        if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) {
+            concurrent_event = &stream_ctx.concurrent_events[node];
+
+            GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
+
+            cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
+            GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
+            CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
+
+            for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
+                CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
+            }
+        }
+    };
+
     while (!graph_evaluated_or_captured) {
         // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
         // With the use of CUDA graphs, the execution will be performed by the graph launch.
@@ -3216,6 +3233,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     if (node == concurrent_event->join_node) {
                         cuda_ctx->curr_stream_no = 0;
                         for (int i = 1; i <= concurrent_event->n_streams; ++i) {
+                            // Wait on join events of forked streams in the main stream 
                             CUDA_CHECK(cudaEventRecord(concurrent_event->per_stream_events[i - 1],
                                                        cuda_ctx->stream(cuda_ctx->device, i)));
                             CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->per_stream_events[i - 1]));
@@ -3234,24 +3252,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
                     //the previous node was fused
                     const ggml_tensor * prev_node = cgraph->nodes[i - 1];
-                    if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) {
-                        concurrent_event = &stream_ctx.concurrent_events[prev_node];
-
-                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, prev_node->name);
-
-                        cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
-                        GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
-                        CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
-
-                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
-                            cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
-                            CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
-                        }
-
-                        is_concurrent_event_active = true;
-                        cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
-                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
-                    }
+                    try_launch_concurrent_event(prev_node);
                 }
                 prev_i = i;
 
@@ -3569,25 +3570,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                 GGML_ASSERT(ok);
 
                 if (!is_concurrent_event_active) {
-                    //const ggml_tensor * adjusted_node = node;
-                    // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD),
-                    // we can safely use the previous node to check if it can be forked
-                    if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) {
-                        concurrent_event = &stream_ctx.concurrent_events[node];
-
-                        GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
-
-                        cudaStream_t main_stream = cuda_ctx->stream();  // this should be stream 0
-                        GGML_ASSERT(cuda_ctx->curr_stream_no == 0);
-                        CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream));
-
-                        for (int i = 1; i <= concurrent_event->n_streams; ++i) {
-                            cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i);
-                            CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event));
-                        }
-
-                        is_concurrent_event_active = true;
-                    }
+                    try_launch_concurrent_event(node);
                }
             }
         }

From c70a9dbb7c703cea1c78e4859d9536e9e3801210 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 14 Nov 2025 23:38:01 +0800
Subject: [PATCH 08/18] ggml-cuda: add some more comments

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 478dba13de0..b6baef72d61 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3193,6 +3193,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     // flag used to determine whether it is an integrated_gpu
     const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated;
 
+    ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
     bool                         is_concurrent_event_active = false;
     ggml_cuda_concurrent_event * concurrent_event           = nullptr;
 
@@ -3219,9 +3220,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
         if (!use_cuda_graph || cuda_graph_update_required) {
             [[maybe_unused]] int prev_i = 0;
 
-            ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
-
             if (stream_ctx.concurrent_events.size() > 0) {
+                //Restore the original graph to enable fusion within the streams
                 cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_graph.data());
             }
 

From 790ddd4a3b0c3e163399912484d1de6abdb2781f Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Fri, 14 Nov 2025 23:53:15 +0800
Subject: [PATCH 09/18] ggml-cuda: add more detailed comments about concurrency

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b6baef72d61..b89a445a1f7 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3768,7 +3768,15 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
         }
     }
 
-    //Target Q, K, V
+    // Target Q, K, V for concurrency
+    // this is a more general way to find nodes which can be candidates for concurrency (although it has not been tested for anything else):
+    // 1. find fan-out (fork) nodes where the same input is used at least N times (in QKV, it would be "attn-norm")
+    // 2. find the join node, where 2 or more of the outputs are required (in QKV, this would "KQ" or "flash-attn")
+    // 3. account for all branches from the fork to the join
+    // 4. To extend lifetimes of the tensors, we interleave the branches (see below for more details)
+    // 5. save the original cgraph and restore it in graph_compute, to enable fusion within streams
+    // See discussion: https://github.com/ggml-org/llama.cpp/pull/16991#issuecomment-3522620030
+
     const int min_fan_out = 3;
     const int max_fan_out = 3;
 

From 3c717272c8ededaa5f8dd9d88a25ecd4d80cb468 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 15 Nov 2025 09:23:58 +0800
Subject: [PATCH 10/18] ggml-cuda: rename + remove unused var

---
 ggml/src/ggml-cuda/common.cuh   | 10 ++++------
 ggml/src/ggml-cuda/ggml-cuda.cu |  6 +++---
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 21d9b2126e3..5d74c1cacfd 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -974,9 +974,8 @@ struct ggml_cuda_graph {
 };
 
 struct ggml_cuda_concurrent_event {
-    std::vector<cudaEvent_t> per_stream_events;
+    std::vector<cudaEvent_t> join_events;
     cudaEvent_t              fork_event;
-    cudaEvent_t              join_event;
 
     int                                          n_streams = 0;
     std::unordered_map<const ggml_tensor *, int> stream_mapping;
@@ -986,14 +985,13 @@ struct ggml_cuda_concurrent_event {
     ggml_cuda_concurrent_event() = default;
 
     explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) {
-        per_stream_events.resize(n_streams);
+        join_events.resize(n_streams);
 
-        for (size_t i = 0; i < per_stream_events.size(); ++i) {
-            CUDA_CHECK(cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming));
+        for (size_t i = 0; i < join_events.size(); ++i) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&join_events[i], cudaEventDisableTiming));
         }
 
         CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming));
-        CUDA_CHECK(cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming));
     }
 };
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b89a445a1f7..2b27193f63d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3233,10 +3233,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     if (node == concurrent_event->join_node) {
                         cuda_ctx->curr_stream_no = 0;
                         for (int i = 1; i <= concurrent_event->n_streams; ++i) {
-                            // Wait on join events of forked streams in the main stream 
-                            CUDA_CHECK(cudaEventRecord(concurrent_event->per_stream_events[i - 1],
+                            // Wait on join events of forked streams in the main stream
+                            CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1],
                                                        cuda_ctx->stream(cuda_ctx->device, i)));
-                            CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->per_stream_events[i - 1]));
+                            CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1]));
                         }
 
                         is_concurrent_event_active = false;

From 05379d98b5edd20fecc13da75a2cff8491171f56 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Wed, 19 Nov 2025 22:58:32 +0800
Subject: [PATCH 11/18] ggml-cuda: fix condition for stream launch

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 2b27193f63d..d0fdd2da492 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3200,6 +3200,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     const auto try_launch_concurrent_event = [&](const ggml_tensor * node) {
         if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) {
             concurrent_event = &stream_ctx.concurrent_events[node];
+            is_concurrent_event_active = true;
 
             GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
 
@@ -3249,10 +3250,15 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
                     }
                 } else if (i - prev_i > 1) {
-
                     //the previous node was fused
                     const ggml_tensor * prev_node = cgraph->nodes[i - 1];
                     try_launch_concurrent_event(prev_node);
+
+                    if (is_concurrent_event_active) {
+                        const int stream_mapping = concurrent_event->stream_mapping[node];
+                        cuda_ctx->curr_stream_no = stream_mapping;
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
+                    }
                 }
                 prev_i = i;
 
@@ -3761,7 +3767,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
         }
         for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
             const ggml_tensor * node = cgraph->nodes[node_idx]->src[src_idx];
-            //TODO: check why nrows > 1 fails, probably related to CUDA graphs
+            //TODO: check why nrows > 1 fails
             if (node && !is_empty(node) && ggml_nrows(node) <= 1) {
                 fan_out[node] += 1;
             }

From 537a08e470ed067c4388aa3e1e2a727f0908f4f5 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 22 Nov 2025 13:22:25 +0800
Subject: [PATCH 12/18] ggml-cuda: address review comments, add destructor

---
 ggml/src/ggml-cuda/common.cuh   | 29 +++++++++++--
 ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++++++++++++------------------
 2 files changed, 59 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 5d74c1cacfd..90ad8f2364b 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -975,7 +975,7 @@ struct ggml_cuda_graph {
 
 struct ggml_cuda_concurrent_event {
     std::vector<cudaEvent_t> join_events;
-    cudaEvent_t              fork_event;
+    cudaEvent_t              fork_event = nullptr;
 
     int                                          n_streams = 0;
     std::unordered_map<const ggml_tensor *, int> stream_mapping;
@@ -984,6 +984,9 @@ struct ggml_cuda_concurrent_event {
 
     ggml_cuda_concurrent_event() = default;
 
+    ggml_cuda_concurrent_event(const ggml_cuda_concurrent_event &) = delete;
+    ggml_cuda_concurrent_event & operator=(const ggml_cuda_concurrent_event &) = delete;
+
     explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) {
         join_events.resize(n_streams);
 
@@ -993,14 +996,34 @@ struct ggml_cuda_concurrent_event {
 
         CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming));
     }
+
+    ggml_cuda_concurrent_event(ggml_cuda_concurrent_event && other) noexcept
+    : join_events(std::move(other.join_events))
+    , fork_event(other.fork_event)
+    , n_streams(other.n_streams)
+    , stream_mapping(std::move(other.stream_mapping))
+    , join_node(other.join_node) {
+        other.fork_event = nullptr;
+    }
+
+    ~ggml_cuda_concurrent_event() {
+        if (fork_event != nullptr) {
+            CUDA_CHECK(cudaEventDestroy(fork_event));
+        }
+        for (cudaEvent_t e : join_events) {
+            if (e != nullptr) {
+                CUDA_CHECK(cudaEventDestroy(e));
+            }
+        }
+    }
 };
 
 struct ggml_cuda_stream_context {
-    std::vector<const ggml_tensor *> original_graph;
+    std::vector<const ggml_tensor *>                                    original_nodes;
     std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> concurrent_events;
 
     void reset() {
-        original_graph.clear();
+        original_nodes.clear();
         concurrent_events.clear();
     }
 };
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d0fdd2da492..62f438353dd 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3223,7 +3223,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
             if (stream_ctx.concurrent_events.size() > 0) {
                 //Restore the original graph to enable fusion within the streams
-                cgraph->nodes = const_cast<ggml_tensor **>(stream_ctx.original_graph.data());
+                cgraph->nodes   = const_cast<ggml_tensor **>(stream_ctx.original_nodes.data());
+                cgraph->n_nodes = (int) stream_ctx.original_nodes.size();
             }
 
             for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -3242,12 +3243,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
 
                         is_concurrent_event_active = false;
                         concurrent_event           = nullptr;
-
                     } else {
                         GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end());
-                        const int stream_mapping = concurrent_event->stream_mapping[node];
-                        cuda_ctx->curr_stream_no = stream_mapping;
-                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
+                        cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
                     }
                 } else if (i - prev_i > 1) {
                     //the previous node was fused
@@ -3255,9 +3254,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                     try_launch_concurrent_event(prev_node);
 
                     if (is_concurrent_event_active) {
-                        const int stream_mapping = concurrent_event->stream_mapping[node];
-                        cuda_ctx->curr_stream_no = stream_mapping;
-                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name);
+                        cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node];
+                        GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name);
                     }
                 }
                 prev_i = i;
@@ -3730,20 +3728,22 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
     }
 
     GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
-    GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes);
+    GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);
 
     ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
     stream_context.reset();
 
+    // number of out-degrees for a particular node
     std::unordered_map<const ggml_tensor *, int> fan_out;
+    // reverse mapping of node to index in the cgraph
     std::unordered_map<const ggml_tensor *, int> node_indices;
 
-    const auto & is_empty = [](const ggml_tensor * node) -> bool {
+    const auto & is_noop = [](const ggml_tensor * node) -> bool {
         return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE ||
                node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE;
     };
 
-    const auto & is_src_of = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool {
+    const auto & depends_on = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool {
         for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) {
             if (dst->src[s] == src) {
                 return true;
@@ -3762,14 +3762,14 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
         const ggml_tensor * node = cgraph->nodes[node_idx];
         node_indices[node]       = node_idx;
 
-        if (is_empty(node)) {
+        if (is_noop(node)) {
             continue;
         }
         for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) {
-            const ggml_tensor * node = cgraph->nodes[node_idx]->src[src_idx];
+            const ggml_tensor * src = cgraph->nodes[node_idx]->src[src_idx];
             //TODO: check why nrows > 1 fails
-            if (node && !is_empty(node) && ggml_nrows(node) <= 1) {
-                fan_out[node] += 1;
+            if (node && !is_noop(node) && ggml_nrows(node) <= 1) {
+                fan_out[src] += 1;
             }
         }
     }
@@ -3786,15 +3786,16 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
     const int min_fan_out = 3;
     const int max_fan_out = 3;
 
+    // store {fork_idx, join_idx}
     std::vector<std::pair<int, int>> concurrent_node_ranges;
 
-    //save the original graph
-    std::vector<const ggml_tensor *> original_graph;
-    original_graph.reserve(cgraph->n_nodes);
+    // save the original nodes
+    std::vector<const ggml_tensor *> original_nodes;
+    original_nodes.reserve(cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; ++i) {
-        original_graph.push_back(cgraph->nodes[i]);
+        original_nodes.push_back(cgraph->nodes[i]);
     }
-    cuda_ctx->stream_context().original_graph = std::move(original_graph);
+    cuda_ctx->stream_context().original_nodes = std::move(original_nodes);
 
     for (const auto & [root_node, count] : fan_out) {
         if (count >= min_fan_out && count <= max_fan_out) {
@@ -3814,7 +3815,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
             std::vector<std::vector<const ggml_tensor *>> nodes_per_branch;
             for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) {
                 const ggml_tensor * node = cgraph->nodes[i];
-                if (!is_empty(node) && is_src_of(node, root_node)) {
+                if (!is_noop(node) && depends_on(node, root_node)) {
                     nodes_per_branch.push_back({ node });
                 }
             }
@@ -3824,13 +3825,10 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
             //find the join point
             const ggml_tensor * join_node = nullptr;
 
-            const auto & belongs_to_branch = [&](const ggml_tensor * node, std::vector<const ggml_tensor *> & branch) -> bool {
+            const auto & belongs_to_branch = [&](const ggml_tensor *                      node,
+                                                 const std::vector<const ggml_tensor *> & branch) -> bool {
                 for (const ggml_tensor * n : branch) {
-                    if (n == node) {
-                        return false;
-                    }
-
-                    if (is_src_of(node, n)) {
+                    if (depends_on(node, n)) {
                         return true;
                     }
                 }
@@ -3856,21 +3854,17 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                 for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
                     if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) {
                         //continue accumulating
-                        nodes_per_branch[branch_idx].push_back(curr_node);
-                        found_branch = true;
-                    } else {
                         if (std::find(nodes_per_branch[branch_idx].begin(), nodes_per_branch[branch_idx].end(),
-                                      curr_node) != nodes_per_branch[branch_idx].end()) {
-                            found_branch = true;
+                                      curr_node) == nodes_per_branch[branch_idx].end()) {
+                            nodes_per_branch[branch_idx].push_back(curr_node);
                         }
+                        found_branch = true;
                     }
                 }
 
-                if (!found_branch) {
-                    if (is_empty(curr_node)) {
-                        // we can put it in any branch because it will be ignored
-                        nodes_per_branch[0].push_back({ curr_node });
-                    }
+                if (!found_branch && is_noop(curr_node)) {
+                    // we can put it in any branch because it will be ignored
+                    nodes_per_branch[0].push_back({ curr_node });
                 }
             }
 
@@ -3909,7 +3903,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
 
                 std::unordered_map<const ggml_tensor *, ggml_cuda_concurrent_event> & concurrent_events = cuda_ctx->stream_context().concurrent_events;
                 GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end());
-                concurrent_events.emplace(root_node, concurrent_event);
+                concurrent_events.emplace(root_node, std::move(concurrent_event));
                 GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node);
                 concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx);
 
@@ -3937,7 +3931,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
                     branch_nodes.erase(branch_nodes.begin());
 
                     // append all empty nodes
-                    while (!branch_nodes.empty() && is_empty(branch_nodes.front())) {
+                    while (!branch_nodes.empty() && is_noop(branch_nodes.front())) {
                         cgraph->nodes[current_node_idx] = const_cast<ggml_tensor *>(branch_nodes.front());
                         current_node_idx++;
                         branch_nodes.erase(branch_nodes.begin());

From 25565c3811f73c047809f4a0a98157e024517890 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 25 Nov 2025 14:45:34 +0800
Subject: [PATCH 13/18] common.cuh: add is_valid for concurrent events

---
 ggml/src/ggml-cuda/common.cuh   | 73 +++++++++++++++++++++++++++++++++
 ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++--
 2 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 90ad8f2364b..60a32326b1d 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -21,6 +21,7 @@
 #include "ggml-common.h"
 
 #include <array>
+#include <algorithm>
 #include <cassert>
 #include <cfloat>
 #include <cstdio>
@@ -1006,6 +1007,78 @@ struct ggml_cuda_concurrent_event {
         other.fork_event = nullptr;
     }
 
+    // check if all branches don't read to the overlapping buffers
+    // check all read_srcs are either within the branch or are at or before the node
+    // we assume all nodes have the same buffer
+    bool is_valid() const {
+        std::vector<std::vector<std::pair<int64_t, int64_t>>> write_ranges;
+        write_ranges.resize(n_streams);
+
+        for (const auto & [tensor, stream] : stream_mapping) {
+            const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
+            //concurrent streams begin from 1
+            write_ranges[stream - 1].emplace_back((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t));
+        }
+
+        for (int i = 0; i < n_streams; ++i) {
+            std::sort(write_ranges[i].begin(), write_ranges[i].end());
+        }
+
+        bool writes_overlap = false;
+        bool dependent_srcs = false;
+        for (const auto & [tensor, stream] : stream_mapping) {
+            const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
+
+            // multiple nodes can use join_node's buffer. That is fine because we synchronize on the join node.
+            if (t->data == join_node->data) {
+                continue;
+            }
+            // check if this buffer's write data overlaps with another stream's
+            std::pair<int64_t, int64_t> data_range =
+                std::make_pair((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t));
+            for (int i = 0; i < n_streams; ++i) {
+                if (i == stream - 1) {
+                    continue;
+                }
+                auto it = std::lower_bound(write_ranges[i].begin(), write_ranges[i].end(), data_range);
+
+                if (it != write_ranges[i].end()) {
+                    const std::pair<int64_t, int64_t> & other = *it;
+                    if ((other.first >= data_range.first && other.second <= data_range.second) ||
+                        (other.first <= data_range.first && other.second >= data_range.second)) {
+                        GGML_LOG_DEBUG("Writes overlap for %s", tensor->name);
+                        writes_overlap = true;
+                        break;
+                    }
+                }
+            }
+
+            //check if all srcs are either in branch or don't have a branch
+            for (int i = 0; i < GGML_MAX_SRC; ++i) {
+                if (!tensor->src[i]) {
+                    continue;
+                }
+
+                auto it = stream_mapping.find(tensor->src[i]);
+
+                if (it == stream_mapping.end()) {
+                    continue;
+                }
+
+                if (it->second != stream) {
+                    dependent_srcs = true;
+                    break;
+                }
+            }
+
+            if (dependent_srcs || writes_overlap) {
+                break;
+            }
+        }
+
+        return !writes_overlap && !dependent_srcs;
+    }
+
     ~ggml_cuda_concurrent_event() {
         if (fork_event != nullptr) {
             CUDA_CHECK(cudaEventDestroy(fork_event));
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 62f438353dd..4d3255f8af5 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3196,10 +3196,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
     ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context();
     bool                         is_concurrent_event_active = false;
     ggml_cuda_concurrent_event * concurrent_event           = nullptr;
+    bool                         should_launch_concurrent_events = false;
 
     const auto try_launch_concurrent_event = [&](const ggml_tensor * node) {
         if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) {
             concurrent_event = &stream_ctx.concurrent_events[node];
+
             is_concurrent_event_active = true;
 
             GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name);
@@ -3222,6 +3224,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
             [[maybe_unused]] int prev_i = 0;
 
             if (stream_ctx.concurrent_events.size() > 0) {
+                bool should_launch_concurrent_events = true;
+                for (const auto & [tensor, event] : stream_ctx.concurrent_events) {
+                    should_launch_concurrent_events &= event.is_valid();
+                }
+            }
+            if (should_launch_concurrent_events) {
                 //Restore the original graph to enable fusion within the streams
                 cgraph->nodes   = const_cast<ggml_tensor **>(stream_ctx.original_nodes.data());
                 cgraph->n_nodes = (int) stream_ctx.original_nodes.size();
@@ -3852,11 +3860,11 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
 
                 bool found_branch = false;
                 for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) {
-                    if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) {
+                    std::vector<const ggml_tensor *> & branch_vec = nodes_per_branch[branch_idx];
+                    if (belongs_to_branch(curr_node, branch_vec)) {
                         //continue accumulating
-                        if (std::find(nodes_per_branch[branch_idx].begin(), nodes_per_branch[branch_idx].end(),
-                                      curr_node) == nodes_per_branch[branch_idx].end()) {
-                            nodes_per_branch[branch_idx].push_back(curr_node);
+                        if (std::find(branch_vec.begin(), branch_vec.end(), curr_node) == branch_vec.end()) {
+                            branch_vec.push_back(curr_node);
                         }
                         found_branch = true;
                     }

From 06b2ad91c7c013d3da4517a7a05f49d47fd1fae0 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Tue, 25 Nov 2025 15:18:04 +0800
Subject: [PATCH 14/18] common.cuh: make comment better

---
 ggml/src/ggml-cuda/common.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 60a32326b1d..5196772127e 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1007,8 +1007,8 @@ struct ggml_cuda_concurrent_event {
         other.fork_event = nullptr;
     }
 
-    // check if all branches don't read to the overlapping buffers
-    // check all read_srcs are either within the branch or are at or before the node
+    // 1. check if all branches don't write to the overlapping memory ranges (except the join node)
+    // 2. check all srcs are either within the branch or outside the event
     // we assume all nodes have the same buffer
     bool is_valid() const {
         std::vector<std::vector<std::pair<int64_t, int64_t>>> write_ranges;

From e7f3158e97a9873791e64c67b7845f4f7977618a Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 27 Nov 2025 22:10:14 +0800
Subject: [PATCH 15/18] update comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/common.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 5196772127e..d59e3f6ca00 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1007,7 +1007,7 @@ struct ggml_cuda_concurrent_event {
         other.fork_event = nullptr;
     }
 
-    // 1. check if all branches don't write to the overlapping memory ranges (except the join node)
+    // 1. check if any branches write to overlapping memory ranges (except the join node)
     // 2. check all srcs are either within the branch or outside the event
     // we assume all nodes have the same buffer
     bool is_valid() const {

From 2c9c3c20c8da3481c883a95df65f0a1bc52571ae Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 27 Nov 2025 22:10:47 +0800
Subject: [PATCH 16/18] update comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 ggml/src/ggml-cuda/common.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index d59e3f6ca00..a6840ebc804 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1008,7 +1008,7 @@ struct ggml_cuda_concurrent_event {
     }
 
     // 1. check if any branches write to overlapping memory ranges (except the join node)
-    // 2. check all srcs are either within the branch or outside the event
+    // 2. check whether all srcs are either within the branch or outside the nodes covered by ggml_cuda_concurrent_event
     // we assume all nodes have the same buffer
     bool is_valid() const {
         std::vector<std::vector<std::pair<int64_t, int64_t>>> write_ranges;

From c1dca28af91a1952dd1a4c214e8d11501736b4fe Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Thu, 27 Nov 2025 23:07:04 +0800
Subject: [PATCH 17/18] common.cuh: fix lower_bound condition + remove
 join_node data from write_ranges

---
 ggml/src/ggml-cuda/common.cuh | 37 +++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a6840ebc804..a6fad94f5d6 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1014,13 +1014,28 @@ struct ggml_cuda_concurrent_event {
         std::vector<std::vector<std::pair<int64_t, int64_t>>> write_ranges;
         write_ranges.resize(n_streams);
 
+        // Get join_node's memory range to exclude from overlap checking.
+        // Multiple nodes can use join_node's buffer; we synchronize on the join node.
+        const ggml_tensor * join_t     = join_node->view_src ? join_node->view_src : join_node;
+        const int64_t       join_start = (int64_t) join_t->data;
+        const int64_t       join_end   = join_start + ggml_nbytes(join_t);
+
         for (const auto & [tensor, stream] : stream_mapping) {
             const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
-            //concurrent streams begin from 1
-            write_ranges[stream - 1].emplace_back((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t));
+            const int64_t       t_start = (int64_t) t->data;
+            const int64_t       t_end   = t_start + ggml_nbytes(t);
+
+            // Skip tensors that overlap with join_node's buffer.
+            if (t_start < join_end && join_start < t_end) {
+                continue;
+            }
+
+            // concurrent streams begin from 1
+            write_ranges[stream - 1].emplace_back(t_start, t_end);
         }
 
         for (int i = 0; i < n_streams; ++i) {
+            // sorts first by start then by end of write range
             std::sort(write_ranges[i].begin(), write_ranges[i].end());
         }
 
@@ -1028,14 +1043,16 @@ struct ggml_cuda_concurrent_event {
         bool dependent_srcs = false;
         for (const auto & [tensor, stream] : stream_mapping) {
             const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor;
+            const int64_t       t_start = (int64_t) t->data;
+            const int64_t       t_end   = t_start + ggml_nbytes(t);
 
-            // multiple nodes can use join_node's buffer. That is fine because we synchronize on the join node.
-            if (t->data == join_node->data) {
+            // Skip tensors that overlap with join_node's buffer (already excluded from write_ranges).
+            if (t_start < join_end && join_start < t_end) {
                 continue;
             }
+
             // check if this buffer's write data overlaps with another stream's
-            std::pair<int64_t, int64_t> data_range =
-                std::make_pair((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t));
+            std::pair<int64_t, int64_t> data_range = std::make_pair(t_start, t_end);
             for (int i = 0; i < n_streams; ++i) {
                 if (i == stream - 1) {
                     continue;
@@ -1044,8 +1061,12 @@ struct ggml_cuda_concurrent_event {
 
                 if (it != write_ranges[i].end()) {
                     const std::pair<int64_t, int64_t> & other = *it;
-                    if ((other.first >= data_range.first && other.second <= data_range.second) ||
-                        (other.first <= data_range.first && other.second >= data_range.second)) {
+
+                    // std::lower_bound returns the first element where other >= data_range (lexicographically).
+                    // This guarantees other.first >= data_range.first.
+                    // Therefore, overlap occurs iff other.first < data_range.second
+                    // (i.e., the other range starts before this range ends).
+                    if (other.first < data_range.second) {
                         GGML_LOG_DEBUG("Writes overlap for %s", tensor->name);
                         writes_overlap = true;
                         break;

From c57b750c30c651521f7ad6676db149a50925c947 Mon Sep 17 00:00:00 2001
From: Aman Gupta <amangupta052@gmail.com>
Date: Sat, 29 Nov 2025 08:46:29 +0800
Subject: [PATCH 18/18] ggml-cuda: fix overlap condition + shadowing parameter

---
 ggml/src/ggml-cuda/common.cuh   | 12 ++++++------
 ggml/src/ggml-cuda/ggml-cuda.cu |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index a6fad94f5d6..978497ca523 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1014,8 +1014,8 @@ struct ggml_cuda_concurrent_event {
         std::vector<std::vector<std::pair<int64_t, int64_t>>> write_ranges;
         write_ranges.resize(n_streams);
 
-        // Get join_node's memory range to exclude from overlap checking.
-        // Multiple nodes can use join_node's buffer; we synchronize on the join node.
+        // get join_node's memory range to exclude from overlap checking.
+        // multiple nodes can use join_node's buffer; we synchronize on the join node.
         const ggml_tensor * join_t     = join_node->view_src ? join_node->view_src : join_node;
         const int64_t       join_start = (int64_t) join_t->data;
         const int64_t       join_end   = join_start + ggml_nbytes(join_t);
@@ -1025,8 +1025,8 @@ struct ggml_cuda_concurrent_event {
             const int64_t       t_start = (int64_t) t->data;
             const int64_t       t_end   = t_start + ggml_nbytes(t);
 
-            // Skip tensors that overlap with join_node's buffer.
-            if (t_start < join_end && join_start < t_end) {
+            // skip tensors that overlap with join_node's buffer.
+            if ((t_start <= join_start && join_start < t_end) || (join_start <= t_start && t_start < join_end)) {
                 continue;
             }
 
@@ -1046,8 +1046,8 @@ struct ggml_cuda_concurrent_event {
             const int64_t       t_start = (int64_t) t->data;
             const int64_t       t_end   = t_start + ggml_nbytes(t);
 
-            // Skip tensors that overlap with join_node's buffer (already excluded from write_ranges).
-            if (t_start < join_end && join_start < t_end) {
+            // skip tensors that overlap with join_node's buffer
+            if ((t_start <= join_start && join_start < t_end) || (join_start <= t_start && t_start < join_end)) {
                 continue;
             }
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 4d3255f8af5..ce98a161ac9 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3224,9 +3224,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
             [[maybe_unused]] int prev_i = 0;
 
             if (stream_ctx.concurrent_events.size() > 0) {
-                bool should_launch_concurrent_events = true;
+                should_launch_concurrent_events = true;
                 for (const auto & [tensor, event] : stream_ctx.concurrent_events) {
-                    should_launch_concurrent_events &= event.is_valid();
+                    should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
                 }
             }
             if (should_launch_concurrent_events) {