From 773909b75cd83559f76d72fb72219001a8c71d4b Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 1 Nov 2025 14:30:00 +0800 Subject: [PATCH 01/18] CUDA: add stream-based concurrency --- ggml/src/ggml-cuda/common.cuh | 46 ++++- ggml/src/ggml-cuda/ggml-cuda.cu | 298 ++++++++++++++++++++++++++++++-- 2 files changed, 324 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 99ec96869a7..d3ab8b6c0cd 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -972,6 +972,32 @@ struct ggml_cuda_graph { #endif }; +struct ggml_cuda_concurrent_event { + std::vector per_stream_events; + cudaEvent_t fork_event; + cudaEvent_t join_event; + + int n_streams = 0; + std::unordered_map stream_mapping; + + const ggml_tensor * join_node; + + ggml_cuda_concurrent_event() = default; + + explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) { + per_stream_events.resize(n_streams); + + for (size_t i = 0; i < per_stream_events.size(); ++i) { + cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming); + } + + cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming); + cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming); + } +}; + +using ggml_cuda_stream_context = std::unordered_map; + struct ggml_backend_cuda_context { int device; std::string name; @@ -982,11 +1008,15 @@ struct ggml_backend_cuda_context { std::unique_ptr cuda_graph; + int curr_stream_no = 0; + explicit ggml_backend_cuda_context(int device) : device(device), name(GGML_CUDA_NAME + std::to_string(device)) { } + ggml_cuda_stream_context concurrent_stream_context; + ~ggml_backend_cuda_context(); cudaStream_t stream(int device, int stream) { @@ -997,9 +1027,9 @@ struct ggml_backend_cuda_context { return streams[device][stream]; } - cudaStream_t stream() { - return stream(device, 0); - } + cudaStream_t stream() { return stream(device, curr_stream_no); } + + ggml_cuda_stream_context & stream_context() { return concurrent_stream_context; } cublasHandle_t cublas_handle(int device) { if (cublas_handles[device] == nullptr) { @@ -1015,15 +1045,15 @@ struct ggml_backend_cuda_context { } // pool - std::unique_ptr pools[GGML_CUDA_MAX_DEVICES]; + std::unique_ptr pools[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; - static std::unique_ptr new_pool_for_device(int device); + static std::unique_ptr new_pool_for_device(int device, int stream_no); ggml_cuda_pool & pool(int device) { - if (pools[device] == nullptr) { - pools[device] = new_pool_for_device(device); + if (pools[device][curr_stream_no] == nullptr) { + pools[device][curr_stream_no] = new_pool_for_device(device, curr_stream_no); } - return *pools[device]; + return *pools[device][curr_stream_no]; } ggml_cuda_pool & pool() { diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 0b29074f33d..510388fa5ec 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -521,7 +521,8 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { }; #endif // defined(GGML_USE_VMM) -std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { +std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device, + [[maybe_unused]] int stream_no) { #if defined(GGML_USE_VMM) if (ggml_cuda_info().devices[device].vmm) { return std::unique_ptr(new ggml_cuda_pool_vmm(device)); @@ -3036,7 +3037,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, #ifndef NDEBUG const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY); GGML_ASSERT(unary_ops.size() == num_unary); -#endif +#endif; //TODO: remove special case once ggml_can_fuse can handle empty nodes std::initializer_list topk_moe_ops = @@ -3192,18 +3193,44 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; + bool is_concurrent_event_active = false; + ggml_cuda_concurrent_event * concurrent_event = nullptr; + while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // With the use of CUDA graphs, the execution will be performed by the graph launch. if (!use_cuda_graph || cuda_graph_update_required) { - [[maybe_unused]] int prev_i = 0; + ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context(); + for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + if (is_concurrent_event_active) { + GGML_ASSERT(concurrent_event); + + if (node == concurrent_event->join_node) { + cuda_ctx->curr_stream_no = 0; + for (int i = 1; i <= concurrent_event->n_streams; ++i) { + CUDA_CHECK(cudaEventRecord(concurrent_event->per_stream_events[i - 1], + cuda_ctx->stream(cuda_ctx->device, i))); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->per_stream_events[i - 1])); + } + + is_concurrent_event_active = false; + concurrent_event = nullptr; + + } else { + GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end()); + const int stream_mapping = concurrent_event->stream_mapping[node]; + cuda_ctx->curr_stream_no = stream_mapping; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name); + } + } + prev_i = i; + #ifdef GGML_CUDA_DEBUG const int nodes_fused = i - prev_i - 1; - prev_i = i; if (nodes_fused > 0) { GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused); } @@ -3213,6 +3240,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } + + // start of fusion operations static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr); if (!disable_fusion) { @@ -3476,16 +3505,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } + //TODO: fix this + static const bool graph_opt = (getenv("GGML_CUDA_GRAPH_OPT") != nullptr) && atoi(getenv("GGML_CUDA_GRAPH_OPT")) == 1; + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { - ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); - i += 2; - continue; + if (strncmp(cgraph->nodes[i+2]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) { + ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); + i += 2; + continue; + } } if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) { - ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); - i++; - continue; + if (strncmp(cgraph->nodes[i+1]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) { + ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); + i++; + continue; + } } if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) { @@ -3505,13 +3541,35 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } #else GGML_UNUSED(integrated); -#endif // NDEBUG +#endif // NDEBUG bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); + + if (!is_concurrent_event_active) { + //const ggml_tensor * adjusted_node = node; + // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD), + // we can safely use the previous node to check if it can be forked + if (stream_ctx.find(node) != stream_ctx.end()) { + concurrent_event = &stream_ctx[node]; + + GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); + + cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0 + GGML_ASSERT(cuda_ctx->curr_stream_no == 0); + CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream)); + + for (int i = 1; i <= concurrent_event->n_streams; ++i) { + cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i); + CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event)); + } + + is_concurrent_event_active = true; + } + } } } @@ -3651,6 +3709,222 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev } } +static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context; + + static bool enable_graph_optimization = [] { + const char * env = getenv("GGML_CUDA_GRAPH_OPT"); + return env != nullptr && atoi(env) == 1; + }(); + + if (!enable_graph_optimization) { + return; + } + + GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "cuda graph optimization is only supported on single GPU"); + GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes); + + ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context(); + stream_context.clear(); + + std::unordered_map fan_out; + std::unordered_map node_indices; + + const auto & is_empty = [](const ggml_tensor * node) -> bool { + return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || + node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; + }; + + const auto & is_src_of = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool { + for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { + if (dst->src[s] == src) { + return true; + } + } + // implicit dependency if they view the same tensor + const ggml_tensor * dst2 = dst->view_src ? dst->view_src : dst; + const ggml_tensor * src2 = src->view_src ? src->view_src : src; + if (dst2 == src2) { + return true; + } + return false; + }; + + for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) { + const ggml_tensor * node = cgraph->nodes[node_idx]; + node_indices[node] = node_idx; + + if (is_empty(node)) { + continue; + } + for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { + const ggml_tensor * node = cgraph->nodes[node_idx]->src[src_idx]; + //TODO: check why nrows > 1 fails, probably related to CUDA graphs + if (node && !is_empty(node) && ggml_nrows(node) <= 1) { + fan_out[node] += 1; + } + } + } + + //Target Q, K, V + const int min_fan_out = 3; + const int max_fan_out = 3; + + std::vector> concurrent_node_ranges; + for (const auto & [root_node, count] : fan_out) { + if (count >= min_fan_out && count <= max_fan_out) { + const int root_node_idx = node_indices[root_node]; + + bool is_part_of_event = false; + for (const auto & [start, end] : concurrent_node_ranges) { + if (root_node_idx >= start && root_node_idx <= end) { + is_part_of_event = true; + } + } + + if (is_part_of_event) { + continue; + } + + std::vector> nodes_per_branch; + for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) { + const ggml_tensor * node = cgraph->nodes[i]; + if (!is_empty(node) && is_src_of(node, root_node)) { + nodes_per_branch.push_back({ node }); + } + } + + GGML_ASSERT(nodes_per_branch.size() == (size_t) count); + + //find the join point + const ggml_tensor * join_node = nullptr; + + auto belongs_to_branch = [&](const ggml_tensor * node, std::vector & branch) -> bool { + for (const ggml_tensor * n : branch) { + if (n == node) { + return false; + } + + if (is_src_of(node, n)) { + return true; + } + } + return false; + }; + + for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) { + const ggml_tensor * curr_node = cgraph->nodes[i]; + + int num_joins = 0; + for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) { + if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) { + num_joins++; + } + } + + if (num_joins >= 2) { + join_node = curr_node; + break; + } + + bool found_branch = false; + for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) { + if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) { + //continue accumulating + nodes_per_branch[branch_idx].push_back(curr_node); + found_branch = true; + } else { + if (std::find(nodes_per_branch[branch_idx].begin(), nodes_per_branch[branch_idx].end(), + curr_node) != nodes_per_branch[branch_idx].end()) { + found_branch = true; + } + } + } + + if (!found_branch) { + if (is_empty(curr_node)) { + // we can put it in any branch because it will be ignored + nodes_per_branch[0].push_back({ curr_node }); + } + } + } + + if (join_node) { + //Create ggml_cuda_concurrent_event + ggml_cuda_concurrent_event concurrent_event(nodes_per_branch.size()); + concurrent_event.join_node = join_node; + + for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) { + for (const ggml_tensor * n : nodes_per_branch[branch_idx]) { + concurrent_event.stream_mapping[n] = branch_idx + 1; + } + } + + int fork_node_idx = node_indices[root_node]; + int join_node_idx = node_indices[join_node]; + + int current_branch_idx = 0; + int current_node_idx = fork_node_idx + 1; + const int n_branches = nodes_per_branch.size(); + + int total_branch_nodes = 0; + for (std::vector branch_nodes : nodes_per_branch) { + total_branch_nodes += branch_nodes.size(); + } + + // there are other nodes in the middle which are unaccounted for + // usually (cpy) nodes, then ignore this fork + if (join_node_idx - fork_node_idx - 1 != total_branch_nodes) { + GGML_LOG_DEBUG( + "Skipping %s because the number of nodes in the middle is not equal to the total number of " + "branch nodes %d != %d\n", + root_node->name, join_node_idx - fork_node_idx - 1, total_branch_nodes); + continue; + } + + GGML_ASSERT(cuda_ctx->stream_context().find(root_node) == cuda_ctx->stream_context().end()); + cuda_ctx->stream_context().emplace(root_node, concurrent_event); + GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node); + concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx); + + // interleave tensors to extend lifetimes so that ggml graph doesn't recycle them + // example transformation: + // [attn-norm, QMul, QNorm, QRope, KMul, KNorm, KRope, VMul, attn] -> + // [attn-norm, QMul, KMul, VMul, QNorm, VNorm, QRope, KRope, attn] + // TODO: This breaks fusion within streams, how do we fix this? + while (current_node_idx < join_node_idx) { + std::vector & branch_nodes = nodes_per_branch[current_branch_idx]; + + bool has_node = false; + for (std::vector branch_node : nodes_per_branch) { + has_node |= branch_node.size() > 0; + } + + GGML_ASSERT(has_node); + + if (branch_nodes.empty()) { + current_branch_idx = (current_branch_idx + 1) % n_branches; + continue; + } + + cgraph->nodes[current_node_idx] = const_cast(branch_nodes.front()); + current_node_idx++; + branch_nodes.erase(branch_nodes.begin()); + + // append all empty nodes + while (!branch_nodes.empty() && is_empty(branch_nodes.front())) { + cgraph->nodes[current_node_idx] = const_cast(branch_nodes.front()); + current_node_idx++; + branch_nodes.erase(branch_nodes.begin()); + } + + current_branch_idx = (current_branch_idx + 1) % n_branches; + } + } + } + } +} + static const ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_get_name, /* .free = */ ggml_backend_cuda_free, @@ -3665,7 +3939,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = { /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .event_record = */ ggml_backend_cuda_event_record, /* .event_wait = */ ggml_backend_cuda_event_wait, - /* .graph_optimize = */ NULL, + /* .graph_optimize = */ ggml_backend_cuda_graph_optimize, }; static ggml_guid_t ggml_backend_cuda_guid() { From fb2e2fbf337c18b464f419781887a83f0091fdc6 Mon Sep 17 00:00:00 2001 From: Carl Philipp Klemm Date: Thu, 6 Nov 2025 13:35:19 +0100 Subject: [PATCH 02/18] HIP: fix hipStreamWaitEvent define and nodiscard warnings --- ggml/src/ggml-cuda/common.cuh | 7 ++++--- ggml/src/ggml-cuda/vendors/hip.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index d3ab8b6c0cd..78eaf9cdc1e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #if defined(GGML_USE_HIP) @@ -988,11 +989,11 @@ struct ggml_cuda_concurrent_event { per_stream_events.resize(n_streams); for (size_t i = 0; i < per_stream_events.size(); ++i) { - cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming); + CUDA_CHECK(cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming)); } - cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming); - cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming); + CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming)); + CUDA_CHECK(cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming)); } }; diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index 890c1036498..b7d6edf7fcb 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -105,7 +105,7 @@ #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamPerThread hipStreamPerThread #define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStreamWaitEvent hipStreamWaitEvent #define cudaGraphExec_t hipGraphExec_t #define cudaGraphNode_t hipGraphNode_t #define cudaKernelNodeParams hipKernelNodeParams From 952526c61ca2aede276dc7553cae1f75f803ea17 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 13 Nov 2025 11:08:05 +0800 Subject: [PATCH 03/18] ggml-cuda: fix fusion inside stream --- ggml/src/ggml-cuda/common.cuh | 10 ++++- ggml/src/ggml-cuda/ggml-cuda.cu | 65 +++++++++++++++++++++++---------- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 78eaf9cdc1e..21d9b2126e3 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -997,7 +997,15 @@ struct ggml_cuda_concurrent_event { } }; -using ggml_cuda_stream_context = std::unordered_map; +struct ggml_cuda_stream_context { + std::vector original_graph; + std::unordered_map concurrent_events; + + void reset() { + original_graph.clear(); + concurrent_events.clear(); + } +}; struct ggml_backend_cuda_context { int device; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 510388fa5ec..90f5f618747 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3203,6 +3203,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx [[maybe_unused]] int prev_i = 0; ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context(); + if (stream_ctx.concurrent_events.size() > 0) { + cgraph->nodes = const_cast(stream_ctx.original_graph.data()); + } for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -3226,6 +3229,26 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx cuda_ctx->curr_stream_no = stream_mapping; GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name); } + } else if (i - prev_i > 1) { + + //the previous node was fused + const ggml_tensor * prev_node = cgraph->nodes[i - 1]; + if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) { + concurrent_event = &stream_ctx.concurrent_events[prev_node]; + + GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); + + cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0 + GGML_ASSERT(cuda_ctx->curr_stream_no == 0); + CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream)); + + for (int i = 1; i <= concurrent_event->n_streams; ++i) { + cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i); + CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event)); + } + + is_concurrent_event_active = true; + } } prev_i = i; @@ -3505,23 +3528,16 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx continue; } - //TODO: fix this - static const bool graph_opt = (getenv("GGML_CUDA_GRAPH_OPT") != nullptr) && atoi(getenv("GGML_CUDA_GRAPH_OPT")) == 1; - if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { - if (strncmp(cgraph->nodes[i+2]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) { - ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); - i += 2; - continue; - } + ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); + i += 2; + continue; } if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) { - if (strncmp(cgraph->nodes[i+1]->name, "attn_norm", strlen("attn_norm")) != 0 || !graph_opt) { - ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); - i++; - continue; - } + ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]); + i++; + continue; } if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) { @@ -3553,8 +3569,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx //const ggml_tensor * adjusted_node = node; // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD), // we can safely use the previous node to check if it can be forked - if (stream_ctx.find(node) != stream_ctx.end()) { - concurrent_event = &stream_ctx[node]; + if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) { + concurrent_event = &stream_ctx.concurrent_events[node]; GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); @@ -3725,7 +3741,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes); ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context(); - stream_context.clear(); + stream_context.reset(); std::unordered_map fan_out; std::unordered_map node_indices; @@ -3771,6 +3787,15 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph const int max_fan_out = 3; std::vector> concurrent_node_ranges; + + //save the original graph + std::vector original_graph; + original_graph.reserve(cgraph->n_nodes); + for (int i = 0; i < cgraph->n_nodes; ++i) { + original_graph.push_back(cgraph->nodes[i]); + } + cuda_ctx->stream_context().original_graph = std::move(original_graph); + for (const auto & [root_node, count] : fan_out) { if (count >= min_fan_out && count <= max_fan_out) { const int root_node_idx = node_indices[root_node]; @@ -3799,7 +3824,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph //find the join point const ggml_tensor * join_node = nullptr; - auto belongs_to_branch = [&](const ggml_tensor * node, std::vector & branch) -> bool { + const auto & belongs_to_branch = [&](const ggml_tensor * node, std::vector & branch) -> bool { for (const ggml_tensor * n : branch) { if (n == node) { return false; @@ -3882,8 +3907,9 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph continue; } - GGML_ASSERT(cuda_ctx->stream_context().find(root_node) == cuda_ctx->stream_context().end()); - cuda_ctx->stream_context().emplace(root_node, concurrent_event); + std::unordered_map & concurrent_events = cuda_ctx->stream_context().concurrent_events; + GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end()); + concurrent_events.emplace(root_node, concurrent_event); GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node); concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx); @@ -3891,7 +3917,6 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph // example transformation: // [attn-norm, QMul, QNorm, QRope, KMul, KNorm, KRope, VMul, attn] -> // [attn-norm, QMul, KMul, VMul, QNorm, VNorm, QRope, KRope, attn] - // TODO: This breaks fusion within streams, how do we fix this? while (current_node_idx < join_node_idx) { std::vector & branch_nodes = nodes_per_branch[current_branch_idx]; From b87eb3ea7a7df06e345642b55d81c9b8957c359d Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 13 Nov 2025 16:23:25 +0800 Subject: [PATCH 04/18] ggml-cuda: fix bug w.r.t first stream launch --- ggml/src/ggml-cuda/ggml-cuda.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 90f5f618747..e3578645149 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3203,6 +3203,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx [[maybe_unused]] int prev_i = 0; ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context(); + if (stream_ctx.concurrent_events.size() > 0) { cgraph->nodes = const_cast(stream_ctx.original_graph.data()); } @@ -3236,7 +3237,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) { concurrent_event = &stream_ctx.concurrent_events[prev_node]; - GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); + GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, prev_node->name); cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0 GGML_ASSERT(cuda_ctx->curr_stream_no == 0); @@ -3248,6 +3249,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } is_concurrent_event_active = true; + cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); } } prev_i = i; From 8b048e8b4c285131ac1737f66d186479eb732425 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 13 Nov 2025 20:49:49 +0800 Subject: [PATCH 05/18] ggml-cuda: format --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e3578645149..d015fbce152 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3037,7 +3037,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, #ifndef NDEBUG const size_t num_unary = std::count(ops.begin(), ops.end(), GGML_OP_UNARY); GGML_ASSERT(unary_ops.size() == num_unary); -#endif; +#endif //TODO: remove special case once ggml_can_fuse can handle empty nodes std::initializer_list topk_moe_ops = From 35ad678106bcfeacc9cbd9f9c7fbece2d21af020 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 14 Nov 2025 10:56:30 +0800 Subject: [PATCH 06/18] ggml-cuda: improve assert message --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d015fbce152..ecbd8886b1d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3740,7 +3740,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph return; } - GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "cuda graph optimization is only supported on single GPU"); + GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend"); GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes); ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context(); From 232c47fee63129359ed51041c8ee20a9583bf0f8 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 14 Nov 2025 23:26:51 +0800 Subject: [PATCH 07/18] ggml-cuda: use lambda instead of duplicating code --- ggml/src/ggml-cuda/ggml-cuda.cu | 57 ++++++++++++--------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index ecbd8886b1d..478dba13de0 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3196,6 +3196,23 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx bool is_concurrent_event_active = false; ggml_cuda_concurrent_event * concurrent_event = nullptr; + const auto try_launch_concurrent_event = [&](const ggml_tensor * node) { + if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) { + concurrent_event = &stream_ctx.concurrent_events[node]; + + GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); + + cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0 + GGML_ASSERT(cuda_ctx->curr_stream_no == 0); + CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream)); + + for (int i = 1; i <= concurrent_event->n_streams; ++i) { + cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i); + CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event)); + } + } + }; + while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. // With the use of CUDA graphs, the execution will be performed by the graph launch. @@ -3216,6 +3233,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (node == concurrent_event->join_node) { cuda_ctx->curr_stream_no = 0; for (int i = 1; i <= concurrent_event->n_streams; ++i) { + // Wait on join events of forked streams in the main stream CUDA_CHECK(cudaEventRecord(concurrent_event->per_stream_events[i - 1], cuda_ctx->stream(cuda_ctx->device, i))); CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->per_stream_events[i - 1])); @@ -3234,24 +3252,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx //the previous node was fused const ggml_tensor * prev_node = cgraph->nodes[i - 1]; - if (stream_ctx.concurrent_events.find(prev_node) != stream_ctx.concurrent_events.end()) { - concurrent_event = &stream_ctx.concurrent_events[prev_node]; - - GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, prev_node->name); - - cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0 - GGML_ASSERT(cuda_ctx->curr_stream_no == 0); - CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream)); - - for (int i = 1; i <= concurrent_event->n_streams; ++i) { - cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i); - CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event)); - } - - is_concurrent_event_active = true; - cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; - GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); - } + try_launch_concurrent_event(prev_node); } prev_i = i; @@ -3569,25 +3570,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx GGML_ASSERT(ok); if (!is_concurrent_event_active) { - //const ggml_tensor * adjusted_node = node; - // the forking node may have been fused, e.g (RMS_NORM_MUL + MUL + ADD), - // we can safely use the previous node to check if it can be forked - if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) { - concurrent_event = &stream_ctx.concurrent_events[node]; - - GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); - - cudaStream_t main_stream = cuda_ctx->stream(); // this should be stream 0 - GGML_ASSERT(cuda_ctx->curr_stream_no == 0); - CUDA_CHECK(cudaEventRecord(concurrent_event->fork_event, main_stream)); - - for (int i = 1; i <= concurrent_event->n_streams; ++i) { - cudaStream_t stream = cuda_ctx->stream(cuda_ctx->device, i); - CUDA_CHECK(cudaStreamWaitEvent(stream, concurrent_event->fork_event)); - } - - is_concurrent_event_active = true; - } + try_launch_concurrent_event(node); } } } From c70a9dbb7c703cea1c78e4859d9536e9e3801210 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 14 Nov 2025 23:38:01 +0800 Subject: [PATCH 08/18] ggml-cuda: add some more comments --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 478dba13de0..b6baef72d61 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3193,6 +3193,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx // flag used to determine whether it is an integrated_gpu const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; + ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context(); bool is_concurrent_event_active = false; ggml_cuda_concurrent_event * concurrent_event = nullptr; @@ -3219,9 +3220,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (!use_cuda_graph || cuda_graph_update_required) { [[maybe_unused]] int prev_i = 0; - ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context(); - if (stream_ctx.concurrent_events.size() > 0) { + //Restore the original graph to enable fusion within the streams cgraph->nodes = const_cast(stream_ctx.original_graph.data()); } From 790ddd4a3b0c3e163399912484d1de6abdb2781f Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Fri, 14 Nov 2025 23:53:15 +0800 Subject: [PATCH 09/18] ggml-cuda: add more detailed comments about concurrency --- ggml/src/ggml-cuda/ggml-cuda.cu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b6baef72d61..b89a445a1f7 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3768,7 +3768,15 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph } } - //Target Q, K, V + // Target Q, K, V for concurrency + // this is a more general way to find nodes which can be candidates for concurrency (although it has not been tested for anything else): + // 1. find fan-out (fork) nodes where the same input is used at least N times (in QKV, it would be "attn-norm") + // 2. find the join node, where 2 or more of the outputs are required (in QKV, this would "KQ" or "flash-attn") + // 3. account for all branches from the fork to the join + // 4. To extend lifetimes of the tensors, we interleave the branches (see below for more details) + // 5. save the original cgraph and restore it in graph_compute, to enable fusion within streams + // See discussion: https://github.com/ggml-org/llama.cpp/pull/16991#issuecomment-3522620030 + const int min_fan_out = 3; const int max_fan_out = 3; From 3c717272c8ededaa5f8dd9d88a25ecd4d80cb468 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 15 Nov 2025 09:23:58 +0800 Subject: [PATCH 10/18] ggml-cuda: rename + remove unused var --- ggml/src/ggml-cuda/common.cuh | 10 ++++------ ggml/src/ggml-cuda/ggml-cuda.cu | 6 +++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 21d9b2126e3..5d74c1cacfd 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -974,9 +974,8 @@ struct ggml_cuda_graph { }; struct ggml_cuda_concurrent_event { - std::vector per_stream_events; + std::vector join_events; cudaEvent_t fork_event; - cudaEvent_t join_event; int n_streams = 0; std::unordered_map stream_mapping; @@ -986,14 +985,13 @@ struct ggml_cuda_concurrent_event { ggml_cuda_concurrent_event() = default; explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) { - per_stream_events.resize(n_streams); + join_events.resize(n_streams); - for (size_t i = 0; i < per_stream_events.size(); ++i) { - CUDA_CHECK(cudaEventCreateWithFlags(&per_stream_events[i], cudaEventDisableTiming)); + for (size_t i = 0; i < join_events.size(); ++i) { + CUDA_CHECK(cudaEventCreateWithFlags(&join_events[i], cudaEventDisableTiming)); } CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming)); - CUDA_CHECK(cudaEventCreateWithFlags(&join_event, cudaEventDisableTiming)); } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b89a445a1f7..2b27193f63d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3233,10 +3233,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (node == concurrent_event->join_node) { cuda_ctx->curr_stream_no = 0; for (int i = 1; i <= concurrent_event->n_streams; ++i) { - // Wait on join events of forked streams in the main stream - CUDA_CHECK(cudaEventRecord(concurrent_event->per_stream_events[i - 1], + // Wait on join events of forked streams in the main stream + CUDA_CHECK(cudaEventRecord(concurrent_event->join_events[i - 1], cuda_ctx->stream(cuda_ctx->device, i))); - CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->per_stream_events[i - 1])); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), concurrent_event->join_events[i - 1])); } is_concurrent_event_active = false; From 05379d98b5edd20fecc13da75a2cff8491171f56 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Wed, 19 Nov 2025 22:58:32 +0800 Subject: [PATCH 11/18] ggml-cuda: fix condition for stream launch --- ggml/src/ggml-cuda/ggml-cuda.cu | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 2b27193f63d..d0fdd2da492 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3200,6 +3200,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx const auto try_launch_concurrent_event = [&](const ggml_tensor * node) { if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) { concurrent_event = &stream_ctx.concurrent_events[node]; + is_concurrent_event_active = true; GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); @@ -3249,10 +3250,15 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name); } } else if (i - prev_i > 1) { - //the previous node was fused const ggml_tensor * prev_node = cgraph->nodes[i - 1]; try_launch_concurrent_event(prev_node); + + if (is_concurrent_event_active) { + const int stream_mapping = concurrent_event->stream_mapping[node]; + cuda_ctx->curr_stream_no = stream_mapping; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name); + } } prev_i = i; @@ -3761,7 +3767,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph } for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { const ggml_tensor * node = cgraph->nodes[node_idx]->src[src_idx]; - //TODO: check why nrows > 1 fails, probably related to CUDA graphs + //TODO: check why nrows > 1 fails if (node && !is_empty(node) && ggml_nrows(node) <= 1) { fan_out[node] += 1; } From 537a08e470ed067c4388aa3e1e2a727f0908f4f5 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 22 Nov 2025 13:22:25 +0800 Subject: [PATCH 12/18] ggml-cuda: address review comments, add destructor --- ggml/src/ggml-cuda/common.cuh | 29 +++++++++++-- ggml/src/ggml-cuda/ggml-cuda.cu | 72 +++++++++++++++------------------ 2 files changed, 59 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 5d74c1cacfd..90ad8f2364b 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -975,7 +975,7 @@ struct ggml_cuda_graph { struct ggml_cuda_concurrent_event { std::vector join_events; - cudaEvent_t fork_event; + cudaEvent_t fork_event = nullptr; int n_streams = 0; std::unordered_map stream_mapping; @@ -984,6 +984,9 @@ struct ggml_cuda_concurrent_event { ggml_cuda_concurrent_event() = default; + ggml_cuda_concurrent_event(const ggml_cuda_concurrent_event &) = delete; + ggml_cuda_concurrent_event & operator=(const ggml_cuda_concurrent_event &) = delete; + explicit ggml_cuda_concurrent_event(int n_streams) : n_streams(n_streams) { join_events.resize(n_streams); @@ -993,14 +996,34 @@ struct ggml_cuda_concurrent_event { CUDA_CHECK(cudaEventCreateWithFlags(&fork_event, cudaEventDisableTiming)); } + + ggml_cuda_concurrent_event(ggml_cuda_concurrent_event && other) noexcept + : join_events(std::move(other.join_events)) + , fork_event(other.fork_event) + , n_streams(other.n_streams) + , stream_mapping(std::move(other.stream_mapping)) + , join_node(other.join_node) { + other.fork_event = nullptr; + } + + ~ggml_cuda_concurrent_event() { + if (fork_event != nullptr) { + CUDA_CHECK(cudaEventDestroy(fork_event)); + } + for (cudaEvent_t e : join_events) { + if (e != nullptr) { + CUDA_CHECK(cudaEventDestroy(e)); + } + } + } }; struct ggml_cuda_stream_context { - std::vector original_graph; + std::vector original_nodes; std::unordered_map concurrent_events; void reset() { - original_graph.clear(); + original_nodes.clear(); concurrent_events.clear(); } }; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d0fdd2da492..62f438353dd 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3223,7 +3223,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (stream_ctx.concurrent_events.size() > 0) { //Restore the original graph to enable fusion within the streams - cgraph->nodes = const_cast(stream_ctx.original_graph.data()); + cgraph->nodes = const_cast(stream_ctx.original_nodes.data()); + cgraph->n_nodes = (int) stream_ctx.original_nodes.size(); } for (int i = 0; i < cgraph->n_nodes; i++) { @@ -3242,12 +3243,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx is_concurrent_event_active = false; concurrent_event = nullptr; - } else { GGML_ASSERT (concurrent_event->stream_mapping.find(node) != concurrent_event->stream_mapping.end()); - const int stream_mapping = concurrent_event->stream_mapping[node]; - cuda_ctx->curr_stream_no = stream_mapping; - GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name); + cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); } } else if (i - prev_i > 1) { //the previous node was fused @@ -3255,9 +3254,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx try_launch_concurrent_event(prev_node); if (is_concurrent_event_active) { - const int stream_mapping = concurrent_event->stream_mapping[node]; - cuda_ctx->curr_stream_no = stream_mapping; - GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", stream_mapping, node->name); + cuda_ctx->curr_stream_no = concurrent_event->stream_mapping[node]; + GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); } } prev_i = i; @@ -3730,20 +3728,22 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph } GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend"); - GGML_LOG_DEBUG("Optimizing CUDA graph %p %d\n", cgraph->nodes, cgraph->n_nodes); + GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes); ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context(); stream_context.reset(); + // number of out-degrees for a particular node std::unordered_map fan_out; + // reverse mapping of node to index in the cgraph std::unordered_map node_indices; - const auto & is_empty = [](const ggml_tensor * node) -> bool { + const auto & is_noop = [](const ggml_tensor * node) -> bool { return ggml_is_empty(node) || node->op == GGML_OP_NONE || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE; }; - const auto & is_src_of = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool { + const auto & depends_on = [](const ggml_tensor * dst, const ggml_tensor * src) -> bool { for (uint32_t s = 0; s < GGML_MAX_SRC; ++s) { if (dst->src[s] == src) { return true; @@ -3762,14 +3762,14 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph const ggml_tensor * node = cgraph->nodes[node_idx]; node_indices[node] = node_idx; - if (is_empty(node)) { + if (is_noop(node)) { continue; } for (int src_idx = 0; src_idx < GGML_MAX_SRC; ++src_idx) { - const ggml_tensor * node = cgraph->nodes[node_idx]->src[src_idx]; + const ggml_tensor * src = cgraph->nodes[node_idx]->src[src_idx]; //TODO: check why nrows > 1 fails - if (node && !is_empty(node) && ggml_nrows(node) <= 1) { - fan_out[node] += 1; + if (node && !is_noop(node) && ggml_nrows(node) <= 1) { + fan_out[src] += 1; } } } @@ -3786,15 +3786,16 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph const int min_fan_out = 3; const int max_fan_out = 3; + // store {fork_idx, join_idx} std::vector> concurrent_node_ranges; - //save the original graph - std::vector original_graph; - original_graph.reserve(cgraph->n_nodes); + // save the original nodes + std::vector original_nodes; + original_nodes.reserve(cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; ++i) { - original_graph.push_back(cgraph->nodes[i]); + original_nodes.push_back(cgraph->nodes[i]); } - cuda_ctx->stream_context().original_graph = std::move(original_graph); + cuda_ctx->stream_context().original_nodes = std::move(original_nodes); for (const auto & [root_node, count] : fan_out) { if (count >= min_fan_out && count <= max_fan_out) { @@ -3814,7 +3815,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph std::vector> nodes_per_branch; for (int i = root_node_idx + 1; i < cgraph->n_nodes; ++i) { const ggml_tensor * node = cgraph->nodes[i]; - if (!is_empty(node) && is_src_of(node, root_node)) { + if (!is_noop(node) && depends_on(node, root_node)) { nodes_per_branch.push_back({ node }); } } @@ -3824,13 +3825,10 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph //find the join point const ggml_tensor * join_node = nullptr; - const auto & belongs_to_branch = [&](const ggml_tensor * node, std::vector & branch) -> bool { + const auto & belongs_to_branch = [&](const ggml_tensor * node, + const std::vector & branch) -> bool { for (const ggml_tensor * n : branch) { - if (n == node) { - return false; - } - - if (is_src_of(node, n)) { + if (depends_on(node, n)) { return true; } } @@ -3856,21 +3854,17 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) { if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) { //continue accumulating - nodes_per_branch[branch_idx].push_back(curr_node); - found_branch = true; - } else { if (std::find(nodes_per_branch[branch_idx].begin(), nodes_per_branch[branch_idx].end(), - curr_node) != nodes_per_branch[branch_idx].end()) { - found_branch = true; + curr_node) == nodes_per_branch[branch_idx].end()) { + nodes_per_branch[branch_idx].push_back(curr_node); } + found_branch = true; } } - if (!found_branch) { - if (is_empty(curr_node)) { - // we can put it in any branch because it will be ignored - nodes_per_branch[0].push_back({ curr_node }); - } + if (!found_branch && is_noop(curr_node)) { + // we can put it in any branch because it will be ignored + nodes_per_branch[0].push_back({ curr_node }); } } @@ -3909,7 +3903,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph std::unordered_map & concurrent_events = cuda_ctx->stream_context().concurrent_events; GGML_ASSERT(concurrent_events.find(root_node) == concurrent_events.end()); - concurrent_events.emplace(root_node, concurrent_event); + concurrent_events.emplace(root_node, std::move(concurrent_event)); GGML_LOG_DEBUG("Adding stream at node %s %p\n", root_node->name, root_node); concurrent_node_ranges.emplace_back(fork_node_idx, join_node_idx); @@ -3937,7 +3931,7 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph branch_nodes.erase(branch_nodes.begin()); // append all empty nodes - while (!branch_nodes.empty() && is_empty(branch_nodes.front())) { + while (!branch_nodes.empty() && is_noop(branch_nodes.front())) { cgraph->nodes[current_node_idx] = const_cast(branch_nodes.front()); current_node_idx++; branch_nodes.erase(branch_nodes.begin()); From 25565c3811f73c047809f4a0a98157e024517890 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 25 Nov 2025 14:45:34 +0800 Subject: [PATCH 13/18] common.cuh: add is_valid for concurrent events --- ggml/src/ggml-cuda/common.cuh | 73 +++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/ggml-cuda.cu | 16 ++++++-- 2 files changed, 85 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 90ad8f2364b..60a32326b1d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -21,6 +21,7 @@ #include "ggml-common.h" #include +#include #include #include #include @@ -1006,6 +1007,78 @@ struct ggml_cuda_concurrent_event { other.fork_event = nullptr; } + // check if all branches don't read to the overlapping buffers + // check all read_srcs are either within the branch or are at or before the node + // we assume all nodes have the same buffer + bool is_valid() const { + std::vector>> write_ranges; + write_ranges.resize(n_streams); + + for (const auto & [tensor, stream] : stream_mapping) { + const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor; + //concurrent streams begin from 1 + write_ranges[stream - 1].emplace_back((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t)); + } + + for (int i = 0; i < n_streams; ++i) { + std::sort(write_ranges[i].begin(), write_ranges[i].end()); + } + + bool writes_overlap = false; + bool dependent_srcs = false; + for (const auto & [tensor, stream] : stream_mapping) { + const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor; + + // multiple nodes can use join_node's buffer. That is fine because we synchronize on the join node. + if (t->data == join_node->data) { + continue; + } + // check if this buffer's write data overlaps with another stream's + std::pair data_range = + std::make_pair((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t)); + for (int i = 0; i < n_streams; ++i) { + if (i == stream - 1) { + continue; + } + auto it = std::lower_bound(write_ranges[i].begin(), write_ranges[i].end(), data_range); + + if (it != write_ranges[i].end()) { + const std::pair & other = *it; + if ((other.first >= data_range.first && other.second <= data_range.second) || + (other.first <= data_range.first && other.second >= data_range.second)) { + GGML_LOG_DEBUG("Writes overlap for %s", tensor->name); + writes_overlap = true; + break; + } + } + } + + //check if all srcs are either in branch or don't have a branch + for (int i = 0; i < GGML_MAX_SRC; ++i) { + if (!tensor->src[i]) { + continue; + } + + auto it = stream_mapping.find(tensor->src[i]); + + if (it == stream_mapping.end()) { + continue; + } + + if (it->second != stream) { + dependent_srcs = true; + break; + } + } + + if (dependent_srcs || writes_overlap) { + break; + } + } + + return !writes_overlap && !dependent_srcs; + } + ~ggml_cuda_concurrent_event() { if (fork_event != nullptr) { CUDA_CHECK(cudaEventDestroy(fork_event)); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 62f438353dd..4d3255f8af5 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3196,10 +3196,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx ggml_cuda_stream_context & stream_ctx = cuda_ctx->stream_context(); bool is_concurrent_event_active = false; ggml_cuda_concurrent_event * concurrent_event = nullptr; + bool should_launch_concurrent_events = false; const auto try_launch_concurrent_event = [&](const ggml_tensor * node) { if (stream_ctx.concurrent_events.find(node) != stream_ctx.concurrent_events.end()) { concurrent_event = &stream_ctx.concurrent_events[node]; + is_concurrent_event_active = true; GGML_LOG_DEBUG("Launching %d streams at %s\n", concurrent_event->n_streams, node->name); @@ -3222,6 +3224,12 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx [[maybe_unused]] int prev_i = 0; if (stream_ctx.concurrent_events.size() > 0) { + bool should_launch_concurrent_events = true; + for (const auto & [tensor, event] : stream_ctx.concurrent_events) { + should_launch_concurrent_events &= event.is_valid(); + } + } + if (should_launch_concurrent_events) { //Restore the original graph to enable fusion within the streams cgraph->nodes = const_cast(stream_ctx.original_nodes.data()); cgraph->n_nodes = (int) stream_ctx.original_nodes.size(); @@ -3852,11 +3860,11 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph bool found_branch = false; for (size_t branch_idx = 0; branch_idx < nodes_per_branch.size(); branch_idx++) { - if (belongs_to_branch(curr_node, nodes_per_branch[branch_idx])) { + std::vector & branch_vec = nodes_per_branch[branch_idx]; + if (belongs_to_branch(curr_node, branch_vec)) { //continue accumulating - if (std::find(nodes_per_branch[branch_idx].begin(), nodes_per_branch[branch_idx].end(), - curr_node) == nodes_per_branch[branch_idx].end()) { - nodes_per_branch[branch_idx].push_back(curr_node); + if (std::find(branch_vec.begin(), branch_vec.end(), curr_node) == branch_vec.end()) { + branch_vec.push_back(curr_node); } found_branch = true; } From 06b2ad91c7c013d3da4517a7a05f49d47fd1fae0 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Tue, 25 Nov 2025 15:18:04 +0800 Subject: [PATCH 14/18] common.cuh: make comment better --- ggml/src/ggml-cuda/common.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 60a32326b1d..5196772127e 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1007,8 +1007,8 @@ struct ggml_cuda_concurrent_event { other.fork_event = nullptr; } - // check if all branches don't read to the overlapping buffers - // check all read_srcs are either within the branch or are at or before the node + // 1. check if all branches don't write to the overlapping memory ranges (except the join node) + // 2. check all srcs are either within the branch or outside the event // we assume all nodes have the same buffer bool is_valid() const { std::vector>> write_ranges; From e7f3158e97a9873791e64c67b7845f4f7977618a Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 27 Nov 2025 22:10:14 +0800 Subject: [PATCH 15/18] update comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 5196772127e..d59e3f6ca00 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1007,7 +1007,7 @@ struct ggml_cuda_concurrent_event { other.fork_event = nullptr; } - // 1. check if all branches don't write to the overlapping memory ranges (except the join node) + // 1. check if any branches write to overlapping memory ranges (except the join node) // 2. check all srcs are either within the branch or outside the event // we assume all nodes have the same buffer bool is_valid() const { From 2c9c3c20c8da3481c883a95df65f0a1bc52571ae Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 27 Nov 2025 22:10:47 +0800 Subject: [PATCH 16/18] update comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index d59e3f6ca00..a6840ebc804 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1008,7 +1008,7 @@ struct ggml_cuda_concurrent_event { } // 1. check if any branches write to overlapping memory ranges (except the join node) - // 2. check all srcs are either within the branch or outside the event + // 2. check whether all srcs are either within the branch or outside the nodes covered by ggml_cuda_concurrent_event // we assume all nodes have the same buffer bool is_valid() const { std::vector>> write_ranges; From c1dca28af91a1952dd1a4c214e8d11501736b4fe Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Thu, 27 Nov 2025 23:07:04 +0800 Subject: [PATCH 17/18] common.cuh: fix lower_bound condition + remove join_node data from write_ranges --- ggml/src/ggml-cuda/common.cuh | 37 +++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a6840ebc804..a6fad94f5d6 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1014,13 +1014,28 @@ struct ggml_cuda_concurrent_event { std::vector>> write_ranges; write_ranges.resize(n_streams); + // Get join_node's memory range to exclude from overlap checking. + // Multiple nodes can use join_node's buffer; we synchronize on the join node. + const ggml_tensor * join_t = join_node->view_src ? join_node->view_src : join_node; + const int64_t join_start = (int64_t) join_t->data; + const int64_t join_end = join_start + ggml_nbytes(join_t); + for (const auto & [tensor, stream] : stream_mapping) { const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor; - //concurrent streams begin from 1 - write_ranges[stream - 1].emplace_back((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t)); + const int64_t t_start = (int64_t) t->data; + const int64_t t_end = t_start + ggml_nbytes(t); + + // Skip tensors that overlap with join_node's buffer. + if (t_start < join_end && join_start < t_end) { + continue; + } + + // concurrent streams begin from 1 + write_ranges[stream - 1].emplace_back(t_start, t_end); } for (int i = 0; i < n_streams; ++i) { + // sorts first by start then by end of write range std::sort(write_ranges[i].begin(), write_ranges[i].end()); } @@ -1028,14 +1043,16 @@ struct ggml_cuda_concurrent_event { bool dependent_srcs = false; for (const auto & [tensor, stream] : stream_mapping) { const ggml_tensor * t = tensor->view_src ? tensor->view_src : tensor; + const int64_t t_start = (int64_t) t->data; + const int64_t t_end = t_start + ggml_nbytes(t); - // multiple nodes can use join_node's buffer. That is fine because we synchronize on the join node. - if (t->data == join_node->data) { + // Skip tensors that overlap with join_node's buffer (already excluded from write_ranges). + if (t_start < join_end && join_start < t_end) { continue; } + // check if this buffer's write data overlaps with another stream's - std::pair data_range = - std::make_pair((int64_t) t->data, (int64_t) t->data + ggml_nbytes(t)); + std::pair data_range = std::make_pair(t_start, t_end); for (int i = 0; i < n_streams; ++i) { if (i == stream - 1) { continue; @@ -1044,8 +1061,12 @@ struct ggml_cuda_concurrent_event { if (it != write_ranges[i].end()) { const std::pair & other = *it; - if ((other.first >= data_range.first && other.second <= data_range.second) || - (other.first <= data_range.first && other.second >= data_range.second)) { + + // std::lower_bound returns the first element where other >= data_range (lexicographically). + // This guarantees other.first >= data_range.first. + // Therefore, overlap occurs iff other.first < data_range.second + // (i.e., the other range starts before this range ends). + if (other.first < data_range.second) { GGML_LOG_DEBUG("Writes overlap for %s", tensor->name); writes_overlap = true; break; From c57b750c30c651521f7ad6676db149a50925c947 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Sat, 29 Nov 2025 08:46:29 +0800 Subject: [PATCH 18/18] ggml-cuda: fix overlap condition + shadowing parameter --- ggml/src/ggml-cuda/common.cuh | 12 ++++++------ ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a6fad94f5d6..978497ca523 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1014,8 +1014,8 @@ struct ggml_cuda_concurrent_event { std::vector>> write_ranges; write_ranges.resize(n_streams); - // Get join_node's memory range to exclude from overlap checking. - // Multiple nodes can use join_node's buffer; we synchronize on the join node. + // get join_node's memory range to exclude from overlap checking. + // multiple nodes can use join_node's buffer; we synchronize on the join node. const ggml_tensor * join_t = join_node->view_src ? join_node->view_src : join_node; const int64_t join_start = (int64_t) join_t->data; const int64_t join_end = join_start + ggml_nbytes(join_t); @@ -1025,8 +1025,8 @@ struct ggml_cuda_concurrent_event { const int64_t t_start = (int64_t) t->data; const int64_t t_end = t_start + ggml_nbytes(t); - // Skip tensors that overlap with join_node's buffer. - if (t_start < join_end && join_start < t_end) { + // skip tensors that overlap with join_node's buffer. + if ((t_start <= join_start && join_start < t_end) || (join_start <= t_start && t_start < join_end)) { continue; } @@ -1046,8 +1046,8 @@ struct ggml_cuda_concurrent_event { const int64_t t_start = (int64_t) t->data; const int64_t t_end = t_start + ggml_nbytes(t); - // Skip tensors that overlap with join_node's buffer (already excluded from write_ranges). - if (t_start < join_end && join_start < t_end) { + // skip tensors that overlap with join_node's buffer + if ((t_start <= join_start && join_start < t_end) || (join_start <= t_start && t_start < join_end)) { continue; } diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4d3255f8af5..ce98a161ac9 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3224,9 +3224,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx [[maybe_unused]] int prev_i = 0; if (stream_ctx.concurrent_events.size() > 0) { - bool should_launch_concurrent_events = true; + should_launch_concurrent_events = true; for (const auto & [tensor, event] : stream_ctx.concurrent_events) { - should_launch_concurrent_events &= event.is_valid(); + should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid(); } } if (should_launch_concurrent_events) {