Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,7 @@ struct ggml_cuda_graph {
bool disable_due_to_too_many_updates = false;
bool disable_due_to_failed_graph_capture = false;
int number_consecutive_updates = 0;
bool cuda_graphs_enabled = false;
std::vector<ggml_graph_node_properties> ggml_graph_properties;
#endif
};
Expand Down
56 changes: 41 additions & 15 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3263,6 +3263,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
should_launch_concurrent_events = should_launch_concurrent_events && event.is_valid();
}
}

if (should_launch_concurrent_events) {
// Restore original node order within each concurrent region to enable fusion within streams

Expand Down Expand Up @@ -3314,6 +3315,8 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
cgraph->nodes[start_pos + i] = const_cast<ggml_tensor *>(event.original_order[i]);
}
}
} else {
stream_ctx.concurrent_events.clear();
}

for (int i = 0; i < cgraph->n_nodes; i++) {
Expand Down Expand Up @@ -3702,11 +3705,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
}
}

static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

ggml_cuda_set_device(cuda_ctx->device);

static bool ggml_cuda_set_cuda_graph_enabled(ggml_backend_cuda_context * cuda_ctx) {
#ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);

Expand All @@ -3716,7 +3715,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
}

bool use_cuda_graph = true;
bool cuda_graph_update_required = false;

if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < GGML_CUDA_CC_AMPERE) {
Expand All @@ -3737,6 +3735,29 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
use_cuda_graph = false;
}

cuda_ctx->cuda_graph->cuda_graphs_enabled = use_cuda_graph;
#else
bool use_cuda_graph = false;
#endif // USE_CUDA_GRAPH

return use_cuda_graph;
}

static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;

bool use_cuda_graph = false;
bool cuda_graph_update_required = false;

// graph_optimize calls set_cuda_graph_enabled, in-case it not called (i.e. graph_compute is directly called)
// we call it here instead.
#ifdef USE_CUDA_GRAPH
if (!cuda_ctx->cuda_graph) {
use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);
} else {
use_cuda_graph = cuda_ctx->cuda_graph && cuda_ctx->cuda_graph->cuda_graphs_enabled;
}

if (use_cuda_graph) {
cuda_graph_update_required = is_cuda_graph_update_required(cuda_ctx, cgraph);

Expand All @@ -3756,6 +3777,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
#endif
}
}
#endif // USE_CUDA_GRAPH

if (use_cuda_graph && cuda_graph_update_required) {
// Start CUDA graph capture
Expand All @@ -3767,11 +3789,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}

#else
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
#endif // USE_CUDA_GRAPH

bool graph_evaluated_or_captured = false;

evaluate_and_capture_cuda_graph(cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
Expand Down Expand Up @@ -3807,21 +3824,24 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;

const bool use_cuda_graph = ggml_cuda_set_cuda_graph_enabled(cuda_ctx);

static bool enable_graph_optimization = [] {
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
const char * env = getenv("GGML_CUDA_GRAPH_OPT");
return env != nullptr && atoi(env) == 1;
}();

if (!enable_graph_optimization) {
return;
}

GGML_ASSERT(ggml_backend_cuda_get_device_count() == 1 && "compute graph optimization is only supported on single GPU in the CUDA backend");
GGML_LOG_DEBUG("Optimizing CUDA graph %p with %d nodes\n", cgraph->nodes, cgraph->n_nodes);

ggml_cuda_stream_context & stream_context = cuda_ctx->stream_context();
stream_context.reset();

if (!use_cuda_graph || ggml_backend_cuda_get_device_count() != 1) {
return;
}

// number of out-degrees for a particular node
std::unordered_map<const ggml_tensor *, int> fan_out;
// reverse mapping of node to index in the cgraph
Expand Down Expand Up @@ -3882,6 +3902,12 @@ static void ggml_backend_cuda_graph_optimize(ggml_backend_t backend, ggml_cgraph
if (count >= min_fan_out && count <= max_fan_out) {
const int root_node_idx = node_indices[root_node];

// only optimize for attn_norm
// TODO: make this more generic
if (!strstr(root_node->name, "attn_norm")) {
continue;
}

bool is_part_of_event = false;
for (const auto & [start, end] : concurrent_node_ranges) {
if (root_node_idx >= start && root_node_idx <= end) {
Expand Down
Loading