Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 39 additions & 7 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,10 +630,11 @@ extern "C" {

// this tensor...
enum ggml_tensor_flag {
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
GGML_TENSOR_FLAG_COMPUTE = 16, // ...must be computed
};

enum ggml_tri_type {
Expand Down Expand Up @@ -2577,11 +2578,42 @@ extern "C" {
struct ggml_tensor * grad,
struct ggml_tensor * sgd_params); // alpha, weight decay

// build forward mutiple tensors and select one of them for computing
// this is useful for creating graphs that have constant topology but compute different things based on the input
// ref: https://github.com/ggml-org/llama.cpp/pull/18550
//
// automatic differentiation
// nodes:
// | - build forward into the graph but do not compute
// c - build forward into the graph and compute
//
// | | ... c ... |
// | | ... c ... |
// | | ... c ... |
// [0 1 ... idx ... n-1] <-- ggml_build_forward_select(..., n, idx)
// c
// c
//
// example:
// struct ggml_tensor * curs[3];
//
// curs[0] = compute0(...);
// curs[1] = compute1(...);
// curs[2] = compute2(...);
//
// int idx = select_branch(some_input);
//
// struct ggml_tensor * out = ggml_build_forward_select(cgraph, curs, 3, idx);
//
GGML_API struct ggml_tensor * ggml_build_forward_select(
struct ggml_cgraph * cgraph,
struct ggml_tensor ** tensors,
int n_tensors,
int idx);

GGML_API void ggml_build_forward_expand(
struct ggml_cgraph * cgraph,
struct ggml_tensor * tensor);

GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(
struct ggml_context * ctx, // context for gradient computation
struct ggml_cgraph * cgraph,
Expand Down Expand Up @@ -2613,7 +2645,7 @@ extern "C" {
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

// dump the graph into a file using the dot format
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * cgraph, const char * filename);

// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
Expand Down
5 changes: 3 additions & 2 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -874,9 +874,9 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
}
if (sched->debug > 1) {
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d,c=%d:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node),
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)]);
graph->use_counts[ggml_hash_find(&graph->visited_hash_set, node)], node->flags & GGML_TENSOR_FLAG_COMPUTE ? 1 : 0);
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
Expand Down Expand Up @@ -1922,6 +1922,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
dst->view_offs = src->view_offs;
}
dst->op = src->op;
dst->flags = src->flags;
memcpy(dst->op_params, src->op_params, sizeof(dst->op_params));
ggml_set_name(dst, src->name);

Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-blas/ggml-blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,10 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_backend_blas_mul_mat(ctx, node);
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cann/ggml-cann.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2146,6 +2146,10 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
continue;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

bool ok = ggml_cann_compute_forward(*cann_ctx, node);
if (!ok) {
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-cpu/ggml-cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -2943,6 +2943,10 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
continue;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

ggml_compute_forward(&params, node);

if (state->ith == 0 && cplan->abort_callback &&
Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cuda/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1123,6 +1123,7 @@ struct ggml_tensor_extra_gpu {
struct ggml_cuda_graph_node_properties {
void * node_address;
ggml_op node_op;
int32_t flags;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
void * src_address[GGML_MAX_SRC];
Expand Down
8 changes: 8 additions & 0 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2918,6 +2918,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
static void ggml_cuda_graph_node_set_properties(ggml_cuda_graph_node_properties * props, ggml_tensor * node) {
props->node_address = node->data;
props->node_op = node->op;
props->flags = node->flags;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
props->ne[i] = node->ne[i];
props->nb[i] = node->nb[i];
Expand Down Expand Up @@ -2961,6 +2962,10 @@ static bool ggml_cuda_graph_node_properties_match(ggml_tensor * node, ggml_cuda_
return false;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) != (props->flags & GGML_TENSOR_FLAG_COMPUTE)) {
return false;
}

return true;
}

Expand Down Expand Up @@ -3378,6 +3383,9 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
continue;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

// start of fusion operations
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-hexagon/ggml-hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2497,6 +2497,10 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
continue;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

uint32_t flags = 0;

// skip quantizer if src1 is reused
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,9 @@ static inline bool ggml_can_fuse_ext(const struct ggml_cgraph * cgraph, const in
if (node->op != ops[i]) {
return false;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
return false;
}
if (i < num_ops - 1 && !ggml_node_has_n_uses(cgraph, node_idxs[i], 1)) {
return false;
}
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-metal/ggml-metal-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
GGML_ABORT("unsupported op");
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
return 1;
}

int n_fuse = 1;

// check if the current node can run concurrently with other nodes before it
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-opencl/ggml-opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3058,6 +3058,10 @@ static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggm
continue;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

if (!backend_ctx->disable_fusion && ggml_opencl_can_fuse(cgraph, i, { GGML_OP_NORM, GGML_OP_MUL, GGML_OP_ADD })) {
ggml_opencl_op_norm_fused(backend, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
i += 2;
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4109,6 +4109,9 @@ static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * syc
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}
#ifndef NDEBUG
assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
for (int j = 0; j < GGML_MAX_SRC; j++) {
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12191,6 +12191,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
if (ggml_is_empty(node) || ggml_op_is_empty(node->op) || !node->buffer) {
return false;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
return false;
}

VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
ctx->semaphore_idx = 0;
Expand Down Expand Up @@ -13645,7 +13648,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
int last_node = cgraph->n_nodes - 1;

// If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly
while (last_node > 0 && ggml_vk_is_empty(cgraph->nodes[last_node])) {
while (last_node > 0 && (ggml_vk_is_empty(cgraph->nodes[last_node]) || ((cgraph->nodes[last_node]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0))) {
last_node -= 1;
}

Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-webgpu/ggml-webgpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1982,6 +1982,9 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
if (ggml_is_empty(node)) {
return std::nullopt;
}
if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
return std::nullopt;
}
WEBGPU_LOG_DEBUG("ggml_webgpu_encode_node(" << node << ", " << ggml_op_name(node->op) << ")");

ggml_tensor * src0 = node->src[0];
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-zdnn/ggml-zdnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,10 @@ static enum ggml_status ggml_zdnn_graph_compute(ggml_backend_t backend, ggml_cgr
continue;
}

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

bool ok = ggml_zdnn_compute_forward(ctx, node);
if (!ok) {
GGML_LOG_ERROR("%s: unsupported op %s (%s)\n",
Expand Down
4 changes: 4 additions & 0 deletions ggml/src/ggml-zendnn/ggml-zendnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,10 @@ static ggml_status ggml_backend_zendnn_graph_compute(ggml_backend_t backend, ggm
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];

if ((node->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
continue;
}

switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_zendnn_compute_forward_mul_mat(ctx, node);
Expand Down
Loading
Loading