Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ extern "C" {

GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
GGML_API int ggml_backend_sched_get_backend_idx(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer);

// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2664,6 +2664,16 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
return sched->backends[i];
}

int ggml_backend_sched_get_backend_idx(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
if (!buffer || !buffer->buft) return -1;
if (buffer && buffer->buft) {
for (int i = 0; i < sched->n_backends; ++i) {
if (ggml_backend_get_default_buffer_type(sched->backends[i]) == buffer->buft) return i;
}
}
return -1;
}

size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
Expand Down
2 changes: 0 additions & 2 deletions ggml/src/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3263,8 +3263,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
ggml_cuda_op_sum_rows_nc(ctx, cgraph->nodes[i+1]);
i += 2;
} else {
//auto src = dst->src[0];
//printf("cont(%s -> %s): %ld x %ld x %ld x %ld; %zu x %zu x %zu x %zu\n", src->name, dst->name, src->ne[0], src->ne[1], src->ne[2], src->ne[3], src->nb[0], src->nb[1], src->nb[2], src->nb[3]);
ggml_cuda_dup(ctx, dst);
}
break;
Expand Down
24 changes: 16 additions & 8 deletions src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1986,9 +1986,12 @@ static ggml_tensor * build_output(llama_context & lctx, ggml_context * ctx, ggml
}
}
} else {
if (cur->op == GGML_OP_REDUCE && cur->src[lctx.model.main_gpu]) {
int idx = lctx.model.default_layer_device[lctx.model.hparams.n_layer];
int idx_out = ggml_backend_sched_get_backend_idx(lctx.sched, lctx.model.output->buffer);
if (idx_out >= 0) idx = idx_out;
if (cur->op == GGML_OP_REDUCE && cur->src[idx]) {
// avoid copy to main GPU
cur->view_src = cur->src[lctx.model.main_gpu];
cur->view_src = cur->src[idx];
}
if (output_norm) {
cur = llm_build_context::llm_build_norm(ctx, cur, lctx.model.hparams, output_norm, NULL, LLM_NORM_RMS, cb, -1);
Expand Down Expand Up @@ -4458,12 +4461,18 @@ ggml_cgraph * llm_build_context::build_qwen3next() {


if (hparams.is_recurrent(il)) {
if (inpL->op == GGML_OP_REDUCE && inpL->src[model.default_layer_device[il]]) {
inpL->view_src = inpL->src[model.default_layer_device[il]];
//printf("Using reduce result on device %d\n", model.default_layer_device[il]);
//inpL = inpL->src[model.default_layer_device[il]];
int idx = model.default_layer_device[il];
if (inpL->op == GGML_OP_REDUCE) {
if (kv_self.s_l[il]) {
// This shouldn't be necessary, but just in case.
int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer);
if (idx_s_l >= 0) idx = idx_s_l;
}
if (inpL->src[idx]) {
inpL->view_src = inpL->src[idx];
}
}
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[model.default_layer_device[il]] : model.layers[il].attn_norm;
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
Expand All @@ -4474,7 +4483,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() {
cur = ggml_add(ctx0, cur, inpSA);
cb(cur, "attn_residual", il);
} else {
//cur = build_layer_attn(cur, inp_pos, KQ_mask, il);
cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr,
KQ_mask, nullptr, nullptr, KQ_scale, 0.0f, 0, il, true, false, true, false, false);
}
Expand Down
5 changes: 2 additions & 3 deletions src/llama-delta-net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -636,9 +636,8 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_tok);

ggml_tensor * attn_out_norm = llm_build_context::llm_build_norm(ctx0, attn_out_2d, hparams, model.layers[il].ssm_norm, nullptr, LLM_NORM_RMS, cb, il);
ggml_tensor * gated_silu = ggml_silu(ctx0, z_2d);
cb(gated_silu, "gated_silu", il);
attn_out_norm = ggml_mul(ctx0, attn_out_norm, gated_silu);
cb(attn_out_norm, "attn_rms_norm", il);
attn_out_norm = ggml_fused_mul_unary(ctx0, z_2d, attn_out_norm, GGML_UNARY_OP_SILU);
cb(attn_out_norm, "attn_out_norm", il);

ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, value_dim, n_tok);
Expand Down
5 changes: 3 additions & 2 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2049,9 +2049,10 @@ static bool llm_load_tensors(
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
split_buft = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
}
auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
//auto buft_layer = llama_default_buffer_type_offload(model, model.devices[main_gpu]);
// assign the repeating layers
for (int i = i_gpu_start; i < n_layer; ++i) {
auto buft_layer = llama_default_buffer_type_offload(model, model.default_layer_device[i]);
if (split_mode == LLAMA_SPLIT_MODE_ATTN) {
int layer_gpu = std::upper_bound(model.splits.begin(), model.splits.begin() + device_count,
float(i - i_gpu_start)/act_gpu_layers) - model.splits.begin();
Expand All @@ -2065,7 +2066,7 @@ static bool llm_load_tensors(
if (n_gpu_layers > n_layer) {
model.buft_output = {
split_buft,
llama_default_buffer_type_offload(model, model.devices[main_gpu])
llama_default_buffer_type_offload(model, model.default_layer_device[n_layer])
};
} else {
model.buft_output = llama_default_buffer_type_cpu(true);
Expand Down