Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1552,6 +1552,7 @@ static ggml_tensor * llm_build_kqv(

cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias,
hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f);
cb(cur, "fa", il);
ggml_flash_attn_ext_add_sinks(cur, sinks);
if (n_swa > 0) {
((int32_t *)cur->op_params)[4] = n_swa;
Expand Down Expand Up @@ -1815,7 +1816,9 @@ std::tuple<ggml_tensor*, ggml_tensor*, ggml_tensor*, ggml_tensor*> llm_build_con
auto row_size = ggml_row_size(Qaux->type, n_embd_head_k);
// TODO: check why CUDA performance suffers so much if we don't make these two tensors contiguous
auto Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, Qaux, n_embd_head_k, Qaux->ne[0]/(2*n_embd_head_k), n_tokens, 2*row_size, Qaux->nb[1], 0));
cb(Qcur, "Qcur_cont", il);
auto gate = ggml_cont_2d(ctx0, ggml_view_3d(ctx0, Qaux, n_embd_head_k, Qaux->ne[0]/(2*n_embd_head_k), n_tokens, 2*row_size, Qaux->nb[1], row_size), Qaux->ne[0]/2, n_tokens);
cb(gate, "gate_cont", il);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, Kcur->ne[0]/n_embd_head_k, n_tokens);
if (q_norm) {
Qcur = llm_build_norm(ctx0, Qcur, hparams, q_norm, NULL, LLM_NORM_RMS, cb, il);
Expand Down Expand Up @@ -10384,8 +10387,8 @@ ggml_tensor * llm_build_context::build_std_attention(ggml_cgraph * gf, ggml_tens
ext_factor, attn_factor, beta_fast, beta_slow);
}
}
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Qcur, "Qcur_roped", il);
cb(Kcur, "Kcur_roped", il);

if (inp_attn_scale) {
Qcur = ggml_mul(ctx0, Qcur, inp_attn_scale);
Expand Down
26 changes: 13 additions & 13 deletions src/llama-delta-net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,18 +151,18 @@ std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_fused_delta_net(ggml_co
return {output_tokens, new_state};
}

std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb) const {
std::pair<ggml_tensor *, ggml_tensor *> delta_net::build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const {
auto & model = lctx.model;
const int64_t n_tok = input->ne[1];
if (model.layers[il].wqkv) {
ggml_tensor * qkv_mixed = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, input);
cb(qkv_mixed, "qkv_mixed", il);
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_tok, 1);
cb(qkv_mixed, "linear_attn_qkv_mixed", il);

ggml_tensor * z = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, input);
cb(z, "z", il);

ggml_build_forward_expand(gf, qkv_mixed);
ggml_build_forward_expand(gf, z);
qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_tok, 1);
cb(qkv_mixed, "linear_attn_qkv_mixed", il);
return { qkv_mixed, z };
}

Expand Down Expand Up @@ -246,9 +246,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
const int64_t n_seqs = 1;
const int64_t n_seq_tokens = n_tok;

auto qkvz = build_qkvz(ctx0, cur, il, cb);
ggml_tensor * qkv_mixed = qkvz.first;
ggml_tensor * z = qkvz.second;
auto [qkv_mixed, z] = build_qkvz(ctx0, cur, il, cb, gf);

ggml_tensor *alpha, *beta;
if (model.layers[il].ssm_beta_alpha) {
Expand Down Expand Up @@ -291,6 +289,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
ggml_build_forward_expand(gf, alpha);

ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
cb(alpha_biased, "alpha_biased", il);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il);
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a);
Expand Down Expand Up @@ -373,6 +372,8 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_

ggml_tensor * q_repeated = ggml_repeat_4d(ctx0, q_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1);
ggml_tensor * k_repeated = ggml_repeat_4d(ctx0, k_reshaped, head_k_dim, repeat_factor, num_k_heads * n_tok, 1);
cb(q_repeated, "q_repeated", il);
cb(k_repeated, "k_repeated", il);

q_conv = ggml_reshape_4d(ctx0, q_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1);
k_conv = ggml_reshape_4d(ctx0, k_repeated, head_k_dim, num_k_heads * repeat_factor, n_tok, 1);
Expand Down Expand Up @@ -403,12 +404,11 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_
ggml_tensor * new_conv_flat = ggml_reshape_2d(ctx0, new_conv_states_cont, conv_state_dim, 1);
ggml_tensor * new_ssm_flat = ggml_reshape_2d(ctx0, new_state, ssm_state_dim, 1);
ggml_tensor * new_state_flat = ggml_concat(ctx0, new_conv_flat, new_ssm_flat, 0);
cb(new_state_flat, "new_state_flat", il);

ggml_tensor * state_update = new_state_flat;
if (state_dst->type != GGML_TYPE_F32) {
state_update = ggml_cast(ctx0, state_update, state_dst->type);
}
ggml_build_forward_expand(gf, ggml_cpy(ctx0, state_update, state_dst));
auto state_cpy = ggml_cpy(ctx0, new_state_flat, state_dst);
cb(state_cpy, "state_cpy", il);
ggml_build_forward_expand(gf, state_cpy);

ggml_tensor * attn_out_2d = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_tok);
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_tok);
Expand Down
2 changes: 1 addition & 1 deletion src/llama-delta-net.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ struct delta_net {
ggml_tensor * g, ggml_tensor * beta, ggml_tensor * state,
int il, const llm_build_cb & cb);

std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb) const;
std::pair<ggml_tensor *, ggml_tensor *> build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const;

ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf,
ggml_tensor * cur, ggml_tensor * causal_mask, ggml_tensor * identity,
Expand Down