From d3c3dc9558995353c7c20140a6cbf5aea00b3687 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 19 Feb 2026 10:00:44 +0200 Subject: [PATCH 1/2] models : fix qwen3.5 beta/gate shapes --- src/models/qwen35.cpp | 3 +++ src/models/qwen35moe.cpp | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 7e1749b2c81..0105eb95083 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -232,6 +232,9 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus cb(gate, "gate", il); + beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); + gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); + // Get convolution states from cache ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index e12a5dea737..ebf8ae30fe1 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -232,6 +232,9 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus cb(gate, "gate", il); + beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); + gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); + // Get convolution states from cache ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); From a5cea28aaf9987ad9eff0e3007e63a88583ae33b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 19 Feb 2026 11:52:26 +0200 Subject: [PATCH 2/2] cont : avoid extra reshapes --- src/models/kimi-linear.cpp | 12 +++++------- src/models/qwen35.cpp | 3 +-- src/models/qwen35moe.cpp | 3 +-- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 8173d894ef2..4d6bb83c142 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -149,17 +149,19 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll g1 = ggml_mul(ctx0, g1, A); cb(g1, "kda_g1", il); + g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs); + // Compute beta (mixing coefficient) ggml_tensor * beta = ggml_mul_mat(ctx0, layer.ssm_beta, cur); - beta = ggml_reshape_4d(ctx0, beta, n_head, 1, n_seq_tokens, n_seqs); + beta = ggml_reshape_4d(ctx0, beta, 1, n_head, n_seq_tokens, n_seqs); cb(beta, "kda_beta", il); + beta = ggml_sigmoid(ctx0, beta); + // Reshape for KDA recurrence // {n_embd, n_tokens} -> {n_embd, n_seq_tokens, n_seqs} cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); - g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs); - // Get SSM state and compute KDA recurrence using ggml_kda_scan ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il); ggml_tensor * state = build_rs(inp_rs, ssm_states_all, hparams.n_embd_s(), n_seqs); @@ -169,10 +171,6 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll Qcur = ggml_l2_norm(ctx0, Qcur, eps_norm); Kcur = ggml_l2_norm(ctx0, Kcur, eps_norm); - beta = ggml_sigmoid(ctx0, beta); - - beta = ggml_reshape_4d(ctx0, beta, 1, n_head, n_seq_tokens, n_seqs); - g1 = ggml_reshape_4d(ctx0, g1, head_dim, n_head, n_seq_tokens, n_seqs); // Choose between build_delta_net_chunking and build_delta_net_recurrent based on n_tokens std::pair attn_out = n_seq_tokens == 1 ? diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 0105eb95083..56eefd7de27 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -216,7 +216,7 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( ggml_tensor * z = qkvz.second; ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur); - beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs); + beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); cb(beta, "beta", il); beta = ggml_sigmoid(ctx0, beta); @@ -232,7 +232,6 @@ ggml_tensor * llm_build_qwen35::build_layer_attn_linear( ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus cb(gate, "gate", il); - beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); // Get convolution states from cache diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index ebf8ae30fe1..c7295e3364f 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -216,7 +216,7 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( ggml_tensor * z = qkvz.second; ggml_tensor * beta = build_lora_mm(model.layers[il].ssm_beta, cur); - beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs); + beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); cb(beta, "beta", il); beta = ggml_sigmoid(ctx0, beta); @@ -232,7 +232,6 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn_linear( ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus cb(gate, "gate", il); - beta = ggml_reshape_4d(ctx0, beta, 1, num_v_heads, n_seq_tokens, n_seqs); gate = ggml_reshape_4d(ctx0, gate, 1, num_v_heads, n_seq_tokens, n_seqs); // Get convolution states from cache