Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion src/llama-build-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4649,7 +4649,19 @@ ggml_cgraph * llm_build_context::build_qwen35() {

if (hparams.is_recurrent(il)) {
ggml_tensor * inpSA = inpL;
cur = llm_build_norm(ctx0, inpL, hparams, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, cb, il);
int idx = model.default_layer_device[il];
if (inpL->op == GGML_OP_REDUCE) {
if (kv_self.s_l[il]) {
// This shouldn't be necessary, but just in case.
int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer);
if (idx_s_l >= 0) idx = idx_s_l;
}
if (inpL->src[idx]) {
inpL->view_src = inpL->src[idx];
}
}
auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm;
cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il);
cb(cur, "attn_norm", il);
cur = delta.build_layer_attn_linear(ctx0, gf, cur, causal_mask, identity, diag_mask, il, cb);
if (il == n_layer - 1 && inp_out_ids) {
Expand Down
23 changes: 12 additions & 11 deletions src/llama-load-tensors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,7 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {

for (int i = 0; i < n_layer; ++i) {
ggml_context * ctx_split = ctx_for_layer_split(i);
ggml_context * ctx_layer = ctx_for_layer(i);

auto & layer = model.layers[i];

Expand All @@ -1510,15 +1511,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) {
} else {
// Linear attention (gated delta net) specific tensors
// Create tensors with calculated dimensions
layer.wqkv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ssm_conv1d = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
layer.ssm_alpha = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
layer.ssm_norm = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
layer.ssm_out = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
layer.wqkv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.wqkv_gate = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED);
layer.ssm_conv1d = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0);
layer.ssm_dt = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0);
layer.ssm_a = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0);
layer.ssm_beta = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0);
layer.ssm_alpha = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0);
layer.ssm_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0);
layer.ssm_out = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0);
}

layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0);
Expand Down Expand Up @@ -3610,7 +3611,7 @@ bool create_tensors_helper::create_tensors() {
if (layer.wo && layer.wq && layer.wk && layer.wv) {
auto granularity_kq = hparams.n_embd_head_k * gqa_ratio;
int wq_ne1 = layer.wq->ne[1];
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) {
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) {
granularity_kq *= 2; wq_ne1 /= 2;
}
auto granularity_vo = hparams.n_embd_head_v * gqa_ratio;
Expand Down Expand Up @@ -3666,7 +3667,7 @@ bool create_tensors_helper::create_tensors() {
LLAMA_LOG_DEBUG("\n");
prepare_split_tensors(1, ctx_split, layer.wqkv_gate, layer.split_wqkv_gate, wqkv_gate_split, mem_used);
}
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE) {
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) {
for (auto & s : split_kq) s /= 2*gqa_ratio;
} else {
for (auto & s : split_kq) s /= gqa_ratio;
Expand Down
5 changes: 3 additions & 2 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -661,7 +661,7 @@ llama_context::~llama_context() {
// kv cache helpers
//

static inline bool llama_qwen3next_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
static inline bool llama_is_recurrent_layer(const llama_hparams & hparams, uint32_t il) {
return hparams.is_recurrent(il);
}

Expand Down Expand Up @@ -836,7 +836,7 @@ static bool llama_kv_cache_init(

int n_mla = 0;
for (int i = 0; i < (int) n_layer; i++) {
const bool qnext_recurrent = llama_qwen3next_is_recurrent_layer(hparams, i);
const bool qnext_recurrent = llama_is_recurrent_layer(hparams, i);
const uint32_t n_embd_v_row = llama_kv_v_row_embd(model, hparams, i);
const uint32_t n_head_kv = hparams.n_head_kv(i);
const uint32_t n_embd_head_k= hparams.n_embd_head_k;
Expand Down Expand Up @@ -1937,6 +1937,7 @@ static bool is_model_split_supported(const llama_model & model) {
LLM_ARCH_SEED_OSS,
LLM_ARCH_STEP35,
LLM_ARCH_QWEN3NEXT,
LLM_ARCH_QWEN35,
};
auto it = k_supported.find(model.arch);
return it != k_supported.end();
Expand Down