diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 99d3091a5..3cd505a91 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -907,40 +907,70 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] else if (extra->split_dim == 0) { int n_interleave = 1; if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; - //if (tensor->type >= GGML_TYPE_Q4_0_R8) { - // GGML_ABORT("Dim 0 copy of row-interleaved quants is not supported yet"); - //} auto tt = ggml_internal_get_type_traits(tensor->type); std::vector host_buffer; GGML_ASSERT(ggml_is_contiguous(tensor)); int nrows = ggml_nrows(tensor); auto bs = tt.blck_size; auto ts = tt.type_size; - auto row_size = ggml_row_size(tensor->type, tensor->ne[0]); - int ne = 0; - for (int i = 0; i < extra->n_device; ++i) { - auto split = extra->splits[i]; - if (!split) continue; - GGML_ASSERT(split->ne[1]%n_interleave == 0); - ggml_cuda_set_device(i); - GGML_ASSERT(split->type == tensor->type); - GGML_ASSERT((int)ggml_nrows(split) == nrows); - GGML_ASSERT(split->ne[0] % bs == 0); - auto source_offset = n_interleave*(tt.row_meta_size + (ne / bs) * ts); - auto split_row_size = ggml_row_size(split->type, split->ne[0]); - if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size); - for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { + void * extra_ptr; + memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr)); + if (extra_ptr) { + auto & ranges = *(const std::vector>> *)extra_ptr; + GGML_ASSERT(extra->n_device == int(ranges.size())); + GGML_ASSERT(tensor->ne[2]*tensor->ne[3] == 1); + GGML_ASSERT(n_interleave == 1); + GGML_ASSERT(tt.row_meta_size == 0); + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) { + GGML_ASSERT(ranges[i].empty()); + continue; + } + GGML_ASSERT(!ranges[i].empty()); + GGML_ASSERT((int)ggml_nrows(split) == nrows); + auto split_row_size = ggml_row_size(split->type, split->ne[0]); + if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size); + auto dst = host_buffer.data(); for (int64_t i01 = 0; i01 < split->ne[1]; i01 += n_interleave) { - auto dst = host_buffer.data() + (i02*split->ne[1] + i01)*split_row_size; - auto src = (const char *)data + i02*tensor->nb[2] + i01*tensor->nb[1]; - if (tt.row_meta_size > 0) { - memcpy(dst, src, tt.row_meta_size*n_interleave); + for (auto & p : ranges[i]) { + GGML_ASSERT(p.first % bs == 0); + GGML_ASSERT(p.second % bs == 0); + auto src = (const char *)data + i01*tensor->nb[1] + (p.first/bs)*ts; + auto size = (p.second/bs)*ts; + memcpy(dst, src, size); + dst += size; + } + } + ggml_cuda_set_device(i); + CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + } + } else { + int ne = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->ne[1]%n_interleave == 0); + ggml_cuda_set_device(i); + GGML_ASSERT(split->type == tensor->type); + GGML_ASSERT((int)ggml_nrows(split) == nrows); + GGML_ASSERT(split->ne[0] % bs == 0); + auto source_offset = n_interleave*(tt.row_meta_size + (ne / bs) * ts); + auto split_row_size = ggml_row_size(split->type, split->ne[0]); + if (host_buffer.size() < nrows*split_row_size) host_buffer.resize(nrows*split_row_size); + for (int64_t i02 = 0; i02 < split->ne[2]; ++i02) { + for (int64_t i01 = 0; i01 < split->ne[1]; i01 += n_interleave) { + auto dst = host_buffer.data() + (i02*split->ne[1] + i01)*split_row_size; + auto src = (const char *)data + i02*tensor->nb[2] + i01*tensor->nb[1]; + if (tt.row_meta_size > 0) { + memcpy(dst, src, tt.row_meta_size*n_interleave); + } + memcpy(dst + tt.row_meta_size*n_interleave, src + source_offset, n_interleave*(split_row_size - tt.row_meta_size)); } - memcpy(dst + tt.row_meta_size*n_interleave, src + source_offset, n_interleave*(split_row_size - tt.row_meta_size)); } + CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + ne += split->ne[0]; } - CUDA_CHECK(cudaMemcpyAsync(split->data, host_buffer.data(), nrows*split_row_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - ne += split->ne[0]; } } else if (extra->split_dim == 1) { @@ -965,16 +995,43 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor([[maybe_unused]] } else { int n_interleave = 1; if (auto it = k_map.find(tensor->type); it != k_map.end()) n_interleave = it->second; - size_t cur_offset = 0; - for (int i = 0; i < extra->n_device; ++i) { - auto split = extra->splits[i]; - if (!split) continue; - GGML_ASSERT(split->ne[1]%n_interleave == 0); - ggml_cuda_set_device(i); - auto size = ggml_nbytes(split); - const char * buf_host = (const char *)data + cur_offset; - CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - cur_offset += size; + void * extra_ptr; + memcpy(&extra_ptr, tensor->op_params, sizeof(extra_ptr)); + if (extra_ptr) { + auto & ranges = *(const std::vector>> *)extra_ptr; + GGML_ASSERT(extra->n_device == int(ranges.size())); + GGML_ASSERT(tensor->ne[2]*tensor->ne[3] == 1); + GGML_ASSERT(n_interleave == 1); + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) { + GGML_ASSERT(ranges[i].empty()); + continue; + } + GGML_ASSERT(!ranges[i].empty()); + ggml_cuda_set_device(i); + auto dst = (char *)split->data; + for (auto & p : ranges[i]) { + GGML_ASSERT(p.first >= 0 && p.first < tensor->ne[1]); + GGML_ASSERT(p.second >= 0 && p.first + p.second <= tensor->ne[1]); + auto src = (const char *)data + p.first*tensor->nb[1]; + auto size = p.second*tensor->nb[1]; + CUDA_CHECK(cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + dst += size; + } + } + } else { + size_t cur_offset = 0; + for (int i = 0; i < extra->n_device; ++i) { + auto split = extra->splits[i]; + if (!split) continue; + GGML_ASSERT(split->ne[1]%n_interleave == 0); + ggml_cuda_set_device(i); + auto size = ggml_nbytes(split); + const char * buf_host = (const char *)data + cur_offset; + CUDA_CHECK(cudaMemcpyAsync(split->data, buf_host, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + cur_offset += size; + } } } } diff --git a/ggml/src/ggml-cuda/delta-net.cu b/ggml/src/ggml-cuda/delta-net.cu index f0bf36e91..cfab0ffca 100644 --- a/ggml/src/ggml-cuda/delta-net.cu +++ b/ggml/src/ggml-cuda/delta-net.cu @@ -157,6 +157,7 @@ __global__ void delta_net_recurrent_f32( } } + __syncthreads(); // Copy the final state to its destination for (int i = 0; i < HEAD_DIM/num_warps; ++i) { int col = num_warps*i + col_idx_0; diff --git a/src/llama-build-context.cpp b/src/llama-build-context.cpp index 163141c72..8557c8ef1 100644 --- a/src/llama-build-context.cpp +++ b/src/llama-build-context.cpp @@ -4437,7 +4437,6 @@ ggml_cgraph * llm_build_context::build_qwen3next() { ggml_tensor * cur = nullptr; for (int il = 0; il < n_layer; ++il) { - ggml_tensor * inpSA = inpL; GGML_ASSERT(model.layers[il].attn_norm != nullptr); GGML_ASSERT(model.layers[il].attn_post_norm != nullptr); @@ -4455,27 +4454,7 @@ ggml_cgraph * llm_build_context::build_qwen3next() { if (hparams.is_recurrent(il)) { - int idx = model.default_layer_device[il]; - if (inpL->op == GGML_OP_REDUCE) { - if (kv_self.s_l[il]) { - // This shouldn't be necessary, but just in case. - int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer); - if (idx_s_l >= 0) idx = idx_s_l; - } - if (inpL->src[idx]) { - inpL->view_src = inpL->src[idx]; - } - } - auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm; - cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb); - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "attn_residual", il); + cur = delta.build_layer_attn_linear(ctx0, gf, inpL, il == n_layer - 1 ? inp_out_ids : nullptr, il, cb); } else { cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask, nullptr, nullptr, KQ_scale, 0.0f, 0, il, true, false, true, false, false); @@ -4545,28 +4524,7 @@ ggml_cgraph * llm_build_context::build_qwen35moe() { for (int il = 0; il < n_layer; ++il) { if (hparams.is_recurrent(il)) { - ggml_tensor * inpSA = inpL; - int idx = model.default_layer_device[il]; - if (inpL->op == GGML_OP_REDUCE) { - if (kv_self.s_l[il]) { - // This shouldn't be necessary, but just in case. - int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer); - if (idx_s_l >= 0) idx = idx_s_l; - } - if (inpL->src[idx]) { - inpL->view_src = inpL->src[idx]; - } - } - auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm; - cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb); - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "attn_residual", il); + cur = delta.build_layer_attn_linear(ctx0, gf, inpL, il == n_layer - 1 ? inp_out_ids : nullptr, il, cb); } else { cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask, nullptr, nullptr, KQ_scale, 0.0f, 0, il, true, false, true, false, true); @@ -4625,28 +4583,7 @@ ggml_cgraph * llm_build_context::build_qwen35() { for (int il = 0; il < n_layer; ++il) { if (hparams.is_recurrent(il)) { - ggml_tensor * inpSA = inpL; - int idx = model.default_layer_device[il]; - if (inpL->op == GGML_OP_REDUCE) { - if (kv_self.s_l[il]) { - // This shouldn't be necessary, but just in case. - int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer); - if (idx_s_l >= 0) idx = idx_s_l; - } - if (inpL->src[idx]) { - inpL->view_src = inpL->src[idx]; - } - } - auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm; - cur = llm_build_norm(ctx0, inpL, hparams, norm, nullptr, LLM_NORM_RMS, cb, il); - cb(cur, "attn_norm", il); - cur = delta.build_layer_attn_linear(ctx0, gf, cur, il, cb); - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "attn_residual", il); + cur = delta.build_layer_attn_linear(ctx0, gf, inpL, il == n_layer - 1 ? inp_out_ids : nullptr, il, cb); } else { cur = build_std_attention(gf, model.layers[il].attn_norm, inpL, inp_pos, il == n_layer - 1 ? inp_out_ids : nullptr, nullptr, KQ_mask, nullptr, nullptr, KQ_scale, 0.0f, 0, il, true, false, true, false, true); diff --git a/src/llama-context.h b/src/llama-context.h index 4acee93f3..af694bd47 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -61,6 +61,7 @@ struct llama_kv_cache { std::vector split_k_l; std::vector split_v_l; + std::vector split_s_l; std::vector ctxs; std::vector bufs; diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp index 3d460a1f7..7fadddebd 100644 --- a/src/llama-delta-net.cpp +++ b/src/llama-delta-net.cpp @@ -153,28 +153,28 @@ std::pair delta_net::build_fused_delta_net(ggml_co return {output_tokens, new_state}; } -std::pair delta_net::build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const { - auto & model = lctx.model; +std::pair delta_net::build_qkvz(llama_context & lctx, ggml_context * ctx0, ggml_tensor * wqkv, ggml_tensor * wqkv_gate, + ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) { + const int64_t n_tok = input->ne[1]; - if (model.layers[il].wqkv) { - ggml_tensor * qkv_mixed = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv, input); - cb(qkv_mixed, "qkv_mixed", il); - ggml_tensor * z = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].wqkv_gate, input); - cb(z, "z", il); - ggml_build_forward_expand(gf, qkv_mixed); - ggml_build_forward_expand(gf, z); - qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_tok, 1); - cb(qkv_mixed, "linear_attn_qkv_mixed", il); - return { qkv_mixed, z }; - } + ggml_tensor * qkv_mixed = llm_build_context::llm_build_lora_mm(lctx, ctx0, wqkv, input); + cb(qkv_mixed, "qkv_mixed", il); + ggml_tensor * z = llm_build_context::llm_build_lora_mm(lctx, ctx0, wqkv_gate, input); + cb(z, "z", il); + ggml_build_forward_expand(gf, qkv_mixed); + ggml_build_forward_expand(gf, z); + qkv_mixed = ggml_reshape_3d(ctx0, qkv_mixed, qkv_mixed->ne[0], n_tok, 1); + cb(qkv_mixed, "linear_attn_qkv_mixed", il); + return { qkv_mixed, z }; +} - auto & hparams = model.hparams; - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t num_k_heads = hparams.ssm_n_group; - const int64_t num_v_heads = hparams.ssm_dt_rank; - const int64_t head_v_dim = hparams.ssm_d_inner / num_v_heads; +std::pair delta_net::build_qkvz(llama_context & lctx, ggml_context * ctx0, ggml_tensor * ssm_in, + int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, + ggml_tensor * input, int il, const llm_build_cb & cb) { + + const int64_t n_tok = input->ne[1]; - ggml_tensor * mixed_qkvz = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_in, input); + ggml_tensor * mixed_qkvz = llm_build_context::llm_build_lora_mm(lctx, ctx0, ssm_in, input); cb(mixed_qkvz, "linear_attn_mixed_qkvz", il); const int64_t qkvz_new_dim = 2 * head_k_dim + 2 * head_v_dim * (num_v_heads / num_k_heads); @@ -223,35 +223,24 @@ std::pair delta_net::build_qkvz(ggml_context * ctx return { qkv_mixed, z }; } -ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext, - uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const { - - auto & model = lctx.model; - auto & hparams = model.hparams; - auto & kv_self = lctx.kv_self; - const int64_t head_k_dim = hparams.ssm_d_state; - const int64_t num_k_heads = hparams.ssm_n_group; - const int64_t num_v_heads = hparams.ssm_dt_rank; - const int64_t head_v_dim = hparams.ssm_d_inner / num_v_heads; - const int64_t key_dim = head_k_dim * num_k_heads; - const int64_t value_dim = head_v_dim * num_v_heads; - const int64_t conv_dim = key_dim * 2 + value_dim; - const int64_t conv_state_dim = (hparams.ssm_d_conv - 1) * conv_dim; - const int64_t ssm_state_dim = head_v_dim * head_v_dim * num_v_heads; - const int64_t state_dim = conv_state_dim + ssm_state_dim; - const uint32_t qnext_state_slots = llm_build_context::llama_kv_qnext_state_slots(kv_self); - GGML_ASSERT(qnext_state_slots > 0); +std::pair delta_net::build_qkvz(llama_context & lctx, ggml_context * ctx0, ggml_tensor * wqkv, ggml_tensor * wqkv_gate, ggml_tensor * ssm_in, + int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) { + GGML_ASSERT((wqkv && wqkv_gate) || ssm_in); + return wqkv && wqkv_gate ? build_qkvz(lctx, ctx0, wqkv, wqkv_gate, input, il, cb, gf) + : build_qkvz(lctx, ctx0, ssm_in, head_k_dim, num_k_heads, head_v_dim, num_v_heads, input, il, cb); +} - const int64_t n_tok = cur->ne[1]; - const int64_t n_seqs = 1; - const int64_t n_seq_tokens = n_tok; +std::pair delta_net::build_beta_gate(llama_context & lctx, ggml_context * ctx0, + ggml_tensor * ssm_beta_alpha, ggml_tensor * ssm_beta, ggml_tensor * ssm_alpha, + ggml_tensor * ssm_dt, ggml_tensor * ssm_a, int64_t num_k_heads, int64_t num_v_heads, int64_t n_seqs, + ggml_tensor * cur, int il, const llm_build_cb & cb, ggml_cgraph * gf) { - auto [qkv_mixed, z] = build_qkvz(ctx0, cur, il, cb, gf); + auto n_tok = cur->ne[1]; + auto n_seq_tokens = n_tok / n_seqs; ggml_tensor *alpha, *beta; - if (model.layers[il].ssm_beta_alpha) { - ggml_tensor * mixed_ba = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_beta_alpha, cur); + if (ssm_beta_alpha) { + ggml_tensor * mixed_ba = llm_build_context::llm_build_lora_mm(lctx, ctx0, ssm_beta_alpha, cur); cb(mixed_ba, "linear_attn_mixed_ba", il); int64_t ba_new_dim = 2 * num_v_heads / num_k_heads; @@ -274,11 +263,11 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_tok, 1); alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_tok, 1); } else { - beta = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_beta, cur); + beta = llm_build_context::llm_build_lora_mm(lctx, ctx0, ssm_beta, cur); cb(beta, "beta", il); beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_tok, 1); cb(beta, "beta_reshaped", il); - alpha = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_alpha, cur); + alpha = llm_build_context::llm_build_lora_mm(lctx, ctx0, ssm_alpha, cur); cb(alpha, "alpha", il); // Why? Don't think this ggml_cont_3d is needed, but lets leave it in for now just in case. alpha = ggml_cont_3d(ctx0, alpha, num_v_heads, n_seq_tokens, n_seqs); @@ -289,22 +278,41 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ ggml_build_forward_expand(gf, beta); ggml_build_forward_expand(gf, alpha); - ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt); + ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, ssm_dt); cb(alpha_biased, "alpha_biased", il); ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased); cb(alpha_softplus, "a_softplus", il); - ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); + ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, ssm_a); cb(gate, "gate", il); + return {beta, gate}; +} + +ggml_tensor * delta_net::build_qkv(ggml_context * ctx0, ggml_tensor * state_storage, ggml_tensor * ssm_conv1d, + ggml_tensor * qkv_mixed, ggml_tensor * inp_s_seq_qnext, ggml_tensor * beta, ggml_tensor * gate, + int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, int64_t ssm_d_conv, + int64_t state_seq_id_local, uint32_t qnext_state_slots, bool reset_state_local, + float eps_norm, int repeat_type, int il, const llm_build_cb & cb, ggml_cgraph * gf) { + const int64_t key_dim = head_k_dim * num_k_heads; + const int64_t value_dim = head_v_dim * num_v_heads; + const int64_t conv_dim = key_dim * 2 + value_dim; + const int64_t conv_state_dim = (ssm_d_conv - 1) * conv_dim; + const int64_t ssm_state_dim = head_v_dim * head_v_dim * num_v_heads; + const int64_t state_dim = conv_state_dim + ssm_state_dim; + GGML_ASSERT(qnext_state_slots > 0); + + const int64_t n_seq_tokens = qkv_mixed->ne[1]; + const int64_t n_seqs = qkv_mixed->ne[2]; + const int64_t n_tok = n_seq_tokens * n_seqs; + size_t state_row_size = 0; ggml_tensor * state_all = nullptr; - GGML_ASSERT((size_t) il < kv_self.s_l.size() && kv_self.s_l[il] != nullptr); - ggml_tensor * state_storage = kv_self.s_l[il]; GGML_ASSERT(state_storage->type == GGML_TYPE_F32); GGML_ASSERT(state_storage->ne[0] >= state_dim); GGML_ASSERT((uint32_t) state_storage->ne[1] == qnext_state_slots); state_row_size = state_storage->nb[1]; GGML_ASSERT(ggml_nbytes(state_storage) >= state_row_size * qnext_state_slots); + state_all = ggml_view_2d(ctx0, state_storage, state_dim, qnext_state_slots, state_row_size, 0); ggml_tensor * state_dst = ggml_view_2d(ctx0, state_all, state_dim, 1, state_row_size, state_seq_id_local * state_row_size); @@ -321,13 +329,13 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ ggml_tensor * ssm_state_flat = ggml_view_2d(ctx0, state_f32, ssm_state_dim, 1, state_f32->nb[1], conv_state_dim * ggml_element_size(state_f32)); - ggml_tensor * conv_states = ggml_reshape_3d(ctx0, conv_state_flat, hparams.ssm_d_conv - 1, conv_dim, 1); + ggml_tensor * conv_states = ggml_reshape_3d(ctx0, conv_state_flat, ssm_d_conv - 1, conv_dim, 1); ggml_tensor * state = ggml_reshape_4d(ctx0, ssm_state_flat, head_v_dim, head_v_dim, num_v_heads, 1); cb(conv_states, "conv_states", il); cb(state, "state_predelta", il); ggml_build_forward_expand(gf, state); - ggml_tensor * conv_output_raw = ggml_ssm_conv(ctx0, conv_states, qkv_mixed, model.layers[il].ssm_conv1d, inp_s_seq_qnext); + ggml_tensor * conv_output_raw = ggml_ssm_conv(ctx0, conv_states, qkv_mixed, ssm_conv1d, inp_s_seq_qnext); cb(conv_output_raw, "conv_output_raw", il); ggml_tensor * conv_output = ggml_view_2d(ctx0, conv_output_raw, conv_dim, n_tok, conv_dim * ggml_element_size(conv_output_raw), 0); @@ -339,38 +347,33 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ int64_t nb1_qkv = ggml_row_size(conv_output_silu->type, qkv_dim); // Extract the convolved Q, K, V from conv_output - ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_output_silu, head_k_dim, num_k_heads, n_tok, 1, - ggml_row_size(conv_output_silu->type, head_k_dim), - nb1_qkv, nb1_qkv * n_tok, 0); + ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_output_silu, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, + ggml_row_size(conv_output_silu->type, head_k_dim), nb1_qkv, nb1_qkv * n_tok, 0); - ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_output_silu, head_k_dim, num_k_heads, n_tok, 1, - ggml_row_size(conv_output_silu->type, head_k_dim), - nb1_qkv, nb1_qkv * n_tok, + ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_output_silu, head_k_dim, num_k_heads, n_seq_tokens, n_seqs, + ggml_row_size(conv_output_silu->type, head_k_dim), nb1_qkv, nb1_qkv * n_tok, head_k_dim * num_k_heads * ggml_element_size(conv_output_silu)); - ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_output_silu, head_v_dim, num_v_heads, n_tok, 1, - ggml_row_size(conv_output_silu->type, head_v_dim), - nb1_qkv, nb1_qkv * n_tok, + ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_output_silu, head_v_dim, num_v_heads, n_seq_tokens, n_seqs, + ggml_row_size(conv_output_silu->type, head_v_dim), nb1_qkv, nb1_qkv * n_tok, ggml_row_size(conv_output_silu->type, 2 * head_k_dim * num_k_heads)); cb(q_conv, "q_conv", il); cb(k_conv, "k_conv", il); cb(v_conv, "v_conv", il); - const float eps_norm = hparams.f_norm_rms_eps; - q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm); k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm); cb(q_conv, "q_conv_normed", il); cb(k_conv, "k_conv_normed", il); - auto [output, new_state] = build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb, model.layers[il].ssm_beta_alpha ? 0 : 1); + auto [output, new_state] = build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb, repeat_type); cb(output, "attn_output", il); cb(new_state, "new_state", il); - ggml_tensor * new_conv_states = ggml_view_2d(ctx0, conv_output_raw, hparams.ssm_d_conv - 1, conv_dim, - hparams.ssm_d_conv * ggml_element_size(conv_output_raw), + ggml_tensor * new_conv_states = ggml_view_2d(ctx0, conv_output_raw, ssm_d_conv - 1, conv_dim, + ssm_d_conv * ggml_element_size(conv_output_raw), (1 + conv_dim * n_tok) * ggml_element_size(conv_output_raw)); auto new_conv_states_cont = ggml_cont(ctx0, new_conv_states); cb(new_conv_states_cont, "new_conv_states_cont", il); @@ -383,26 +386,207 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ cb(state_cpy, "state_cpy", il); ggml_build_forward_expand(gf, state_cpy); + return output; +} + +ggml_tensor * delta_net::build_gated_output(llama_context & lctx, ggml_context * ctx0, ggml_tensor * ssm_norm, ggml_tensor * ssm_out, ggml_tensor * output, ggml_tensor * z, + int64_t head_v_dim, int64_t num_v_heads, int64_t n_tok, int il, const llm_build_cb & cb) { + ggml_tensor * attn_out_2d = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_tok); ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_tok); - ggml_tensor * attn_out_norm = llm_build_context::llm_build_norm(ctx0, attn_out_2d, hparams, model.layers[il].ssm_norm, nullptr, LLM_NORM_RMS, cb, il); + ggml_tensor * attn_out_norm = llm_build_context::llm_build_norm(ctx0, attn_out_2d, lctx.model.hparams, ssm_norm, nullptr, LLM_NORM_RMS, cb, il); cb(attn_out_norm, "attn_rms_norm", il); attn_out_norm = ggml_fused_mul_unary(ctx0, z_2d, attn_out_norm, GGML_UNARY_OP_SILU); cb(attn_out_norm, "attn_out_norm", il); - ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, value_dim, n_tok); + ggml_tensor * final_output = ggml_reshape_2d(ctx0, attn_out_norm, head_v_dim*num_v_heads, n_tok); cb(final_output, "final_output", il); - ggml_tensor * out = llm_build_context::llm_build_lora_mm(lctx, ctx0, model.layers[il].ssm_out, final_output); + ggml_tensor * out = llm_build_context::llm_build_lora_mm(lctx, ctx0, ssm_out, final_output); cb(out, "linear_attn_out", il); - return ggml_reshape_2d(ctx0, out, hparams.n_embd, n_tok); + return ggml_reshape_2d(ctx0, out, lctx.model.hparams.n_embd, n_tok); +} + +static ggml_tensor * get_input_tensor_sm_graph(ggml_context * ctx, ggml_tensor * input, int id) { + auto cur = input; + if (input->op == GGML_OP_REDUCE) { + auto view_src = input->view_src; + GGML_ASSERT(view_src); + cur = input->src[id]; + if (!cur) { + GGML_ASSERT((input->op_params[4] & (1u << id)) == 0); + cur = ggml_dup_tensor(ctx, input); + input->src[id] = cur; + input->op_params[4] |= (1u << id); + } + else if (cur == view_src) { + cur = input; + } + } + return cur; +} + +ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf, + ggml_tensor * delta_input, ggml_tensor * inp_s_seq_qnext, ggml_tensor * inp_out_ids, + uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const { + + const int64_t n_tok = delta_input->ne[1]; + const int64_t n_seqs = 1; + //const int64_t n_seq_tokens = n_tok; + + auto & model = lctx.model; + auto & hparams = model.hparams; + auto & kv_self = lctx.kv_self; + + int64_t head_k_dim = hparams.ssm_d_state; + int64_t num_k_heads = hparams.ssm_n_group; + int64_t num_v_heads = hparams.ssm_dt_rank; + int64_t head_v_dim = hparams.ssm_d_inner / num_v_heads; + GGML_ASSERT(num_v_heads % num_k_heads == 0); + int64_t gqa_ratio = num_v_heads / num_k_heads; + + if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH && kv_self.s_l[il]->extra) { + GGML_ASSERT(head_k_dim == head_v_dim); + auto split_s_l = (ggml_split_tensor_t *)kv_self.s_l[il]->extra; + GGML_ASSERT(split_s_l); + int n_device = split_s_l->n_device; + ggml_split_tensor_t *split_wqkv = nullptr, *split_wqkv_gate = nullptr, *split_smm_in = nullptr; + auto & l = model.layers[il]; + if (l.wqkv && l.wqkv_gate) { + split_wqkv = (ggml_split_tensor_t *)l.wqkv->extra; + split_wqkv_gate = (ggml_split_tensor_t *)l.wqkv_gate->extra; + GGML_ASSERT(split_wqkv && split_wqkv_gate); + GGML_ASSERT(split_wqkv->n_device == n_device); + GGML_ASSERT(split_wqkv_gate->n_device == n_device); + } else { + split_smm_in = (ggml_split_tensor_t *)l.ssm_in->extra; + GGML_ASSERT(split_smm_in); + GGML_ASSERT(split_smm_in->n_device == n_device); + } + GGML_ASSERT(n_device > 1); + std::vector results(n_device, nullptr); + bool input_added = false; + for (int id = 0; id < n_device; ++id) { + if (!split_s_l->splits[id]) continue; + auto input = get_input_tensor_sm_graph(ctx0, delta_input, id); + auto split_norm = (ggml_split_tensor_t *)l.attn_norm->extra; + GGML_ASSERT(split_norm && split_norm->splits[id]); + auto cur = llm_build_context::llm_build_norm(ctx0, input, hparams, split_norm->splits[id], nullptr, LLM_NORM_RMS, cb, il); + int qnext_state_slots = split_s_l->splits[id]->ne[1]; + int il_cb = 1000*il + id; + int64_t num_k_heads_id, num_v_heads_id; + ggml_tensor *qkv_mixed, *z; + if (split_wqkv && split_wqkv_gate) { + num_k_heads_id = split_wqkv->splits[id]->ne[1]/(head_k_dim*(2 + gqa_ratio)); + num_v_heads_id = num_k_heads_id * gqa_ratio; + auto p = build_qkvz(lctx, ctx0, split_wqkv->splits[id], split_wqkv_gate->splits[id], cur, il_cb, cb, gf); + qkv_mixed = p.first; + z = p.second; + } else { + num_k_heads_id = split_smm_in->splits[id]->ne[1]/(2*head_k_dim*(1 + gqa_ratio)); + num_v_heads_id = num_k_heads_id * gqa_ratio; + auto p = build_qkvz(lctx, ctx0, nullptr, nullptr, split_smm_in->splits[id], head_k_dim, num_k_heads_id, head_v_dim, num_v_heads_id, cur, il, cb, gf); + //auto p = build_qkvz(lctx, ctx0, split_smm_in->splits[id], head_k_dim, num_k_heads_id, head_v_dim, num_v_heads_id, cur, il_cb, cb); + qkv_mixed = p.first; + z = p.second; + } + auto split_ssm_dt = (ggml_split_tensor_t *)l.ssm_dt->extra; + GGML_ASSERT(split_ssm_dt && split_ssm_dt->splits[id] && split_ssm_dt->splits[id]->ne[0] == num_v_heads_id); + auto split_ssm_a = (ggml_split_tensor_t *)l.ssm_a->extra; + GGML_ASSERT(split_ssm_a && split_ssm_a->splits[id] && split_ssm_a->splits[id]->ne[0] == num_v_heads_id); + ggml_tensor *beta, *gate; + if (l.ssm_beta_alpha) { + auto split_ssm_beta_alpha = (ggml_split_tensor_t *)l.ssm_beta_alpha->extra; + GGML_ASSERT(split_ssm_beta_alpha && split_ssm_beta_alpha->splits[id]); + auto p = build_beta_gate(lctx, ctx0, split_ssm_beta_alpha->splits[id], nullptr, nullptr, split_ssm_dt->splits[id], split_ssm_a->splits[id], + num_k_heads_id, num_v_heads_id, n_seqs, cur, il, cb, gf); + beta = p.first; gate = p.second; + } else { + auto split_ssm_beta = (ggml_split_tensor_t *)l.ssm_beta->extra; + GGML_ASSERT(split_ssm_beta && split_ssm_beta->splits[id]); + auto split_ssm_alpha = (ggml_split_tensor_t *)l.ssm_alpha->extra; + GGML_ASSERT(split_ssm_alpha && split_ssm_alpha->splits[id]); + auto p = build_beta_gate(lctx, ctx0, nullptr, split_ssm_beta->splits[id], split_ssm_alpha->splits[id], split_ssm_dt->splits[id], split_ssm_a->splits[id], + num_k_heads_id, num_v_heads_id, n_seqs, cur, il, cb, gf); + beta = p.first; gate = p.second; + } + auto split_ssm_conv1d = (ggml_split_tensor_t *)l.ssm_conv1d->extra; + GGML_ASSERT(split_ssm_conv1d && split_ssm_conv1d->splits[id]); + auto output = build_qkv(ctx0, split_s_l->splits[id], split_ssm_conv1d->splits[id], qkv_mixed, inp_s_seq_qnext, beta, gate, + head_k_dim, num_k_heads_id, head_v_dim, num_v_heads_id, hparams.ssm_d_conv, + state_seq_id_local, qnext_state_slots, reset_state_local, hparams.f_norm_rms_eps, + l.ssm_beta_alpha ? 0 : 1, il, cb, gf); + split_norm = (ggml_split_tensor_t *)l.ssm_norm->extra; + GGML_ASSERT(split_norm && split_norm->splits[id]); + auto split_ssm_out = (ggml_split_tensor_t *)l.ssm_out->extra; + GGML_ASSERT(split_ssm_out && split_ssm_out->splits[id] && split_ssm_out->splits[id]->ne[0] == head_k_dim*num_v_heads_id); + auto gated_output = build_gated_output(lctx, ctx0, split_norm->splits[id], split_ssm_out->splits[id], output, z, head_v_dim, num_v_heads_id, n_tok, il_cb, cb); + if (inp_out_ids) { + gated_output = ggml_get_rows(ctx0, gated_output, inp_out_ids); + } + if (!input_added) { + if (inp_out_ids) { + input = ggml_get_rows(ctx0, input, inp_out_ids); + } + gated_output = ggml_add(ctx0, gated_output, input); + input_added = true; + } + if (gated_output->ne[1] > 32 && lctx.cparams.reduce_type != GGML_TYPE_F32) { + gated_output = ggml_cast(ctx0, gated_output, lctx.cparams.reduce_type); + } + ggml_build_forward_expand(gf, gated_output); + results[id] = gated_output; + } + auto cur = ggml_reduce(ctx0, results.data(), n_device, GGML_OP_ADD); + ggml_build_forward_expand(gf, cur); + return cur; + } + + const uint32_t qnext_state_slots = llm_build_context::llama_kv_qnext_state_slots(kv_self); + GGML_ASSERT(qnext_state_slots > 0); + + int idx = model.default_layer_device[il]; + auto input = delta_input; + if (input->op == GGML_OP_REDUCE) { + if (kv_self.s_l[il]) { + int idx_s_l = ggml_backend_sched_get_backend_idx(lctx.sched, kv_self.s_l[il]->buffer); + if (idx_s_l >= 0) idx = idx_s_l; + } + if (input->src[idx]) { + input->view_src = input->src[idx]; + } + } + auto norm = model.layers[il].attn_norm->extra ? ((ggml_split_tensor_t *)model.layers[il].attn_norm->extra)->splits[idx] : model.layers[il].attn_norm; + auto cur = llm_build_context::llm_build_norm(ctx0, input, hparams, norm, nullptr, LLM_NORM_RMS, cb, il); + + auto [qkv_mixed, z] = build_qkvz(lctx, ctx0, model.layers[il].wqkv, model.layers[il].wqkv_gate, model.layers[il].ssm_in, + head_k_dim, num_k_heads, head_v_dim, num_v_heads, cur, il, cb, gf); + + auto [beta, gate] = build_beta_gate(lctx, ctx0, model.layers[il].ssm_beta_alpha, model.layers[il].ssm_beta, model.layers[il].ssm_alpha, + model.layers[il].ssm_dt, model.layers[il].ssm_a, num_k_heads, num_v_heads, n_seqs, cur, il, cb, gf); + + auto output = build_qkv(ctx0, kv_self.s_l[il], model.layers[il].ssm_conv1d, + qkv_mixed, inp_s_seq_qnext, beta, gate, + head_k_dim, num_k_heads, head_v_dim, num_v_heads, hparams.ssm_d_conv, + state_seq_id_local, qnext_state_slots, reset_state_local, hparams.f_norm_rms_eps, + model.layers[il].ssm_beta_alpha ? 0 : 1, il, cb, gf); + + auto gated_output = build_gated_output(lctx, ctx0, model.layers[il].ssm_norm, model.layers[il].ssm_out, output, z, head_v_dim, num_v_heads, n_tok, il, cb); + if (inp_out_ids) { + gated_output = ggml_get_rows(ctx0, gated_output, inp_out_ids); + input = ggml_get_rows(ctx0, input, inp_out_ids); + } + output = ggml_add(ctx0, gated_output, input); + cb(output, "ssm_output", il); + return output; + //return build_gated_output(lctx, ctx0, model.layers[il].ssm_norm, model.layers[il].ssm_out, output, z, head_v_dim, num_v_heads, n_tok, il, cb); } ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * cur, int il, const llm_build_cb & cb) const { + ggml_tensor * cur, ggml_tensor * inp_out_ids, int il, const llm_build_cb & cb) const { GGML_ASSERT(lctx.inp_s_seq_qnext != nullptr); auto & model = lctx.model; @@ -420,7 +604,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap if (all_same_seq) { bool reset_state = batch.pos != nullptr && batch.pos[0] == 0; - return build_layer_attn_linear_core(ctx0, gf, cur, lctx.inp_s_seq_qnext, token_seq_ids.front(), reset_state, il, cb); + return build_layer_attn_linear_core(ctx0, gf, cur, lctx.inp_s_seq_qnext, inp_out_ids, token_seq_ids.front(), reset_state, il, cb); } GGML_ASSERT(has_unique_seq_ids && "qwen3next mixed-sequence batches require unique sequence IDs per token"); @@ -432,7 +616,7 @@ ggml_tensor * delta_net::build_layer_attn_linear(ggml_context * ctx0, ggml_cgrap const bool reset_state_i = batch.pos != nullptr && batch.pos[i] == 0; const uint32_t state_seq_id_i = (uint32_t) token_seq_ids[i]; - ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, inp_s_seq_qnext_i, state_seq_id_i, reset_state_i, il, cb); + ggml_tensor * out_i = build_layer_attn_linear_core(ctx0, gf, cur_i, inp_s_seq_qnext_i, inp_out_ids, state_seq_id_i, reset_state_i, il, cb); out = out == nullptr ? out_i : ggml_concat(ctx0, out, out_i, 1); } diff --git a/src/llama-delta-net.h b/src/llama-delta-net.h index f34cb52cd..58259633f 100644 --- a/src/llama-delta-net.h +++ b/src/llama-delta-net.h @@ -13,14 +13,12 @@ struct delta_net { ggml_tensor * g, ggml_tensor * beta, ggml_tensor * state, int il, const llm_build_cb & cb, int repeat_type); - std::pair build_qkvz(ggml_context * ctx0, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf) const; - ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext, + ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext, ggml_tensor * inp_out_ids, uint32_t state_seq_id_local, bool reset_state_local, int il, const llm_build_cb & cb) const; ggml_tensor * build_layer_attn_linear(ggml_context * ctx0, ggml_cgraph * gf, - ggml_tensor * cur, int il, const llm_build_cb & cb) const; + ggml_tensor * cur, ggml_tensor * inp_out_ids, int il, const llm_build_cb & cb) const; private: @@ -30,4 +28,26 @@ struct delta_net { bool all_same_seq; bool has_unique_seq_ids; + static std::pair build_qkvz(llama_context & lctx, ggml_context * ctx0, + ggml_tensor * wqkv, ggml_tensor * wqkv_gate, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf); + + static std::pair build_qkvz(llama_context & lctx, ggml_context * ctx0, ggml_tensor * ssm_in, + int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, ggml_tensor * input, int il, const llm_build_cb & cb); + + static std::pair build_qkvz(llama_context & lctx, ggml_context * ctx0, ggml_tensor * wqkv, ggml_tensor * wqkv_gate, ggml_tensor * ssm_in, + int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, ggml_tensor * input, int il, const llm_build_cb & cb, ggml_cgraph * gf); + + static std::pair build_beta_gate(llama_context & lctx, ggml_context * ctx0, + ggml_tensor * ssm_beta_alpha, ggml_tensor * ssm_beta, ggml_tensor * ssm_alpha, + ggml_tensor * ssm_dt, ggml_tensor * ssm_a, int64_t num_k_heads, int64_t num_v_heads, int64_t n_seqs, + ggml_tensor * cur, int il, const llm_build_cb & cb, ggml_cgraph * gf); + + static ggml_tensor * build_qkv(ggml_context * ctx0, ggml_tensor * state_storage, ggml_tensor * ssm_conv1d, + ggml_tensor * qkv_mixed, ggml_tensor * inp_s_seq_qnext, ggml_tensor * beta, ggml_tensor * gate, + int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, int64_t ssm_d_conv, + int64_t state_seq_id_local, uint32_t qnext_state_slots, bool reset_state_local, + float eps_norm, int repeat_type, int il, const llm_build_cb & cb, ggml_cgraph * gf); + + static ggml_tensor * build_gated_output(llama_context & lctx, ggml_context * ctx0, ggml_tensor * ssm_norm, ggml_tensor * ssm_out, + ggml_tensor * output, ggml_tensor * z, int64_t head_v_dim, int64_t num_v_heads, int64_t n_tok, int il, const llm_build_cb & cb); }; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 1e6a05f2e..1f79557a1 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -279,6 +279,23 @@ struct llama_hparams { return ssm_d_state * ssm_d_inner; } + uint32_t n_embd_v_s_id(int nv) const { + if (ssm_n_group <= 0 || nv < 1 || ssm_dt_rank < 1) return 0; + int num_v_heads = ssm_dt_rank; + int num_k_heads = ssm_n_group; + int gqa_ratio = num_v_heads / num_k_heads; + GGML_ASSERT(nv <= num_v_heads); + GGML_ASSERT(nv % gqa_ratio == 0); + int nk = nv / gqa_ratio; + int head_k_dim = ssm_d_state; + int head_v_dim = ssm_d_inner / num_v_heads; + uint32_t conv_dim = 2 * nk * head_k_dim + nv * head_v_dim; + uint32_t conv_state_dim = conv_dim * (ssm_d_conv - 1); + uint32_t ssm_state_dim = head_v_dim * head_v_dim * nv; + return conv_state_dim + ssm_state_dim; + + } + bool is_recurrent(uint32_t il) const { return il < n_layer ? recurrent_layer_arr[il] : false; } diff --git a/src/llama-impl.h b/src/llama-impl.h index e773d2e8c..741c11df9 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -227,6 +227,7 @@ ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer); struct llama_split_tensor { std::vector tensor_splits; + std::vector>> ranges; ggml_split_tensor_t ggml; }; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index b39940847..1cb6afde2 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -1361,7 +1361,7 @@ bool create_tensors_helper::create_qwen3next_tensors(const LLM_TN & tn) { layer.ssm_out = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT, "weight", i), {value_dim, n_embd}); } - auto ffn_ctx = model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; + auto ffn_ctx = ctx_split; //model.split_mode == LLAMA_SPLIT_MODE_GRAPH ? ctx_split : ctx_layer; // Dense FFN path (optional, e.g. mlp_only_layers) layer.ffn_gate = create_tensor(ffn_ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); @@ -1418,8 +1418,6 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) { for (int i = 0; i < n_layer; ++i) { auto ctx_split = ctx_for_layer_split(i); - auto ctx_layer = ctx_for_layer(i); - auto & layer = model.layers[i]; @@ -1440,15 +1438,15 @@ bool create_tensors_helper::create_qwen35moe_tensors(const LLM_TN & tn) { } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, 0); + layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, 0); + layer.ssm_conv1d = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); + layer.ssm_dt = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_beta = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); + layer.ssm_alpha = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); + layer.ssm_norm = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); + layer.ssm_out = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); } layer.ffn_gate_inp = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, 0); @@ -1492,7 +1490,6 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) { for (int i = 0; i < n_layer; ++i) { ggml_context * ctx_split = ctx_for_layer_split(i); - ggml_context * ctx_layer = ctx_for_layer(i); auto & layer = model.layers[i]; @@ -1513,15 +1510,15 @@ bool create_tensors_helper::create_qwen35_tensors(const LLM_TN & tn) { } else { // Linear attention (gated delta net) specific tensors // Create tensors with calculated dimensions - layer.wqkv = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.wqkv_gate = create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); - layer.ssm_conv1d = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); - layer.ssm_dt = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); - layer.ssm_beta = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_alpha = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); - layer.ssm_norm = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); - layer.ssm_out = create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); + layer.wqkv = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, key_dim * 2 + value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.wqkv_gate = create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_GATE, "weight", i), { n_embd, value_dim }, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ssm_conv1d = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); + layer.ssm_dt = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_beta = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_BETA, "weight", i), { n_embd, n_v_heads }, 0); + layer.ssm_alpha = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_ALPHA, "weight", i), { n_embd, n_v_heads }, 0); + layer.ssm_norm = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); + layer.ssm_out = create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); } layer.ffn_gate = create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, 0); @@ -3403,6 +3400,261 @@ static void adjust_split(std::vector & split, const std::vector & } } +static void check_delta_split(ggml_tensor * t, llama_split_tensor & l_split) { + auto extra = (ggml_split_tensor_t *)t->extra; + GGML_ASSERT(extra); + if (extra->split_dim < 0) return; + GGML_ASSERT(extra->n_device == int(l_split.ranges.size())); + for (int is = 0; is < extra->n_device; ++is) { + if (!extra->splits[is]) { + GGML_ASSERT(l_split.ranges[is].empty()); + continue; + } + int ntot = 0; + for (auto & p : l_split.ranges[is]) ntot += p.second; + GGML_ASSERT(ntot == extra->splits[is]->ne[extra->split_dim]); + //auto data = &l_split.ranges[is]; + //std::memcpy(extra->splits[is]->op_params, &data, sizeof(data)); + } + auto data = &l_split.ranges; + std::memcpy(t->op_params, &data, sizeof(data)); +} + +// ttype = 0 -> q, k, v, always multiplied with head_k_dim/head_v_dim +// ttype = 1 -> q, k, v, v, always multiplied with head_k_dim/head_v_dim +// ttype = 2 -> v +// ttype = 3 -> v, but multiplied with head_v_dim +// ttype = 4 -> v, v, never multiplied with head_v_dim +static void prepare_delta_split(int ttype, int repeat_type, int num_k_heads, int gqa_ratio, int head_k_dim, int head_v_dim, const std::vector & split, + ggml_tensor * t, llama_split_tensor & l_split) { + auto extra = (ggml_split_tensor_t *)t->extra; + GGML_ASSERT(extra && extra->n_device == int(split.size())); + l_split.ranges.resize(split.size()); + LLAMA_LOG_DEBUG("================= %s(%s, %d, %d)\n", __func__, t->name, ttype, repeat_type); + int first = 0; + for (int is = 0; is < int(split.size()); ++is) { + int s = split[is]; + if (!s) continue; + auto & ranges = l_split.ranges[is]; + if (ttype == 0 || ttype == 1) { + LLAMA_LOG_DEBUG("adding type 0/1 entry %d, %d for split %d\n", first*head_k_dim, s*head_k_dim, is); + ranges.push_back({first*head_k_dim, s*head_k_dim}); + } + else if (ttype == 2 || ttype == 3 || ttype == 4) { + int multiplier = ttype == 3 ? head_v_dim : ttype == 4 ? 2 : 1; + if (repeat_type == 0) { + LLAMA_LOG_DEBUG("adding type 2/3/4 entry %d, %d for split %d (repeat type is 0)\n", first*gqa_ratio*multiplier, s*gqa_ratio*multiplier, is); + ranges.push_back({first*gqa_ratio*multiplier, s*gqa_ratio*multiplier}); + } else { + for (int j = 0; j < gqa_ratio; ++j) { + LLAMA_LOG_DEBUG("adding type 2/3/4 entry %d, %d for split %d (repeat type is 1)\n", (first + j*num_k_heads)*multiplier, s*multiplier, is); + ranges.push_back({(first + j*num_k_heads)*multiplier, s*multiplier}); + } + } + } + else { + GGML_ABORT("Unknown tensor type for delta-net split"); + } + first += s; + } + if (ttype == 2 || ttype == 3 || ttype == 4) { + check_delta_split(t, l_split); + return; + } + //if (ttype == 4) { + // first = num_k_heads*gqa_ratio; + // for (int is = 0; is < int(split.size()); ++is) { + // int s = split[is]; + // if (!s) continue; + // auto & ranges = l_split.ranges[is]; + // int multiplier = 1; + // if (repeat_type == 0) { + // ranges.push_back({first*gqa_ratio*multiplier, s*gqa_ratio*multiplier}); + // LLAMA_LOG_DEBUG("adding type 4 entry %d, %d for split %d (repeat type is 0)\n", first*gqa_ratio*multiplier, s*gqa_ratio*multiplier, is); + // } else { + // for (int j = 0; j < gqa_ratio; ++j) { + // LLAMA_LOG_DEBUG("adding type 4 entry %d, %d for split %d (repeat type is 1)\n", (first + j*num_k_heads)*multiplier, s*multiplier, is); + // ranges.push_back({(first + j*num_k_heads)*multiplier, s*multiplier}); + // } + // } + // first += s; + // } + // check_delta_split(t, l_split); + // return; + //} + // ttype = 0, 1 + // First we need to add the ranges for k + first = num_k_heads; + for (int is = 0; is < int(split.size()); ++is) { + int s = split[is]; + if (!s) continue; + auto & ranges = l_split.ranges[is]; + LLAMA_LOG_DEBUG("adding type 0/1 entry %d, %d for split %d\n", first*head_k_dim, s*head_k_dim, is); + ranges.push_back({first*head_k_dim, s*head_k_dim}); + first += s; + } + // Then we need to add the ranges for v + first = 2*num_k_heads; + for (int is = 0; is < int(split.size()); ++is) { + int s = split[is]; + if (!s) continue; + auto & ranges = l_split.ranges[is]; + int multiplier = ttype == 0 ? head_v_dim : 2*head_v_dim; + if (repeat_type == 0) { + LLAMA_LOG_DEBUG("adding type 0/1 entry %d, %d for split %d (repeat type is 0)\n", first*gqa_ratio*multiplier, s*gqa_ratio*multiplier, is); + ranges.push_back({first*multiplier, s*gqa_ratio*multiplier}); + first += gqa_ratio; + } else { + for (int j = 0; j < gqa_ratio; ++j) { + LLAMA_LOG_DEBUG("adding type 0/1 entry %d, %d for split %d (repeat type is 1)\n", (first + j*num_k_heads)*multiplier, s*multiplier, is); + ranges.push_back({(first + j*num_k_heads)*multiplier, s*multiplier}); + } + first += s; + } + } + //if (ttype == 0) { + check_delta_split(t, l_split); + return; + //} + first = (2 + gqa_ratio)*num_k_heads; + for (int is = 0; is < int(split.size()); ++is) { + int s = split[is]; + if (!s) continue; + auto & ranges = l_split.ranges[is]; + int multiplier = head_v_dim; + if (repeat_type == 0) { + LLAMA_LOG_DEBUG("adding type 1 entry %d, %d for split %d (repeat type is 0)\n", first*gqa_ratio*multiplier, s*gqa_ratio*multiplier, is); + ranges.push_back({first*gqa_ratio*multiplier, s*gqa_ratio*multiplier}); + } else { + for (int j = 0; j < gqa_ratio; ++j) { + LLAMA_LOG_DEBUG("adding type 1 entry %d, %d for split %d (repeat type is 1)\n", (first + j*num_k_heads)*multiplier, s*multiplier, is); + ranges.push_back({(first + j*num_k_heads)*multiplier, s*multiplier}); + } + } + first += s; + } + check_delta_split(t, l_split); +} + +static void split_recurrent_tensors(const llama_hparams & hparams, llama_layer & layer, const std::vector & cur_splits, std::vector & mem_used, + ggml_context * ctx_split, [[maybe_unused]] int il) { //, int repeat_type) { + int head_k_dim = hparams.ssm_d_state; + int num_k_heads = hparams.ssm_n_group; + int num_v_heads = hparams.ssm_dt_rank; + int head_v_dim = hparams.ssm_d_inner / num_v_heads; + int gqa_ratio = num_v_heads / num_k_heads; + + GGML_ASSERT(layer.ssm_in || (layer.wqkv && layer.wqkv_gate)); + //int repeat_type = layer.ssm_in ? 0 : 1; + int repeat_type = layer.ssm_beta_alpha ? 0 : 1; + + { + // We do not support quantized ssm_dt and ssm_a + auto tt = ggml_internal_get_type_traits(layer.ssm_dt->type); + GGML_ASSERT(tt.row_meta_size == 0 && tt.blck_size == 1); + tt = ggml_internal_get_type_traits(layer.ssm_a->type); + GGML_ASSERT(tt.row_meta_size == 0 && tt.blck_size == 1); + } + + int k_head_granularity = 1; + auto tt = ggml_internal_get_type_traits(layer.ssm_out->type); + auto eff_head_v_dim = repeat_type == 1 ? head_v_dim : head_v_dim * gqa_ratio; + if (tt.blck_size > eff_head_v_dim) { + GGML_ASSERT(tt.blck_size % eff_head_v_dim == 0); + k_head_granularity = tt.blck_size / eff_head_v_dim; + } else { + GGML_ASSERT(eff_head_v_dim % tt.blck_size == 0); + } + if (tt.row_meta_size > 0) { + GGML_ABORT("Quantization types with per row meta data are not supported for the ssm_out tensor when using split mode graph"); + } + + auto split = create_split(num_k_heads, k_head_granularity, cur_splits, mem_used); + LLAMA_LOG_DEBUG("================ %s(%d)", __func__, il); + int n_on = 0; + for (auto & s : split) { + if (s > 0) ++n_on; + LLAMA_LOG_DEBUG(" %d", s); + } + LLAMA_LOG_DEBUG("\n"); + if (n_on < 2) { + GGML_ABORT("The configuration results in a single GPU participating in the delta-net tensor split. This is not supported"); + } + + size_t orig_size = 0, split_size = 0; + auto add_size = [&orig_size, &split_size] (ggml_tensor * t) { + orig_size += ggml_nbytes(t); + auto extra = (ggml_split_tensor_t *)t->extra; + for (int i = 0; i < extra->n_device; ++i) if (extra->splits[i]) split_size += ggml_nbytes(extra->splits[i]); + }; + + // ttype = 0 -> q, k, v, always multiplied with head_k_dim/head_v_dim + // ttype = 1 -> q, k, v, v, always multiplied with head_k_dim/head_v_dim + // ttype = 2 -> v + // ttype = 3 -> v, but multiplied with head_v_dim + // ttype = 4 -> v, v, never multiplied with head_v_dim + + prepare_split_tensors(-1, ctx_split, layer.ssm_norm, layer.split_ssm_norm, split, mem_used); + add_size(layer.ssm_norm); + + auto split_k = split; + for (auto & k : split_k) k *= (head_k_dim*2 + head_v_dim*gqa_ratio); + prepare_split_tensors( 1, ctx_split, layer.ssm_conv1d, layer.split_ssm_conv1d, split_k, mem_used); + prepare_delta_split(0, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_conv1d, layer.split_ssm_conv1d); + add_size(layer.ssm_conv1d); + + if (layer.wqkv) { + prepare_split_tensors( 1, ctx_split, layer.wqkv, layer.split_ssm_wqkv, split_k, mem_used); + prepare_delta_split(0, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.wqkv, layer.split_wqkv); + add_size(layer.wqkv); + } + if (layer.ssm_in) { + split_k = split; + for (auto & k : split_k) k *= (head_k_dim*2 + head_v_dim*gqa_ratio*2); + prepare_split_tensors( 1, ctx_split, layer.ssm_in, layer.split_ssm_in, split_k, mem_used); + prepare_delta_split(1, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_in, layer.split_ssm_in); + add_size(layer.ssm_in); + } + + auto split_v = split; + for (auto & v : split_v) v *= gqa_ratio; + + prepare_split_tensors( 0, ctx_split, layer.ssm_dt, layer.split_ssm_dt, split_v, mem_used); + prepare_delta_split(2, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_dt, layer.split_ssm_dt); + add_size(layer.ssm_dt); + prepare_split_tensors( 0, ctx_split, layer.ssm_a, layer.split_ssm_a, split_v, mem_used); + prepare_delta_split(2, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_a, layer.split_ssm_a); + add_size(layer.ssm_a); + if (layer.ssm_beta) { + prepare_split_tensors( 1, ctx_split, layer.ssm_beta, layer.split_ssm_beta, split_v, mem_used); + prepare_delta_split(2, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_beta, layer.split_ssm_beta); + add_size(layer.ssm_beta); + } + if (layer.ssm_alpha) { + prepare_split_tensors( 1, ctx_split, layer.ssm_alpha, layer.split_ssm_alpha, split_v, mem_used); + prepare_delta_split(2, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_alpha, layer.split_ssm_alpha); + add_size(layer.ssm_alpha); + } + if (layer.ssm_beta_alpha) { + auto split_v2 = split_v; + for (auto & v : split_v2) v *= 2; + prepare_split_tensors( 1, ctx_split, layer.ssm_beta_alpha, layer.split_ssm_beta_alpha, split_v2, mem_used); + prepare_delta_split(4, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_beta_alpha, layer.split_ssm_beta_alpha); + add_size(layer.ssm_beta_alpha); + } + + for (auto & v : split_v) v *= head_v_dim; + prepare_split_tensors( 0, ctx_split, layer.ssm_out, layer.split_ssm_out, split_v, mem_used); + prepare_delta_split(3, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.ssm_out, layer.split_ssm_out); + add_size(layer.ssm_out); + if (layer.wqkv_gate) { + prepare_split_tensors( 1, ctx_split, layer.wqkv_gate, layer.split_ssm_wqkv_gate, split_v, mem_used); + prepare_delta_split(3, repeat_type, num_k_heads, gqa_ratio, head_k_dim, head_v_dim, split, layer.wqkv_gate, layer.split_wqkv_gate); + add_size(layer.wqkv_gate); + } + LLAMA_LOG_DEBUG(" original size: %g MiB, split size: %g MiB\n", orig_size/1024./1024., split_size/1024./1024.); +} + bool create_tensors_helper::create_tensors() { const auto tn = LLM_TN(model.arch); bool use_mmap_buffer = true; @@ -3556,7 +3808,7 @@ bool create_tensors_helper::create_tensors() { throw std::runtime_error("unknown architecture"); } if (model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) { - const int n_layer = model.mtp ? model.layers.size() + const int n_layer = model.mtp ? model.layers.size() : model.layers.size() - model.hparams.nextn_predict_layers; LLAMA_LOG_INFO("================================ max_gpu = %d\n", model.max_gpu); std::vector mem_used(model.splits.size(), 0); @@ -3610,7 +3862,10 @@ bool create_tensors_helper::create_tensors() { auto split = create_split(ggml_nrows(layer.rope_freqs), -1, cur_splits, mem_used); prepare_split_tensors(-1, ctx_split, layer.rope_freqs, layer.split_rope_freqs, split, mem_used); } - if (layer.wo && layer.wq && layer.wk && layer.wv) { + if (hparams.is_recurrent(il)) { + split_recurrent_tensors(hparams, layer, cur_splits, mem_used, ctx_split, il); //, model.arch == LLM_ARCH_QWEN3NEXT ? 0 : 1); + } + else if (layer.wo && layer.wq && layer.wk && layer.wv) { auto granularity_kq = hparams.n_embd_head_k * gqa_ratio; int wq_ne1 = layer.wq->ne[1]; if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_QWEN35MOE || model.arch == LLM_ARCH_QWEN35) { diff --git a/src/llama-model.h b/src/llama-model.h index 8d79b4bf4..78556b935 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -212,6 +212,18 @@ struct llama_layer { llama_split_tensor split_sinks; llama_split_tensor split_wqkv_gate; + llama_split_tensor split_ssm_wqkv; + llama_split_tensor split_ssm_wqkv_gate; + llama_split_tensor split_ssm_in; + llama_split_tensor split_ssm_conv1d; + llama_split_tensor split_ssm_dt; + llama_split_tensor split_ssm_a; + llama_split_tensor split_ssm_beta_alpha; + llama_split_tensor split_ssm_beta; + llama_split_tensor split_ssm_alpha; + llama_split_tensor split_ssm_norm; + llama_split_tensor split_ssm_out; + // relative position bias struct ggml_tensor * attn_rel_b = nullptr; struct ggml_tensor * attn_rel_b_enc = nullptr; diff --git a/src/llama.cpp b/src/llama.cpp index 83d47466d..a0007c6c9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -759,6 +759,9 @@ static bool llama_kv_cache_init( if ((model.split_mode == LLAMA_SPLIT_MODE_GRAPH || model.split_mode == LLAMA_SPLIT_MODE_ATTN) && !is_mla_attn && offload) { cache.split_k_l.reserve(n_layer); cache.split_v_l.reserve(n_layer); + if (llama_model_has_recurrent(&model)) { + cache.split_s_l.reserve(n_layer); + } split_cache = true; } @@ -766,7 +769,7 @@ static bool llama_kv_cache_init( std::map buft_layer_count; if (offload) { for (int64_t i = 0; i < n_layer; ++i) { - if (split_cache && !hparams.is_recurrent(i)) { + if (split_cache) { buft_layer_count[model.buft_layer[i].buft_matrix]++; } else { buft_layer_count[model.buft_layer[i].buft]++; @@ -780,8 +783,8 @@ static bool llama_kv_cache_init( std::map ctx_map; for (auto & it : buft_layer_count) { int n_layers = it.second; - size_t ctx_mem_size = 5u*n_layers*ggml_tensor_overhead(); - if (split_cache) ctx_mem_size += 2*model.splits.size()*n_layers*ggml_tensor_overhead(); + size_t ctx_mem_size = 8u*n_layers*ggml_tensor_overhead(); + if (split_cache) ctx_mem_size += 4*model.splits.size()*n_layers*ggml_tensor_overhead(); struct ggml_init_params params = { /*.mem_size =*/ ctx_mem_size, /*.mem_buffer =*/ NULL, @@ -843,7 +846,8 @@ static bool llama_kv_cache_init( const uint32_t n_head_kv = hparams.n_head_kv(i); const uint32_t n_embd_head_k= hparams.n_embd_head_k; - struct ggml_context * ctx = split_cache && !qnext_recurrent ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + //struct ggml_context * ctx = split_cache && !qnext_recurrent ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + struct ggml_context * ctx = split_cache ? ctx_map.at(model.buft_layer[i].buft_matrix) : offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k = nullptr; ggml_tensor * v = nullptr; ggml_tensor * s = nullptr; @@ -877,6 +881,31 @@ static bool llama_kv_cache_init( cache.s_l[i] = s; cache.k_l.push_back(nullptr); cache.v_l.push_back(nullptr); + LLAMA_LOG_DEBUG("=== Created recurrent cache %s as %ld x %ld x %ld x %ld\n", s->name, s->ne[0], s->ne[1], s->ne[2], s->ne[3]); + if (split_cache && model.layers[i].ssm_out->extra) { + auto split_ssm_out = (const ggml_split_tensor_t *)model.layers[i].ssm_out->extra; + GGML_ASSERT(split_ssm_out); + int num_v_heads = hparams.ssm_dt_rank; + int head_v_dim = hparams.ssm_d_inner / num_v_heads; + int n_device = split_ssm_out->n_device; + auto & split_s_l = cache.split_s_l.emplace_back(); + split_s_l.tensor_splits.resize(n_device, nullptr); + for (int is = 0; is < n_device; ++is) { + auto split = split_ssm_out->splits[is]; + if (!split) continue; + GGML_ASSERT(split->ne[0] % head_v_dim == 0); + int nv = split->ne[0] / head_v_dim; + auto size = hparams.n_embd_v_s_id(nv); + split_s_l.tensor_splits[is] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, size, qnext_state_slots); + auto split_name = s_name + '.' + std::to_string(is); + ggml_set_name(split_s_l.tensor_splits[is], split_name.c_str()); + mem_split[is] += ggml_nbytes(split_s_l.tensor_splits[is]); + } + split_s_l.ggml.n_device = n_device; + split_s_l.ggml.split_dim = 0; + split_s_l.ggml.splits = split_s_l.tensor_splits.data(); + cache.s_l[i]->extra = (void *)&split_s_l.ggml; + } continue; } bool split_cache_i = split_cache; @@ -1939,7 +1968,7 @@ static bool is_model_split_supported(const llama_model & model) { LLM_ARCH_MINIMAX_M2, LLM_ARCH_SEED_OSS, LLM_ARCH_STEP35, - LLM_ARCH_QWEN3NEXT, + //LLM_ARCH_QWEN3NEXT, LLM_ARCH_QWEN35, LLM_ARCH_QWEN35MOE, }; @@ -6256,22 +6285,24 @@ struct llama_data_read { void read_kv_cache_data_split(llama_context * ctx, ggml_tensor * tensor, const uint8_t * data, size_t head, size_t row_size, int nrows, int il) { GGML_ASSERT(il >= 0 && il < int(ctx->model.layers.size())); GGML_ASSERT(ggml_internal_get_type_traits(tensor->type).row_meta_size == 0); - auto kv = get_kv_cache_split_tensor(tensor, ctx->model.layers[il]); + bool is_recurrent = ctx->model.hparams.recurrent_layer_arr[il]; + auto kv = is_recurrent ? nullptr : get_kv_cache_split_tensor(tensor, ctx->model.layers[il]); auto extra = (ggml_split_tensor_t *)tensor->extra; - auto kv_extra = (ggml_split_tensor_t *)kv->extra; - GGML_ASSERT(extra && kv_extra); - auto ne = kv->ne[1]; + auto kv_extra = kv ? (ggml_split_tensor_t *)kv->extra : nullptr; + GGML_ASSERT(extra && (is_recurrent || kv_extra)); + auto ne = kv ? kv->ne[1] : tensor->ne[0]; size_t sum_ne = 0; size_t sum_split_row_size = 0; GGML_ASSERT(row_size == ggml_row_size(tensor->type, ne)); std::vector aux; for (int id = 0; id < extra->n_device; ++id) { auto split = extra->splits[id]; - auto kv_split = kv_extra->splits[id]; - GGML_ASSERT((split && kv_split) || (!split && !kv_split)); + auto kv_split = kv_extra ? kv_extra->splits[id] : nullptr; + GGML_ASSERT((split && (kv_split || is_recurrent)) || (!split && !kv_split)); if (!split) continue; GGML_ASSERT(split->type == tensor->type); - auto split_row_size = ggml_row_size(tensor->type, kv_split->ne[1]); + auto ne_split = kv_split ? kv_split->ne[1] : split->ne[0]; + auto split_row_size = ggml_row_size(tensor->type, ne_split); aux.resize(split_row_size*nrows); auto src = data + sum_split_row_size; auto dst = aux.data(); @@ -6281,7 +6312,7 @@ struct llama_data_read { src += row_size; } ggml_backend_tensor_set(split, aux.data(), head*split_row_size, nrows*split_row_size); - sum_ne += kv_split->ne[1]; + sum_ne += ne_split; sum_split_row_size += split_row_size; } GGML_ASSERT(sum_ne == ne); @@ -6627,8 +6658,12 @@ struct llama_data_write_buffer : llama_data_write { throw std::runtime_error(std::string{"Split cache for type "} + ggml_type_name(tensor->type) + " is not supported"); } GGML_ASSERT(il >= 0 && il < int(model.layers.size())); - auto kv = get_kv_cache_split_tensor(tensor, model.layers[il]); - get_tensor_data_split(ptr, tensor, kv, aux_buffer, offset, size); + if (model.hparams.recurrent_layer_arr[il]) { + get_tensor_data_split(ptr, tensor, aux_buffer, offset, size); + } else { + auto kv = get_kv_cache_split_tensor(tensor, model.layers[il]); + get_tensor_data_split(ptr, tensor, kv, aux_buffer, offset, size); + } } static void get_tensor_data_split(uint8_t * ptr, const ggml_tensor * tensor, const ggml_tensor * kv, @@ -6666,6 +6701,38 @@ struct llama_data_write_buffer : llama_data_write { } GGML_ASSERT(total_size == size); } + static void get_tensor_data_split(uint8_t * ptr, const ggml_tensor * tensor, + std::vector & aux_buffer, size_t offset, size_t size) { + auto ne = tensor->ne[0]; + auto full_row_size = ggml_row_size(tensor->type, ne); + GGML_ASSERT(offset % full_row_size == 0); + GGML_ASSERT(size % full_row_size == 0); + auto first_row = offset / full_row_size; + auto num_rows = size / full_row_size; + auto extra = (const ggml_split_tensor_t *)tensor->extra; + GGML_ASSERT(extra); + size_t split_offset = 0; + size_t total_size = 0; + for (int id = 0; id < extra->n_device; ++id) { + auto split = extra->splits[id]; + if (!split) continue; + GGML_ASSERT(split->type == tensor->type); + auto split_row_size = ggml_row_size(tensor->type, split->ne[0]); + auto split_size = split_row_size * num_rows; + if (split_size > aux_buffer.size()) aux_buffer.resize(split_size); + ggml_backend_tensor_get(split, aux_buffer.data(), first_row*split_row_size, split_size); + auto dst = ptr + split_offset; + auto src = aux_buffer.data(); + for (int row = 0; row < (int)num_rows; ++row) { + std::memcpy(dst, src, split_row_size); + dst += full_row_size; + src += split_row_size; + } + split_offset += split_row_size; + total_size += split_row_size * num_rows; + } + GGML_ASSERT(total_size == size); + } size_t get_size_written() override { return size_written;