From 978aaa9f68564354fe404bc122b5a7510338fd17 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Feb 2025 07:41:53 +0200 Subject: [PATCH 1/8] Do not allocate / report caches that are not used It is either the standard KV cache or MLA cache, not both. --- src/llama.cpp | 75 ++++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0817c53c5..b7665eed9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3178,33 +3178,30 @@ static bool llama_kv_cache_init( ggml_tensor * k; ggml_tensor * v; if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) { - k = ggml_new_tensor_1d(ctx, type_k, 1); - v = ggml_new_tensor_1d(ctx, type_v, 1); + // DeepSeek MLA + //k = ggml_new_tensor_1d(ctx, type_k, 1); + //v = ggml_new_tensor_1d(ctx, type_v, 1); + const uint32_t n_embd_head_qk_rope = hparams.n_rot; + const uint32_t kv_lora_rank = hparams.n_lora_kv; + LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); + ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); + ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); + ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); + ggml_format_name(kr, "cache_kr_l%d", i); + ggml_format_name(kv, "cache_kv_l%d", i); + ggml_format_name(kvt, "cache_kvt_l%d", i); + cache.kr_l.push_back(kr); + cache.kv_l.push_back(kv); + cache.kvt_l.push_back(kvt); } else { - k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); - v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); - } - - ggml_format_name(k, "cache_k_l%d", i); - ggml_format_name(v, "cache_v_l%d", i); - cache.k_l.push_back(k); - cache.v_l.push_back(v); - - - // DeepSeek MLA - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t kv_lora_rank = hparams.n_lora_kv; - LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); - ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); - ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); - ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); - ggml_format_name(kr, "cache_kr_l%d", i); - ggml_format_name(kv, "cache_kv_l%d", i); - ggml_format_name(kvt, "cache_kvt_l%d", i); - cache.kr_l.push_back(kr); - cache.kv_l.push_back(kv); - cache.kvt_l.push_back(kvt); + k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); + v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size); + ggml_format_name(k, "cache_k_l%d", i); + ggml_format_name(v, "cache_v_l%d", i); + cache.k_l.push_back(k); + cache.v_l.push_back(v); + } } // allocate tensors and initialize the buffers to avoid NaNs in the padding @@ -18054,15 +18051,18 @@ struct llama_context * llama_new_context_with_model( memory_size_v += ggml_nbytes(v); } - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, - (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), - ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + if (memory_size_k + memory_size_v > 0) { + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__, + (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f)); + } } - { + { size_t memory_size_kr = 0; size_t memory_size_kv = 0; + size_t memory_size_kvt = 0; for (auto & kr : ctx->kv_self.kr_l) { memory_size_kr += ggml_nbytes(kr); @@ -18072,10 +18072,17 @@ struct llama_context * llama_new_context_with_model( memory_size_kv += ggml_nbytes(kv); } - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB\n", __func__, - (float)(memory_size_kr + memory_size_kv) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_kv / (1024.0f * 1024.0f)); + for (auto & kvt : ctx->kv_self.kvt_l) { + memory_size_kvt += ggml_nbytes(kvt); + } + + if (memory_size_kr + memory_size_kv + memory_size_kvt > 0) { + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__, + (float)(memory_size_kr + memory_size_kv + memory_size_kvt) / (1024.0f * 1024.0f), + ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_kv / (1024.0f * 1024.0f), + ggml_type_name(type_v), (float)memory_size_kvt / (1024.0f * 1024.0f)); + } } // graph outputs buffer From 54252d0256c2ff1473691cdc8ac59c6a5eea5c98 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Feb 2025 07:49:16 +0200 Subject: [PATCH 2/8] Rename X_pe to X_rope Much easier to follow, at least for my brain, when we have X_rope : rotational position encoding X_nope : no position encoding instead of X_pe and X_nope, where I was wondering wtf is 'pe' and 'nope'. --- src/llama.cpp | 78 +++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index b7665eed9..4e580af75 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13419,30 +13419,30 @@ struct llm_build_context { cb(q_nope, "q_nope", il); // and {n_head * n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, + struct ggml_tensor * q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, hparams.n_embd_head_k), ggml_row_size(q->type, hparams.n_embd_head_k * n_head), ggml_row_size(q->type, n_embd_head_qk_nope)); - cb(q_pe, "q_pe", il); + cb(q_rope, "q_rope", il); // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_pe_compresseed, "kv_pe_compresseed", il); + struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_rope_compresseed, "kv_rope_compresseed", il); // split into {kv_lora_rank, n_tokens} - struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, - kv_pe_compresseed->nb[1], + struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens, + kv_rope_compresseed->nb[1], 0); cb(kv_compressed, "kv_compressed", il); if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) { // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); - cb(k_pe, "k_pe", il); + struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_rope_compresseed->nb[1], + kv_rope_compresseed->nb[1], + ggml_row_size(kv_rope_compresseed->type, kv_lora_rank)); + cb(k_rope, "k_rope", il); //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, @@ -13476,28 +13476,26 @@ struct llm_build_context { 0); cb(kv_cache_trans, "kv_cache_trans", il); - //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, nullptr, + q_rope = ggml_rope_ext( + ctx0, q_rope, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); - cb(q_pe, "q_pe", il); + cb(q_rope, "q_rope", il); // shared RoPE key - //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, nullptr, + k_rope = ggml_rope_ext( + ctx0, k_rope, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); - cb(k_pe, "k_pe", il); + cb(k_rope, "k_rope", il); struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); cb(kr_cache_view, "kr_cache_view", il); // note: storing RoPE-ed version of K^R in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_rope, kr_cache_view)); struct ggml_tensor * kr_cache = ggml_view_2d(ctx0, kv_self.kr_l[il], @@ -13528,18 +13526,18 @@ struct llm_build_context { } if (pp_opt) { - q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); - cb(q_pe, "q_pe_perm", il); + q_rope = ggml_permute(ctx0, q_rope, 0, 2, 1, 3); + cb(q_rope, "q_rope_perm", il); } - struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe); - cb(kq_pe, "kq_pe", il); + struct ggml_tensor * kq_rope = ggml_mul_mat(ctx0, kr_cache, q_rope); + cb(kq_rope, "kq_rope", il); if (!pp_opt) { - kq_pe = ggml_permute(ctx0, kq_pe, 0, 2, 1, 3); - cb(kq_pe, "kq_pe_perm", il); + kq_rope = ggml_permute(ctx0, kq_rope, 0, 2, 1, 3); + cb(kq_rope, "kq_rope_perm", il); } - struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe); + struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_rope); cb(kq, "kq", il); kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); @@ -13579,11 +13577,11 @@ struct llm_build_context { else { // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); - cb(k_pe, "k_pe", il); + struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_rope_compresseed->nb[1], + kv_rope_compresseed->nb[1], + ggml_row_size(kv_rope_compresseed->type, kv_lora_rank)); + cb(k_rope, "k_pe", il); //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, @@ -13618,26 +13616,26 @@ struct llm_build_context { cb(v_states, "v_states", il); //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, nullptr, + q_rope = ggml_rope_ext( + ctx0, q_rope, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); - cb(q_pe, "q_pe", il); + cb(q_rope, "q_rope", il); // shared RoPE key //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, nullptr, + k_rope = ggml_rope_ext( + ctx0, k_rope, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); - cb(k_pe, "k_pe", il); + cb(k_rope, "k_rope", il); - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); + struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_rope, 0); cb(q_states, "q_states", il); - struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); + struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_rope, q_rope), 0); cb(k_states, "k_states", il); cur = llm_build_kv(ctx0, lctx, kv_self, gf, From 8438b1628181d6cb22f4bb85110fea97174bd356 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Feb 2025 10:04:33 +0200 Subject: [PATCH 3/8] WIP --- src/llama.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 4e580af75..0ee01f1e0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2547,7 +2547,7 @@ struct llama_layer { struct ggml_tensor * wkv_a_mqa; struct ggml_tensor * wkv_b; struct ggml_tensor * wk_b; - struct ggml_tensor * wv_b; + struct ggml_tensor * wv_b; struct ggml_tensor * wq_cross; struct ggml_tensor * wk_cross; struct ggml_tensor * wv_cross; @@ -13504,12 +13504,21 @@ struct llm_build_context { 0); cb(kr_cache, "kr_cache", il); - struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0); + printf("kv_lora_rank = %d, n_head = %d, n_embd_head_qk_nope = %d, n_embd_head_v = %d\n", kv_lora_rank, (int)n_head, n_embd_head_qk_nope, (int)n_embd_head_v); + printf("wk_b: %d x %d x %d x %d, wkv_b: %d x %d x %d x %d\n", + (int)model.layers[il].wk_b->ne[0], (int)model.layers[il].wk_b->ne[1], (int)model.layers[il].wk_b->ne[2], (int)model.layers[il].wk_b->ne[3], + (int)model.layers[il].wkv_b->ne[0], (int)model.layers[il].wkv_b->ne[1], (int)model.layers[il].wkv_b->ne[2], (int)model.layers[il].wkv_b->ne[3]); + struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0); cb(wk_b, "wk_b", il); q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); cb(q_nope, "q_nope_perm", il); + //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head, + // ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0); + //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache); + //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, + struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); cb(q_nope2, "q_nope2", il); @@ -13519,6 +13528,10 @@ struct llm_build_context { } struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); cb(kq_nope, "kq_nope", il); + printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n", + (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3], + (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3], + (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]); if (!pp_opt) { kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3); From cfee1a0b91ad2e90bf866b8f20bd46b3eae61a53 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Feb 2025 13:53:38 +0200 Subject: [PATCH 4/8] WIP --- src/llama.cpp | 100 +++++++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 67 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 0ee01f1e0..010fc3580 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13425,30 +13425,45 @@ struct llm_build_context { ggml_row_size(q->type, n_embd_head_qk_nope)); cb(q_rope, "q_rope", il); + q_rope = ggml_rope_ext( + ctx0, q_rope, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(q_rope, "q_rope", il); + // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); cb(kv_rope_compresseed, "kv_rope_compresseed", il); + // and {n_embd_head_qk_rope, n_tokens} + struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens, + kv_rope_compresseed->nb[1], + kv_rope_compresseed->nb[1], + ggml_row_size(kv_rope_compresseed->type, kv_lora_rank)); + cb(k_rope, "k_rope", il); + + // shared RoPE key + k_rope = ggml_rope_ext( + ctx0, k_rope, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(k_rope, "k_rope", il); + // split into {kv_lora_rank, n_tokens} struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens, kv_rope_compresseed->nb[1], 0); cb(kv_compressed, "kv_compressed", il); - if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) { - - // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_rope_compresseed->nb[1], - kv_rope_compresseed->nb[1], - ggml_row_size(kv_rope_compresseed->type, kv_lora_rank)); - cb(k_rope, "k_rope", il); + //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm + kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, + model.layers[il].attn_kv_a_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(kv_compressed, "kv_compressed", il); - //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(kv_compressed, "kv_compressed", il); + if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) { struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head); cb(kv_cache_view, "kv_cache_view", il); @@ -13476,21 +13491,6 @@ struct llm_build_context { 0); cb(kv_cache_trans, "kv_cache_trans", il); - q_rope = ggml_rope_ext( - ctx0, q_rope, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(q_rope, "q_rope", il); - - // shared RoPE key - k_rope = ggml_rope_ext( - ctx0, k_rope, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_rope, "k_rope", il); - struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); cb(kr_cache_view, "kr_cache_view", il); @@ -13504,10 +13504,6 @@ struct llm_build_context { 0); cb(kr_cache, "kr_cache", il); - printf("kv_lora_rank = %d, n_head = %d, n_embd_head_qk_nope = %d, n_embd_head_v = %d\n", kv_lora_rank, (int)n_head, n_embd_head_qk_nope, (int)n_embd_head_v); - printf("wk_b: %d x %d x %d x %d, wkv_b: %d x %d x %d x %d\n", - (int)model.layers[il].wk_b->ne[0], (int)model.layers[il].wk_b->ne[1], (int)model.layers[il].wk_b->ne[2], (int)model.layers[il].wk_b->ne[3], - (int)model.layers[il].wkv_b->ne[0], (int)model.layers[il].wkv_b->ne[1], (int)model.layers[il].wkv_b->ne[2], (int)model.layers[il].wkv_b->ne[3]); struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0); cb(wk_b, "wk_b", il); @@ -13517,7 +13513,7 @@ struct llm_build_context { //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head, // ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0); //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache); - //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, + //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); cb(q_nope2, "q_nope2", il); @@ -13528,10 +13524,10 @@ struct llm_build_context { } struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); cb(kq_nope, "kq_nope", il); - printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n", - (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3], - (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3], - (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]); + //printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n", + // (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3], + // (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3], + // (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]); if (!pp_opt) { kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3); @@ -13589,19 +13585,6 @@ struct llm_build_context { } else { - // and {n_embd_head_qk_rope, n_tokens} - struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_rope_compresseed->nb[1], - kv_rope_compresseed->nb[1], - ggml_row_size(kv_rope_compresseed->type, kv_lora_rank)); - cb(k_rope, "k_pe", il); - - //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm - kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, - model.layers[il].attn_kv_a_norm, NULL, - LLM_NORM_RMS, cb, il); - cb(kv_compressed, "kv_compressed", il); - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); cb(kv, "kv", il); @@ -13628,23 +13611,6 @@ struct llm_build_context { 0); cb(v_states, "v_states", il); - //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE - q_rope = ggml_rope_ext( - ctx0, q_rope, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(q_rope, "q_rope", il); - - // shared RoPE key - //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE - k_rope = ggml_rope_ext( - ctx0, k_rope, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_rope, "k_rope", il); - struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_rope, 0); cb(q_states, "q_states", il); From 00dcb0cfa76bc1e49c1357d61ba35de2cdd15aba Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Feb 2025 15:29:59 +0200 Subject: [PATCH 5/8] WIP --- src/llama.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 010fc3580..2bdbf2a07 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -13465,13 +13465,15 @@ struct llm_build_context { if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) { - struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head); + struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, + ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head); cb(kv_cache_view, "kv_cache_view", il); // note: storing c^KV in the KV cache ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view)); - struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); + struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, + ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); cb(kv_cache_trans_view, "kv_cache_trans_view", il); // note: storing transposed c^KV in the transposed KV cache @@ -13491,7 +13493,8 @@ struct llm_build_context { 0); cb(kv_cache_trans, "kv_cache_trans", il); - struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); + struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, + ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); cb(kr_cache_view, "kr_cache_view", il); // note: storing RoPE-ed version of K^R in the KV cache @@ -13504,17 +13507,14 @@ struct llm_build_context { 0); cb(kr_cache, "kr_cache", il); - struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0); + struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, + ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), + ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank)*n_embd_head_qk_nope, 0); cb(wk_b, "wk_b", il); q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); cb(q_nope, "q_nope_perm", il); - //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head, - // ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0); - //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache); - //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, - struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); cb(q_nope2, "q_nope2", il); @@ -13524,10 +13524,6 @@ struct llm_build_context { } struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); cb(kq_nope, "kq_nope", il); - //printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n", - // (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3], - // (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3], - // (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]); if (!pp_opt) { kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3); From 00063b7d99de948fbc4ee02520dcdf92bbb92d0b Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Wed, 12 Feb 2025 18:38:08 +0200 Subject: [PATCH 6/8] WIP --- src/llama.cpp | 92 ++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 67 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 2bdbf2a07..f9a59c793 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2685,7 +2685,6 @@ struct llama_kv_cache { std::vector v_l; // DeepSeek MLA - std::vector kr_l; // per layer std::vector kv_l; std::vector kvt_l; @@ -3166,7 +3165,6 @@ static bool llama_kv_cache_init( cache.v_l.reserve(n_layer); // DeepSeek MLA - cache.kr_l.reserve(n_layer); cache.kv_l.reserve(n_layer); cache.kvt_l.reserve(n_layer); @@ -3179,18 +3177,13 @@ static bool llama_kv_cache_init( ggml_tensor * v; if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) { // DeepSeek MLA - //k = ggml_new_tensor_1d(ctx, type_k, 1); - //v = ggml_new_tensor_1d(ctx, type_v, 1); const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); - ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size); - ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); + ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); - ggml_format_name(kr, "cache_kr_l%d", i); ggml_format_name(kv, "cache_kv_l%d", i); ggml_format_name(kvt, "cache_kvt_l%d", i); - cache.kr_l.push_back(kr); cache.kv_l.push_back(kv); cache.kvt_l.push_back(kvt); } @@ -13457,7 +13450,6 @@ struct llm_build_context { 0); cb(kv_compressed, "kv_compressed", il); - //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams, model.layers[il].attn_kv_a_norm, NULL, LLM_NORM_RMS, cb, il); @@ -13465,13 +13457,6 @@ struct llm_build_context { if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) { - struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, - ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head); - cb(kv_cache_view, "kv_cache_view", il); - - // note: storing c^KV in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view)); - struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); cb(kv_cache_trans_view, "kv_cache_trans_view", il); @@ -13479,13 +13464,6 @@ struct llm_build_context { // note: storing transposed c^KV in the transposed KV cache ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view)); - struct ggml_tensor * kv_cache = - ggml_view_2d(ctx0, kv_self.kv_l[il], - kv_lora_rank, n_kv, - ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank), - 0); - cb(kv_cache, "kv_cache", il); - struct ggml_tensor * kv_cache_trans = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_kv, kv_lora_rank, @@ -13493,19 +13471,16 @@ struct llm_build_context { 0); cb(kv_cache_trans, "kv_cache_trans", il); - struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, - ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head); - cb(kr_cache_view, "kr_cache_view", il); - - // note: storing RoPE-ed version of K^R in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_rope, kr_cache_view)); + ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0); + cb(kvr, "kvr", il); - struct ggml_tensor * kr_cache = - ggml_view_2d(ctx0, kv_self.kr_l[il], - n_embd_head_qk_rope, n_kv, - ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope), - 0); - cb(kr_cache, "kr_cache", il); + ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*(kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope)*kv_head); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, kvr, kv_cache_view)); + ggml_tensor * kv_cache = ggml_view_2d(ctx0, kv_self.kv_l[il], + kv_lora_rank + n_embd_head_qk_rope, n_kv, + ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0); + cb(kv_cache, "kv_cache", il); struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), @@ -13518,33 +13493,20 @@ struct llm_build_context { struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope); cb(q_nope2, "q_nope2", il); + ggml_tensor * q = ggml_concat(ctx0, q_nope2, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), 0); + cb(q, "q", il); if (!pp_opt) { - q_nope2 = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3); - cb(q_nope2, "q_nope2_perm", il); - } - struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2); - cb(kq_nope, "kq_nope", il); - - if (!pp_opt) { - kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3); - cb(kq_nope, "kq_nope_perm", il); - } - - if (pp_opt) { - q_rope = ggml_permute(ctx0, q_rope, 0, 2, 1, 3); - cb(q_rope, "q_rope_perm", il); + q = ggml_permute(ctx0, q, 0, 2, 1, 3); + cb(q, "q_perm", il); } - struct ggml_tensor * kq_rope = ggml_mul_mat(ctx0, kr_cache, q_rope); - cb(kq_rope, "kq_rope", il); + ggml_tensor * kq = ggml_mul_mat(ctx0, kv_cache, q); + cb(kq, "kq", il); - if (!pp_opt) { - kq_rope = ggml_permute(ctx0, kq_rope, 0, 2, 1, 3); - cb(kq_rope, "kq_rope_perm", il); + if (!pp_opt) { + kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3)); + cb(kq, "kq_perm", il); } - struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_rope); - cb(kq, "kq", il); - kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias); cb(kq, "kq_soft_max_ext", il); @@ -13561,7 +13523,9 @@ struct llm_build_context { cb(kqv_compressed, "kqv_compressed_perm", il); } - struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0); + struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, + ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), + ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank)*n_embd_head_v, 0); cb(wv_b, "wv_b", il); struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed); @@ -18033,14 +17997,9 @@ struct llama_context * llama_new_context_with_model( } { - size_t memory_size_kr = 0; size_t memory_size_kv = 0; size_t memory_size_kvt = 0; - for (auto & kr : ctx->kv_self.kr_l) { - memory_size_kr += ggml_nbytes(kr); - } - for (auto & kv : ctx->kv_self.kv_l) { memory_size_kv += ggml_nbytes(kv); } @@ -18049,10 +18008,9 @@ struct llama_context * llama_new_context_with_model( memory_size_kvt += ggml_nbytes(kvt); } - if (memory_size_kr + memory_size_kv + memory_size_kvt > 0) { - LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__, - (float)(memory_size_kr + memory_size_kv + memory_size_kvt) / (1024.0f * 1024.0f), - ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f), + if (memory_size_kv + memory_size_kvt > 0) { + LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__, + (float)(memory_size_kv + memory_size_kvt) / (1024.0f * 1024.0f), ggml_type_name(type_v), (float)memory_size_kv / (1024.0f * 1024.0f), ggml_type_name(type_v), (float)memory_size_kvt / (1024.0f * 1024.0f)); } From 91db234fb5cac7195f517c5cd12abf2bd2c32e9a Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 13 Feb 2025 08:40:24 +0200 Subject: [PATCH 7/8] Warn user when disabling MLA --- src/llama.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/llama.cpp b/src/llama.cpp index f9a59c793..52291cabc 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3168,6 +3168,8 @@ static bool llama_kv_cache_init( cache.kv_l.reserve(n_layer); cache.kvt_l.reserve(n_layer); + bool warn = true; + int n_mla = 0; for (int i = 0; i < (int) n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); @@ -3175,6 +3177,17 @@ static bool llama_kv_cache_init( struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); ggml_tensor * k; ggml_tensor * v; + if (cparams.mla_attn) { + if (!model.layers[i].wk_b || !model.layers[i].wv_b) { + if (warn) { + LLAMA_LOG_WARN("=======================================================================================\n"); + LLAMA_LOG_WARN("%s: missing MLA tensors => disabling MLA\n", __func__); + LLAMA_LOG_WARN("%s: you need to reconvert your model in order to use MLA\n", __func__); + LLAMA_LOG_WARN("=======================================================================================\n"); + warn = false; + } + } + } if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) { // DeepSeek MLA const uint32_t n_embd_head_qk_rope = hparams.n_rot; @@ -3186,6 +3199,7 @@ static bool llama_kv_cache_init( ggml_format_name(kvt, "cache_kvt_l%d", i); cache.kv_l.push_back(kv); cache.kvt_l.push_back(kvt); + n_mla++; } else { k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size); @@ -3196,6 +3210,11 @@ static bool llama_kv_cache_init( cache.v_l.push_back(v); } } + if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) { + LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer)); + LLAMA_LOG_ERROR("%s: bailing out\n", __func__); + GGML_ABORT("fatal error"); + } // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto it : ctx_map) { From f875ed00e866465a2e0bba20707c5635df2e75d3 Mon Sep 17 00:00:00 2001 From: Iwan Kawrakow Date: Thu, 13 Feb 2025 10:54:42 +0200 Subject: [PATCH 8/8] MLA: compile time option to not use transposed KV cache Cuts KV cache size in nearly half at the expense of slower TG performance for long contexts (it becomes similar to no-MLA). --- src/llama.cpp | 60 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 52291cabc..498bb4370 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -108,6 +108,14 @@ #define LLAMA_MAX_LAYERS 512 #define LLAMA_MAX_EXPERTS 256 // DeepSeekV2 +// +// === MLA cache +// If tou are desperate to reduce KV cache size, set MLA_USE_TRANSPOSED_CACHE to 0. +// TG perfornce will be slower (similar to no-MLA), but KV cache size will be cut to ~half. +// PP performance will be about the same as with MLA_USE_TRANSPOSED_CACHE = 1. +// +#define MLA_USE_TRANSPOSED_CACHE 1 + // // helpers // @@ -2676,9 +2684,6 @@ struct llama_kv_cache { ggml_type type_k = GGML_TYPE_F16; ggml_type type_v = GGML_TYPE_F16; - ggml_type type_kr = GGML_TYPE_F16; - ggml_type type_kv = GGML_TYPE_F16; - std::vector cells; std::vector k_l; // per layer @@ -2686,7 +2691,9 @@ struct llama_kv_cache { // DeepSeek MLA std::vector kv_l; +#if MLA_USE_TRANSPOSED_CACHE std::vector kvt_l; +#endif std::vector ctxs; std::vector bufs; @@ -3120,8 +3127,6 @@ static bool llama_kv_cache_init( cache.type_k = type_k; cache.type_v = type_v; - cache.type_kr = type_k; - cache.type_kv = type_v; cache.cells.clear(); cache.cells.resize(kv_size); @@ -3166,7 +3171,9 @@ static bool llama_kv_cache_init( // DeepSeek MLA cache.kv_l.reserve(n_layer); +#if MLA_USE_TRANSPOSED_CACHE cache.kvt_l.reserve(n_layer); +#endif bool warn = true; int n_mla = 0; @@ -3193,12 +3200,22 @@ static bool llama_kv_cache_init( const uint32_t n_embd_head_qk_rope = hparams.n_rot; const uint32_t kv_lora_rank = hparams.n_lora_kv; LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank); - ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); - ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size); +#if MLA_USE_TRANSPOSED_CACHE + // TODO: The k-cache is contiguous and not permuted, so strictly speaking, it should be possible to quantize it. + // Sadly, at this point something goes wrong with quantized k-cache, so for now we set the k-cache + // type to type_v, which is guaranteed to be f16 or bf16 without FA. + //ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); + ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); +#else + ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size); +#endif ggml_format_name(kv, "cache_kv_l%d", i); - ggml_format_name(kvt, "cache_kvt_l%d", i); cache.kv_l.push_back(kv); +#if MLA_USE_TRANSPOSED_CACHE + ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_v, kv_lora_rank*kv_size); + ggml_format_name(kvt, "cache_kvt_l%d", i); cache.kvt_l.push_back(kvt); +#endif n_mla++; } else { @@ -13476,19 +13493,20 @@ struct llm_build_context { if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) { - struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, - ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); +#if MLA_USE_TRANSPOSED_CACHE + ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, + ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head)); cb(kv_cache_trans_view, "kv_cache_trans_view", il); // note: storing transposed c^KV in the transposed KV cache ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view)); - struct ggml_tensor * kv_cache_trans = - ggml_view_2d(ctx0, kv_self.kvt_l[il], - n_kv, kv_lora_rank, - ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), - 0); + ggml_tensor * kv_cache_trans = ggml_view_2d(ctx0, kv_self.kvt_l[il], + n_kv, kv_lora_rank, + ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), + 0); cb(kv_cache_trans, "kv_cache_trans", il); +#endif ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0); cb(kvr, "kvr", il); @@ -13534,6 +13552,16 @@ struct llm_build_context { cb(kq, "kq_soft_max_ext_perm", il); } +#if !MLA_USE_TRANSPOSED_CACHE + ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, kv_self.kv_l[il], + kv_lora_rank, n_kv, + ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0); + cb(kv_cache, "kv_cache_lora", il); + + ggml_tensor * kv_cache_trans = ggml_cont(ctx0, ggml_transpose(ctx0, kv_cache_lora)); + cb(kv_cache_trans, "kv_cache_trans", il); +#endif + struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq); cb(kqv_compressed, "kqv_compressed", il); @@ -18023,9 +18051,11 @@ struct llama_context * llama_new_context_with_model( memory_size_kv += ggml_nbytes(kv); } +#if MLA_USE_TRANSPOSED_CACHE for (auto & kvt : ctx->kv_self.kvt_l) { memory_size_kvt += ggml_nbytes(kvt); } +#endif if (memory_size_kv + memory_size_kvt > 0) { LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,