From 978aaa9f68564354fe404bc122b5a7510338fd17 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Feb 2025 07:41:53 +0200
Subject: [PATCH 1/8] Do not allocate / report caches that are not used

It is either the standard KV cache or MLA cache, not both.
---
 src/llama.cpp | 75 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 0817c53c5..b7665eed9 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3178,33 +3178,30 @@ static bool llama_kv_cache_init(
         ggml_tensor * k;
         ggml_tensor * v;
         if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) {
-               k = ggml_new_tensor_1d(ctx, type_k, 1);
-               v = ggml_new_tensor_1d(ctx, type_v, 1);
+            // DeepSeek MLA
+            //k = ggml_new_tensor_1d(ctx, type_k, 1);
+            //v = ggml_new_tensor_1d(ctx, type_v, 1);
+            const uint32_t n_embd_head_qk_rope = hparams.n_rot;
+            const uint32_t kv_lora_rank = hparams.n_lora_kv;
+            LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank);
+            ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size);
+            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
+            ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
+            ggml_format_name(kr, "cache_kr_l%d", i);
+            ggml_format_name(kv, "cache_kv_l%d", i);
+            ggml_format_name(kvt, "cache_kvt_l%d", i);
+            cache.kr_l.push_back(kr);
+            cache.kv_l.push_back(kv);
+            cache.kvt_l.push_back(kvt);
         }
         else {
-               k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
-               v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
-       }
-
-        ggml_format_name(k, "cache_k_l%d", i);
-        ggml_format_name(v, "cache_v_l%d", i);
-        cache.k_l.push_back(k);
-        cache.v_l.push_back(v);
-
-
-        // DeepSeek MLA
-        const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-        const uint32_t kv_lora_rank = hparams.n_lora_kv;
-        LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank);
-        ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size);
-        ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
-        ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
-        ggml_format_name(kr, "cache_kr_l%d", i);
-        ggml_format_name(kv, "cache_kv_l%d", i);
-        ggml_format_name(kvt, "cache_kvt_l%d", i);
-        cache.kr_l.push_back(kr);
-        cache.kv_l.push_back(kv);
-        cache.kvt_l.push_back(kvt);
+            k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+            v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
+            ggml_format_name(k, "cache_k_l%d", i);
+            ggml_format_name(v, "cache_v_l%d", i);
+            cache.k_l.push_back(k);
+            cache.v_l.push_back(v);
+        }
     }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
@@ -18054,15 +18051,18 @@ struct llama_context * llama_new_context_with_model(
                 memory_size_v += ggml_nbytes(v);
             }
 
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
-                (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
-                ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+            if (memory_size_k + memory_size_v > 0) {
+                LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
+                        (float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
+                        ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
+                        ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
+            }
         }
 
-	{
+	    {
             size_t memory_size_kr = 0;
             size_t memory_size_kv = 0;
+            size_t memory_size_kvt = 0;
 
             for (auto & kr : ctx->kv_self.kr_l) {
                 memory_size_kr += ggml_nbytes(kr);
@@ -18072,10 +18072,17 @@ struct llama_context * llama_new_context_with_model(
                 memory_size_kv += ggml_nbytes(kv);
             }
 
-            LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB\n", __func__,
-                      (float)(memory_size_kr + memory_size_kv) / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f),
-                ggml_type_name(type_k), (float)memory_size_kv / (1024.0f * 1024.0f));
+            for (auto & kvt : ctx->kv_self.kvt_l) {
+                memory_size_kvt += ggml_nbytes(kvt);
+            }
+
+            if (memory_size_kr + memory_size_kv + memory_size_kvt > 0) {
+                LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,
+                        (float)(memory_size_kr + memory_size_kv + memory_size_kvt) / (1024.0f * 1024.0f),
+                        ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f),
+                        ggml_type_name(type_v), (float)memory_size_kv / (1024.0f * 1024.0f),
+                        ggml_type_name(type_v), (float)memory_size_kvt / (1024.0f * 1024.0f));
+            }
         }
 
         // graph outputs buffer

From 54252d0256c2ff1473691cdc8ac59c6a5eea5c98 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Feb 2025 07:49:16 +0200
Subject: [PATCH 2/8] Rename X_pe to X_rope

Much easier to follow, at least for my brain, when we have
  X_rope : rotational position encoding
  X_nope :         no position encoding
instead of X_pe and X_nope, where I was wondering wtf is 'pe'
and 'nope'.
---
 src/llama.cpp | 78 +++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 40 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index b7665eed9..4e580af75 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13419,30 +13419,30 @@ struct llm_build_context {
                 cb(q_nope, "q_nope", il);
 
                 // and {n_head * n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
+                struct ggml_tensor * q_rope = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
                         ggml_row_size(q->type, hparams.n_embd_head_k),
                         ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
                         ggml_row_size(q->type, n_embd_head_qk_nope));
-                cb(q_pe, "q_pe", il);
+                cb(q_rope, "q_rope", il);
 
                 // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
-                struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
-                cb(kv_pe_compresseed, "kv_pe_compresseed", il);
+                struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
+                cb(kv_rope_compresseed, "kv_rope_compresseed", il);
 
                 // split into {kv_lora_rank, n_tokens}
-                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
-                        kv_pe_compresseed->nb[1],
+                struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens,
+                        kv_rope_compresseed->nb[1],
                         0);
                 cb(kv_compressed, "kv_compressed", il);
 
                 if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
 
                     // and {n_embd_head_qk_rope, n_tokens}
-                    struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                            kv_pe_compresseed->nb[1],
-                            kv_pe_compresseed->nb[1],
-                            ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
-                    cb(k_pe, "k_pe", il);
+                    struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                            kv_rope_compresseed->nb[1],
+                            kv_rope_compresseed->nb[1],
+                            ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
+                    cb(k_rope, "k_rope", il);
 
                     //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
                     kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
@@ -13476,28 +13476,26 @@ struct llm_build_context {
                                 0);
                     cb(kv_cache_trans, "kv_cache_trans", il);
 
-                    //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
-                    q_pe = ggml_rope_ext(
-                            ctx0, q_pe, inp_pos, nullptr,
+                    q_rope = ggml_rope_ext(
+                            ctx0, q_rope, inp_pos, nullptr,
                             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor_scaled, beta_fast, beta_slow
                             );
-                    cb(q_pe, "q_pe", il);
+                    cb(q_rope, "q_rope", il);
 
                     // shared RoPE key
-                    //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
-                    k_pe = ggml_rope_ext(
-                            ctx0, k_pe, inp_pos, nullptr,
+                    k_rope = ggml_rope_ext(
+                            ctx0, k_rope, inp_pos, nullptr,
                             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor_scaled, beta_fast, beta_slow
                             );
-                    cb(k_pe, "k_pe", il);
+                    cb(k_rope, "k_rope", il);
 
                     struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
                     cb(kr_cache_view, "kr_cache_view", il);
 
                     // note: storing RoPE-ed version of K^R in the KV cache
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_pe, kr_cache_view));
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_rope, kr_cache_view));
 
                     struct ggml_tensor * kr_cache =
                         ggml_view_2d(ctx0, kv_self.kr_l[il],
@@ -13528,18 +13526,18 @@ struct llm_build_context {
                     }
 
                     if (pp_opt) {
-                        q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3);
-                        cb(q_pe, "q_pe_perm", il);
+                        q_rope = ggml_permute(ctx0, q_rope, 0, 2, 1, 3);
+                        cb(q_rope, "q_rope_perm", il);
                     }
-                    struct ggml_tensor * kq_pe = ggml_mul_mat(ctx0, kr_cache, q_pe);
-                    cb(kq_pe, "kq_pe", il);
+                    struct ggml_tensor * kq_rope = ggml_mul_mat(ctx0, kr_cache, q_rope);
+                    cb(kq_rope, "kq_rope", il);
 
                     if (!pp_opt) {
-                        kq_pe = ggml_permute(ctx0, kq_pe, 0, 2, 1, 3);
-                        cb(kq_pe, "kq_pe_perm", il);
+                        kq_rope = ggml_permute(ctx0, kq_rope, 0, 2, 1, 3);
+                        cb(kq_rope, "kq_rope_perm", il);
                     }
 
-                    struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_pe);
+                    struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_rope);
                     cb(kq, "kq", il);
 
                     kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias);
@@ -13579,11 +13577,11 @@ struct llm_build_context {
                 else {
 
                     // and {n_embd_head_qk_rope, n_tokens}
-                    struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                            kv_pe_compresseed->nb[1],
-                            kv_pe_compresseed->nb[1],
-                            ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
-                    cb(k_pe, "k_pe", il);
+                    struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                            kv_rope_compresseed->nb[1],
+                            kv_rope_compresseed->nb[1],
+                            ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
+                    cb(k_rope, "k_pe", il);
 
                     //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
                     kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
@@ -13618,26 +13616,26 @@ struct llm_build_context {
                     cb(v_states, "v_states", il);
 
                     //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
-                    q_pe = ggml_rope_ext(
-                            ctx0, q_pe, inp_pos, nullptr,
+                    q_rope = ggml_rope_ext(
+                            ctx0, q_rope, inp_pos, nullptr,
                             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor_scaled, beta_fast, beta_slow
                             );
-                    cb(q_pe, "q_pe", il);
+                    cb(q_rope, "q_rope", il);
 
                     // shared RoPE key
                     //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
-                    k_pe = ggml_rope_ext(
-                            ctx0, k_pe, inp_pos, nullptr,
+                    k_rope = ggml_rope_ext(
+                            ctx0, k_rope, inp_pos, nullptr,
                             n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor_scaled, beta_fast, beta_slow
                             );
-                    cb(k_pe, "k_pe", il);
+                    cb(k_rope, "k_rope", il);
 
-                    struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
+                    struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_rope, 0);
                     cb(q_states, "q_states", il);
 
-                    struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+                    struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_rope, q_rope), 0);
                     cb(k_states, "k_states", il);
 
                     cur = llm_build_kv(ctx0, lctx, kv_self, gf,

From 8438b1628181d6cb22f4bb85110fea97174bd356 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Feb 2025 10:04:33 +0200
Subject: [PATCH 3/8] WIP

---
 src/llama.cpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 4e580af75..0ee01f1e0 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2547,7 +2547,7 @@ struct llama_layer {
     struct ggml_tensor * wkv_a_mqa;
     struct ggml_tensor * wkv_b;
     struct ggml_tensor * wk_b;
-    struct ggml_tensor * wv_b;    
+    struct ggml_tensor * wv_b;
     struct ggml_tensor * wq_cross;
     struct ggml_tensor * wk_cross;
     struct ggml_tensor * wv_cross;
@@ -13504,12 +13504,21 @@ struct llm_build_context {
                                 0);
                     cb(kr_cache, "kr_cache", il);
 
-                    struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank * n_embd_head_qk_nope), 0);
+                    printf("kv_lora_rank = %d, n_head = %d, n_embd_head_qk_nope = %d, n_embd_head_v = %d\n", kv_lora_rank, (int)n_head, n_embd_head_qk_nope, (int)n_embd_head_v);
+                    printf("wk_b: %d x %d x %d x %d, wkv_b: %d x %d x %d x %d\n",
+                            (int)model.layers[il].wk_b->ne[0], (int)model.layers[il].wk_b->ne[1], (int)model.layers[il].wk_b->ne[2], (int)model.layers[il].wk_b->ne[3],
+                            (int)model.layers[il].wkv_b->ne[0], (int)model.layers[il].wkv_b->ne[1], (int)model.layers[il].wkv_b->ne[2], (int)model.layers[il].wkv_b->ne[3]);
+                    struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0);
                     cb(wk_b, "wk_b", il);
 
                     q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
                     cb(q_nope, "q_nope_perm", il);
 
+                    //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head,
+                    //        ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0);
+                    //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache);
+                    //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, 
+
                     struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
                     cb(q_nope2, "q_nope2", il);
 
@@ -13519,6 +13528,10 @@ struct llm_build_context {
                     }
                     struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2);
                     cb(kq_nope, "kq_nope", il);
+                    printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n",
+                            (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3],
+                            (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3],
+                            (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]);
 
                     if (!pp_opt) {
                         kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3);

From cfee1a0b91ad2e90bf866b8f20bd46b3eae61a53 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Feb 2025 13:53:38 +0200
Subject: [PATCH 4/8] WIP

---
 src/llama.cpp | 100 +++++++++++++++++---------------------------------
 1 file changed, 33 insertions(+), 67 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 0ee01f1e0..010fc3580 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13425,30 +13425,45 @@ struct llm_build_context {
                         ggml_row_size(q->type, n_embd_head_qk_nope));
                 cb(q_rope, "q_rope", il);
 
+                q_rope = ggml_rope_ext(
+                        ctx0, q_rope, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                        );
+                cb(q_rope, "q_rope", il);
+
                 // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
                 struct ggml_tensor * kv_rope_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
                 cb(kv_rope_compresseed, "kv_rope_compresseed", il);
 
+                // and {n_embd_head_qk_rope, n_tokens}
+                struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
+                        kv_rope_compresseed->nb[1],
+                        kv_rope_compresseed->nb[1],
+                        ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
+                cb(k_rope, "k_rope", il);
+
+                // shared RoPE key
+                k_rope = ggml_rope_ext(
+                        ctx0, k_rope, inp_pos, nullptr,
+                        n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                        ext_factor, attn_factor_scaled, beta_fast, beta_slow
+                        );
+                cb(k_rope, "k_rope", il);
+
                 // split into {kv_lora_rank, n_tokens}
                 struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_rope_compresseed, kv_lora_rank, n_tokens,
                         kv_rope_compresseed->nb[1],
                         0);
                 cb(kv_compressed, "kv_compressed", il);
 
-                if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
-
-                    // and {n_embd_head_qk_rope, n_tokens}
-                    struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                            kv_rope_compresseed->nb[1],
-                            kv_rope_compresseed->nb[1],
-                            ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
-                    cb(k_rope, "k_rope", il);
+                //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
+                kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
+                        model.layers[il].attn_kv_a_norm, NULL,
+                        LLM_NORM_RMS, cb, il);
+                cb(kv_compressed, "kv_compressed", il);
 
-                    //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
-                    kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
-                            model.layers[il].attn_kv_a_norm, NULL,
-                            LLM_NORM_RMS, cb, il);
-                    cb(kv_compressed, "kv_compressed", il);
+                if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
 
                     struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head);
                     cb(kv_cache_view, "kv_cache_view", il);
@@ -13476,21 +13491,6 @@ struct llm_build_context {
                                 0);
                     cb(kv_cache_trans, "kv_cache_trans", il);
 
-                    q_rope = ggml_rope_ext(
-                            ctx0, q_rope, inp_pos, nullptr,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                            );
-                    cb(q_rope, "q_rope", il);
-
-                    // shared RoPE key
-                    k_rope = ggml_rope_ext(
-                            ctx0, k_rope, inp_pos, nullptr,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                            );
-                    cb(k_rope, "k_rope", il);
-
                     struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
                     cb(kr_cache_view, "kr_cache_view", il);
 
@@ -13504,10 +13504,6 @@ struct llm_build_context {
                                 0);
                     cb(kr_cache, "kr_cache", il);
 
-                    printf("kv_lora_rank = %d, n_head = %d, n_embd_head_qk_nope = %d, n_embd_head_v = %d\n", kv_lora_rank, (int)n_head, n_embd_head_qk_nope, (int)n_embd_head_v);
-                    printf("wk_b: %d x %d x %d x %d, wkv_b: %d x %d x %d x %d\n",
-                            (int)model.layers[il].wk_b->ne[0], (int)model.layers[il].wk_b->ne[1], (int)model.layers[il].wk_b->ne[2], (int)model.layers[il].wk_b->ne[3],
-                            (int)model.layers[il].wkv_b->ne[0], (int)model.layers[il].wkv_b->ne[1], (int)model.layers[il].wkv_b->ne[2], (int)model.layers[il].wkv_b->ne[3]);
                     struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0);
                     cb(wk_b, "wk_b", il);
 
@@ -13517,7 +13513,7 @@ struct llm_build_context {
                     //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head,
                     //        ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0);
                     //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache);
-                    //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope, 
+                    //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope,
 
                     struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
                     cb(q_nope2, "q_nope2", il);
@@ -13528,10 +13524,10 @@ struct llm_build_context {
                     }
                     struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2);
                     cb(kq_nope, "kq_nope", il);
-                    printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n",
-                            (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3],
-                            (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3],
-                            (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]);
+                    //printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n",
+                    //        (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3],
+                    //        (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3],
+                    //        (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]);
 
                     if (!pp_opt) {
                         kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3);
@@ -13589,19 +13585,6 @@ struct llm_build_context {
                 }
                 else {
 
-                    // and {n_embd_head_qk_rope, n_tokens}
-                    struct ggml_tensor * k_rope = ggml_view_3d(ctx0, kv_rope_compresseed, n_embd_head_qk_rope, 1, n_tokens,
-                            kv_rope_compresseed->nb[1],
-                            kv_rope_compresseed->nb[1],
-                            ggml_row_size(kv_rope_compresseed->type, kv_lora_rank));
-                    cb(k_rope, "k_pe", il);
-
-                    //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
-                    kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
-                            model.layers[il].attn_kv_a_norm, NULL,
-                            LLM_NORM_RMS, cb, il);
-                    cb(kv_compressed, "kv_compressed", il);
-
                     // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
                     struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
                     cb(kv, "kv", il);
@@ -13628,23 +13611,6 @@ struct llm_build_context {
                             0);
                     cb(v_states, "v_states", il);
 
-                    //q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
-                    q_rope = ggml_rope_ext(
-                            ctx0, q_rope, inp_pos, nullptr,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                            );
-                    cb(q_rope, "q_rope", il);
-
-                    // shared RoPE key
-                    //k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
-                    k_rope = ggml_rope_ext(
-                            ctx0, k_rope, inp_pos, nullptr,
-                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                            ext_factor, attn_factor_scaled, beta_fast, beta_slow
-                            );
-                    cb(k_rope, "k_rope", il);
-
                     struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_rope, 0);
                     cb(q_states, "q_states", il);
 

From 00dcb0cfa76bc1e49c1357d61ba35de2cdd15aba Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Feb 2025 15:29:59 +0200
Subject: [PATCH 5/8] WIP

---
 src/llama.cpp | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 010fc3580..2bdbf2a07 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13465,13 +13465,15 @@ struct llm_build_context {
 
                 if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
 
-                    struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head);
+                    struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank,
+                            ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head);
                     cb(kv_cache_view, "kv_cache_view", il);
 
                     // note: storing c^KV in the KV cache
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view));
 
-                    struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank, ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
+                    struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank,
+                            ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
                     cb(kv_cache_trans_view, "kv_cache_trans_view", il);
 
                     // note: storing transposed c^KV in the transposed KV cache
@@ -13491,7 +13493,8 @@ struct llm_build_context {
                                 0);
                     cb(kv_cache_trans, "kv_cache_trans", il);
 
-                    struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope, ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
+                    struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope,
+                            ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
                     cb(kr_cache_view, "kr_cache_view", il);
 
                     // note: storing RoPE-ed version of K^R in the KV cache
@@ -13504,17 +13507,14 @@ struct llm_build_context {
                                 0);
                     cb(kr_cache, "kr_cache", il);
 
-                    struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head, ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank) * n_embd_head_qk_nope, 0);
+                    struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head,
+                            ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope),
+                            ggml_row_size(model.layers[il].wk_b->type, kv_lora_rank)*n_embd_head_qk_nope, 0);
                     cb(wk_b, "wk_b", il);
 
                     q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
                     cb(q_nope, "q_nope_perm", il);
 
-                    //ggml_tensor * wkv_b = ggml_view_2d(ctx0, model.layers[il].wkv_b, kv_lora_rank, n_embd_head_qk_nope*n_head,
-                    //        ggml_row_size(model.layers[il].wkv_b->type, kv_lora_rank), 0);
-                    //ggml_tensor * ik1 = ggml_mul_mat(ctx0, wkv_b, kv_cache);
-                    //ggml_tensor * ik2 = ggml_view_3d(ctx0, ik1, n_embd_head_qk_nope,
-
                     struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
                     cb(q_nope2, "q_nope2", il);
 
@@ -13524,10 +13524,6 @@ struct llm_build_context {
                     }
                     struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2);
                     cb(kq_nope, "kq_nope", il);
-                    //printf("kq_nope = kv_cache(%d x %d x %d x %d) * [wk_b (%d x %d x %d x %d) * q_nope (%d x %d x %d x %d)]\n",
-                    //        (int)kv_cache->ne[0], (int)kv_cache->ne[1], (int)kv_cache->ne[2], (int)kv_cache->ne[3],
-                    //        (int)wk_b->ne[0], (int)wk_b->ne[1], (int)wk_b->ne[2], (int)wk_b->ne[3],
-                    //        (int)q_nope->ne[0], (int)q_nope->ne[1], (int)q_nope->ne[2], (int)q_nope->ne[3]);
 
                     if (!pp_opt) {
                         kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3);

From 00063b7d99de948fbc4ee02520dcdf92bbb92d0b Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Wed, 12 Feb 2025 18:38:08 +0200
Subject: [PATCH 6/8] WIP

---
 src/llama.cpp | 92 ++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 67 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 2bdbf2a07..f9a59c793 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -2685,7 +2685,6 @@ struct llama_kv_cache {
     std::vector<struct ggml_tensor *> v_l;
 
     // DeepSeek MLA
-    std::vector<struct ggml_tensor *> kr_l; // per layer
     std::vector<struct ggml_tensor *> kv_l;
     std::vector<struct ggml_tensor *> kvt_l;
 
@@ -3166,7 +3165,6 @@ static bool llama_kv_cache_init(
     cache.v_l.reserve(n_layer);
 
     // DeepSeek MLA
-    cache.kr_l.reserve(n_layer);
     cache.kv_l.reserve(n_layer);
     cache.kvt_l.reserve(n_layer);
 
@@ -3179,18 +3177,13 @@ static bool llama_kv_cache_init(
         ggml_tensor * v;
         if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) {
             // DeepSeek MLA
-            //k = ggml_new_tensor_1d(ctx, type_k, 1);
-            //v = ggml_new_tensor_1d(ctx, type_v, 1);
             const uint32_t n_embd_head_qk_rope = hparams.n_rot;
             const uint32_t kv_lora_rank = hparams.n_lora_kv;
             LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank);
-            ggml_tensor * kr = ggml_new_tensor_1d(ctx, cache.type_kr, n_embd_head_qk_rope*kv_size);
-            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
+            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
             ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
-            ggml_format_name(kr, "cache_kr_l%d", i);
             ggml_format_name(kv, "cache_kv_l%d", i);
             ggml_format_name(kvt, "cache_kvt_l%d", i);
-            cache.kr_l.push_back(kr);
             cache.kv_l.push_back(kv);
             cache.kvt_l.push_back(kvt);
         }
@@ -13457,7 +13450,6 @@ struct llm_build_context {
                         0);
                 cb(kv_compressed, "kv_compressed", il);
 
-                //kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
                 kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
                         model.layers[il].attn_kv_a_norm, NULL,
                         LLM_NORM_RMS, cb, il);
@@ -13465,13 +13457,6 @@ struct llm_build_context {
 
                 if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
 
-                    struct ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*kv_lora_rank,
-                            ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank)*kv_head);
-                    cb(kv_cache_view, "kv_cache_view", il);
-
-                    // note: storing c^KV in the KV cache
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, kv_compressed, kv_cache_view));
-
                     struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank,
                             ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
                     cb(kv_cache_trans_view, "kv_cache_trans_view", il);
@@ -13479,13 +13464,6 @@ struct llm_build_context {
                     // note: storing transposed c^KV in the transposed KV cache
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view));
 
-                    struct ggml_tensor * kv_cache =
-                        ggml_view_2d(ctx0, kv_self.kv_l[il],
-                                kv_lora_rank, n_kv,
-                                ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank),
-                                0);
-                    cb(kv_cache, "kv_cache", il);
-
                     struct ggml_tensor * kv_cache_trans =
                         ggml_view_2d(ctx0, kv_self.kvt_l[il],
                                 n_kv, kv_lora_rank,
@@ -13493,19 +13471,16 @@ struct llm_build_context {
                                 0);
                     cb(kv_cache_trans, "kv_cache_trans", il);
 
-                    struct ggml_tensor * kr_cache_view = ggml_view_1d(ctx0, kv_self.kr_l[il], n_tokens*n_embd_head_qk_rope,
-                            ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope)*kv_head);
-                    cb(kr_cache_view, "kr_cache_view", il);
-
-                    // note: storing RoPE-ed version of K^R in the KV cache
-                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_rope, kr_cache_view));
+                    ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0);
+                    cb(kvr, "kvr", il);
 
-                    struct ggml_tensor * kr_cache =
-                        ggml_view_2d(ctx0, kv_self.kr_l[il],
-                                n_embd_head_qk_rope, n_kv,
-                                ggml_row_size(kv_self.kr_l[il]->type, n_embd_head_qk_rope),
-                                0);
-                    cb(kr_cache, "kr_cache", il);
+                    ggml_tensor * kv_cache_view = ggml_view_1d(ctx0, kv_self.kv_l[il], n_tokens*(kv_lora_rank + n_embd_head_qk_rope),
+                            ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope)*kv_head);
+                    ggml_build_forward_expand(gf, ggml_cpy(ctx0, kvr, kv_cache_view));
+                    ggml_tensor * kv_cache = ggml_view_2d(ctx0, kv_self.kv_l[il],
+                            kv_lora_rank + n_embd_head_qk_rope, n_kv,
+                            ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+                    cb(kv_cache, "kv_cache", il);
 
                     struct ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, n_embd_head_qk_nope, kv_lora_rank, n_head,
                             ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope),
@@ -13518,33 +13493,20 @@ struct llm_build_context {
                     struct ggml_tensor * q_nope2 = ggml_mul_mat(ctx0, wk_b, q_nope);
                     cb(q_nope2, "q_nope2", il);
 
+                    ggml_tensor * q = ggml_concat(ctx0, q_nope2, ggml_permute(ctx0, q_rope, 0, 2, 1, 3), 0);
+                    cb(q, "q", il);
                     if (!pp_opt) {
-                        q_nope2 = ggml_permute(ctx0, q_nope2, 0, 2, 1, 3);
-                        cb(q_nope2, "q_nope2_perm", il);
-                    }
-                    struct ggml_tensor * kq_nope = ggml_mul_mat(ctx0, kv_cache, q_nope2);
-                    cb(kq_nope, "kq_nope", il);
-
-                    if (!pp_opt) {
-                        kq_nope = ggml_permute(ctx0, kq_nope, 0, 2, 1, 3);
-                        cb(kq_nope, "kq_nope_perm", il);
-                    }
-
-                    if (pp_opt) {
-                        q_rope = ggml_permute(ctx0, q_rope, 0, 2, 1, 3);
-                        cb(q_rope, "q_rope_perm", il);
+                        q = ggml_permute(ctx0, q, 0, 2, 1, 3);
+                        cb(q, "q_perm", il);
                     }
-                    struct ggml_tensor * kq_rope = ggml_mul_mat(ctx0, kr_cache, q_rope);
-                    cb(kq_rope, "kq_rope", il);
+                    ggml_tensor * kq = ggml_mul_mat(ctx0, kv_cache, q);
+                    cb(kq, "kq", il);
 
-                    if (!pp_opt) {
-                        kq_rope = ggml_permute(ctx0, kq_rope, 0, 2, 1, 3);
-                        cb(kq_rope, "kq_rope_perm", il);
+		            if (!pp_opt) {
+                        kq = ggml_cont(ctx0, ggml_permute(ctx0, kq, 0, 2, 1, 3));
+                        cb(kq, "kq_perm", il);
                     }
 
-                    struct ggml_tensor * kq = ggml_add(ctx0, kq_nope, kq_rope);
-                    cb(kq, "kq", il);
-
                     kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, kq_scale, hparams.f_max_alibi_bias);
                     cb(kq, "kq_soft_max_ext", il);
 
@@ -13561,7 +13523,9 @@ struct llm_build_context {
                         cb(kqv_compressed, "kqv_compressed_perm", il);
                     }
 
-                    struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head, ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank * n_embd_head_v), 0);
+                    struct ggml_tensor * wv_b = ggml_view_3d(ctx0, model.layers[il].wv_b, kv_lora_rank, n_embd_head_v, n_head,
+                            ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank),
+                            ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank)*n_embd_head_v, 0);
                     cb(wv_b, "wv_b", il);
 
                     struct ggml_tensor * kqv = ggml_mul_mat(ctx0, wv_b, kqv_compressed);
@@ -18033,14 +17997,9 @@ struct llama_context * llama_new_context_with_model(
         }
 
 	    {
-            size_t memory_size_kr = 0;
             size_t memory_size_kv = 0;
             size_t memory_size_kvt = 0;
 
-            for (auto & kr : ctx->kv_self.kr_l) {
-                memory_size_kr += ggml_nbytes(kr);
-            }
-
             for (auto & kv : ctx->kv_self.kv_l) {
                 memory_size_kv += ggml_nbytes(kv);
             }
@@ -18049,10 +18008,9 @@ struct llama_context * llama_new_context_with_model(
                 memory_size_kvt += ggml_nbytes(kvt);
             }
 
-            if (memory_size_kr + memory_size_kv + memory_size_kvt > 0) {
-                LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, K^R (%s): %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,
-                        (float)(memory_size_kr + memory_size_kv + memory_size_kvt) / (1024.0f * 1024.0f),
-                        ggml_type_name(type_k), (float)memory_size_kr / (1024.0f * 1024.0f),
+            if (memory_size_kv + memory_size_kvt > 0) {
+                LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,
+                        (float)(memory_size_kv + memory_size_kvt) / (1024.0f * 1024.0f),
                         ggml_type_name(type_v), (float)memory_size_kv / (1024.0f * 1024.0f),
                         ggml_type_name(type_v), (float)memory_size_kvt / (1024.0f * 1024.0f));
             }

From 91db234fb5cac7195f517c5cd12abf2bd2c32e9a Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 13 Feb 2025 08:40:24 +0200
Subject: [PATCH 7/8] Warn user when disabling MLA

---
 src/llama.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/llama.cpp b/src/llama.cpp
index f9a59c793..52291cabc 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3168,6 +3168,8 @@ static bool llama_kv_cache_init(
     cache.kv_l.reserve(n_layer);
     cache.kvt_l.reserve(n_layer);
 
+    bool warn = true;
+    int n_mla = 0;
     for (int i = 0; i < (int) n_layer; i++) {
         const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
         const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@@ -3175,6 +3177,17 @@ static bool llama_kv_cache_init(
         struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
         ggml_tensor * k;
         ggml_tensor * v;
+        if (cparams.mla_attn) {
+            if (!model.layers[i].wk_b || !model.layers[i].wv_b) {
+                if (warn) {
+                    LLAMA_LOG_WARN("=======================================================================================\n");
+                    LLAMA_LOG_WARN("%s: missing MLA tensors => disabling MLA\n", __func__);
+                    LLAMA_LOG_WARN("%s: you need to reconvert your model in order to use MLA\n", __func__);
+                    LLAMA_LOG_WARN("=======================================================================================\n");
+                    warn = false;
+                }
+            }
+        }
         if (cparams.mla_attn && model.layers[i].wk_b && model.layers[i].wv_b) {
             // DeepSeek MLA
             const uint32_t n_embd_head_qk_rope = hparams.n_rot;
@@ -3186,6 +3199,7 @@ static bool llama_kv_cache_init(
             ggml_format_name(kvt, "cache_kvt_l%d", i);
             cache.kv_l.push_back(kv);
             cache.kvt_l.push_back(kvt);
+            n_mla++;
         }
         else {
             k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
@@ -3196,6 +3210,11 @@ static bool llama_kv_cache_init(
             cache.v_l.push_back(v);
         }
     }
+    if (cparams.mla_attn && n_mla < n_layer && n_mla > 0) {
+        LLAMA_LOG_ERROR("%s: unexpected situation with %d out of %d layers having MLA enabled\n", __func__, n_mla, int(n_layer));
+        LLAMA_LOG_ERROR("%s: bailing out\n", __func__);
+        GGML_ABORT("fatal error");
+    }
 
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
     for (auto it : ctx_map) {

From f875ed00e866465a2e0bba20707c5635df2e75d3 Mon Sep 17 00:00:00 2001
From: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Date: Thu, 13 Feb 2025 10:54:42 +0200
Subject: [PATCH 8/8] MLA: compile time option to not use transposed KV cache

Cuts KV cache size in nearly half at the expense of slower
TG performance for long contexts (it becomes similar to
no-MLA).
---
 src/llama.cpp | 60 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 52291cabc..498bb4370 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -108,6 +108,14 @@
 #define LLAMA_MAX_LAYERS  512
 #define LLAMA_MAX_EXPERTS 256  // DeepSeekV2
 
+//
+// === MLA cache
+// If tou are desperate to reduce KV cache size, set MLA_USE_TRANSPOSED_CACHE to 0.
+// TG perfornce will be slower (similar to no-MLA), but KV cache size will be cut to ~half.
+// PP performance will be about the same as with MLA_USE_TRANSPOSED_CACHE = 1.
+//
+#define MLA_USE_TRANSPOSED_CACHE 1
+
 //
 // helpers
 //
@@ -2676,9 +2684,6 @@ struct llama_kv_cache {
     ggml_type type_k = GGML_TYPE_F16;
     ggml_type type_v = GGML_TYPE_F16;
 
-    ggml_type type_kr = GGML_TYPE_F16;
-    ggml_type type_kv = GGML_TYPE_F16;
-
     std::vector<llama_kv_cell> cells;
 
     std::vector<struct ggml_tensor *> k_l; // per layer
@@ -2686,7 +2691,9 @@ struct llama_kv_cache {
 
     // DeepSeek MLA
     std::vector<struct ggml_tensor *> kv_l;
+#if MLA_USE_TRANSPOSED_CACHE
     std::vector<struct ggml_tensor *> kvt_l;
+#endif
 
     std::vector<struct ggml_context *> ctxs;
     std::vector<ggml_backend_buffer_t> bufs;
@@ -3120,8 +3127,6 @@ static bool llama_kv_cache_init(
 
     cache.type_k  = type_k;
     cache.type_v  = type_v;
-    cache.type_kr = type_k;
-    cache.type_kv = type_v;
 
     cache.cells.clear();
     cache.cells.resize(kv_size);
@@ -3166,7 +3171,9 @@ static bool llama_kv_cache_init(
 
     // DeepSeek MLA
     cache.kv_l.reserve(n_layer);
+#if MLA_USE_TRANSPOSED_CACHE
     cache.kvt_l.reserve(n_layer);
+#endif
 
     bool warn = true;
     int n_mla = 0;
@@ -3193,12 +3200,22 @@ static bool llama_kv_cache_init(
             const uint32_t n_embd_head_qk_rope = hparams.n_rot;
             const uint32_t kv_lora_rank = hparams.n_lora_kv;
             LLAMA_LOG_INFO("%s: layer %d: n_embd_head_qk_rope = %d, kv_lora_rank = %d\n", __func__, i, n_embd_head_qk_rope, kv_lora_rank);
-            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_kv, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
-            ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_kv, kv_lora_rank*kv_size);
+#if MLA_USE_TRANSPOSED_CACHE
+            // TODO: The k-cache is contiguous and not permuted, so strictly speaking, it should be possible to quantize it.
+            //       Sadly, at this point something goes wrong with quantized k-cache, so for now we set the k-cache
+            //       type to type_v, which is guaranteed to be f16 or bf16 without FA.
+            //ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_k, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
+            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
+#else
+            ggml_tensor * kv = ggml_new_tensor_1d(ctx, cache.type_v, (kv_lora_rank + n_embd_head_qk_rope)*kv_size);
+#endif
             ggml_format_name(kv, "cache_kv_l%d", i);
-            ggml_format_name(kvt, "cache_kvt_l%d", i);
             cache.kv_l.push_back(kv);
+#if MLA_USE_TRANSPOSED_CACHE
+            ggml_tensor * kvt = ggml_new_tensor_1d(ctx, cache.type_v, kv_lora_rank*kv_size);
+            ggml_format_name(kvt, "cache_kvt_l%d", i);
             cache.kvt_l.push_back(kvt);
+#endif
             n_mla++;
         }
         else {
@@ -13476,19 +13493,20 @@ struct llm_build_context {
 
                 if (lctx.cparams.mla_attn && model.layers[il].wk_b && model.layers[il].wv_b) {
 
-                    struct ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank,
-                            ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
+#if MLA_USE_TRANSPOSED_CACHE
+                    ggml_tensor * kv_cache_trans_view = ggml_view_2d(ctx0, kv_self.kvt_l[il], n_tokens, kv_lora_rank,
+                                    ggml_row_size(kv_self.kv_l[il]->type, kv_self.size), ggml_row_size(kv_self.kv_l[il]->type, kv_head));
                     cb(kv_cache_trans_view, "kv_cache_trans_view", il);
 
                     // note: storing transposed c^KV in the transposed KV cache
                     ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_transpose(ctx0, kv_compressed), kv_cache_trans_view));
 
-                    struct ggml_tensor * kv_cache_trans =
-                        ggml_view_2d(ctx0, kv_self.kvt_l[il],
-                                n_kv, kv_lora_rank,
-                                ggml_row_size(kv_self.kv_l[il]->type, kv_self.size),
-                                0);
+                    ggml_tensor * kv_cache_trans = ggml_view_2d(ctx0, kv_self.kvt_l[il],
+                                    n_kv, kv_lora_rank,
+                                    ggml_row_size(kv_self.kv_l[il]->type, kv_self.size),
+                                    0);
                     cb(kv_cache_trans, "kv_cache_trans", il);
+#endif
 
                     ggml_tensor * kvr = ggml_concat(ctx0, kv_compressed, ggml_permute(ctx0, k_rope, 0, 2, 1, 3), 0);
                     cb(kvr, "kvr", il);
@@ -13534,6 +13552,16 @@ struct llm_build_context {
                         cb(kq, "kq_soft_max_ext_perm", il);
                     }
 
+#if !MLA_USE_TRANSPOSED_CACHE
+                    ggml_tensor * kv_cache_lora = ggml_view_2d(ctx0, kv_self.kv_l[il],
+                            kv_lora_rank, n_kv,
+                            ggml_row_size(kv_self.kv_l[il]->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+                    cb(kv_cache, "kv_cache_lora", il);
+
+                    ggml_tensor * kv_cache_trans = ggml_cont(ctx0, ggml_transpose(ctx0, kv_cache_lora));
+                    cb(kv_cache_trans, "kv_cache_trans", il);
+#endif
+
                     struct ggml_tensor * kqv_compressed = ggml_mul_mat(ctx0, kv_cache_trans, kq);
                     cb(kqv_compressed, "kqv_compressed", il);
 
@@ -18023,9 +18051,11 @@ struct llama_context * llama_new_context_with_model(
                 memory_size_kv += ggml_nbytes(kv);
             }
 
+#if MLA_USE_TRANSPOSED_CACHE
             for (auto & kvt : ctx->kv_self.kvt_l) {
                 memory_size_kvt += ggml_nbytes(kvt);
             }
+#endif
 
             if (memory_size_kv + memory_size_kvt > 0) {
                 LLAMA_LOG_INFO("%s: KV self size  = %7.2f MiB, c^KV (%s): %7.2f MiB, kv^T (%s): %7.2f MiB\n", __func__,