comment out or hide all debug prints behind LLAMA_SPARSE_DEBUG

createthis · createthis · commit 6fb54c1f2b40 · 2025-10-27T13:45:08.000Z
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -3401,10 +3401,12 @@ struct ggml_tensor * ggml_reshape_2d(
         int64_t               ne0,
         int64_t               ne1) {
     GGML_ASSERT(ggml_is_contiguous(a));
+    /*
     printf("ggml_reshape_2d: a=[%5" PRId64 ", %5" PRId64 "], ne0=%5" PRId64 ", ne1=%5" PRId64 "\n",
                 a->ne[0], a->ne[1],
                 ne0, ne1);
     fflush(stdout);
+    */
     GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
 
     const int64_t ne[2] = { ne0, ne1 };
@@ -3424,10 +3426,12 @@ struct ggml_tensor * ggml_reshape_3d(
         int64_t               ne1,
         int64_t               ne2) {
     GGML_ASSERT(ggml_is_contiguous(a));
+    /*
     printf("ggml_reshape_3d: a=[%5" PRId64 ", %5" PRId64 ", %5" PRId64 "], ne0=%5" PRId64 ", ne1=%5" PRId64 ", ne2=%5" PRId64 "\n",
                 a->ne[0], a->ne[1], a->ne[2],
                 ne0, ne1, ne2);
     fflush(stdout);
+    */
     GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
 
     const int64_t ne[3] = { ne0, ne1, ne2 };
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -348,17 +348,21 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
 
     mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
 
+    /*
     printf("[SET_INPUT_KV][base] micro=[%lld,%lld,%lld,%lld] full2d=%p\n",
            (long long) self_kq_mask->ne[0], (long long) self_kq_mask->ne[1],
            (long long) self_kq_mask->ne[2], (long long) self_kq_mask->ne[3],
            (void*) self_kq_mask_full_2d);
     fflush(stdout);
+    */
 
     if (self_kq_mask_full_2d) {
+        /*
         printf("[SET_INPUT_KV][base] full2d=[%lld,%lld,%lld,%lld]\n",
                (long long) self_kq_mask_full_2d->ne[0], (long long) self_kq_mask_full_2d->ne[1],
                (long long) self_kq_mask_full_2d->ne[2], (long long) self_kq_mask_full_2d->ne[3]);
         fflush(stdout);
+        */
         mctx->set_input_kq_mask_full_2d(self_kq_mask_full_2d, ubatch, cparams.causal_attn);
     }
 
@@ -371,10 +375,12 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
 
     bool res = true;
 
+    /*
     printf("[SET_INPUT_KV][iswa] micro(base)=[%lld,%lld,%lld,%lld]\n",
            (long long) self_kq_mask->ne[0], (long long) self_kq_mask->ne[1],
            (long long) self_kq_mask->ne[2], (long long) self_kq_mask->ne[3]);
     fflush(stdout);
+    */
 
 
     res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -6578,7 +6578,7 @@ struct llm_build_llama : public llm_graph_context {
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
-            ggml_tensor * ffn_inp = llama_add_dbg(ctx0, cur, inpSA, "ffn_inp", il);
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network (non-MoE)
@@ -6753,7 +6753,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
-            ggml_tensor * ffn_inp = llama_add_dbg(ctx0, cur, inpSA, "ffn_inp", il);
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network (non-MoE)
@@ -13730,6 +13730,9 @@ struct llm_build_deepseek2 : public llm_graph_context {
 
 struct llm_build_deepseek3_2 : public llm_graph_context {
     llm_build_deepseek3_2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+        const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
+        const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
+
         bool is_lite = (hparams.n_layer == 27);
 
         const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
@@ -13749,7 +13752,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
         const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
         const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
 
-        printf("[deepseek3_2] layer init: attn_factor=%g mscale=%g dense_kq_scale=%g (n_embd_head_k=%lld)\n", attn_factor, mscale, kq_scale, (long long) n_embd_head_k);
+        if (dbg) printf("[deepseek3_2] layer init: attn_factor=%g mscale=%g dense_kq_scale=%g (n_embd_head_k=%lld)\n", attn_factor, mscale, kq_scale, (long long) n_embd_head_k);
 
         ggml_tensor * cur;
         ggml_tensor * inpL;
@@ -13932,7 +13935,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
                             if (cur->ne[0] != (int64_t) kv_lora_rank) {
                                 printf("[SPARSE-DBG-MLA] L%d: sparse attn out Dv=%" PRId64 " but kv_lora_rank=%u (mismatch)\n", il, cur->ne[0], kv_lora_rank);
                             }
-                            if (model.layers[il].wv_b) {
+                            if (dbg && model.layers[il].wv_b) {
                                 printf("[SPARSE-DBG-MLA] L%d: wv_b dims=[%" PRId64 ", %" PRId64 "] expected=[%u, %" PRId64 "]\n",
                                        il, (int64_t) model.layers[il].wv_b->ne[0], (int64_t) model.layers[il].wv_b->ne[1], kv_lora_rank, (int64_t) n_embd_head_v);
                             }
@@ -13967,7 +13970,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
                         cb(cur, "sparse_attn_out", il);
 
                         // Log that we're using sparse attention
-                        LLAMA_LOG_INFO("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
+                        LLAMA_LOG_DEBUG("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
                                       (int)top_k, il);
                     } else {
                         // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
@@ -14066,7 +14069,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
                         cur = ggml_cont(ctx0, cur);
 
                         // Log that we're using sparse attention
-                        LLAMA_LOG_INFO("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
+                        LLAMA_LOG_DEBUG("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
                                       (int)top_k, il);
                     } else {
                         // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
@@ -14083,7 +14086,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
                 inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
             }
 
-            ggml_tensor * ffn_inp = llama_add_dbg(ctx0, cur, inpSA, "ffn_inp", il);
+            ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);
 
             cur = build_norm(ffn_inp,
diff --git a/src/llama-sparse-indexer.cpp b/src/llama-sparse-indexer.cpp
@@ -49,6 +49,10 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
     float beta_slow,
     const function<void(ggml_tensor *, const char *, int)> & cb,
     ggml_cgraph * gf) {
+
+    const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
+    const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
+
     // Compute Indexer K for current tokens and (optionally) write to cache
     ggml_tensor * Kindexer_cur = ggml_mul_mat(ctx, model.layers[layer_idx].attn_indexer_wk, cur);
 
@@ -117,8 +121,10 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
             ggml_tensor * gamma_q = model.layers[layer_idx].attn_q_a_norm;
             ggml_tensor * gamma_q_r = ggml_repeat(ctx, gamma_q, qsrc);
             qsrc = ggml_mul(ctx, qsrc, gamma_q_r);
-            printf("[SPARSE-IDX-Q] L%d: applied attn_q_a_norm to indexer qsrc\n", layer_idx);
-            fflush(stdout);
+            if (dbg) {
+                printf("[SPARSE-IDX-Q] L%d: applied attn_q_a_norm to indexer qsrc\n", layer_idx);
+                fflush(stdout);
+            }
         } else {
             printf("[SPARSE-IDX-Q][WARN] L%d: attn_q_a_norm not found; using plain RMSNorm for indexer qsrc\n", layer_idx);
             fflush(stdout);
@@ -127,19 +133,21 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
         qsrc = ggml_norm(ctx, cur, 1e-5f);
     }
 
-    // Logging and sanity checks for potential lite-config mismatch
-    const int64_t qsrc_in_dim = qsrc ? qsrc->ne[0] : -1;
-    const int64_t wq_b_in_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[0] : -1;
-    const int64_t wq_b_out_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[1] : -1;
-    printf("[SPARSE-IDX-Q] L%d: has_wq_a=%d qsrc_in=%lld wq_b_in=%lld wq_b_out=%lld\n",
-           layer_idx, (int)has_wq_a, (long long)qsrc_in_dim, (long long)wq_b_in_dim, (long long)wq_b_out_dim);
-    fflush(stdout);
+    if (dbg) {
+        // Logging and sanity checks for potential lite-config mismatch
+        const int64_t qsrc_in_dim = qsrc ? qsrc->ne[0] : -1;
+        const int64_t wq_b_in_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[0] : -1;
+        const int64_t wq_b_out_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[1] : -1;
+        printf("[SPARSE-IDX-Q] L%d: has_wq_a=%d qsrc_in=%lld wq_b_in=%lld wq_b_out=%lld\n",
+               layer_idx, (int)has_wq_a, (long long)qsrc_in_dim, (long long)wq_b_in_dim, (long long)wq_b_out_dim);
+        fflush(stdout);
 
-    if (model.layers[layer_idx].attn_indexer_wq_b && qsrc) {
-        if (wq_b_in_dim != qsrc_in_dim) {
-            printf("[SPARSE-IDX-Q][WARN] L%d: attn_indexer_wq_b input dim (%lld) != qsrc dim (%lld). Lite config?\n",
-                   layer_idx, (long long) wq_b_in_dim, (long long) qsrc_in_dim);
-            fflush(stdout);
+        if (model.layers[layer_idx].attn_indexer_wq_b && qsrc) {
+            if (wq_b_in_dim != qsrc_in_dim) {
+                printf("[SPARSE-IDX-Q][WARN] L%d: attn_indexer_wq_b input dim (%lld) != qsrc dim (%lld). Lite config?\n",
+                       layer_idx, (long long) wq_b_in_dim, (long long) qsrc_in_dim);
+                fflush(stdout);
+            }
         }
     }
 
@@ -196,7 +204,7 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
     ggml_tensor * q_sum = ggml_sum_rows(ctx, q_sqr);                                  // [1, H, T]
     ggml_tensor * q_mean= ggml_scale(ctx, q_sum, 1.0f / (float) D_index);             // [1, H, T]
     ggml_tensor * q_rms = ggml_sqrt(ctx, q_mean);                                     // [1, H, T]
-    printf("[SPARSE-IDX-QRMS] L%d: computed q_rms over D_index; D_index=%" PRId64 " H=%" PRId64 " T=%" PRId64 "\n",
+    if (dbg) printf("[SPARSE-IDX-QRMS] L%d: computed q_rms over D_index; D_index=%" PRId64 " H=%" PRId64 " T=%" PRId64 "\n",
            layer_idx, D_index, H_index, n_tokens);
 
     // Build base weights from projection on cur
@@ -269,15 +277,22 @@ ggml_tensor * sparse_attn_indexer::build_kvaware_topk_indices(
     ggml_backend_sched_t sched,
     ggml_backend_t backend_cpu)
 {
-    printf("=== SPARSE INDEXER: build_kvaware_topk_indices L%d ===\n", layer_idx);
-    size_t initial_mem = ggml_used_mem(ctx);
-    printf("Initial memory usage: %s\n", format_memory_size(initial_mem).c_str());
-    fflush(stdout);
-    // Dump indexer dims and sanity-check shapes
-    const int64_t D_index_dbg = model.layers[layer_idx].attn_indexer_wk->ne[1];
-    const int64_t H_index_dbg = model.layers[layer_idx].attn_indexer_wq_b->ne[1] / D_index_dbg;
-    printf("[SPARSE-DBG-IDX] L%d: D_index=%" PRId64 " H_index=%" PRId64 " n_tokens=%" PRId64 "\n", layer_idx, (int64_t) D_index_dbg, (int64_t) H_index_dbg, (int64_t) n_tokens);
-    fflush(stdout);
+    const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
+    const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
+
+    size_t initial_mem = 0;
+    if (dbg) {
+        printf("=== SPARSE INDEXER: build_kvaware_topk_indices L%d ===\n", layer_idx);
+        initial_mem = ggml_used_mem(ctx);
+        printf("Initial memory usage: %s\n", format_memory_size(initial_mem).c_str());
+        fflush(stdout);
+
+        // Dump indexer dims and sanity-check shapes
+        const int64_t D_index_dbg = model.layers[layer_idx].attn_indexer_wk->ne[1];
+        const int64_t H_index_dbg = model.layers[layer_idx].attn_indexer_wq_b->ne[1] / D_index_dbg;
+        printf("[SPARSE-DBG-IDX] L%d: D_index=%" PRId64 " H_index=%" PRId64 " n_tokens=%" PRId64 "\n", layer_idx, (int64_t) D_index_dbg, (int64_t) H_index_dbg, (int64_t) n_tokens);
+        fflush(stdout);
+    }
     IndexerKVTriplet trip = compute_indexer_triplet(ctx, model, layer_idx, cur, n_tokens, mctx, k_idxs,
         inp_pos, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
         cb, gf);
@@ -295,11 +310,13 @@ ggml_tensor * sparse_attn_indexer::build_kvaware_topk_indices(
     }
     ggml_tensor * kvaware_indices = llama::sparse_attn_topk::select_topk_tokens_indexer_kvaware(
         ctx, trip.q_indexer, Kindexer_cache, trip.idx_weights, kq_mask, top_k, cb, gf, sched, backend_cpu);
-    printf("SPARSE INDEXER: Final topk_indices [k,T]=[%" PRId64 ", %" PRId64 "]\n",
-           kvaware_indices->ne[0], kvaware_indices->ne[1]);
-    printf("Final memory usage: %s (total delta: %s)\n", format_memory_size(ggml_used_mem(ctx)).c_str(),
-           format_memory_size(ggml_used_mem(ctx) - initial_mem).c_str());
-    fflush(stdout);
+    if (dbg) {
+        printf("SPARSE INDEXER: Final topk_indices [k,T]=[%" PRId64 ", %" PRId64 "]\n",
+               kvaware_indices->ne[0], kvaware_indices->ne[1]);
+        printf("Final memory usage: %s (total delta: %s)\n", format_memory_size(ggml_used_mem(ctx)).c_str(),
+               format_memory_size(ggml_used_mem(ctx) - initial_mem).c_str());
+        fflush(stdout);
+    }
     return kvaware_indices;
 }
 
diff --git a/src/llama-sparse-mla-fwd.cpp b/src/llama-sparse-mla-fwd.cpp
@@ -5,21 +5,6 @@
 #include <cstdio>
 #include <cinttypes>
 
-// Helper function to get memory usage in human-readable format
-static std::string format_memory_size(size_t bytes) {
-    const char* units[] = {"B", "KB", "MB", "GB", "TB"};
-    size_t unit_idx = 0;
-    double size = bytes;
-
-    while (size >= 1024.0 && unit_idx < 4) {
-        size /= 1024.0;
-        unit_idx++;
-    }
-
-    char buffer[32];
-    snprintf(buffer, sizeof(buffer), "%.2f %s", size, units[unit_idx]);
-    return std::string(buffer);
-}
 
 namespace llama {
 
@@ -47,6 +32,9 @@ using std::function;
       int64_t Hkv_v= v_cache->ne[1];
       int64_t N_kv_v= v_cache->ne[2];
 
+      const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
+      const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
+
       // Normalize V layout: expected effective layout is [Dv, Hkv_v, N_kv]
       // Some builds return V cache with transposed layout [N_kv, Hkv_v, Dv, ns].
       ggml_tensor * V_gather_src = nullptr;
@@ -82,20 +70,22 @@ using std::function;
       const int64_t Hq   = q_cur->ne[1];
       const int64_t T    = q_cur->ne[2];
 
-      cb(k_cache, "kvaware_k_cache", -1);
-      cb(v_cache, "kvaware_v_cache", -1);
-      cb(q_cur,   "kvaware_q_cur",   -1);
-      cb(topk_indices, "kvaware_topk_indices", -1);
-      printf("[SPARSE-MLA] Dq=%lld Hq=%lld T=%lld Dk=%lld Hkv=%lld N_kv=%lld Dv=%lld Hkv_v=%lld\n",
-             (long long) Dq, (long long) Hq, (long long) T,
-             (long long) Dk, (long long) Hkv, (long long) N_kv,
-             (long long) Dv, (long long) Hkv_v);
-      fflush(stdout);
-
-      printf("SPARSE MLA KV-AWARE DBG: Q=[%" PRId64 ",%" PRId64 ",%" PRId64 "] K=[%" PRId64 ",%" PRId64 ",%" PRId64 "] V=[%" PRId64 ",%" PRId64 ",%" PRId64 "] topk=[%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64 "]\n",
-             Dq, Hq, T, Dk, Hkv, N_kv, Dv, Hkv_v, N_kv_v,
-             topk_indices->ne[0], topk_indices->ne[1], topk_indices->ne[2], topk_indices->ne[3]);
-      fflush(stdout);
+      if (dbg) {
+          cb(k_cache, "kvaware_k_cache", -1);
+          cb(v_cache, "kvaware_v_cache", -1);
+          cb(q_cur,   "kvaware_q_cur",   -1);
+          cb(topk_indices, "kvaware_topk_indices", -1);
+          printf("[SPARSE-MLA] Dq=%lld Hq=%lld T=%lld Dk=%lld Hkv=%lld N_kv=%lld Dv=%lld Hkv_v=%lld\n",
+                 (long long) Dq, (long long) Hq, (long long) T,
+                 (long long) Dk, (long long) Hkv, (long long) N_kv,
+                 (long long) Dv, (long long) Hkv_v);
+          fflush(stdout);
+
+          printf("SPARSE MLA KV-AWARE DBG: Q=[%" PRId64 ",%" PRId64 ",%" PRId64 "] K=[%" PRId64 ",%" PRId64 ",%" PRId64 "] V=[%" PRId64 ",%" PRId64 ",%" PRId64 "] topk=[%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64 "]\n",
+                 Dq, Hq, T, Dk, Hkv, N_kv, Dv, Hkv_v, N_kv_v,
+                 topk_indices->ne[0], topk_indices->ne[1], topk_indices->ne[2], topk_indices->ne[3]);
+          fflush(stdout);
+      }
 
       ggml_tensor * K4d = ggml_reshape_4d(ctx, k_cache, Dk*Hkv, N_kv, 1, 1);
       ggml_tensor * V4d = ggml_reshape_4d(ctx, V_gather_src, Dv*Hkv_v, N_kv_v, 1, 1);
@@ -119,7 +109,7 @@ using std::function;
           // ensure contiguous [Hkv_v*top_k, Dv] with a real transpose to avoid stride/view aliasing
           v_sel_2d = ggml_cont(ctx, ggml_transpose(ctx, v_sel_2d));
           // Note: cannot read tensor->data during graph build; only log shapes here to avoid invalid dereference
-          if (t < 2) {
+          if (dbg && t < 2) {
               printf("[SPARSE-DBG-INDICES] t=%lld: top_k=%lld N_kv=%lld\n",
                      (long long) t, (long long) top_k, (long long) N_kv);
           }
@@ -131,10 +121,10 @@ using std::function;
           q_t_2d = ggml_cont(ctx, q_t_2d);
           ggml_tensor * scores_t = ggml_mul_mat(ctx, k_sel_2d, q_t_2d); // [Hkv*top_k, Hq]
           // debug marker: scores computed pre-scale
-          printf("[SPARSE-DBG-MLA] t=%lld scores pre-scale\n", (long long) t);
+          if (dbg) printf("[SPARSE-DBG-MLA] t=%lld scores pre-scale\n", (long long) t);
           scores_t = ggml_scale(ctx, scores_t, kq_scale);
           // add mask/alibi bias if provided: gather kq_mask rows by indices and add to scores
-          if (t == 0 || t == T - 1) {
+          if (dbg && (t == 0 || t == T - 1)) {
               cb(scores_t, "mla_scores_pre_mask", -1);
           }
 
@@ -188,15 +178,15 @@ using std::function;
               scores_t = ggml_scale(ctx, scores_t, attn_softcap);
           }
           // debug marker: scores post-mask/softcap
-          printf("[SPARSE-DBG-MLA] t=%lld scores post-mask/softcap\n", (long long) t);
+          if (dbg) printf("[SPARSE-DBG-MLA] t=%lld scores post-mask/softcap\n", (long long) t);
           // Clamp infinities to large finite values to avoid NaNs in softmax when all entries are masked
           scores_t = ggml_clamp(ctx, scores_t, -1e30f, 1e30f);
           ggml_tensor * weights_t = ggml_soft_max(ctx, scores_t);
           // Be conservative: ensure operands of second matmul are contiguous
           weights_t = ggml_cont(ctx, weights_t);
           v_sel_2d  = ggml_cont(ctx, v_sel_2d);
 
-          if (t == 0 || t == T - 1) {
+          if (dbg && (t == 0 || t == T - 1)) {
               cb(weights_t, "mla_weights_sample", -1);
           }
 
diff --git a/src/llama-sparse-topk.cpp b/src/llama-sparse-topk.cpp