55#include < cstdio>
66#include < cinttypes>
77
8- // Helper function to get memory usage in human-readable format
9- static std::string format_memory_size (size_t bytes) {
10- const char * units[] = {" B" , " KB" , " MB" , " GB" , " TB" };
11- size_t unit_idx = 0 ;
12- double size = bytes;
13-
14- while (size >= 1024.0 && unit_idx < 4 ) {
15- size /= 1024.0 ;
16- unit_idx++;
17- }
18-
19- char buffer[32 ];
20- snprintf (buffer, sizeof (buffer), " %.2f %s" , size, units[unit_idx]);
21- return std::string (buffer);
22- }
238
249namespace llama {
2510
@@ -47,6 +32,9 @@ using std::function;
4732 int64_t Hkv_v= v_cache->ne [1 ];
4833 int64_t N_kv_v= v_cache->ne [2 ];
4934
35+ const char * ENV_SPARSE_DEBUG = getenv (" LLAMA_SPARSE_DEBUG" );
36+ const bool dbg = (ENV_SPARSE_DEBUG && atoi (ENV_SPARSE_DEBUG) != 0 );
37+
5038 // Normalize V layout: expected effective layout is [Dv, Hkv_v, N_kv]
5139 // Some builds return V cache with transposed layout [N_kv, Hkv_v, Dv, ns].
5240 ggml_tensor * V_gather_src = nullptr ;
@@ -82,20 +70,22 @@ using std::function;
8270 const int64_t Hq = q_cur->ne [1 ];
8371 const int64_t T = q_cur->ne [2 ];
8472
85- cb (k_cache, " kvaware_k_cache" , -1 );
86- cb (v_cache, " kvaware_v_cache" , -1 );
87- cb (q_cur, " kvaware_q_cur" , -1 );
88- cb (topk_indices, " kvaware_topk_indices" , -1 );
89- printf (" [SPARSE-MLA] Dq=%lld Hq=%lld T=%lld Dk=%lld Hkv=%lld N_kv=%lld Dv=%lld Hkv_v=%lld\n " ,
90- (long long ) Dq, (long long ) Hq, (long long ) T,
91- (long long ) Dk, (long long ) Hkv, (long long ) N_kv,
92- (long long ) Dv, (long long ) Hkv_v);
93- fflush (stdout);
94-
95- printf (" SPARSE MLA KV-AWARE DBG: Q=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ] K=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ] V=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ] topk=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ,%" PRId64 " ]\n " ,
96- Dq, Hq, T, Dk, Hkv, N_kv, Dv, Hkv_v, N_kv_v,
97- topk_indices->ne [0 ], topk_indices->ne [1 ], topk_indices->ne [2 ], topk_indices->ne [3 ]);
98- fflush (stdout);
73+ if (dbg) {
74+ cb (k_cache, " kvaware_k_cache" , -1 );
75+ cb (v_cache, " kvaware_v_cache" , -1 );
76+ cb (q_cur, " kvaware_q_cur" , -1 );
77+ cb (topk_indices, " kvaware_topk_indices" , -1 );
78+ printf (" [SPARSE-MLA] Dq=%lld Hq=%lld T=%lld Dk=%lld Hkv=%lld N_kv=%lld Dv=%lld Hkv_v=%lld\n " ,
79+ (long long ) Dq, (long long ) Hq, (long long ) T,
80+ (long long ) Dk, (long long ) Hkv, (long long ) N_kv,
81+ (long long ) Dv, (long long ) Hkv_v);
82+ fflush (stdout);
83+
84+ printf (" SPARSE MLA KV-AWARE DBG: Q=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ] K=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ] V=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ] topk=[%" PRId64 " ,%" PRId64 " ,%" PRId64 " ,%" PRId64 " ]\n " ,
85+ Dq, Hq, T, Dk, Hkv, N_kv, Dv, Hkv_v, N_kv_v,
86+ topk_indices->ne [0 ], topk_indices->ne [1 ], topk_indices->ne [2 ], topk_indices->ne [3 ]);
87+ fflush (stdout);
88+ }
9989
10090 ggml_tensor * K4d = ggml_reshape_4d (ctx, k_cache, Dk*Hkv, N_kv, 1 , 1 );
10191 ggml_tensor * V4d = ggml_reshape_4d (ctx, V_gather_src, Dv*Hkv_v, N_kv_v, 1 , 1 );
@@ -119,7 +109,7 @@ using std::function;
119109 // ensure contiguous [Hkv_v*top_k, Dv] with a real transpose to avoid stride/view aliasing
120110 v_sel_2d = ggml_cont (ctx, ggml_transpose (ctx, v_sel_2d));
121111 // Note: cannot read tensor->data during graph build; only log shapes here to avoid invalid dereference
122- if (t < 2 ) {
112+ if (dbg && t < 2 ) {
123113 printf (" [SPARSE-DBG-INDICES] t=%lld: top_k=%lld N_kv=%lld\n " ,
124114 (long long ) t, (long long ) top_k, (long long ) N_kv);
125115 }
@@ -131,10 +121,10 @@ using std::function;
131121 q_t_2d = ggml_cont (ctx, q_t_2d);
132122 ggml_tensor * scores_t = ggml_mul_mat (ctx, k_sel_2d, q_t_2d); // [Hkv*top_k, Hq]
133123 // debug marker: scores computed pre-scale
134- printf (" [SPARSE-DBG-MLA] t=%lld scores pre-scale\n " , (long long ) t);
124+ if (dbg) printf (" [SPARSE-DBG-MLA] t=%lld scores pre-scale\n " , (long long ) t);
135125 scores_t = ggml_scale (ctx, scores_t , kq_scale);
136126 // add mask/alibi bias if provided: gather kq_mask rows by indices and add to scores
137- if (t == 0 || t == T - 1 ) {
127+ if (dbg && ( t == 0 || t == T - 1 ) ) {
138128 cb (scores_t , " mla_scores_pre_mask" , -1 );
139129 }
140130
@@ -188,15 +178,15 @@ using std::function;
188178 scores_t = ggml_scale (ctx, scores_t , attn_softcap);
189179 }
190180 // debug marker: scores post-mask/softcap
191- printf (" [SPARSE-DBG-MLA] t=%lld scores post-mask/softcap\n " , (long long ) t);
181+ if (dbg) printf (" [SPARSE-DBG-MLA] t=%lld scores post-mask/softcap\n " , (long long ) t);
192182 // Clamp infinities to large finite values to avoid NaNs in softmax when all entries are masked
193183 scores_t = ggml_clamp (ctx, scores_t , -1e30f, 1e30f);
194184 ggml_tensor * weights_t = ggml_soft_max (ctx, scores_t );
195185 // Be conservative: ensure operands of second matmul are contiguous
196186 weights_t = ggml_cont (ctx, weights_t );
197187 v_sel_2d = ggml_cont (ctx, v_sel_2d);
198188
199- if (t == 0 || t == T - 1 ) {
189+ if (dbg && ( t == 0 || t == T - 1 ) ) {
200190 cb (weights_t , " mla_weights_sample" , -1 );
201191 }
202192
0 commit comments