Skip to content

Commit 6fb54c1

Browse files
committed
comment out or hide all debug prints behind LLAMA_SPARSE_DEBUG
1 parent b284bd1 commit 6fb54c1

File tree

6 files changed

+97
-75
lines changed

6 files changed

+97
-75
lines changed

ggml/src/ggml.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3401,10 +3401,12 @@ struct ggml_tensor * ggml_reshape_2d(
34013401
int64_t ne0,
34023402
int64_t ne1) {
34033403
GGML_ASSERT(ggml_is_contiguous(a));
3404+
/*
34043405
printf("ggml_reshape_2d: a=[%5" PRId64 ", %5" PRId64 "], ne0=%5" PRId64 ", ne1=%5" PRId64 "\n",
34053406
a->ne[0], a->ne[1],
34063407
ne0, ne1);
34073408
fflush(stdout);
3409+
*/
34083410
GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
34093411

34103412
const int64_t ne[2] = { ne0, ne1 };
@@ -3424,10 +3426,12 @@ struct ggml_tensor * ggml_reshape_3d(
34243426
int64_t ne1,
34253427
int64_t ne2) {
34263428
GGML_ASSERT(ggml_is_contiguous(a));
3429+
/*
34273430
printf("ggml_reshape_3d: a=[%5" PRId64 ", %5" PRId64 ", %5" PRId64 "], ne0=%5" PRId64 ", ne1=%5" PRId64 ", ne2=%5" PRId64 "\n",
34283431
a->ne[0], a->ne[1], a->ne[2],
34293432
ne0, ne1, ne2);
34303433
fflush(stdout);
3434+
*/
34313435
GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
34323436

34333437
const int64_t ne[3] = { ne0, ne1, ne2 };

src/llama-graph.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,17 +348,21 @@ void llm_graph_input_attn_kv::set_input(const llama_ubatch * ubatch) {
348348

349349
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
350350

351+
/*
351352
printf("[SET_INPUT_KV][base] micro=[%lld,%lld,%lld,%lld] full2d=%p\n",
352353
(long long) self_kq_mask->ne[0], (long long) self_kq_mask->ne[1],
353354
(long long) self_kq_mask->ne[2], (long long) self_kq_mask->ne[3],
354355
(void*) self_kq_mask_full_2d);
355356
fflush(stdout);
357+
*/
356358

357359
if (self_kq_mask_full_2d) {
360+
/*
358361
printf("[SET_INPUT_KV][base] full2d=[%lld,%lld,%lld,%lld]\n",
359362
(long long) self_kq_mask_full_2d->ne[0], (long long) self_kq_mask_full_2d->ne[1],
360363
(long long) self_kq_mask_full_2d->ne[2], (long long) self_kq_mask_full_2d->ne[3]);
361364
fflush(stdout);
365+
*/
362366
mctx->set_input_kq_mask_full_2d(self_kq_mask_full_2d, ubatch, cparams.causal_attn);
363367
}
364368

@@ -371,10 +375,12 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
371375

372376
bool res = true;
373377

378+
/*
374379
printf("[SET_INPUT_KV][iswa] micro(base)=[%lld,%lld,%lld,%lld]\n",
375380
(long long) self_kq_mask->ne[0], (long long) self_kq_mask->ne[1],
376381
(long long) self_kq_mask->ne[2], (long long) self_kq_mask->ne[3]);
377382
fflush(stdout);
383+
*/
378384

379385

380386
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;

src/llama-model.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6578,7 +6578,7 @@ struct llm_build_llama : public llm_graph_context {
65786578
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
65796579
}
65806580

6581-
ggml_tensor * ffn_inp = llama_add_dbg(ctx0, cur, inpSA, "ffn_inp", il);
6581+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
65826582
cb(ffn_inp, "ffn_inp", il);
65836583

65846584
// feed-forward network (non-MoE)
@@ -6753,7 +6753,7 @@ struct llm_build_llama_iswa : public llm_graph_context {
67536753
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
67546754
}
67556755

6756-
ggml_tensor * ffn_inp = llama_add_dbg(ctx0, cur, inpSA, "ffn_inp", il);
6756+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
67576757
cb(ffn_inp, "ffn_inp", il);
67586758

67596759
// feed-forward network (non-MoE)
@@ -13730,6 +13730,9 @@ struct llm_build_deepseek2 : public llm_graph_context {
1373013730

1373113731
struct llm_build_deepseek3_2 : public llm_graph_context {
1373213732
llm_build_deepseek3_2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
13733+
const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
13734+
const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
13735+
1373313736
bool is_lite = (hparams.n_layer == 27);
1373413737

1373513738
const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0);
@@ -13749,7 +13752,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1374913752
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
1375013753
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
1375113754

13752-
printf("[deepseek3_2] layer init: attn_factor=%g mscale=%g dense_kq_scale=%g (n_embd_head_k=%lld)\n", attn_factor, mscale, kq_scale, (long long) n_embd_head_k);
13755+
if (dbg) printf("[deepseek3_2] layer init: attn_factor=%g mscale=%g dense_kq_scale=%g (n_embd_head_k=%lld)\n", attn_factor, mscale, kq_scale, (long long) n_embd_head_k);
1375313756

1375413757
ggml_tensor * cur;
1375513758
ggml_tensor * inpL;
@@ -13932,7 +13935,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1393213935
if (cur->ne[0] != (int64_t) kv_lora_rank) {
1393313936
printf("[SPARSE-DBG-MLA] L%d: sparse attn out Dv=%" PRId64 " but kv_lora_rank=%u (mismatch)\n", il, cur->ne[0], kv_lora_rank);
1393413937
}
13935-
if (model.layers[il].wv_b) {
13938+
if (dbg && model.layers[il].wv_b) {
1393613939
printf("[SPARSE-DBG-MLA] L%d: wv_b dims=[%" PRId64 ", %" PRId64 "] expected=[%u, %" PRId64 "]\n",
1393713940
il, (int64_t) model.layers[il].wv_b->ne[0], (int64_t) model.layers[il].wv_b->ne[1], kv_lora_rank, (int64_t) n_embd_head_v);
1393813941
}
@@ -13967,7 +13970,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1396713970
cb(cur, "sparse_attn_out", il);
1396813971

1396913972
// Log that we're using sparse attention
13970-
LLAMA_LOG_INFO("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
13973+
LLAMA_LOG_DEBUG("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
1397113974
(int)top_k, il);
1397213975
} else {
1397313976
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
@@ -14066,7 +14069,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1406614069
cur = ggml_cont(ctx0, cur);
1406714070

1406814071
// Log that we're using sparse attention
14069-
LLAMA_LOG_INFO("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
14072+
LLAMA_LOG_DEBUG("DeepSeek V3.2: Using sparse attention with top-%d tokens for layer %d\n",
1407014073
(int)top_k, il);
1407114074
} else {
1407214075
// note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups)
@@ -14083,7 +14086,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1408314086
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
1408414087
}
1408514088

14086-
ggml_tensor * ffn_inp = llama_add_dbg(ctx0, cur, inpSA, "ffn_inp", il);
14089+
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
1408714090
cb(ffn_inp, "ffn_inp", il);
1408814091

1408914092
cur = build_norm(ffn_inp,

src/llama-sparse-indexer.cpp

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
4949
float beta_slow,
5050
const function<void(ggml_tensor *, const char *, int)> & cb,
5151
ggml_cgraph * gf) {
52+
53+
const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
54+
const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
55+
5256
// Compute Indexer K for current tokens and (optionally) write to cache
5357
ggml_tensor * Kindexer_cur = ggml_mul_mat(ctx, model.layers[layer_idx].attn_indexer_wk, cur);
5458

@@ -117,8 +121,10 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
117121
ggml_tensor * gamma_q = model.layers[layer_idx].attn_q_a_norm;
118122
ggml_tensor * gamma_q_r = ggml_repeat(ctx, gamma_q, qsrc);
119123
qsrc = ggml_mul(ctx, qsrc, gamma_q_r);
120-
printf("[SPARSE-IDX-Q] L%d: applied attn_q_a_norm to indexer qsrc\n", layer_idx);
121-
fflush(stdout);
124+
if (dbg) {
125+
printf("[SPARSE-IDX-Q] L%d: applied attn_q_a_norm to indexer qsrc\n", layer_idx);
126+
fflush(stdout);
127+
}
122128
} else {
123129
printf("[SPARSE-IDX-Q][WARN] L%d: attn_q_a_norm not found; using plain RMSNorm for indexer qsrc\n", layer_idx);
124130
fflush(stdout);
@@ -127,19 +133,21 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
127133
qsrc = ggml_norm(ctx, cur, 1e-5f);
128134
}
129135

130-
// Logging and sanity checks for potential lite-config mismatch
131-
const int64_t qsrc_in_dim = qsrc ? qsrc->ne[0] : -1;
132-
const int64_t wq_b_in_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[0] : -1;
133-
const int64_t wq_b_out_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[1] : -1;
134-
printf("[SPARSE-IDX-Q] L%d: has_wq_a=%d qsrc_in=%lld wq_b_in=%lld wq_b_out=%lld\n",
135-
layer_idx, (int)has_wq_a, (long long)qsrc_in_dim, (long long)wq_b_in_dim, (long long)wq_b_out_dim);
136-
fflush(stdout);
136+
if (dbg) {
137+
// Logging and sanity checks for potential lite-config mismatch
138+
const int64_t qsrc_in_dim = qsrc ? qsrc->ne[0] : -1;
139+
const int64_t wq_b_in_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[0] : -1;
140+
const int64_t wq_b_out_dim = model.layers[layer_idx].attn_indexer_wq_b ? model.layers[layer_idx].attn_indexer_wq_b->ne[1] : -1;
141+
printf("[SPARSE-IDX-Q] L%d: has_wq_a=%d qsrc_in=%lld wq_b_in=%lld wq_b_out=%lld\n",
142+
layer_idx, (int)has_wq_a, (long long)qsrc_in_dim, (long long)wq_b_in_dim, (long long)wq_b_out_dim);
143+
fflush(stdout);
137144

138-
if (model.layers[layer_idx].attn_indexer_wq_b && qsrc) {
139-
if (wq_b_in_dim != qsrc_in_dim) {
140-
printf("[SPARSE-IDX-Q][WARN] L%d: attn_indexer_wq_b input dim (%lld) != qsrc dim (%lld). Lite config?\n",
141-
layer_idx, (long long) wq_b_in_dim, (long long) qsrc_in_dim);
142-
fflush(stdout);
145+
if (model.layers[layer_idx].attn_indexer_wq_b && qsrc) {
146+
if (wq_b_in_dim != qsrc_in_dim) {
147+
printf("[SPARSE-IDX-Q][WARN] L%d: attn_indexer_wq_b input dim (%lld) != qsrc dim (%lld). Lite config?\n",
148+
layer_idx, (long long) wq_b_in_dim, (long long) qsrc_in_dim);
149+
fflush(stdout);
150+
}
143151
}
144152
}
145153

@@ -196,7 +204,7 @@ IndexerKVTriplet sparse_attn_indexer::compute_indexer_triplet(
196204
ggml_tensor * q_sum = ggml_sum_rows(ctx, q_sqr); // [1, H, T]
197205
ggml_tensor * q_mean= ggml_scale(ctx, q_sum, 1.0f / (float) D_index); // [1, H, T]
198206
ggml_tensor * q_rms = ggml_sqrt(ctx, q_mean); // [1, H, T]
199-
printf("[SPARSE-IDX-QRMS] L%d: computed q_rms over D_index; D_index=%" PRId64 " H=%" PRId64 " T=%" PRId64 "\n",
207+
if (dbg) printf("[SPARSE-IDX-QRMS] L%d: computed q_rms over D_index; D_index=%" PRId64 " H=%" PRId64 " T=%" PRId64 "\n",
200208
layer_idx, D_index, H_index, n_tokens);
201209

202210
// Build base weights from projection on cur
@@ -269,15 +277,22 @@ ggml_tensor * sparse_attn_indexer::build_kvaware_topk_indices(
269277
ggml_backend_sched_t sched,
270278
ggml_backend_t backend_cpu)
271279
{
272-
printf("=== SPARSE INDEXER: build_kvaware_topk_indices L%d ===\n", layer_idx);
273-
size_t initial_mem = ggml_used_mem(ctx);
274-
printf("Initial memory usage: %s\n", format_memory_size(initial_mem).c_str());
275-
fflush(stdout);
276-
// Dump indexer dims and sanity-check shapes
277-
const int64_t D_index_dbg = model.layers[layer_idx].attn_indexer_wk->ne[1];
278-
const int64_t H_index_dbg = model.layers[layer_idx].attn_indexer_wq_b->ne[1] / D_index_dbg;
279-
printf("[SPARSE-DBG-IDX] L%d: D_index=%" PRId64 " H_index=%" PRId64 " n_tokens=%" PRId64 "\n", layer_idx, (int64_t) D_index_dbg, (int64_t) H_index_dbg, (int64_t) n_tokens);
280-
fflush(stdout);
280+
const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
281+
const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
282+
283+
size_t initial_mem = 0;
284+
if (dbg) {
285+
printf("=== SPARSE INDEXER: build_kvaware_topk_indices L%d ===\n", layer_idx);
286+
initial_mem = ggml_used_mem(ctx);
287+
printf("Initial memory usage: %s\n", format_memory_size(initial_mem).c_str());
288+
fflush(stdout);
289+
290+
// Dump indexer dims and sanity-check shapes
291+
const int64_t D_index_dbg = model.layers[layer_idx].attn_indexer_wk->ne[1];
292+
const int64_t H_index_dbg = model.layers[layer_idx].attn_indexer_wq_b->ne[1] / D_index_dbg;
293+
printf("[SPARSE-DBG-IDX] L%d: D_index=%" PRId64 " H_index=%" PRId64 " n_tokens=%" PRId64 "\n", layer_idx, (int64_t) D_index_dbg, (int64_t) H_index_dbg, (int64_t) n_tokens);
294+
fflush(stdout);
295+
}
281296
IndexerKVTriplet trip = compute_indexer_triplet(ctx, model, layer_idx, cur, n_tokens, mctx, k_idxs,
282297
inp_pos, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
283298
cb, gf);
@@ -295,11 +310,13 @@ ggml_tensor * sparse_attn_indexer::build_kvaware_topk_indices(
295310
}
296311
ggml_tensor * kvaware_indices = llama::sparse_attn_topk::select_topk_tokens_indexer_kvaware(
297312
ctx, trip.q_indexer, Kindexer_cache, trip.idx_weights, kq_mask, top_k, cb, gf, sched, backend_cpu);
298-
printf("SPARSE INDEXER: Final topk_indices [k,T]=[%" PRId64 ", %" PRId64 "]\n",
299-
kvaware_indices->ne[0], kvaware_indices->ne[1]);
300-
printf("Final memory usage: %s (total delta: %s)\n", format_memory_size(ggml_used_mem(ctx)).c_str(),
301-
format_memory_size(ggml_used_mem(ctx) - initial_mem).c_str());
302-
fflush(stdout);
313+
if (dbg) {
314+
printf("SPARSE INDEXER: Final topk_indices [k,T]=[%" PRId64 ", %" PRId64 "]\n",
315+
kvaware_indices->ne[0], kvaware_indices->ne[1]);
316+
printf("Final memory usage: %s (total delta: %s)\n", format_memory_size(ggml_used_mem(ctx)).c_str(),
317+
format_memory_size(ggml_used_mem(ctx) - initial_mem).c_str());
318+
fflush(stdout);
319+
}
303320
return kvaware_indices;
304321
}
305322

src/llama-sparse-mla-fwd.cpp

Lines changed: 24 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,6 @@
55
#include <cstdio>
66
#include <cinttypes>
77

8-
// Helper function to get memory usage in human-readable format
9-
static std::string format_memory_size(size_t bytes) {
10-
const char* units[] = {"B", "KB", "MB", "GB", "TB"};
11-
size_t unit_idx = 0;
12-
double size = bytes;
13-
14-
while (size >= 1024.0 && unit_idx < 4) {
15-
size /= 1024.0;
16-
unit_idx++;
17-
}
18-
19-
char buffer[32];
20-
snprintf(buffer, sizeof(buffer), "%.2f %s", size, units[unit_idx]);
21-
return std::string(buffer);
22-
}
238

249
namespace llama {
2510

@@ -47,6 +32,9 @@ using std::function;
4732
int64_t Hkv_v= v_cache->ne[1];
4833
int64_t N_kv_v= v_cache->ne[2];
4934

35+
const char * ENV_SPARSE_DEBUG = getenv("LLAMA_SPARSE_DEBUG");
36+
const bool dbg = (ENV_SPARSE_DEBUG && atoi(ENV_SPARSE_DEBUG) != 0);
37+
5038
// Normalize V layout: expected effective layout is [Dv, Hkv_v, N_kv]
5139
// Some builds return V cache with transposed layout [N_kv, Hkv_v, Dv, ns].
5240
ggml_tensor * V_gather_src = nullptr;
@@ -82,20 +70,22 @@ using std::function;
8270
const int64_t Hq = q_cur->ne[1];
8371
const int64_t T = q_cur->ne[2];
8472

85-
cb(k_cache, "kvaware_k_cache", -1);
86-
cb(v_cache, "kvaware_v_cache", -1);
87-
cb(q_cur, "kvaware_q_cur", -1);
88-
cb(topk_indices, "kvaware_topk_indices", -1);
89-
printf("[SPARSE-MLA] Dq=%lld Hq=%lld T=%lld Dk=%lld Hkv=%lld N_kv=%lld Dv=%lld Hkv_v=%lld\n",
90-
(long long) Dq, (long long) Hq, (long long) T,
91-
(long long) Dk, (long long) Hkv, (long long) N_kv,
92-
(long long) Dv, (long long) Hkv_v);
93-
fflush(stdout);
94-
95-
printf("SPARSE MLA KV-AWARE DBG: Q=[%" PRId64 ",%" PRId64 ",%" PRId64 "] K=[%" PRId64 ",%" PRId64 ",%" PRId64 "] V=[%" PRId64 ",%" PRId64 ",%" PRId64 "] topk=[%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64 "]\n",
96-
Dq, Hq, T, Dk, Hkv, N_kv, Dv, Hkv_v, N_kv_v,
97-
topk_indices->ne[0], topk_indices->ne[1], topk_indices->ne[2], topk_indices->ne[3]);
98-
fflush(stdout);
73+
if (dbg) {
74+
cb(k_cache, "kvaware_k_cache", -1);
75+
cb(v_cache, "kvaware_v_cache", -1);
76+
cb(q_cur, "kvaware_q_cur", -1);
77+
cb(topk_indices, "kvaware_topk_indices", -1);
78+
printf("[SPARSE-MLA] Dq=%lld Hq=%lld T=%lld Dk=%lld Hkv=%lld N_kv=%lld Dv=%lld Hkv_v=%lld\n",
79+
(long long) Dq, (long long) Hq, (long long) T,
80+
(long long) Dk, (long long) Hkv, (long long) N_kv,
81+
(long long) Dv, (long long) Hkv_v);
82+
fflush(stdout);
83+
84+
printf("SPARSE MLA KV-AWARE DBG: Q=[%" PRId64 ",%" PRId64 ",%" PRId64 "] K=[%" PRId64 ",%" PRId64 ",%" PRId64 "] V=[%" PRId64 ",%" PRId64 ",%" PRId64 "] topk=[%" PRId64 ",%" PRId64 ",%" PRId64 ",%" PRId64 "]\n",
85+
Dq, Hq, T, Dk, Hkv, N_kv, Dv, Hkv_v, N_kv_v,
86+
topk_indices->ne[0], topk_indices->ne[1], topk_indices->ne[2], topk_indices->ne[3]);
87+
fflush(stdout);
88+
}
9989

10090
ggml_tensor * K4d = ggml_reshape_4d(ctx, k_cache, Dk*Hkv, N_kv, 1, 1);
10191
ggml_tensor * V4d = ggml_reshape_4d(ctx, V_gather_src, Dv*Hkv_v, N_kv_v, 1, 1);
@@ -119,7 +109,7 @@ using std::function;
119109
// ensure contiguous [Hkv_v*top_k, Dv] with a real transpose to avoid stride/view aliasing
120110
v_sel_2d = ggml_cont(ctx, ggml_transpose(ctx, v_sel_2d));
121111
// Note: cannot read tensor->data during graph build; only log shapes here to avoid invalid dereference
122-
if (t < 2) {
112+
if (dbg && t < 2) {
123113
printf("[SPARSE-DBG-INDICES] t=%lld: top_k=%lld N_kv=%lld\n",
124114
(long long) t, (long long) top_k, (long long) N_kv);
125115
}
@@ -131,10 +121,10 @@ using std::function;
131121
q_t_2d = ggml_cont(ctx, q_t_2d);
132122
ggml_tensor * scores_t = ggml_mul_mat(ctx, k_sel_2d, q_t_2d); // [Hkv*top_k, Hq]
133123
// debug marker: scores computed pre-scale
134-
printf("[SPARSE-DBG-MLA] t=%lld scores pre-scale\n", (long long) t);
124+
if (dbg) printf("[SPARSE-DBG-MLA] t=%lld scores pre-scale\n", (long long) t);
135125
scores_t = ggml_scale(ctx, scores_t, kq_scale);
136126
// add mask/alibi bias if provided: gather kq_mask rows by indices and add to scores
137-
if (t == 0 || t == T - 1) {
127+
if (dbg && (t == 0 || t == T - 1)) {
138128
cb(scores_t, "mla_scores_pre_mask", -1);
139129
}
140130

@@ -188,15 +178,15 @@ using std::function;
188178
scores_t = ggml_scale(ctx, scores_t, attn_softcap);
189179
}
190180
// debug marker: scores post-mask/softcap
191-
printf("[SPARSE-DBG-MLA] t=%lld scores post-mask/softcap\n", (long long) t);
181+
if (dbg) printf("[SPARSE-DBG-MLA] t=%lld scores post-mask/softcap\n", (long long) t);
192182
// Clamp infinities to large finite values to avoid NaNs in softmax when all entries are masked
193183
scores_t = ggml_clamp(ctx, scores_t, -1e30f, 1e30f);
194184
ggml_tensor * weights_t = ggml_soft_max(ctx, scores_t);
195185
// Be conservative: ensure operands of second matmul are contiguous
196186
weights_t = ggml_cont(ctx, weights_t);
197187
v_sel_2d = ggml_cont(ctx, v_sel_2d);
198188

199-
if (t == 0 || t == T - 1) {
189+
if (dbg && (t == 0 || t == T - 1)) {
200190
cb(weights_t, "mla_weights_sample", -1);
201191
}
202192

0 commit comments

Comments
 (0)