Skip to content

Commit b284bd1

Browse files
committed
Fix top-k clamp. Sparse attention generation is working!
1 parent 0555126 commit b284bd1

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

src/llama-model.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13914,7 +13914,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1391413914
ggml_build_forward_expand(gf, KQmask2);
1391513915
{
1391613916
int64_t used_kv = mctx_cur2->get_n_kv();
13917-
int64_t n_kv_cache = (int64_t) Kcache->ne[1];
13917+
int64_t n_kv_cache = (int64_t) Kcache->ne[2];
1391813918
ggml_tensor * Kindexer_full = mctx_cur2->get_k_indexer_full(ctx0, il);
1391913919
int64_t n_kv_indexer = Kindexer_full ? (int64_t) Kindexer_full->ne[1] : n_kv_cache;
1392013920
int64_t available_kv = std::min(used_kv, std::min(n_kv_cache, n_kv_indexer));
@@ -14026,7 +14026,7 @@ struct llm_build_deepseek3_2 : public llm_graph_context {
1402614026
ggml_build_forward_expand(gf, KQmask2);
1402714027
{
1402814028
int64_t used_kv = mctx_cur2->get_n_kv();
14029-
int64_t n_kv_cache = (int64_t) Kcache->ne[1];
14029+
int64_t n_kv_cache = (int64_t) Kcache->ne[2];
1403014030
ggml_tensor * Kindexer_full = mctx_cur2->get_k_indexer_full(ctx0, il);
1403114031
int64_t n_kv_indexer = Kindexer_full ? (int64_t) Kindexer_full->ne[1] : n_kv_cache;
1403214032
int64_t available_kv = std::min(used_kv, std::min(n_kv_cache, n_kv_indexer));

0 commit comments

Comments
 (0)