From 9e4459600390bb891d7f94a5ba46758b973620e2 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 22 Oct 2025 16:03:54 -0400 Subject: [PATCH 1/2] fix num_q_tokens_per_head_k Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/mla/flashmla.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index 3e481af29544..e77569c353d8 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -120,9 +120,12 @@ def _build_decode( num_decode_tokens: int, dcp_tot_seq_lens_device: torch.Tensor | None, ) -> FlashMLADecodeMetadata: + query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + max_query_len = query_lens_cpu.max().item() + num_q_tokens_per_head_k = max_query_len * self.num_q_heads // 1 tile_scheduler_metadata, num_splits = get_mla_metadata( seq_lens_device, - self.num_q_heads, + num_q_tokens_per_head_k, 1, # MQA for the decode path is_fp8_kvcache=self.is_fp8_kvcache, ) From 4fc3912c4e3706bfaac5d9d8e13b6c4ee129ecb3 Mon Sep 17 00:00:00 2001 From: Matthew Bonanni Date: Wed, 22 Oct 2025 17:34:29 -0400 Subject: [PATCH 2/2] add note Signed-off-by: Matthew Bonanni --- vllm/v1/attention/backends/mla/flashmla.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/attention/backends/mla/flashmla.py b/vllm/v1/attention/backends/mla/flashmla.py index e77569c353d8..1f98204031ed 100644 --- a/vllm/v1/attention/backends/mla/flashmla.py +++ b/vllm/v1/attention/backends/mla/flashmla.py @@ -121,6 +121,7 @@ def _build_decode( dcp_tot_seq_lens_device: torch.Tensor | None, ) -> FlashMLADecodeMetadata: query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1] + # we use the max but all should be the same due to uniform length requirement max_query_len = query_lens_cpu.max().item() num_q_tokens_per_head_k = max_query_len * self.num_q_heads // 1 tile_scheduler_metadata, num_splits = get_mla_metadata(