add gqa limit

fsx950223 · fsx950223 · commit 60a931e8feec · 2025-07-02T08:23:46.000Z
Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -286,7 +286,7 @@ def chunked_prefill_paged_decode(
                                                  num_queries_per_kv,
                                                  max_seq_len, sliding_window,
                                                  kv_cache_dtype, alibi_slopes)
-    if use_custom and head_size <= 128:
+    if use_custom and head_size <= 128 and num_queries_per_kv <= 16:
         _PARTITION_SIZE_ROCM = 256
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
                               _PARTITION_SIZE_ROCM)