revert

fsx950223 · fsx950223 · commit 70d687f30c65 · 2025-07-04T07:08:44.000Z
Signed-off-by: fsx950223 &lt;fsx950223@outlook.com&gt;
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -913,7 +913,8 @@ def forward(
                 )
                 max_logits = torch.empty_like(exp_sums)
 
-                torch.ops.aiter.paged_attention_rocm(
+                query_start_loc = None
+                ops.paged_attention_rocm(
                     output[num_prefill_tokens:],
                     exp_sums,
                     max_logits,
@@ -929,6 +930,7 @@ def forward(
                     decode_meta.seq_lens_tensor
                     if self.attn_type != AttentionType.ENCODER_DECODER else
                     decode_meta.encoder_seq_lens_tensor,
+                    query_start_loc,
                     block_size,
                     max_seq_len,
                     self.alibi_slopes,
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -286,7 +286,7 @@ def chunked_prefill_paged_decode(
                                                  num_queries_per_kv,
                                                  max_seq_len, sliding_window,
                                                  kv_cache_dtype, alibi_slopes)
-    if use_custom and head_size <= 128 and num_queries_per_kv <= 16:
+    if use_custom:
         _PARTITION_SIZE_ROCM = 256
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
                               _PARTITION_SIZE_ROCM)
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -138,9 +138,9 @@ def use_rocm_custom_paged_attention(
         return ((not envs.VLLM_USE_V1 or sliding_window == 0
                  or sliding_window == (-1, -1))
                 and (qtype == torch.half or qtype == torch.bfloat16)
-                and (head_size in [64, 128, 256])
+                and (head_size == 64 or head_size == 128)
                 and (block_size == 16 or block_size == 32)
-                and (gqa_ratio >= 1 and gqa_ratio <= 32)
+                and (gqa_ratio >= 1 and gqa_ratio <= 16)
                 and max_seq_len <= 128 * 1024
                 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
                 and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN