diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py index 7c0715a9e8b6..ed8c073e81e2 100644 --- a/vllm/v1/attention/backends/mla/indexer.py +++ b/vllm/v1/attention/backends/mla/indexer.py @@ -122,7 +122,10 @@ def get_name() -> str: @staticmethod def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: - return [1, 64] if current_platform.is_rocm() else [64] + # Must not advertise 1 on ROCm: select_common_block_size picks + # min([1, 64]) = 1, which keeps block_size=1 and leaves the gluon + # preshuffle path (added in #41217) permanently disabled. + return [64] @classmethod def get_supported_head_sizes(cls) -> list[int]: diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py index dc343b639f6c..095761704a15 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla_sparse.py @@ -273,7 +273,7 @@ class ROCMAiterMLASparseBackend(AttentionBackend): @staticmethod def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: - return [1, 64] + return [64] @staticmethod def get_name() -> str: