diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 81d62629d85e..0ad211bda217 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -35,7 +35,6 @@ ) from vllm.config import ( VllmConfig, - get_current_vllm_config, get_current_vllm_config_or_none, get_layers_from_vllm_config, ) @@ -67,22 +66,6 @@ class FlashAttentionBackend(AttentionBackend): @staticmethod def get_supported_kernel_block_sizes() -> list[int | MultipleOf]: - vllm_config = get_current_vllm_config() - model_config = vllm_config.model_config - cache_config = vllm_config.cache_config - if ( - model_config - and model_config.is_hybrid - and ( - cache_config.mamba_ssm_cache_dtype == "float32" - or cache_config.mamba_cache_dtype == "float32" - ) - ): - # NOTE(tdoublep): while in principle, FA supports - # MultipleOf(16), these are the block sizes that do not - # suffer from the NaN propagation problem described here: - # https://github.com/Dao-AILab/flash-attention/issues/1974 - return [16, 32, 64] return [MultipleOf(16)] forward_includes_kv_cache_update: bool = False