Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions vllm/v1/attention/backends/fa_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any

from vllm.logger import init_logger
from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant
from vllm.platforms import current_platform

logger = init_logger(__name__)
Expand Down Expand Up @@ -111,6 +112,16 @@ def get_flash_attn_version(
)
fa_version = 2

# FA4 currently uses batch-shape-dependent scheduling
# heuristics on SM100+, which breaks batch invariance.
if vllm_is_batch_invariant() and fa_version == 4:
logger.warning_once(
"Cannot use FA version 4 with batch invariance, "
"defaulting to FA version 2.",
scope="local",
)
fa_version = 2

# FA4 on SM100 (Blackwell) has TMEM capacity limits that restrict
# supported head dimensions.
# See: https://github.com/Dao-AILab/flash-attention/issues/1959
Expand Down