From e82beeb81d78d0ce0d01d8a986cb55fefab516bc Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Wed, 26 Nov 2025 20:34:31 +0000
Subject: [PATCH 01/36] initial implementation

---
 conftest.py                                   |   1 +
 pyproject.toml                                |   1 +
 src/transformers/masking_utils.py             |   1 +
 .../modeling_flash_attention_utils.py         |  43 ++++--
 src/transformers/modeling_utils.py            | 122 +++++++++++++++---
 src/transformers/testing_utils.py             |  10 ++
 src/transformers/utils/__init__.py            |   1 +
 src/transformers/utils/import_utils.py        |   9 ++
 tests/utils/test_modeling_utils.py            |   7 +
 9 files changed, 170 insertions(+), 25 deletions(-)

diff --git a/conftest.py b/conftest.py
index 910e9fcc1766..d8c7481d8a5e 100644
--- a/conftest.py
+++ b/conftest.py
@@ -89,6 +89,7 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
     config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
     config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
+    config.addinivalue_line("markers", "flash_attn_4_test: mark test which tests flash attention 4 functionality")
 
     os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"
 
diff --git a/pyproject.toml b/pyproject.toml
index 54ec1618e384..354b2ea1bac5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,7 @@ line-ending = "auto"
 addopts = "--doctest-glob='**/*.md'"
 doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
 markers = [
+    "flash_attn_4_test: marks tests related to flash attention 4 (deselect with '-m \"not flash_attn_4_test\"')",
     "flash_attn_3_test: marks tests related to flash attention 3 (deselect with '-m \"not flash_attn_3_test\"')",
     "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
     "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
diff --git a/src/transformers/masking_utils.py b/src/transformers/masking_utils.py
index 6ae8eab54144..f057f590b65e 100644
--- a/src/transformers/masking_utils.py
+++ b/src/transformers/masking_utils.py
@@ -636,6 +636,7 @@ class AttentionMaskInterface(GeneralInterface):
         "eager": eager_mask,
         "flash_attention_2": flash_attention_mask,
         "flash_attention_3": flash_attention_mask,
+        "flash_attention_4": flash_attention_mask,
         "flex_attention": flex_attention_mask,
     }
 
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index 5e31bdae01eb..1f6d6fe4b1cc 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -22,6 +22,7 @@
 from .utils import (
     is_flash_attn_2_available,
     is_flash_attn_3_available,
+    is_flash_attn_4_available,
     is_flash_attn_greater_or_equal_2_10,
     is_torch_npu_available,
     is_torch_xpu_available,
@@ -34,7 +35,7 @@
 
 # TODO Deprecate when all models have the attention interface
 def flash_attn_supports_top_left_mask():
-    if is_flash_attn_3_available():
+    if is_flash_attn_3_available() or is_flash_attn_4_available():
         return False
     if is_flash_attn_2_available():
         return not is_flash_attn_greater_or_equal_2_10()
@@ -47,7 +48,8 @@ def flash_attn_supports_top_left_mask():
 # TODO Deprecate when all models have the attention interface
 def is_flash_attn_available():
     return (
-        is_flash_attn_3_available()
+        is_flash_attn_4_available()
+        or is_flash_attn_3_available()
         or is_flash_attn_2_available()
         or is_torch_npu_available()
         or is_torch_xpu_available()
@@ -82,10 +84,13 @@ def _lazy_imports(implementation: Optional[str]):
     """
     is_fa2 = is_flash_attn_2_available()
     is_fa3 = is_flash_attn_3_available()
+    is_fa4 = is_flash_attn_4_available()
 
     pad_input, unpad_input = _pad_input, _unpad_input
 
-    if (implementation == "flash_attention_2" and is_fa2) or (implementation is None and is_fa2 and not is_fa3):
+    if (implementation == "flash_attention_2" and is_fa2) or (
+        implementation is None and is_fa2 and not is_fa3 and not is_fa4
+    ):
         from flash_attn import flash_attn_func, flash_attn_varlen_func
         from flash_attn.bert_padding import pad_input, unpad_input
     elif is_torch_npu_available():
@@ -94,8 +99,10 @@ def _lazy_imports(implementation: Optional[str]):
         from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_func
         from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_varlen_func
     else:
-        if implementation == "flash_attention_3" or (implementation is None and is_fa3):
+        if implementation == "flash_attention_3" or (implementation is None and is_fa3 and not is_fa4):
             from flash_attn_interface import flash_attn_func, flash_attn_varlen_func
+        elif implementation == "flash_attention_4" or (implementation is None and is_fa4):
+            from flash_attn.cute import flash_attn_func, flash_attn_varlen_func
         # Kernels fallback
         else:
             flash_attn_func = getattr(implementation, "flash_attn_func", None)
@@ -467,6 +474,8 @@ def _process_flash_attention_kwargs(
     softcap: Optional[float] = None,
     deterministic: Optional[bool] = None,
     s_aux: Optional[torch.Tensor] = None,
+    max_seqlen_q: Optional[int] = None,
+    max_seqlen_k: Optional[int] = None,
     supports_mapping: Optional[dict[str, bool]] = None,
     **kwargs,
 ):
@@ -497,6 +506,10 @@ def _process_flash_attention_kwargs(
             Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
         s_aux (`torch.Tensor`, *optional*):
             Attention sink auxiliary that adds a `bias` to the attention calculation via an additional head.
+        max_seqlen_q (`int`, *optional*):
+            The maximum sequence length in the query tensor during a varlen forward.
+        max_seqlen_k (`int`, *optional*):
+            The maximum sequence length in the key/value tensor during a varlen forward.
     Return:
         flash_kwargs (`dict`):
             A dict of kwargs that are requested and supported.
@@ -529,6 +542,12 @@ def _process_flash_attention_kwargs(
     if supports_mapping["s_aux"] and s_aux is not None:
         flash_kwargs["s_aux"] = s_aux
 
+    if supports_mapping["max_seqlen_q"] and max_seqlen_q is not None:
+        flash_kwargs["max_seqlen_q"] = max_seqlen_q
+
+    if supports_mapping["max_seqlen_k"] and max_seqlen_k is not None:
+        flash_kwargs["max_seqlen_k"] = max_seqlen_k
+
     return flash_kwargs
 
 
@@ -583,7 +602,8 @@ def _flash_attention_forward(
     )
 
     # Extract the flash attention kwargs that have been requested (and are supported by the implementation)
-    flash_kwargs = process_flash_kwargs_fn(
+    flash_kwargs = partial(
+        process_flash_kwargs_fn,
         query_length=query_length,
         key_length=key_states.size(1),
         is_causal=is_causal,
@@ -619,15 +639,15 @@ def _flash_attention_forward(
         if "mps" in str(q.device):
             cu_seq_lens_k = cu_seq_lens_k.clone()
 
+        # Newer fa versions no longer accept `max_seqlen_(q|k)`
+        final_flash_kwargs = flash_kwargs(max_seqlen_q=max_length_q, max_seqlen_k=max_length_k)
         out_unpad = flash_varlen_fn(
             q,
             k,
             v,
             cu_seqlens_q=cu_seq_lens_q,
             cu_seqlens_k=cu_seq_lens_k,
-            max_seqlen_q=max_length_q,
-            max_seqlen_k=max_length_k,
-            **flash_kwargs,
+            **final_flash_kwargs,
         )
         if isinstance(out_unpad, tuple):
             out_unpad = out_unpad[0]
@@ -650,6 +670,8 @@ def _flash_attention_forward(
         if "mps" in str(q.device):
             cu_seq_lens_k = cu_seq_lens_k.clone()
 
+        # Newer fa versions no longer accept `max_seqlen_(q|k)`
+        final_flash_kwargs = flash_kwargs(max_seqlen_q=max_length_q, max_seqlen_k=max_length_k)
         out = flash_varlen_fn(
             q,
             k,
@@ -658,7 +680,7 @@ def _flash_attention_forward(
             cu_seqlens_k=cu_seq_lens_k,
             max_seqlen_q=max_length_q,
             max_seqlen_k=max_length_k,
-            **flash_kwargs,
+            **final_flash_kwargs,
         )
         if isinstance(out, tuple):
             out = out[0]
@@ -667,7 +689,8 @@ def _flash_attention_forward(
 
     # No padding
     else:
-        out = flash_fn(query_states, key_states, value_states, **flash_kwargs)
+        final_flash_kwargs = flash_kwargs()
+        out = flash_fn(query_states, key_states, value_states, **final_flash_kwargs)
         if isinstance(out, tuple):
             out = out[0]
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 748d7af639af..dd51131e9aca 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -70,7 +70,7 @@
 from .integrations.flash_attention import flash_attention_forward
 from .integrations.flash_paged import paged_attention_forward
 from .integrations.flex_attention import flex_attention_forward
-from .integrations.hub_kernels import is_kernel, load_and_register_attn_kernel
+from .integrations.hub_kernels import is_kernel
 from .integrations.peft import maybe_load_adapters
 from .integrations.sdpa_attention import sdpa_attention_forward
 from .integrations.sdpa_paged import sdpa_attention_paged_forward
@@ -110,6 +110,7 @@
     is_accelerate_available,
     is_flash_attn_2_available,
     is_flash_attn_3_available,
+    is_flash_attn_4_available,
     is_kernels_available,
     is_offline_mode,
     is_remote_url,
@@ -123,6 +124,7 @@
 from .utils.generic import _CAN_RECORD_REGISTRY, GeneralInterface, OutputRecorder
 from .utils.hub import DownloadKwargs, create_and_tag_model_card, get_checkpoint_shard_files
 from .utils.import_utils import (
+    PACKAGE_DISTRIBUTION_MAPPING,
     is_huggingface_hub_greater_or_equal,
     is_sagemaker_mp_enabled,
     is_tracing,
@@ -1666,9 +1668,9 @@ def _flash_attn_3_can_dispatch(self, is_init_check: bool = False) -> bool:
 
             if torch.cuda.is_available():
                 major, _ = torch.cuda.get_device_capability()
-                if major < 9:
+                if major < 8:
                     raise ValueError(
-                        f"{preface} Flash Attention 3 requires compute capability >= 9.0, but found {torch.cuda.get_device_capability()} with compute capability {major}.0."
+                        f"{preface} Flash Attention 3 requires compute capability >= 8.0, but found {torch.cuda.get_device_capability()} with compute capability {major}.0."
                     )
                 else:
                     raise ImportError(f"{preface} Flash Attention 3 is not available.")
@@ -1715,6 +1717,86 @@ def _flash_attn_3_can_dispatch(self, is_init_check: bool = False) -> bool:
 
         return True
 
+    def _flash_attn_4_can_dispatch(self, is_init_check: bool = False) -> bool:
+        """
+        Check the availability of Flash Attention 4 for a given model.
+
+        Args:
+            is_init_check (`bool`, *optional*):
+                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
+                fully instantiated. This is needed as we also check the devices of the weights, which are only available
+                later after __init__. This allows to raise proper exceptions early before instantiating the full models
+                if we know that the model does not support the requested attention.
+        """
+        dtype = self.config.dtype
+
+        if not self._supports_flash_attn:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support Flash Attention 4 yet. Please request to add support where"
+                f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
+                " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
+            )
+
+        if not is_flash_attn_4_available():
+            preface = "FlashAttention4 has been toggled on, but it cannot be used due to the following error:"
+
+            if (
+                importlib.util.find_spec("flash_attn") is None
+                or "flash-attn-cute" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+            ):
+                raise ImportError(f"{preface} the package flash_attn (under cute) seems to be not installed.")
+
+            if torch.cuda.is_available():
+                major, _ = torch.cuda.get_device_capability()
+                if major < 9:
+                    raise ValueError(
+                        f"{preface} Flash Attention 4 requires compute capability >= 9.0, but found {torch.cuda.get_device_capability()} with compute capability {major}.0."
+                    )
+                else:
+                    raise ImportError(f"{preface} Flash Attention 4 is not available.")
+            else:
+                raise ValueError(
+                    f"{preface} Flash Attention 4 is not available on CPU. Please make sure torch can access a CUDA device."
+                )
+
+        if dtype is None:
+            logger.warning_once(
+                "You are attempting to use Flash Attention 4 without specifying a torch dtype. This might lead to unexpected behaviour"
+            )
+        elif dtype is not None and dtype not in [torch.float16, torch.bfloat16]:
+            logger.warning_once(
+                "Flash Attention 4 only supports torch.float16 and torch.bfloat16 dtypes, but"
+                f" the current dype in {self.__class__.__name__} is {dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
+                ' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_4", dtype=torch.float16)`'
+            )
+
+        if getattr(self.config, "alibi", False) or getattr(self.config, "use_alibi", False):
+            raise ValueError("Model is configured to use ALiBi, which is not supported by Flash Attention 4.")
+
+        # Check for attention dropout, which is incompatible with FA4
+        if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
+            raise ValueError(
+                f"Model has attention_dropout={self.config.attention_dropout}, which is not supported by Flash Attention 4."
+            )
+
+        # With the early check, the parameters are not yet initialized correctly
+        if not is_init_check:
+            param_devices = list({param.device for param in self.parameters()})
+            if len(param_devices) == 1 and param_devices[0].type == "cpu":
+                if torch.cuda.is_available():
+                    logger.warning_once(
+                        "You are attempting to use Flash Attention 4 with a model not initialized on GPU. Make sure to move the model to GPU"
+                        " after initializing it on CPU with `model.to('cuda')`."
+                    )
+                else:
+                    raise ValueError(
+                        "You are attempting to use Flash Attention 4 with a model not initialized on GPU and with no GPU available. "
+                        "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
+                        "or initialising the model on CPU and then moving it to GPU."
+                    )
+
+        return True
+
     def _sdpa_can_dispatch(self, is_init_check: bool = False) -> bool:
         """
         Check the availability of SDPA for a given model.
@@ -1794,10 +1876,12 @@ def _check_and_adjust_attn_implementation(
         """
         applicable_attn_implementation = attn_implementation
 
-        # If FA not installed, do not fail but use kernels instead
+        # If FA not installed, do not fail but use kernels instead (except for FA4 for now)
+        requested_original_flash_attn = attn_implementation is not None and (
+            attn_implementation == "flash_attention_2" or attn_implementation == "flash_attention_3"
+        )
         if (
-            attn_implementation is not None
-            and "flash" in attn_implementation
+            requested_original_flash_attn
             and self._supports_flash_attn
             and not (is_flash_attn_2_available() or is_flash_attn_3_available())
             and is_kernels_available()
@@ -1806,23 +1890,26 @@ def _check_and_adjust_attn_implementation(
             if attn_implementation.endswith("2"):
                 applicable_attn_implementation = "kernels-community/flash-attn2"
                 if is_torch_xpu_available():
-                    # On XPU, kernels library is the native implementation. Rename variable to avoid "fallback" warning and irrelevant checks.
-                    attn_implementation = "kernels-community/flash-attn2"
+                    # On XPU, kernels library is the native implementation
+                    # Disabling this flag to avoid giving wrong fallbacks on errors and warnings
+                    requested_original_flash_attn = False
             else:
                 applicable_attn_implementation = "kernels-community/vllm-flash-attn3"
 
         if is_kernel(applicable_attn_implementation):
             try:
-                load_and_register_attn_kernel(applicable_attn_implementation)
+                # preload flash attention here to allow compile with fullgraph
+                lazy_import_flash_attention(applicable_attn_implementation)
+
                 # log that we used kernel fallback if successful
-                if "flash_" in attn_implementation:
+                if requested_original_flash_attn:
                     logger.warning_once(
                         f"You do not have `flash_attn` installed, using `{applicable_attn_implementation}` "
                         "from the `kernels` library instead!"
                     )
             except Exception as e:
                 # raise the proper exception for requested flash attention
-                if attn_implementation.startswith("flash_"):
+                if requested_original_flash_attn:
                     if attn_implementation.endswith("2"):
                         self._flash_attn_2_can_dispatch()
                     else:
@@ -1834,9 +1921,11 @@ def _check_and_adjust_attn_implementation(
             applicable_attn_implementation = self.get_correct_attn_implementation(
                 applicable_attn_implementation, is_init_check
             )
+
             # preload flash attention here to allow compile with fullgraph
-            if applicable_attn_implementation.startswith("flash_"):
-                lazy_import_flash_attention(applicable_attn_implementation, force_import=True)
+            if "flash" in applicable_attn_implementation:
+                lazy_import_flash_attention(applicable_attn_implementation)
+
         return applicable_attn_implementation
 
     def get_correct_attn_implementation(self, requested_attention: Optional[str], is_init_check: bool = False) -> str:
@@ -1848,7 +1937,7 @@ def get_correct_attn_implementation(self, requested_attention: Optional[str], is
             )
             # check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
             if self._supports_flash_attn or getattr(self, "_supports_flash_attn_2", False):
-                message += ', `"attn_implementation=flash_attention_3"`, `"attn_implementation=flash_attention_2"`, `"attn_implementation=paged|flash_attention_2"`'
+                message += ', `"attn_implementation=flash_attention_4"`, `"attn_implementation=flash_attention_3"`, `"attn_implementation=flash_attention_2"`, `"attn_implementation=paged|flash_attention_2"`'
             if self._supports_sdpa:
                 message += ', `"attn_implementation=sdpa"`, `"attn_implementation=paged|sdpa"`'
             if self._supports_flex_attn:
@@ -1860,6 +1949,8 @@ def get_correct_attn_implementation(self, requested_attention: Optional[str], is
             self._flash_attn_2_can_dispatch(is_init_check)
         elif "flash_attention_3" in applicable_attention:
             self._flash_attn_3_can_dispatch(is_init_check)
+        elif "flash_attention_4" in applicable_attention:
+            self._flash_attn_4_can_dispatch(is_init_check)
         elif "flex_attention" in applicable_attention:
             self._flex_attn_can_dispatch(is_init_check)
         elif "sdpa" in applicable_attention:
@@ -3631,7 +3722,7 @@ def from_pretrained(
 
                 </Tip>
             attn_implementation (`str`, *optional*):
-                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)), or `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
+                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)), or `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper)), or `"flash_attention_4"` (using [Dao-AILab/flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
 
                 Accept HF kernel references in the form:
                   <namespace>/<repo_name>[@<revision>][:<kernel_name>]
@@ -4650,6 +4741,7 @@ class AttentionInterface(GeneralInterface):
     # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if
     # a new instance is created (in order to locally override a given function)
     _global_mapping = {
+        "flash_attention_4": flash_attention_forward,
         "flash_attention_3": flash_attention_forward,
         "flash_attention_2": flash_attention_forward,
         "flex_attention": flex_attention_forward,
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index bdaf22e81478..a41e8da10ef7 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -89,6 +89,7 @@
     is_fbgemm_gpu_available,
     is_flash_attn_2_available,
     is_flash_attn_3_available,
+    is_flash_attn_4_available,
     is_flute_available,
     is_fp_quant_available,
     is_fsdp_available,
@@ -619,6 +620,15 @@ def require_flash_attn_3(test_case):
     return unittest.skipUnless(is_flash_attn_3_available(), "test requires Flash Attention 3")(test_case)
 
 
+def require_flash_attn_4(test_case):
+    """
+    Decorator marking a test that requires Flash Attention 4.
+
+    These tests are skipped when Flash Attention 4 isn't installed.
+    """
+    return unittest.skipUnless(is_flash_attn_4_available(), "test requires Flash Attention 4")(test_case)
+
+
 def require_read_token(test_case):
     """
     A decorator that loads the HF token for tests that require to load gated models.
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 38b5db8f4893..19e14933c4b3 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -143,6 +143,7 @@
     is_fbgemm_gpu_available,
     is_flash_attn_2_available,
     is_flash_attn_3_available,
+    is_flash_attn_4_available,
     is_flash_attn_greater_or_equal,
     is_flash_attn_greater_or_equal_2_10,
     is_flute_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index bf2fba35fd0e..199cdf6110f4 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -857,6 +857,15 @@ def is_flash_attn_3_available() -> bool:
     return is_torch_cuda_available() and _is_package_available("flash_attn_3")
 
 
+@lru_cache
+def is_flash_attn_4_available() -> bool:
+    if not (is_torch_cuda_available() and _is_package_available("flash_attn")):
+        return False
+
+    # FA4 is distributed to just "flash_attn" but its mapping is properly mapped to cute
+    return "flash-attn-cute" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+
+
 @lru_cache
 def is_flash_attn_greater_or_equal_2_10() -> bool:
     _, flash_attn_version = _is_package_available("flash_attn", return_version=True)
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 69273e4c1ddd..d661dd6d5f17 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -87,6 +87,7 @@
 from transformers.utils.import_utils import (
     is_flash_attn_2_available,
     is_flash_attn_3_available,
+    is_flash_attn_4_available,
     is_kernels_available,
     is_torch_npu_available,
 )
@@ -741,6 +742,9 @@ def test_model_from_pretrained_attn_implementation(self):
         if is_flash_attn_3_available():
             attn_implementation_available.append("flash_attention_3")
 
+        if is_flash_attn_4_available():
+            attn_implementation_available.append("flash_attention_4")
+
         for requested_attn_implementation in attn_implementation_available:
             model = AutoModelForCausalLM.from_pretrained(
                 TINY_MISTRAL, attn_implementation=requested_attn_implementation
@@ -766,6 +770,9 @@ def test_model_from_config_attn_implementation(self):
         if is_flash_attn_3_available():
             attn_implementation_available.append("flash_attention_3")
 
+        if is_flash_attn_4_available():
+            attn_implementation_available.append("flash_attention_4")
+
         for requested_attn_implementation in attn_implementation_available:
             config = AutoConfig.from_pretrained(TINY_MISTRAL, attn_implementation=requested_attn_implementation)
             # Ensure the config was set correctly

From 30c6682281d099ad2a6bac8d823e49794ef9ce6d Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 14:43:19 +0000
Subject: [PATCH 02/36] CB support

---
 src/transformers/modeling_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 86e06fc48c0b..1310311f5fc4 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -4745,6 +4745,7 @@ class AttentionInterface(GeneralInterface):
         "flash_attention_2": flash_attention_forward,
         "flex_attention": flex_attention_forward,
         "sdpa": sdpa_attention_forward,
+        "paged|flash_attention_4": paged_attention_forward,
         "paged|flash_attention_3": paged_attention_forward,
         "paged|flash_attention_2": paged_attention_forward,
         "paged|sdpa": sdpa_attention_paged_forward,

From e9cdeea283ee3b9bdfed4077a43f9c63dfc9d585 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 14:58:53 +0000
Subject: [PATCH 03/36] change how we call item on max_seq_len_q/k

---
 .../modeling_flash_attention_utils.py         | 34 ++++++++++---------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index 5d243c51ae76..f681b4896e00 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -29,6 +29,7 @@
     is_torch_xpu_available,
     logging,
 )
+from .utils.import_utils import is_tracing
 
 
 logger = logging.get_logger(__name__)
@@ -219,7 +220,7 @@ def _unpad_input(hidden_states, attention_mask, unused_mask=None):
     seqlens_in_batch = all_masks.sum(dim=-1, dtype=torch.int32)
     used_seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(all_masks.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    max_seqlen_in_batch = seqlens_in_batch.max()
     cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
 
     return (
@@ -268,9 +269,7 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.T
     """
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    # NOTE: Similar to the `.item()` in prepare_fa2_from_position_ids, with torch compile,
-    # this might cause a graph break
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    max_seqlen_in_batch = seqlens_in_batch.max()
     cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
@@ -391,12 +390,6 @@ def prepare_fa_kwargs_from_position_ids(position_ids):
     # We should use cu_seq_lens instead of position_ids to get the max length since position_ids is not always increasing
     # for some models (e.g. qwen2-vl).
     max_length_q = cu_seq_lens_q.diff().max()
-    # NOTE: With torch compile, this will cause a graph break if you don't set
-    # `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` in the environment or call
-    # `torch._dynamo.config.capture_scalar_outputs = True` before doing the forward pass.
-    # This is a limitation of flash attention API, as the function `flash_attn_varlen_func`
-    # requires `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
-    max_length_q = max_length_q.item()
     max_length_k = max_length_q
 
     return (cu_seq_lens_q, cu_seq_lens_k), (max_length_q, max_length_k)
@@ -506,8 +499,8 @@ def _process_flash_attention_kwargs(
     softcap: Optional[float] = None,
     deterministic: Optional[bool] = None,
     s_aux: Optional[torch.Tensor] = None,
-    max_seqlen_q: Optional[int] = None,
-    max_seqlen_k: Optional[int] = None,
+    max_seqlen_q: Optional[int | torch.IntTensor] = None,
+    max_seqlen_k: Optional[int | torch.IntTensor] = None,
     supports_mapping: Optional[dict[str, bool]] = None,
     **kwargs,
 ):
@@ -538,9 +531,9 @@ def _process_flash_attention_kwargs(
             Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
         s_aux (`torch.Tensor`, *optional*):
             Attention sink auxiliary that adds a `bias` to the attention calculation via an additional head.
-        max_seqlen_q (`int`, *optional*):
+        max_seqlen_q (`Union[int, torch.IntTensor]`, *optional*):
             The maximum sequence length in the query tensor during a varlen forward.
-        max_seqlen_k (`int`, *optional*):
+        max_seqlen_k (`Union[int, torch.IntTensor]`, *optional*):
             The maximum sequence length in the key/value tensor during a varlen forward.
     Return:
         flash_kwargs (`dict`):
@@ -574,10 +567,21 @@ def _process_flash_attention_kwargs(
     if supports_mapping["s_aux"] and s_aux is not None:
         flash_kwargs["s_aux"] = s_aux
 
+    # There is a limitation of the flash attention API, as the function `flash_attn_varlen_func`
+    # may require `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
+    #
+    # You can either set
+    #   - Env: `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1`
+    #   - Before compiling: `torch._dynamo.config.capture_scalar_outputs = True`
+    # to allow torch compile to handle scalar outputs in those cases.
     if supports_mapping["max_seqlen_q"] and max_seqlen_q is not None:
+        if not isinstance(max_seqlen_q, int) and is_tracing(max_seqlen_q):
+            max_seqlen_q = max_seqlen_q.item()
         flash_kwargs["max_seqlen_q"] = max_seqlen_q
 
     if supports_mapping["max_seqlen_k"] and max_seqlen_k is not None:
+        if not isinstance(max_seqlen_k, int) and is_tracing(max_seqlen_k):
+            max_seqlen_k = max_seqlen_k.item()
         flash_kwargs["max_seqlen_k"] = max_seqlen_k
 
     return flash_kwargs
@@ -710,8 +714,6 @@ def _flash_attention_forward(
             v,
             cu_seqlens_q=cu_seq_lens_q,
             cu_seqlens_k=cu_seq_lens_k,
-            max_seqlen_q=max_length_q,
-            max_seqlen_k=max_length_k,
             **final_flash_kwargs,
         )
         if isinstance(out, tuple):

From 40168b4f36b637b41d8b0ca93cf4caddc14e1b6f Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 15:03:08 +0000
Subject: [PATCH 04/36] fix

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 1310311f5fc4..9a30ed2effab 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1737,7 +1737,7 @@ def _flash_attn_4_can_dispatch(self, is_init_check: bool = False) -> bool:
 
             if (
                 importlib.util.find_spec("flash_attn") is None
-                or "flash-attn-cute" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+                or "flash-attn-cute" not in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
             ):
                 raise ImportError(f"{preface} the package flash_attn (under cute) seems to be not installed.")
 

From 91a1b3beedc10b34a0bac6afdbe20a31283ec677 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 15:14:18 +0000
Subject: [PATCH 05/36] tests

---
 .../self-scheduled-flash-attn-caller.yml      |  2 +-
 tests/generation/test_utils.py                | 25 ++++++++++++++++
 tests/models/edgetam/test_modeling_edgetam.py |  4 +--
 tests/models/gemma3/test_modeling_gemma3.py   |  8 +++++
 tests/models/sam2/test_modeling_sam2.py       |  4 +--
 tests/models/sam3/test_modeling_sam3.py       | 14 +++++++++
 .../test_modeling_sam3_tracker.py             |  4 +--
 tests/test_modeling_common.py                 | 30 +++++++++++++++++++
 8 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/self-scheduled-flash-attn-caller.yml b/.github/workflows/self-scheduled-flash-attn-caller.yml
index ff990f2808b7..a3ba8b189c3d 100644
--- a/.github/workflows/self-scheduled-flash-attn-caller.yml
+++ b/.github/workflows/self-scheduled-flash-attn-caller.yml
@@ -56,5 +56,5 @@ jobs:
       runner_type: "a10"
       report_repo_id: hf-internal-testing/transformers_flash_attn_ci
       commit_sha: ${{ github.sha }}
-      pytest_marker: "flash_attn_test or flash_attn_3_test"
+      pytest_marker: "flash_attn_test or flash_attn_3_test or flash_attn_4_test"
     secrets: inherit
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index d1f14ad0c9c8..1b53c8802cb2 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -43,6 +43,7 @@
     require_accelerate,
     require_flash_attn,
     require_flash_attn_3,
+    require_flash_attn_4,
     require_optimum_quanto,
     require_read_token,
     require_torch,
@@ -1862,6 +1863,14 @@ def test_eager_matches_fa3_generate(self):
         """Tests that generate has equivalent outputs with FA3 and eager attention implementations."""
         self._test_attention_implementation("flash_attention_3")
 
+    @pytest.mark.flash_attn_4_test
+    @require_flash_attn_4
+    @require_torch_gpu
+    @slow
+    def test_eager_matches_fa4_generate(self):
+        """Tests that generate has equivalent outputs with FA4 and eager attention implementations."""
+        self._test_attention_implementation("flash_attention_4")
+
     @require_flash_attn
     @require_torch_accelerator
     @pytest.mark.flash_attn_test
@@ -2096,6 +2105,22 @@ def test_flash_attention_3_padding_matches_padding_free_with_position_ids_and_fa
             attn_implementation="flash_attention_3", fa_kwargs=True
         )
 
+    @require_flash_attn_4
+    @require_torch_gpu
+    @pytest.mark.flash_attn_4_test
+    @slow
+    def test_flash_attention_4_padding_matches_padding_free_with_position_ids(self):
+        self.attention_mask_padding_matches_padding_free_with_position_ids(attn_implementation="flash_attention_4")
+
+    @require_flash_attn_4
+    @require_torch_gpu
+    @pytest.mark.flash_attn_4_test
+    @slow
+    def test_flash_attention_4_padding_matches_padding_free_with_position_ids_and_fa_kwargs(self):
+        self.attention_mask_padding_matches_padding_free_with_position_ids(
+            attn_implementation="flash_attention_4", fa_kwargs=True
+        )
+
     def _get_custom_4d_mask_test_data(self):
         # Sequence in which all but the last token is the same
         input_ids = torch.tensor(
diff --git a/tests/models/edgetam/test_modeling_edgetam.py b/tests/models/edgetam/test_modeling_edgetam.py
index 4ec9e16c4db2..0c3cbe331ce8 100644
--- a/tests/models/edgetam/test_modeling_edgetam.py
+++ b/tests/models/edgetam/test_modeling_edgetam.py
@@ -270,9 +270,7 @@ def flash_attn_inference_equivalence(self, attn_implementation: str, padding_sid
             self.skipTest(reason="Model architecture does not support attentions")
 
         for model_class in self.all_model_classes:
-            if (attn_implementation == "flash_attention_2" and not model_class._supports_flash_attn_2) or (
-                attn_implementation == "flash_attention_3" and not model_class._supports_flash_attn_3
-            ):
+            if attn_implementation.startswith("flash_attention") and not model_class._supports_flash_attn:
                 self.skipTest(f"{model_class.__name__} does not support {attn_implementation}")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py
index 17e9d9991bd4..6aa0e6aff140 100644
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@@ -35,6 +35,7 @@
     require_deterministic_for_xpu,
     require_flash_attn,
     require_flash_attn_3,
+    require_flash_attn_4,
     require_read_token,
     require_torch,
     require_torch_accelerator,
@@ -453,6 +454,13 @@ def test_flash_attn_2_from_config(self):
     def test_flash_attn_3_from_config(self):
         self.flash_attn_from_config(attn_implementation="flash_attention_3", test_fwd_in_train=False)
 
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    @slow
+    def test_flash_attn_4_from_config(self):
+        self.flash_attn_from_config(attn_implementation="flash_attention_4", test_fwd_in_train=False)
+
 
 @slow
 @require_torch_accelerator
diff --git a/tests/models/sam2/test_modeling_sam2.py b/tests/models/sam2/test_modeling_sam2.py
index 15dd3039e76a..82aa7ef7615c 100644
--- a/tests/models/sam2/test_modeling_sam2.py
+++ b/tests/models/sam2/test_modeling_sam2.py
@@ -612,9 +612,7 @@ def flash_attn_inference_equivalence(
             self.skipTest(reason="Model architecture does not support attentions")
 
         for model_class in self.all_model_classes:
-            if (attn_implementation == "flash_attention_2" and not model_class._supports_flash_attn_2) or (
-                attn_implementation == "flash_attention_3" and not model_class._supports_flash_attn_3
-            ):
+            if attn_implementation.startswith("flash_attention") and not model_class._supports_flash_attn:
                 self.skipTest(f"{model_class.__name__} does not support {attn_implementation}")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/sam3/test_modeling_sam3.py b/tests/models/sam3/test_modeling_sam3.py
index be8e7e600420..7748c2a91ff7 100644
--- a/tests/models/sam3/test_modeling_sam3.py
+++ b/tests/models/sam3/test_modeling_sam3.py
@@ -662,6 +662,20 @@ def test_flash_attn_3_inference_equivalence(self):
     def test_flash_attn_3_inference_equivalence_right_padding(self):
         pass
 
+    @unittest.skip(
+        reason="Sam3Model creates attention masks from features (with gradients), "
+        "which is incompatible with flash attention's expectation of binary masks"
+    )
+    def test_flash_attn_4_inference_equivalence(self):
+        pass
+
+    @unittest.skip(
+        reason="Sam3Model creates attention masks from features (with gradients), "
+        "which is incompatible with flash attention's expectation of binary masks"
+    )
+    def test_flash_attn_4_inference_equivalence_right_padding(self):
+        pass
+
     @unittest.skip(
         reason="Sam3Model creates attention masks from features (with gradients), "
         "which is incompatible with flash attention's expectation of binary masks"
diff --git a/tests/models/sam3_tracker/test_modeling_sam3_tracker.py b/tests/models/sam3_tracker/test_modeling_sam3_tracker.py
index 919116a4ac19..498f6c0e4256 100644
--- a/tests/models/sam3_tracker/test_modeling_sam3_tracker.py
+++ b/tests/models/sam3_tracker/test_modeling_sam3_tracker.py
@@ -381,9 +381,7 @@ def flash_attn_inference_equivalence(
             self.skipTest(reason="Model architecture does not support attentions")
 
         for model_class in self.all_model_classes:
-            if (attn_implementation == "flash_attention_2" and not model_class._supports_flash_attn_2) or (
-                attn_implementation == "flash_attention_3" and not model_class._supports_flash_attn_3
-            ):
+            if attn_implementation.startswith("flash_attention") and not model_class._supports_flash_attn:
                 self.skipTest(f"{model_class.__name__} does not support {attn_implementation}")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 4359368a4fdb..91fdb89dec4a 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -85,6 +85,7 @@
     require_deepspeed,
     require_flash_attn,
     require_flash_attn_3,
+    require_flash_attn_4,
     require_kernels,
     require_non_hpu,
     require_torch,
@@ -2910,6 +2911,22 @@ def test_flash_attn_3_inference_equivalence(self):
     def test_flash_attn_3_inference_equivalence_right_padding(self):
         self.flash_attn_inference_equivalence(attn_implementation="flash_attention_3", padding_side="right")
 
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    @slow
+    @is_flaky()
+    def test_flash_attn_4_inference_equivalence(self):
+        self.flash_attn_inference_equivalence(attn_implementation="flash_attention_4", padding_side="left")
+
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    @slow
+    @is_flaky()
+    def test_flash_attn_4_inference_equivalence_right_padding(self):
+        self.flash_attn_inference_equivalence(attn_implementation="flash_attention_4", padding_side="right")
+
     def test_attn_implementation_composite_models(self):
         """
         Tests if composite models can receive a dict object as attn_implementation, where each key should be
@@ -3290,6 +3307,12 @@ def test_flash_attn_2_can_dispatch_composite_models(self):
     def test_flash_attn_3_can_dispatch_composite_models(self):
         self.flash_attn_can_dispatch_composite_models(attn_implementation="flash_attention_3")
 
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    def test_flash_attn_4_can_dispatch_composite_models(self):
+        self.flash_attn_can_dispatch_composite_models(attn_implementation="flash_attention_4")
+
     @require_flash_attn
     @require_torch_accelerator
     @require_bitsandbytes
@@ -3447,6 +3470,13 @@ def test_flash_attn_2_from_config(self):
     def test_flash_attn_3_from_config(self):
         self.flash_attn_from_config(attn_implementation="flash_attention_3")
 
+    @require_flash_attn_4
+    @require_torch_gpu
+    @mark.flash_attn_4_test
+    @slow
+    def test_flash_attn_4_from_config(self):
+        self.flash_attn_from_config(attn_implementation="flash_attention_4")
+
     def test_sliding_window_mask(self):
         """Tests that we can control the sliding window attention behavior of a model."""
         config, inputs = self.model_tester.prepare_config_and_inputs_for_common()

From 8d3dc6c8d3dde05fd853fa6f6587048230724813 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 15:49:23 +0000
Subject: [PATCH 06/36] fix fa2 clash

---
 src/transformers/modeling_utils.py     | 2 +-
 src/transformers/utils/import_utils.py | 3 +++
 tests/generation/test_utils.py         | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9a30ed2effab..5accb7a63c64 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1576,7 +1576,7 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
                 logger.info("Detect using FlashAttention2 (via kernel `kernels-community/flash-attn2`) on XPU.")
                 return True
 
-            if importlib.util.find_spec("flash_attn") is None:
+            if importlib.util.find_spec("flash_attn") is None or "flash_attn" not in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]:
                 raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
             else:
                 # Check FA2 installed version compatibility
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 73850bc5161b..9fd049c8dba2 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -841,6 +841,9 @@ def is_bitsandbytes_available(min_version: str = BITSANDBYTES_MIN_VERSION) -> bo
 @lru_cache
 def is_flash_attn_2_available() -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
+    # FA4 is also distributed under "flash_attn", hence we need to check the naming here
+    is_available = is_available and "flash_attn" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+
     if not is_available or not (is_torch_cuda_available() or is_torch_mlu_available()):
         return False
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 1b53c8802cb2..097f4bb8bf58 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1764,6 +1764,7 @@ def _test_attention_implementation(self, attn_implementation):
             "sdpa": "_supports_sdpa",
             "flash_attention_2": "_supports_flash_attn",
             "flash_attention_3": "_supports_flash_attn",
+            "flash_attention_4": "_supports_flash_attn",
         }
 
         for model_class in self.all_generative_model_classes:
@@ -1974,6 +1975,7 @@ def attention_mask_padding_matches_padding_free_with_position_ids(
             "sdpa": "_supports_sdpa",
             "flash_attention_2": "_supports_flash_attn",
             "flash_attention_3": "_supports_flash_attn",
+            "flash_attention_4": "_supports_flash_attn",
         }
 
         for model_class in self.all_generative_model_classes:

From bf1d5896ae43d63f4f0f109a05db34c343e9edd4 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 17:00:26 +0000
Subject: [PATCH 07/36] unify the fa dispatch

---
 src/transformers/modeling_utils.py | 240 ++++++++++-------------------
 1 file changed, 81 insertions(+), 159 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 5accb7a63c64..d132f7f80a33 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1542,27 +1542,7 @@ def can_generate(cls) -> bool:
         # Otherwise, can't generate
         return False
 
-    def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
-        """
-        Check the availability of Flash Attention 2 for a given model.
-
-        Args:
-            is_init_check (`bool`, *optional*):
-                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
-                fully instantiated. This is needed as we also check the devices of the weights, which are only available
-                later after __init__. This allows to raise proper exceptions early before instantiating the full models
-                if we know that the model does not support the requested attention.
-        """
-        dtype = self.config.dtype
-
-        # check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
-        if not (self._supports_flash_attn or getattr(self, "_supports_flash_attn_2", False)):
-            raise ValueError(
-                f"{self.__class__.__name__} does not support Flash Attention 2.0 yet. Please request to add support where"
-                f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
-                " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
-            )
-
+    def _flash_attn_2_import_error(self):
         if not is_flash_attn_2_available():
             preface = "FlashAttention2 has been toggled on, but it cannot be used due to the following error:"
             install_message = "Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
@@ -1570,13 +1550,16 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
             # package `flash-attn` can not be installed on Ascend NPU, following validation logics can be ignored.
             if is_torch_npu_available():
                 logger.info("Detect using FlashAttention2 on Ascend NPU.")
-                return True
+                return
 
             if is_torch_xpu_available():
                 logger.info("Detect using FlashAttention2 (via kernel `kernels-community/flash-attn2`) on XPU.")
-                return True
+                return
 
-            if importlib.util.find_spec("flash_attn") is None or "flash_attn" not in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]:
+            if (
+                importlib.util.find_spec("flash_attn") is None
+                or "flash_attn" not in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+            ):
                 raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
             else:
                 # Check FA2 installed version compatibility
@@ -1600,61 +1583,7 @@ def _flash_attn_2_can_dispatch(self, is_init_check: bool = False) -> bool:
                     else:
                         raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
 
-        if dtype is None:
-            logger.warning_once(
-                "You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour"
-            )
-        elif dtype is not None and dtype not in [torch.float16, torch.bfloat16]:
-            logger.warning_once(
-                "Flash Attention 2 only supports torch.float16 and torch.bfloat16 dtypes, but"
-                f" the current dype in {self.__class__.__name__} is {dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
-                ' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`'
-            )
-
-        # With the early check, the parameters are not yet initialized correctly
-        if not is_init_check:
-            param_devices = list({param.device for param in self.parameters()})
-            if len(param_devices) == 1 and param_devices[0].type == "cpu":
-                if torch.cuda.is_available():
-                    logger.warning_once(
-                        "You are attempting to use Flash Attention 2 with a model not initialized on GPU. Make sure to move the model to GPU"
-                        " after initializing it on CPU with `model.to('cuda')`."
-                    )
-                elif is_torch_mlu_available():
-                    logger.warning_once(
-                        "You are attempting to use Flash Attention 2 with a model not initialized on MLU. Make sure to move the model to MLU"
-                        " after initializing it on CPU with `model.to('mlu')`."
-                    )
-                else:
-                    raise ValueError(
-                        "You are attempting to use Flash Attention 2 with a model not initialized on GPU and with no GPU available. "
-                        "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
-                        "or initialising the model on CPU and then moving it to GPU."
-                    )
-
-        # If no error raise by this point, we can return `True`
-        return True
-
-    def _flash_attn_3_can_dispatch(self, is_init_check: bool = False) -> bool:
-        """
-        Check the availability of Flash Attention 3 for a given model.
-
-        Args:
-            is_init_check (`bool`, *optional*):
-                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
-                fully instantiated. This is needed as we also check the devices of the weights, which are only available
-                later after __init__. This allows to raise proper exceptions early before instantiating the full models
-                if we know that the model does not support the requested attention.
-        """
-        dtype = self.config.dtype
-
-        if not self._supports_flash_attn:
-            raise ValueError(
-                f"{self.__class__.__name__} does not support Flash Attention 3 yet. Please request to add support where"
-                f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
-                " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
-            )
-
+    def _flash_attn_3_import_error(self):
         if not is_flash_attn_3_available():
             preface = "FlashAttention3 has been toggled on, but it cannot be used due to the following error:"
 
@@ -1674,64 +1603,7 @@ def _flash_attn_3_can_dispatch(self, is_init_check: bool = False) -> bool:
                     f"{preface} Flash Attention 3 is not available on CPU. Please make sure torch can access a CUDA device."
                 )
 
-        if dtype is None:
-            logger.warning_once(
-                "You are attempting to use Flash Attention 3 without specifying a torch dtype. This might lead to unexpected behaviour"
-            )
-        elif dtype is not None and dtype not in [torch.float16, torch.bfloat16]:
-            logger.warning_once(
-                "Flash Attention 3 only supports torch.float16 and torch.bfloat16 dtypes, but"
-                f" the current dype in {self.__class__.__name__} is {dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
-                ' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_3", dtype=torch.float16)`'
-            )
-
-        if getattr(self.config, "alibi", False) or getattr(self.config, "use_alibi", False):
-            raise ValueError("Model is configured to use ALiBi, which is not supported by Flash Attention 3.")
-
-        # Check for attention dropout, which is incompatible with FA3
-        if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
-            raise ValueError(
-                f"Model has attention_dropout={self.config.attention_dropout}, which is not supported by Flash Attention 3."
-            )
-
-        # With the early check, the parameters are not yet initialized correctly
-        if not is_init_check:
-            param_devices = list({param.device for param in self.parameters()})
-            if len(param_devices) == 1 and param_devices[0].type == "cpu":
-                if torch.cuda.is_available():
-                    logger.warning_once(
-                        "You are attempting to use Flash Attention 3 with a model not initialized on GPU. Make sure to move the model to GPU"
-                        " after initializing it on CPU with `model.to('cuda')`."
-                    )
-                else:
-                    raise ValueError(
-                        "You are attempting to use Flash Attention 3 with a model not initialized on GPU and with no GPU available. "
-                        "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
-                        "or initialising the model on CPU and then moving it to GPU."
-                    )
-
-        return True
-
-    def _flash_attn_4_can_dispatch(self, is_init_check: bool = False) -> bool:
-        """
-        Check the availability of Flash Attention 4 for a given model.
-
-        Args:
-            is_init_check (`bool`, *optional*):
-                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
-                fully instantiated. This is needed as we also check the devices of the weights, which are only available
-                later after __init__. This allows to raise proper exceptions early before instantiating the full models
-                if we know that the model does not support the requested attention.
-        """
-        dtype = self.config.dtype
-
-        if not self._supports_flash_attn:
-            raise ValueError(
-                f"{self.__class__.__name__} does not support Flash Attention 4 yet. Please request to add support where"
-                f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
-                " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
-            )
-
+    def _flash_attn_4_import_error(self):
         if not is_flash_attn_4_available():
             preface = "FlashAttention4 has been toggled on, but it cannot be used due to the following error:"
 
@@ -1754,25 +1626,58 @@ def _flash_attn_4_can_dispatch(self, is_init_check: bool = False) -> bool:
                     f"{preface} Flash Attention 4 is not available on CPU. Please make sure torch can access a CUDA device."
                 )
 
+    def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool = False) -> bool:
+        """
+        Check the availability of Flash Attention for a given model.
+
+        Args:
+            flash_attn_version (`int`):
+                The requested version of Flash Attention.
+            is_init_check (`bool`, *optional*):
+                Whether this check is performed early, i.e. at __init__ time, or later when the model and its weights are
+                fully instantiated. This is needed as we also check the devices of the weights, which are only available
+                later after __init__. This allows to raise proper exceptions early before instantiating the full models
+                if we know that the model does not support the requested attention.
+        """
+        if flash_attn_version not in [2, 3, 4]:
+            raise ValueError(f"Requested Flash Attention {flash_attn_version} which is not supported.")
+
+        if not self._supports_flash_attn:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support Flash Attention {flash_attn_version} yet. Please request to add support where"
+                f" the model is hosted, on its model hub page: https://huggingface.co/{self.config._name_or_path}/discussions/new"
+                " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
+            )
+
+        # Check if we can even use the FA version based on the env of the user
+        if flash_attn_version == 2:
+            self._flash_attn_2_import_error()
+        elif flash_attn_version == 3:
+            self._flash_attn_3_import_error()
+        else:
+            self._flash_attn_4_import_error()
+
+        dtype = self.config.dtype
         if dtype is None:
             logger.warning_once(
-                "You are attempting to use Flash Attention 4 without specifying a torch dtype. This might lead to unexpected behaviour"
+                f"You are attempting to use Flash Attention {flash_attn_version} without specifying a torch dtype. This might lead to unexpected behaviour"
             )
         elif dtype is not None and dtype not in [torch.float16, torch.bfloat16]:
             logger.warning_once(
-                "Flash Attention 4 only supports torch.float16 and torch.bfloat16 dtypes, but"
+                f"Flash Attention {flash_attn_version} only supports torch.float16 and torch.bfloat16 dtypes, but"
                 f" the current dype in {self.__class__.__name__} is {dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
-                ' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_4", dtype=torch.float16)`'
+                ' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`'
             )
 
-        if getattr(self.config, "alibi", False) or getattr(self.config, "use_alibi", False):
-            raise ValueError("Model is configured to use ALiBi, which is not supported by Flash Attention 4.")
+        # FA2 has broader support for some features and devices
+        is_fa2 = flash_attn_version == 2
 
-        # Check for attention dropout, which is incompatible with FA4
-        if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
-            raise ValueError(
-                f"Model has attention_dropout={self.config.attention_dropout}, which is not supported by Flash Attention 4."
-            )
+        if not is_fa2:
+            # Check for attention dropout, which is incompatible with newer FA versions
+            if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
+                raise ValueError(
+                    f"Model has attention_dropout={self.config.attention_dropout}, which is not supported by Flash Attention {flash_attn_version}."
+                )
 
         # With the early check, the parameters are not yet initialized correctly
         if not is_init_check:
@@ -1780,16 +1685,33 @@ def _flash_attn_4_can_dispatch(self, is_init_check: bool = False) -> bool:
             if len(param_devices) == 1 and param_devices[0].type == "cpu":
                 if torch.cuda.is_available():
                     logger.warning_once(
-                        "You are attempting to use Flash Attention 4 with a model not initialized on GPU. Make sure to move the model to GPU"
+                        f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU. Make sure to move the model to GPU"
                         " after initializing it on CPU with `model.to('cuda')`."
                     )
-                else:
-                    raise ValueError(
-                        "You are attempting to use Flash Attention 4 with a model not initialized on GPU and with no GPU available. "
-                        "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
-                        "or initialising the model on CPU and then moving it to GPU."
-                    )
+                elif is_fa2:
+                    if is_torch_mlu_available():
+                        logger.warning_once(
+                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on MLU. Make sure to move the model to MLU"
+                            " after initializing it on CPU with `model.to('mlu')`."
+                        )
+                    elif is_torch_npu_available():
+                        logger.warning_once(
+                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on NPU. Make sure to move the model to NPU"
+                            " after initializing it on CPU with `model.to('npu')`."
+                        )
+                    elif is_torch_xpu_available():
+                        logger.warning_once(
+                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on XPU. Make sure to move the model to XPU"
+                            " after initializing it on CPU with `model.to('xpu')`."
+                        )
 
+                raise ValueError(
+                    f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU and with no GPU available. "
+                    "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
+                    "or initialising the model on CPU and then moving it to GPU."
+                )
+
+        # If no error raise by this point, we can return `True`
         return True
 
     def _sdpa_can_dispatch(self, is_init_check: bool = False) -> bool:
@@ -1906,9 +1828,9 @@ def _check_and_adjust_attn_implementation(
                 # raise the proper exception for requested flash attention
                 if requested_original_flash_attn:
                     if attn_implementation.endswith("2"):
-                        self._flash_attn_2_can_dispatch()
+                        self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
                     else:
-                        self._flash_attn_3_can_dispatch()
+                        self._flash_attn_can_dispatch(flash_attn_version=3, is_init_check=is_init_check)
 
                 # error properly out if a kernel was specifically requested
                 raise e
@@ -1941,11 +1863,11 @@ def get_correct_attn_implementation(self, requested_attention: Optional[str], is
 
         # Perform relevant checks
         if "flash_attention_2" in applicable_attention:
-            self._flash_attn_2_can_dispatch(is_init_check)
+            self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
         elif "flash_attention_3" in applicable_attention:
-            self._flash_attn_3_can_dispatch(is_init_check)
+            self._flash_attn_can_dispatch(flash_attn_version=4, is_init_check=is_init_check)
         elif "flash_attention_4" in applicable_attention:
-            self._flash_attn_4_can_dispatch(is_init_check)
+            self._flash_attn_can_dispatch(flash_attn_version=4, is_init_check=is_init_check)
         elif "flex_attention" in applicable_attention:
             self._flex_attn_can_dispatch(is_init_check)
         elif "sdpa" in applicable_attention:

From f5b7f9c69fdaf0b7773d30181d9b30720de468b7 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 17:03:20 +0000
Subject: [PATCH 08/36] fix

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d132f7f80a33..d6c4a97a3c6a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1666,7 +1666,7 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
             logger.warning_once(
                 f"Flash Attention {flash_attn_version} only supports torch.float16 and torch.bfloat16 dtypes, but"
                 f" the current dype in {self.__class__.__name__} is {dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator,"
-                ' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", dtype=torch.float16)`'
+                f' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_{flash_attn_version}", dtype=torch.float16)`'
             )
 
         # FA2 has broader support for some features and devices

From 6288f44a5205f8119f6121c41a582d2b80f26664 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 17:16:19 +0000
Subject: [PATCH 09/36] modernbert...

---
 src/transformers/models/modernbert/modeling_modernbert.py | 3 ++-
 src/transformers/models/modernbert/modular_modernbert.py  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/modernbert/modeling_modernbert.py b/src/transformers/models/modernbert/modeling_modernbert.py
index 8069f2bec2ff..2ba92098f6dc 100644
--- a/src/transformers/models/modernbert/modeling_modernbert.py
+++ b/src/transformers/models/modernbert/modeling_modernbert.py
@@ -689,7 +689,8 @@ def _check_and_adjust_attn_implementation(
         try:
             attn_implementation = (
                 "flash_attention_2"
-                if attn_implementation is None and self._flash_attn_2_can_dispatch()
+                if attn_implementation is None
+                and self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
                 else attn_implementation
             )
         except (ValueError, ImportError):
diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py
index 8fc375671583..f251c8ee7e97 100644
--- a/src/transformers/models/modernbert/modular_modernbert.py
+++ b/src/transformers/models/modernbert/modular_modernbert.py
@@ -870,7 +870,8 @@ def _check_and_adjust_attn_implementation(
         try:
             attn_implementation = (
                 "flash_attention_2"
-                if attn_implementation is None and self._flash_attn_2_can_dispatch()
+                if attn_implementation is None
+                and self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
                 else attn_implementation
             )
         except (ValueError, ImportError):

From 15ed2eb9e15d31a9538b10a2c75acad68f8aaf18 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Thu, 27 Nov 2025 19:05:22 +0000
Subject: [PATCH 10/36] oops

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d6c4a97a3c6a..6b291ce512af 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1865,7 +1865,7 @@ def get_correct_attn_implementation(self, requested_attention: Optional[str], is
         if "flash_attention_2" in applicable_attention:
             self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
         elif "flash_attention_3" in applicable_attention:
-            self._flash_attn_can_dispatch(flash_attn_version=4, is_init_check=is_init_check)
+            self._flash_attn_can_dispatch(flash_attn_version=3, is_init_check=is_init_check)
         elif "flash_attention_4" in applicable_attention:
             self._flash_attn_can_dispatch(flash_attn_version=4, is_init_check=is_init_check)
         elif "flex_attention" in applicable_attention:

From 6be5bbeea9af2af00a5c7119753a4ffc54ac8232 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 28 Nov 2025 16:28:45 +0000
Subject: [PATCH 11/36] parity test

---
 .../generation/test_flash_attention_parity.py | 115 ++++++++++++------
 1 file changed, 76 insertions(+), 39 deletions(-)

diff --git a/tests/generation/test_flash_attention_parity.py b/tests/generation/test_flash_attention_parity.py
index 969cdddcd38d..55f0da845746 100644
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Eduard Durech and SGLang team.
+# Copyright 2025 Eduard Durech, SGLang, and HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,12 +16,19 @@
 # RUN_SLOW=1 pytest -s tests/generation/test_flash_attention_parity.py
 
 import unittest
+from collections import defaultdict
 
 import pytest
 import torch
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.testing_utils import require_flash_attn, require_flash_attn_3, require_torch_gpu, slow
+from transformers.testing_utils import (
+    require_flash_attn,
+    require_flash_attn_3,
+    require_flash_attn_4,
+    require_torch_gpu,
+    slow,
+)
 
 
 class FlashAttentionParityTest(unittest.TestCase):
@@ -74,12 +81,17 @@ def _benchmark_generation(self, model, inputs, n_warmup=3, n_runs=5):
 
         return start_time.elapsed_time(end_time) / n_runs
 
-    @pytest.mark.flash_attn_3_test
     @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @pytest.mark.flash_attn_3_test
+    @pytest.mark.flash_attn_4_test
     @require_flash_attn
     @require_flash_attn_3
+    @require_flash_attn_4
     @slow
-    def test_flash_attention_2_3_parity(self):
+    def test_flash_attention_parity(self):
+        flash_attn_versions = [2, 3, 4]
+
         model_id = "meta-llama/Llama-3.2-1B-Instruct"
         prompt = ["The ETH AI Center is", "What is life?"]
 
@@ -87,55 +99,80 @@ def test_flash_attention_2_3_parity(self):
         model = AutoModelForCausalLM.from_pretrained(
             model_id,
             dtype=torch.bfloat16,
+            device_map="auto",
             attn_implementation="flash_attention_2",
-        ).to("cuda")
+        )
         tokenizer = AutoTokenizer.from_pretrained(model_id)
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
         # 2. Generate with both models
         inputs = tokenizer(prompt, padding=True, padding_side="left", return_tensors="pt").to("cuda")
 
+        logits = {}
+        logprobs = {}
+        outputs = defaultdict(list)
         with torch.no_grad():
-            output_2 = model.generate(
-                **inputs, max_new_tokens=20, do_sample=False, output_scores=True, return_dict_in_generate=True
-            )
-            model.set_attn_implementation("flash_attention_3")
-            output_3 = model.generate(
-                **inputs, max_new_tokens=20, do_sample=False, output_scores=True, return_dict_in_generate=True
-            )
+            def generate(model, version, outputs, logits, logprobs):
+                model.set_attn_implementation(f"flash_attention_{version}")
+                output = model.generate(
+                    **inputs, max_new_tokens=20, do_sample=False, output_scores=True, return_dict_in_generate=True
+                )
+                logit = torch.stack(output.scores)
+                logprob = torch.nn.functional.log_softmax(logit, dim=-1)
+
+                for i in range(len(prompt)):
+                    outputs[version].append(tokenizer.decode(output.sequences[i], skip_special_tokens=True))
+                logits[version] = logit
+                logprobs[version] = logprob
+
+            for version in flash_attn_versions:
+                generate(model, version, outputs, logits, logprobs)
 
         # 3. Correctness check
         # 3a. Logits
-        logits_2 = torch.stack(output_2.scores)
-        logits_3 = torch.stack(output_3.scores)
-        torch.testing.assert_close(logits_2, logits_3, atol=1e-3, rtol=1e-3)
-        logprobs_2 = torch.nn.functional.log_softmax(logits_2, dim=-1)
-        logprobs_3 = torch.nn.functional.log_softmax(logits_3, dim=-1)
-        max_logprob_diff = torch.max(torch.abs(logprobs_2 - logprobs_3)).item()
+        # FA2 as base to compare against
+        logits_1 = logits[2]
+        logprobs_1 = logprobs[2]
+        max_logprob_diffs = []
+        for version in range(1, len(flash_attn_versions)):
+            logits_x = logits[flash_attn_versions[version]]
+            logprobs_x = logprobs[flash_attn_versions[version]]
+            # TODO: logits significantly differ between FA2 and FA4
+            #torch.testing.assert_close(logits_1, logits_x, atol=1e-3, rtol=1e-3)
+            max_logprob_diffs.append(torch.max(torch.abs(logprobs_1 - logprobs_x)).item())
 
         # 3b. Generated text
-        text_2s, text_3s = [], []
-        for i in range(len(prompt)):
-            text_2s.append(tokenizer.decode(output_2.sequences[i], skip_special_tokens=True))
-            text_3s.append(tokenizer.decode(output_3.sequences[i], skip_special_tokens=True))
-
-        rouge_scores = self._calculate_rouge_l(text_2s, text_3s)
-        for i in range(len(rouge_scores)):
-            assert rouge_scores[i] > 0.99, f"Generated texts at prompt {i} do not match (ROUGE-L: {rouge_scores[i]})"
+        # FA2 as base to compare against
+        texts_1 = outputs[2]
+        rouge_scores = []
+        for version in range(1, len(flash_attn_versions)):
+            fa_version = flash_attn_versions[version]
+            texts_x = outputs[fa_version]
+            rouge_score = self._calculate_rouge_l(texts_1, texts_x)
+            for idx, score in enumerate(rouge_score):
+                assert score > 0.99, f"Generated texts at prompt {idx} do not match (ROUGE-L: {score}) comparing FA2 vs FA{fa_version}"
+            rouge_scores.append(self._calculate_rouge_l(texts_1, texts_x))
 
         # 4. Performance check
+        times = []
         with torch.no_grad():
-            time_3 = self._benchmark_generation(model, inputs)
-            model.set_attn_implementation("flash_attention_2")
-            time_2 = self._benchmark_generation(model, inputs)
-
-        print(f"\n--- Flash Attention {2, 3} Parity Test on {model_id} ---")
-        print(f"Prompt: '{prompt}'")
-        print(f"Generated text with Flash Attention 2: {text_2s}")
-        print(f"Generated text with Flash Attention 3: {text_3s}")
-        print(f"ROUGE-L: {rouge_scores}")
-        print(f"Max absolute difference in logprobs: {max_logprob_diff:.5e}")
-        print(f"Flash Attention 2 latency: {time_2:.2f} ms")
-        print(f"Flash Attention 3 latency: {time_3:.2f} ms")
-        print(f"Speed-up: {time_2 / time_3:.2f}x")
+            for version in range(len(flash_attn_versions)):
+                model.set_attn_implementation(f"flash_attention_{flash_attn_versions[version]}")
+                times.append(self._benchmark_generation(model, inputs))
+
+        # Summary
+        print(f"\n--- Flash Attention Parity Test on {model_id} ---")
+        print(f"Prompts: '{prompt}'")
+        print("\nGenerated texts:")
+        for version in flash_attn_versions:
+            print(f"    With FA{version}: {outputs[version]}")
+        print("\nROUGE-L scores:")
+        for idx, version in enumerate(range(1, len(flash_attn_versions))):
+            print(f"    Between FA2 and FA{flash_attn_versions[version]}: {rouge_scores[idx]}")
+        print("\nMax absolute difference in logprobs:")
+        for idx, version in enumerate(range(1, len(flash_attn_versions))):
+            print(f"    Between FA2 and FA{flash_attn_versions[version]}: {max_logprob_diffs[idx]:.5e}")
+        print("\nLatency:")
+        for idx, version in enumerate(flash_attn_versions):
+            print(f"    With FA{version}: {times[idx]}")
         print("---")

From dad1b04c3937b134983628d0002313f381093639 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 28 Nov 2025 16:52:41 +0000
Subject: [PATCH 12/36] style

---
 tests/generation/test_flash_attention_parity.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/generation/test_flash_attention_parity.py b/tests/generation/test_flash_attention_parity.py
index 55f0da845746..2f14561c0772 100644
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -112,6 +112,7 @@ def test_flash_attention_parity(self):
         logprobs = {}
         outputs = defaultdict(list)
         with torch.no_grad():
+
             def generate(model, version, outputs, logits, logprobs):
                 model.set_attn_implementation(f"flash_attention_{version}")
                 output = model.generate(
@@ -131,14 +132,14 @@ def generate(model, version, outputs, logits, logprobs):
         # 3. Correctness check
         # 3a. Logits
         # FA2 as base to compare against
-        logits_1 = logits[2]
+        # logits_1 = logits[2]
         logprobs_1 = logprobs[2]
         max_logprob_diffs = []
         for version in range(1, len(flash_attn_versions)):
-            logits_x = logits[flash_attn_versions[version]]
-            logprobs_x = logprobs[flash_attn_versions[version]]
             # TODO: logits significantly differ between FA2 and FA4
-            #torch.testing.assert_close(logits_1, logits_x, atol=1e-3, rtol=1e-3)
+            # logits_x = logits[flash_attn_versions[version]]
+            # torch.testing.assert_close(logits_1, logits_x, atol=1e-3, rtol=1e-3)
+            logprobs_x = logprobs[flash_attn_versions[version]]
             max_logprob_diffs.append(torch.max(torch.abs(logprobs_1 - logprobs_x)).item())
 
         # 3b. Generated text
@@ -150,7 +151,9 @@ def generate(model, version, outputs, logits, logprobs):
             texts_x = outputs[fa_version]
             rouge_score = self._calculate_rouge_l(texts_1, texts_x)
             for idx, score in enumerate(rouge_score):
-                assert score > 0.99, f"Generated texts at prompt {idx} do not match (ROUGE-L: {score}) comparing FA2 vs FA{fa_version}"
+                assert score > 0.99, (
+                    f"Generated texts at prompt {idx} do not match (ROUGE-L: {score}) comparing FA2 vs FA{fa_version}"
+                )
             rouge_scores.append(self._calculate_rouge_l(texts_1, texts_x))
 
         # 4. Performance check

From 34c15c2ab3e21c17600b9765b6665e5ffc9c37e6 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 28 Nov 2025 17:00:27 +0000
Subject: [PATCH 13/36] nit

---
 tests/generation/test_flash_attention_parity.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/generation/test_flash_attention_parity.py b/tests/generation/test_flash_attention_parity.py
index 2f14561c0772..84cc73d68001 100644
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -159,8 +159,8 @@ def generate(model, version, outputs, logits, logprobs):
         # 4. Performance check
         times = []
         with torch.no_grad():
-            for version in range(len(flash_attn_versions)):
-                model.set_attn_implementation(f"flash_attention_{flash_attn_versions[version]}")
+            for version in flash_attn_versions:
+                model.set_attn_implementation(f"flash_attention_{version}")
                 times.append(self._benchmark_generation(model, inputs))
 
         # Summary

From 776a1af358ce1d187e927caee228c7a16f393d8c Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 6 Mar 2026 16:46:45 +0000
Subject: [PATCH 14/36] fixup imports for fa4

---
 src/transformers/modeling_utils.py     | 5 +----
 src/transformers/utils/import_utils.py | 7 +++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 88d2bb70bc9e..3cf1c64bd28e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1616,10 +1616,7 @@ def _flash_attn_4_import_error(self):
         if not is_flash_attn_4_available():
             preface = "FlashAttention4 has been toggled on, but it cannot be used due to the following error:"
 
-            if (
-                importlib.util.find_spec("flash_attn") is None
-                or "flash-attn-cute" not in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
-            ):
+            if importlib.util.find_spec("flash_attn") is None or importlib.util.find_spec("flash_attn.cute") is None:
                 raise ImportError(f"{preface} the package flash_attn (under cute) seems to be not installed.")
 
             if torch.cuda.is_available():
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index effe3902c76d..eb5479324884 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -943,11 +943,10 @@ def is_flash_attn_3_available() -> bool:
 
 @lru_cache
 def is_flash_attn_4_available() -> bool:
-    if not (is_torch_cuda_available() and _is_package_available("flash_attn")):
+    # Check first under base flash then cute
+    if not (is_torch_cuda_available() and _is_package_available("flash_attn")[0]):
         return False
-
-    # FA4 is distributed to just "flash_attn" but its mapping is properly mapped to cute
-    return "flash-attn-cute" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+    return _is_package_available("flash_attn.cute")[0]
 
 
 @lru_cache

From cc7a1b73eef5d92bc72a1c36df6187a42e83a28f Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 6 Mar 2026 23:41:21 +0000
Subject: [PATCH 15/36] enable attention sinks, fixup logits checks in parity
 test

---
 .../modeling_flash_attention_utils.py         | 26 +++++++++++--------
 .../models/gpt_oss/modeling_gpt_oss.py        |  2 +-
 .../models/gpt_oss/modular_gpt_oss.py         |  2 +-
 .../generation/test_flash_attention_parity.py | 14 ++++++----
 4 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index 1172762cbe96..926fce55b3d5 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -72,6 +72,10 @@ def is_flash_attn_available():
     "dropout": "dropout_p",
     "sliding_window": "window_size",
 }
+# alternative names within the different flash attention APIs, e.g. for attention sinks
+_flash_api_alternative_names = {
+    "s_aux": "learnable_sink"
+}
 
 
 def _lazy_imports(
@@ -156,6 +160,9 @@ def _lazy_define_process_function(flash_function):
         fa_param = _hf_api_to_flash_mapping.get(param, param)
         supports_mapping[fa_param] = fa_param in flash_parameters
 
+        if (fa_alternative_name := _flash_api_alternative_names.get(param, param)) != fa_param:
+            supports_mapping[fa_alternative_name] = fa_alternative_name in flash_parameters
+
     return partial(_process_flash_attention_kwargs, supports_mapping=supports_mapping)
 
 
@@ -570,9 +577,11 @@ def _process_flash_attention_kwargs(
     if supports_mapping["softcap"] and softcap is not None:
         flash_kwargs["softcap"] = softcap
 
-    # Only within kernel implementation atm
-    if supports_mapping["s_aux"] and s_aux is not None:
-        flash_kwargs["s_aux"] = s_aux
+    if ((legacy_sink_param := supports_mapping["s_aux"]) or supports_mapping["learnable_sink"]) and s_aux is not None:
+        if legacy_sink_param:
+            flash_kwargs["s_aux"] = s_aux  # e.g. FA3 (vllm)
+        else:
+            flash_kwargs["learnable_sink"] = s_aux  # FA4
 
     # There is a limitation of the flash attention API, as the function `flash_attn_varlen_func`
     # may require `max_length_q`, `max_length_k` to be passed as `int` and not `torch.Tensor`.
@@ -682,15 +691,13 @@ def _flash_attention_forward(
         if "mps" in str(q.device):
             cu_seq_lens_k = cu_seq_lens_k.clone()
 
-        # Newer fa versions no longer accept `max_seqlen_(q|k)`
-        final_flash_kwargs = flash_kwargs(max_seqlen_q=max_length_q, max_seqlen_k=max_length_k)
         out_unpad = flash_varlen_fn(
             q,
             k,
             v,
             cu_seqlens_q=cu_seq_lens_q,
             cu_seqlens_k=cu_seq_lens_k,
-            **final_flash_kwargs,
+            **flash_kwargs(max_seqlen_q=max_length_q, max_seqlen_k=max_length_k),
         )
         if isinstance(out_unpad, tuple):
             out_unpad = out_unpad[0]
@@ -713,15 +720,13 @@ def _flash_attention_forward(
         if "mps" in str(q.device):
             cu_seq_lens_k = cu_seq_lens_k.clone()
 
-        # Newer fa versions no longer accept `max_seqlen_(q|k)`
-        final_flash_kwargs = flash_kwargs(max_seqlen_q=max_length_q, max_seqlen_k=max_length_k)
         out = flash_varlen_fn(
             q,
             k,
             v,
             cu_seqlens_q=cu_seq_lens_q,
             cu_seqlens_k=cu_seq_lens_k,
-            **final_flash_kwargs,
+            **flash_kwargs(max_seqlen_q=max_length_q, max_seqlen_k=max_length_k),
         )
         if isinstance(out, tuple):
             out = out[0]
@@ -730,8 +735,7 @@ def _flash_attention_forward(
 
     # No padding
     else:
-        final_flash_kwargs = flash_kwargs()
-        out = flash_fn(query_states, key_states, value_states, **final_flash_kwargs)
+        out = flash_fn(query_states, key_states, value_states, **flash_kwargs())
         if isinstance(out, tuple):
             out = out[0]
 
diff --git a/src/transformers/models/gpt_oss/modeling_gpt_oss.py b/src/transformers/models/gpt_oss/modeling_gpt_oss.py
index 891860848763..1cf9fca69f15 100644
--- a/src/transformers/models/gpt_oss/modeling_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -410,7 +410,7 @@ class GptOssPreTrainedModel(PreTrainedModel):
         "attentions": GptOssAttention,
     }
     _keep_in_fp32_modules = ["post_attention_layernorm", "input_layernorm", "norm"]
-    _compatible_flash_implementations = ["kernels-community/vllm-flash-attn3"]
+    _compatible_flash_implementations = ["kernels-community/vllm-flash-attn3", "flash_attention_4"]
 
     @torch.no_grad()
     def _init_weights(self, module):
diff --git a/src/transformers/models/gpt_oss/modular_gpt_oss.py b/src/transformers/models/gpt_oss/modular_gpt_oss.py
index b360e3739b70..a4b6d8a64881 100644
--- a/src/transformers/models/gpt_oss/modular_gpt_oss.py
+++ b/src/transformers/models/gpt_oss/modular_gpt_oss.py
@@ -319,7 +319,7 @@ def forward(
 class GptOssPreTrainedModel(LlamaPreTrainedModel):
     _keep_in_fp32_modules = ["post_attention_layernorm", "input_layernorm", "norm"]
     _supports_sdpa = False
-    _compatible_flash_implementations = ["kernels-community/vllm-flash-attn3"]
+    _compatible_flash_implementations = ["kernels-community/vllm-flash-attn3", "flash_attention_4"]
     _can_record_outputs = {
         "router_logits": OutputRecorder(GptOssTopKRouter, index=0),
         "hidden_states": GptOssDecoderLayer,
diff --git a/tests/generation/test_flash_attention_parity.py b/tests/generation/test_flash_attention_parity.py
index 84cc73d68001..f64b8c6fe88a 100644
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -112,7 +112,6 @@ def test_flash_attention_parity(self):
         logprobs = {}
         outputs = defaultdict(list)
         with torch.no_grad():
-
             def generate(model, version, outputs, logits, logprobs):
                 model.set_attn_implementation(f"flash_attention_{version}")
                 output = model.generate(
@@ -132,16 +131,21 @@ def generate(model, version, outputs, logits, logprobs):
         # 3. Correctness check
         # 3a. Logits
         # FA2 as base to compare against
-        # logits_1 = logits[2]
+        logits_1 = logits[2]
         logprobs_1 = logprobs[2]
         max_logprob_diffs = []
         for version in range(1, len(flash_attn_versions)):
-            # TODO: logits significantly differ between FA2 and FA4
-            # logits_x = logits[flash_attn_versions[version]]
-            # torch.testing.assert_close(logits_1, logits_x, atol=1e-3, rtol=1e-3)
+            logits_x = logits[flash_attn_versions[version]]
             logprobs_x = logprobs[flash_attn_versions[version]]
             max_logprob_diffs.append(torch.max(torch.abs(logprobs_1 - logprobs_x)).item())
 
+            # Only 80% need to pass the tolerance (big model with several steps)
+            atol, fraction = 4e-2, 0.8
+            logits_ok = (torch.abs(logits_1 - logits_x) <= atol).float().mean().item()
+            assert logits_ok >= fraction, (
+                f"FA{flash_attn_versions[version]} logits pass fraction {logits_ok:.6f} < {fraction:.6f}"
+            )
+
         # 3b. Generated text
         # FA2 as base to compare against
         texts_1 = outputs[2]

From 65912b2c02fe2f0653eb9710dc32b84aa209ae56 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 6 Mar 2026 23:52:58 +0000
Subject: [PATCH 16/36] style

---
 src/transformers/modeling_flash_attention_utils.py | 4 +---
 tests/generation/test_flash_attention_parity.py    | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index 926fce55b3d5..ed04e578a14f 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -73,9 +73,7 @@ def is_flash_attn_available():
     "sliding_window": "window_size",
 }
 # alternative names within the different flash attention APIs, e.g. for attention sinks
-_flash_api_alternative_names = {
-    "s_aux": "learnable_sink"
-}
+_flash_api_alternative_names = {"s_aux": "learnable_sink"}
 
 
 def _lazy_imports(
diff --git a/tests/generation/test_flash_attention_parity.py b/tests/generation/test_flash_attention_parity.py
index f64b8c6fe88a..28327dca04cb 100644
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -112,6 +112,7 @@ def test_flash_attention_parity(self):
         logprobs = {}
         outputs = defaultdict(list)
         with torch.no_grad():
+
             def generate(model, version, outputs, logits, logprobs):
                 model.set_attn_implementation(f"flash_attention_{version}")
                 output = model.generate(

From d07749f4c6c11d0d4726d5d6a9ca272e76424d34 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Mon, 9 Mar 2026 20:10:39 +0000
Subject: [PATCH 17/36] change dispatch logic and introduce lower bound for FA

---
 .../modeling_flash_attention_utils.py         |   5 +-
 src/transformers/modeling_utils.py            | 238 +++++++++---------
 src/transformers/utils/__init__.py            |   1 -
 src/transformers/utils/import_utils.py        |  19 +-
 4 files changed, 120 insertions(+), 143 deletions(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index ed04e578a14f..c4078ebf36c0 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -24,7 +24,6 @@
     is_flash_attn_2_available,
     is_flash_attn_3_available,
     is_flash_attn_4_available,
-    is_flash_attn_greater_or_equal_2_10,
     is_torch_npu_available,
     is_torch_xpu_available,
     logging,
@@ -37,10 +36,8 @@
 
 # TODO Deprecate when all models have the attention interface
 def flash_attn_supports_top_left_mask():
-    if is_flash_attn_3_available() or is_flash_attn_4_available():
+    if is_flash_attn_2_available() or is_flash_attn_3_available() or is_flash_attn_4_available():
         return False
-    if is_flash_attn_2_available():
-        return not is_flash_attn_greater_or_equal_2_10()
 
     from .integrations.npu_flash_attention import is_npu_fa2_top_left_aligned_causal_mask
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 3cf1c64bd28e..a3160669c4c8 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -125,6 +125,7 @@
     PACKAGE_DISTRIBUTION_MAPPING,
     is_huggingface_hub_greater_or_equal,
     is_sagemaker_mp_enabled,
+    is_torch_cuda_available,
     is_tracing,
 )
 from .utils.loading_report import LoadStateDictInfo, log_state_dict_report
@@ -1549,88 +1550,68 @@ def can_generate(cls) -> bool:
         # Otherwise, can't generate
         return False
 
-    def _flash_attn_2_import_error(self):
-        if not is_flash_attn_2_available():
-            preface = "FlashAttention2 has been toggled on, but it cannot be used due to the following error:"
-            install_message = "Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
+    def _flash_attn_import_error(
+        self,
+        flash_attn_version: int,
+        general_availability_check: Callable,
+        pkg_availability_check: Callable,
+        supported_devices: tuple[tuple[Callable, str]],
+        custom_supported_devices: tuple[tuple[Callable, str]]=(),
+        cuda_major_versions: tuple[int] | None = None,
+    ):
+        """
+        Checks whether the specified Flash Attention version is supported and if not, searches for the specific reason
+        on why it failed - package import and/or device incompatibility issues.
 
-            # package `flash-attn` can not be installed on Ascend NPU, following validation logics can be ignored.
-            if is_torch_npu_available():
-                logger.info("Detect using FlashAttention2 on Ascend NPU.")
+        Args:
+            flash_attn_version (`int`):
+                The requested version of Flash Attention.
+            general_availability_check (`Callable`):
+                Checks whether our `is_available` function detects the specific FA version. Failing reasons
+                are then checked for one-by-one.
+            pkg_availability_check (`Callable`):
+                Checks whether the package could theoretically be detected in the environment by the init structures.
+                This is not a sure-fire check as device compatibility with FA is just as important.
+            supported_devices (`tuple[tuple[Callable, str]]`):
+                Essentially a list (for mutable kwargs reasons a tuple) of the supported devices in the format of
+                `(device_availability_check, device_name)`, i.e. a pair of the associated device's name and whether
+                it is available in the environment.
+            custom_supported_devices (`tuple[tuple[Callable, str]]`, *optional*, defaults to `()`):
+                Essentially a list (for mutable kwargs reasons a tuple) of the custom supported devices in the format of
+                `(device_availability_check, info_message)`. These custom devices have custom logic outside the torch
+                ecosystem either via kernels or other packages and hence have early checks for availability.
+            cuda_major_versions (`tuple[int]`, *optional*):
+                A potential list of major cuda versions supported for this version of Flash Attention. This is mostly
+                affecting more recent versions which are more specialized to the features of new hardware.
+        """
+        # Certain devices have custom workarounds e.g. with their own package distribution (NPU) or via kernels (XPU)
+        for device_availability_check, info_message in custom_supported_devices:
+            if device_availability_check():
+                logger.info(info_message)
                 return
 
-            if is_torch_xpu_available():
-                logger.info(
-                    f"Detect using FlashAttention2 (via kernel `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}`) on XPU."
-                )
-                return True
+        if not general_availability_check():
+            preface = f"FlashAttention{flash_attn_version} has been toggled on, but it cannot be used due to the following error:"
 
-            if (
-                importlib.util.find_spec("flash_attn") is None
-                or "flash_attn" not in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
-            ):
-                raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+            # Can the package be seen in the import structure
+            if not pkg_availability_check():
+                raise ImportError(
+                    f"{preface} the package for FlashAttention{flash_attn_version} seems to be not installed."
+                )
             else:
-                # Check FA2 installed version compatibility
-                flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
-                if torch.version.cuda:
-                    if flash_attention_version < version.parse("2.1.0"):
-                        raise ImportError(
-                            f"{preface} you need flash_attn package version to be greater or equal than 2.1.0. Detected version {flash_attention_version}. {install_message}"
-                        )
-                    elif not torch.cuda.is_available():
-                        raise ValueError(
-                            f"{preface} Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device."
-                        )
-                    else:
-                        raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
-                elif torch.version.hip:
-                    if flash_attention_version < version.parse("2.0.4"):
+                # Supported devices availability
+                device_availability_checks, device_names = zip(*supported_devices)
+                if not any(device_availability_check() for device_availability_check in device_availability_checks):
+                    raise ImportError(
+                        f"{preface} FlashAttention{flash_attn_version} is not available on CPU. Please make sure you are on any of the supported devices: {device_names}."
+                    )
+                # Cuda major versions (more recent FA versions are specialized to newer cuda devices)
+                elif cuda_major_versions is not None and is_torch_cuda_available():
+                    major, _ = torch.cuda.get_device_capability()
+                    if major not in cuda_major_versions:
                         raise ImportError(
-                            f"{preface} you need flash_attn package version to be greater or equal than 2.0.4. Detected version {flash_attention_version}. {install_message}"
+                            f"{preface} FlashAttention{flash_attn_version} requires compute capability >= {min(cuda_major_versions)}, but found {torch.cuda.get_device_capability()} with compute capability {major}.x"
                         )
-                    else:
-                        raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
-
-    def _flash_attn_3_import_error(self):
-        if not is_flash_attn_3_available():
-            preface = "FlashAttention3 has been toggled on, but it cannot be used due to the following error:"
-
-            if importlib.util.find_spec("flash_attn_3") is None:
-                raise ImportError(f"{preface} the package flash_attn_3 seems to be not installed.")
-
-            if torch.cuda.is_available():
-                major, _ = torch.cuda.get_device_capability()
-                if major < 8:
-                    raise ValueError(
-                        f"{preface} Flash Attention 3 requires compute capability >= 8.0, but found {torch.cuda.get_device_capability()} with compute capability {major}.0."
-                    )
-                else:
-                    raise ImportError(f"{preface} Flash Attention 3 is not available.")
-            else:
-                raise ValueError(
-                    f"{preface} Flash Attention 3 is not available on CPU. Please make sure torch can access a CUDA device."
-                )
-
-    def _flash_attn_4_import_error(self):
-        if not is_flash_attn_4_available():
-            preface = "FlashAttention4 has been toggled on, but it cannot be used due to the following error:"
-
-            if importlib.util.find_spec("flash_attn") is None or importlib.util.find_spec("flash_attn.cute") is None:
-                raise ImportError(f"{preface} the package flash_attn (under cute) seems to be not installed.")
-
-            if torch.cuda.is_available():
-                major, _ = torch.cuda.get_device_capability()
-                if major < 9:
-                    raise ValueError(
-                        f"{preface} Flash Attention 4 requires compute capability >= 9.0, but found {torch.cuda.get_device_capability()} with compute capability {major}.0."
-                    )
-                else:
-                    raise ImportError(f"{preface} Flash Attention 4 is not available.")
-            else:
-                raise ValueError(
-                    f"{preface} Flash Attention 4 is not available on CPU. Please make sure torch can access a CUDA device."
-                )
 
     def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool = False) -> bool:
         """
@@ -1645,9 +1626,6 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
                 later after __init__. This allows to raise proper exceptions early before instantiating the full models
                 if we know that the model does not support the requested attention.
         """
-        if flash_attn_version not in [2, 3, 4]:
-            raise ValueError(f"Requested Flash Attention {flash_attn_version} which is not supported.")
-
         if not self._supports_flash_attn:
             raise ValueError(
                 f"{self.__class__.__name__} does not support Flash Attention {flash_attn_version} yet. Please request to add support where"
@@ -1655,18 +1633,64 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
                 " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new"
             )
 
+        if flash_attn_version not in [2, 3, 4]:
+            raise ValueError(f"Requested Flash Attention {flash_attn_version} which is not supported.")
+
+        # Big matrix that shows each FA versions limitations by device and major cuda versions
+        flash_attention_compatibility_matrix = {
+            2: {
+                "flash_attn_version": 2,
+                "general_availability_check": is_flash_attn_2_available,
+                "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
+                and "flash_attn" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"],
+                "supported_devices": (
+                    (is_torch_cuda_available, "cuda"),
+                    (is_torch_mlu_available, "mlu"),
+                    (is_torch_npu_available, "npu"),
+                    (is_torch_xpu_available, "xpu"),
+                ),
+                "custom_supported_devices": (
+                    (is_torch_npu_available, "Detect using FlashAttention2 on Ascend NPU."),
+                    (
+                        is_torch_xpu_available,
+                        f"Detect using FlashAttention2 (via kernel `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}`) on XPU.",
+                    ),
+                ),
+            },
+            3: {
+                "flash_attn_version": 3,
+                "general_availability_check": is_flash_attn_3_available,
+                "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn_3") is not None,
+                "supported_devices": ((is_torch_cuda_available, "cuda"),),
+                "cuda_major_versions": (8, 9),  # Ampere and Hopper
+            },
+            4: {
+                "flash_attn_version": 4,
+                "general_availability_check": is_flash_attn_4_available,
+                "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
+                and importlib.util.find_spec("flash_attn.cute") is not None,
+                "supported_devices": ((is_torch_cuda_available, "cuda"),),
+                "cuda_major_versions": (9, 10),  # Hopper and Blackwell
+            },
+        }
+
         # Check if we can even use the FA version based on the env of the user
-        if flash_attn_version == 2:
-            self._flash_attn_2_import_error()
-        elif flash_attn_version == 3:
-            self._flash_attn_3_import_error()
-        else:
-            self._flash_attn_4_import_error()
+        self._flash_attn_import_error(**flash_attention_compatibility_matrix[flash_attn_version])
+
+        # Check for attention dropout, which is incompatible with newer FA versions
+        # (many should not really care about dropout as it is not super effective, hence warning for now)
+        if flash_attn_version > 2:
+            if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
+                logger.warning_once(
+                    f"You are attempting to use Flash Attention {flash_attn_version} with dropout. "
+                    "This might lead to unexpected behaviour as this is not supported on recent versions of Flash Attention."
+                )
 
+        # People often move dtypes after init so we only warn in those cases
         dtype = self.config.dtype
         if dtype is None:
             logger.warning_once(
-                f"You are attempting to use Flash Attention {flash_attn_version} without specifying a torch dtype. This might lead to unexpected behaviour"
+                f"You are attempting to use Flash Attention {flash_attn_version} without specifying a dtype. This might lead to unexpected behaviour"
             )
         elif dtype is not None and dtype not in [torch.float16, torch.bfloat16]:
             logger.warning_once(
@@ -1675,48 +1699,20 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
                 f' or load the model with the `dtype` argument. Example: `model = AutoModel.from_pretrained("meta-llama/Llama-3.2-1B", attn_implementation="flash_attention_{flash_attn_version}", dtype=torch.float16)`'
             )
 
-        # FA2 has broader support for some features and devices
-        is_fa2 = flash_attn_version == 2
-
-        if not is_fa2:
-            # Check for attention dropout, which is incompatible with newer FA versions
-            if hasattr(self.config, "attention_dropout") and self.config.attention_dropout > 0:
-                raise ValueError(
-                    f"Model has attention_dropout={self.config.attention_dropout}, which is not supported by Flash Attention {flash_attn_version}."
-                )
-
         # With the early check, the parameters are not yet initialized correctly
         if not is_init_check:
             param_devices = list({param.device for param in self.parameters()})
             if len(param_devices) == 1 and param_devices[0].type == "cpu":
-                if torch.cuda.is_available():
-                    logger.warning_once(
-                        f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU. Make sure to move the model to GPU"
-                        " after initializing it on CPU with `model.to('cuda')`."
-                    )
-                elif is_fa2:
-                    if is_torch_mlu_available():
-                        logger.warning_once(
-                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on MLU. Make sure to move the model to MLU"
-                            " after initializing it on CPU with `model.to('mlu')`."
-                        )
-                    elif is_torch_npu_available():
-                        logger.warning_once(
-                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on NPU. Make sure to move the model to NPU"
-                            " after initializing it on CPU with `model.to('npu')`."
-                        )
-                    elif is_torch_xpu_available():
-                        logger.warning_once(
-                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on XPU. Make sure to move the model to XPU"
-                            " after initializing it on CPU with `model.to('xpu')`."
+                for device_availability_check, device_name in flash_attention_compatibility_matrix[flash_attn_version][
+                    "supported_devices"
+                ]:
+                    if device_availability_check():
+                        raise ValueError(
+                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU. Make sure to move the model to GPU "
+                            "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
+                            f"or initialising the model on CPU and then moving it to GPU, e.g. with `model.to('{device_name}')`."
                         )
 
-                raise ValueError(
-                    f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU and with no GPU available. "
-                    "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
-                    "or initialising the model on CPU and then moving it to GPU."
-                )
-
         # If no error raise by this point, we can return `True`
         return True
 
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 7b8bfb80ec19..699d28c7ff04 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -134,7 +134,6 @@
     is_flash_attn_3_available,
     is_flash_attn_4_available,
     is_flash_attn_greater_or_equal,
-    is_flash_attn_greater_or_equal_2_10,
     is_flute_available,
     is_fouroversix_available,
     is_fp_quant_available,
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index b31686acb986..8c7ecd419c72 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -920,18 +920,9 @@ def is_flash_attn_2_available() -> bool:
     if not is_available or not (is_torch_cuda_available() or is_torch_mlu_available()):
         return False
 
-    import torch
-
+    # Only allow versions >= 2.3.3 to avoid very old legacy workarounds that are now 2+ years old
     try:
-        if torch.version.cuda:
-            return version.parse(flash_attn_version) >= version.parse("2.1.0")
-        elif torch.version.hip:
-            # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
-            return version.parse(flash_attn_version) >= version.parse("2.0.4")
-        elif is_torch_mlu_available():
-            return version.parse(flash_attn_version) >= version.parse("2.3.3")
-        else:
-            return False
+        return version.parse(flash_attn_version) >= version.parse("2.3.3")
     except packaging.version.InvalidVersion:
         return False
 
@@ -949,12 +940,6 @@ def is_flash_attn_4_available() -> bool:
     return _is_package_available("flash_attn.cute")[0]
 
 
-@lru_cache
-def is_flash_attn_greater_or_equal_2_10() -> bool:
-    _, flash_attn_version = _is_package_available("flash_attn", return_version=True)
-    return is_flash_attn_2_available() and version.parse(flash_attn_version) >= version.parse("2.1.0")
-
-
 @lru_cache
 def is_flash_attn_greater_or_equal(library_version: str) -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)

From ed88dcce5ca6987c879002ffae3dfc0bc539ca33 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Mon, 9 Mar 2026 20:15:10 +0000
Subject: [PATCH 18/36] style

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 596b08f61cf2..9134d7380b15 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1556,7 +1556,7 @@ def _flash_attn_import_error(
         general_availability_check: Callable,
         pkg_availability_check: Callable,
         supported_devices: tuple[tuple[Callable, str]],
-        custom_supported_devices: tuple[tuple[Callable, str]]=(),
+        custom_supported_devices: tuple[tuple[Callable, str]] = (),
         cuda_major_versions: tuple[int] | None = None,
     ):
         """

From 7fba6df67fc048be473594bc68690abff6c44124 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Mon, 9 Mar 2026 20:40:25 +0000
Subject: [PATCH 19/36] fix test

---
 src/transformers/modeling_utils.py | 2 +-
 tests/utils/test_modeling_utils.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9134d7380b15..017e47dbce93 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1596,7 +1596,7 @@ def _flash_attn_import_error(
             # Can the package be seen in the import structure
             if not pkg_availability_check():
                 raise ImportError(
-                    f"{preface} the package for FlashAttention{flash_attn_version} seems to be not installed."
+                    f"{preface} the package for FlashAttention{flash_attn_version} doesn't seem to be not installed."
                 )
             else:
                 # Supported devices availability
diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 341bf30fec33..e971af43f064 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -2948,7 +2948,7 @@ def test_not_available_flash(self):
             _ = AutoModel.from_pretrained(
                 "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="flash_attention_2"
             )
-        self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception))
+        self.assertTrue("the package for FlashAttention2 doesn't seem to be not installed." in str(cm.exception))
 
     def test_not_available_flash_with_config(self):
         if is_flash_attn_2_available():
@@ -2971,7 +2971,7 @@ def test_not_available_flash_with_config(self):
                 attn_implementation="flash_attention_2",
             )
 
-        self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception))
+        self.assertTrue("the package for FlashAttention2 doesn't seem to be not installed." in str(cm.exception))
 
     def test_kernels_fallback(self):
         if not is_kernels_available():

From 27acafe059baca9ed8a4dced9822f8ad877e9483 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Mon, 9 Mar 2026 22:31:43 +0000
Subject: [PATCH 20/36] min fa2, avoid 2x device sync

---
 src/transformers/modeling_flash_attention_utils.py | 5 ++++-
 src/transformers/modeling_utils.py                 | 6 ++++++
 src/transformers/utils/import_utils.py             | 3 +++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index c4078ebf36c0..16519dc34246 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -585,13 +585,16 @@ def _process_flash_attention_kwargs(
     #   - Env: `TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1`
     #   - Before compiling: `torch._dynamo.config.capture_scalar_outputs = True`
     # to allow torch compile to handle scalar outputs in those cases.
+    same_max_seqlen = max_seqlen_q is max_seqlen_k  # to avoid 2x device syncs
     if supports_mapping["max_seqlen_q"] and max_seqlen_q is not None:
         if not isinstance(max_seqlen_q, int) and is_tracing(max_seqlen_q):
             max_seqlen_q = max_seqlen_q.item()
         flash_kwargs["max_seqlen_q"] = max_seqlen_q
 
     if supports_mapping["max_seqlen_k"] and max_seqlen_k is not None:
-        if not isinstance(max_seqlen_k, int) and is_tracing(max_seqlen_k):
+        if same_max_seqlen and flash_kwargs["max_seqlen_q"] is not None:
+            max_seqlen_k = flash_kwargs["max_seqlen_q"]
+        elif not isinstance(max_seqlen_k, int) and is_tracing(max_seqlen_k):
             max_seqlen_k = max_seqlen_k.item()
         flash_kwargs["max_seqlen_k"] = max_seqlen_k
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 017e47dbce93..c7e79ac84d1d 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -123,6 +123,7 @@
 from .utils.hub import DownloadKwargs, create_and_tag_model_card, get_checkpoint_shard_files
 from .utils.import_utils import (
     PACKAGE_DISTRIBUTION_MAPPING,
+    is_flash_attn_greater_or_equal,
     is_huggingface_hub_greater_or_equal,
     is_sagemaker_mp_enabled,
     is_torch_cuda_available,
@@ -1598,6 +1599,11 @@ def _flash_attn_import_error(
                 raise ImportError(
                     f"{preface} the package for FlashAttention{flash_attn_version} doesn't seem to be not installed."
                 )
+            # Minimum version (FA2 only)
+            elif flash_attn_version == 2 and not is_flash_attn_greater_or_equal("2.3.3"):
+                raise ImportError(
+                    f"{preface} FlashAttention{flash_attn_version} requires at least version `2.3.3`."
+                )
             else:
                 # Supported devices availability
                 device_availability_checks, device_names = zip(*supported_devices)
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index e7954f6994c5..34a6ccc94c56 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -944,6 +944,9 @@ def is_flash_attn_4_available() -> bool:
 @lru_cache
 def is_flash_attn_greater_or_equal(library_version: str) -> bool:
     is_available, flash_attn_version = _is_package_available("flash_attn", return_version=True)
+    # FA4 is also distributed under "flash_attn", hence we need to check the naming here
+    is_available = is_available and "flash_attn" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"]
+
     if not is_available:
         return False
     try:

From afa0940ae09e053e22301d93da54c59c5ffb0e0f Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Mon, 9 Mar 2026 22:37:18 +0000
Subject: [PATCH 21/36] style

---
 src/transformers/modeling_flash_attention_utils.py | 1 +
 src/transformers/modeling_utils.py                 | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index ab03eb5b281f..e24ef822e569 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -112,6 +112,7 @@ def _lazy_imports(
             from flash_attn_interface import flash_attn_func, flash_attn_varlen_func, flash_attn_with_kvcache
         elif implementation == "flash_attention_4" or (implementation is None and is_fa4):
             from flash_attn.cute import flash_attn_func, flash_attn_varlen_func
+
             flash_attn_with_kvcache = None  # not supported yet
         # Kernels fallback
         else:
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c7e79ac84d1d..c95e0d541c44 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1601,9 +1601,7 @@ def _flash_attn_import_error(
                 )
             # Minimum version (FA2 only)
             elif flash_attn_version == 2 and not is_flash_attn_greater_or_equal("2.3.3"):
-                raise ImportError(
-                    f"{preface} FlashAttention{flash_attn_version} requires at least version `2.3.3`."
-                )
+                raise ImportError(f"{preface} FlashAttention{flash_attn_version} requires at least version `2.3.3`.")
             else:
                 # Supported devices availability
                 device_availability_checks, device_names = zip(*supported_devices)

From 7223fe6d2bc7066ce465e18c9adaf49d4379c7f1 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Tue, 10 Mar 2026 12:04:47 +0000
Subject: [PATCH 22/36] simple min version instead of list

---
 src/transformers/modeling_utils.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c95e0d541c44..0b89791b800b 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1558,7 +1558,7 @@ def _flash_attn_import_error(
         pkg_availability_check: Callable,
         supported_devices: tuple[tuple[Callable, str]],
         custom_supported_devices: tuple[tuple[Callable, str]] = (),
-        cuda_major_versions: tuple[int] | None = None,
+        cuda_min_major_version: int | None = None,
     ):
         """
         Checks whether the specified Flash Attention version is supported and if not, searches for the specific reason
@@ -1581,8 +1581,8 @@ def _flash_attn_import_error(
                 Essentially a list (for mutable kwargs reasons a tuple) of the custom supported devices in the format of
                 `(device_availability_check, info_message)`. These custom devices have custom logic outside the torch
                 ecosystem either via kernels or other packages and hence have early checks for availability.
-            cuda_major_versions (`tuple[int]`, *optional*):
-                A potential list of major cuda versions supported for this version of Flash Attention. This is mostly
+            cuda_min_major_version (`int`, *optional*):
+                The minimum major cuda version supported for this version of Flash Attention. This is mostly
                 affecting more recent versions which are more specialized to the features of new hardware.
         """
         # Certain devices have custom workarounds e.g. with their own package distribution (NPU) or via kernels (XPU)
@@ -1610,11 +1610,11 @@ def _flash_attn_import_error(
                         f"{preface} FlashAttention{flash_attn_version} is not available on CPU. Please make sure you are on any of the supported devices: {device_names}."
                     )
                 # Cuda major versions (more recent FA versions are specialized to newer cuda devices)
-                elif cuda_major_versions is not None and is_torch_cuda_available():
+                elif cuda_min_major_version is not None and is_torch_cuda_available():
                     major, _ = torch.cuda.get_device_capability()
-                    if major not in cuda_major_versions:
+                    if major < cuda_min_major_version:
                         raise ImportError(
-                            f"{preface} FlashAttention{flash_attn_version} requires compute capability >= {min(cuda_major_versions)}, but found {torch.cuda.get_device_capability()} with compute capability {major}.x"
+                            f"{preface} FlashAttention{flash_attn_version} requires compute capability >= {cuda_min_major_version}, but found {torch.cuda.get_device_capability()} with compute capability {major}.x"
                         )
 
     def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool = False) -> bool:
@@ -1666,7 +1666,7 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
                 "general_availability_check": is_flash_attn_3_available,
                 "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn_3") is not None,
                 "supported_devices": ((is_torch_cuda_available, "cuda"),),
-                "cuda_major_versions": (8, 9),  # Ampere and Hopper
+                "cuda_min_major_version": 8,  # Ampere
             },
             4: {
                 "flash_attn_version": 4,
@@ -1674,7 +1674,7 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
                 "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
                 and importlib.util.find_spec("flash_attn.cute") is not None,
                 "supported_devices": ((is_torch_cuda_available, "cuda"),),
-                "cuda_major_versions": (9, 10),  # Hopper and Blackwell
+                "cuda_min_major_version": 9,  # Hopper
             },
         }
 

From da88dcf9aab07a3065326dd9feb30bec1dcc058a Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Tue, 10 Mar 2026 16:49:26 +0000
Subject: [PATCH 23/36] fixup error message on non init check

---
 src/transformers/modeling_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 0b89791b800b..30e6e926748e 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1712,9 +1712,9 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
                 ]:
                     if device_availability_check():
                         raise ValueError(
-                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU. Make sure to move the model to GPU "
-                            "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
-                            f"or initialising the model on CPU and then moving it to GPU, e.g. with `model.to('{device_name}')`."
+                            f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU. Please make sure to have "
+                            "access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then "
+                            f"moving it to GPU, e.g. with `model.to('{device_name}')`."
                         )
 
         # If no error raise by this point, we can return `True`

From 654db436d7acc8ea0aa8a40a463c82cd4080ff34 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Tue, 10 Mar 2026 16:58:03 +0000
Subject: [PATCH 24/36] fixup up non init check a tad more

---
 src/transformers/modeling_utils.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 30e6e926748e..59bfc9cac003 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1707,15 +1707,25 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
         if not is_init_check:
             param_devices = list({param.device for param in self.parameters()})
             if len(param_devices) == 1 and param_devices[0].type == "cpu":
+                found_device = False
                 for device_availability_check, device_name in flash_attention_compatibility_matrix[flash_attn_version][
                     "supported_devices"
                 ]:
                     if device_availability_check():
-                        raise ValueError(
+                        found_device = True
+                        logger.warning_once(
                             f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU. Please make sure to have "
                             "access to a GPU and either initialise the model on a GPU by passing a device_map or initialising the model on CPU and then "
                             f"moving it to GPU, e.g. with `model.to('{device_name}')`."
                         )
+                        break
+
+                if not found_device:
+                    raise ValueError(
+                        f"You are attempting to use Flash Attention {flash_attn_version} with a model not initialized on GPU and with no GPU available. "
+                        "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
+                        "or initialising the model on CPU and then moving it to GPU."
+                    )
 
         # If no error raise by this point, we can return `True`
         return True

From d3485dace4a00e9687664960ecc1a04015200b07 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 16:54:41 +0000
Subject: [PATCH 25/36] refactor some FA constants out to main fa utils

---
 .../modeling_flash_attention_utils.py         | 54 ++++++++++++++++-
 src/transformers/modeling_utils.py            | 59 +++----------------
 2 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
index e24ef822e569..72d432c06eea 100644
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import inspect
 import os
 from collections.abc import Callable
@@ -24,11 +25,13 @@
     is_flash_attn_2_available,
     is_flash_attn_3_available,
     is_flash_attn_4_available,
+    is_torch_cuda_available,
+    is_torch_mlu_available,
     is_torch_npu_available,
     is_torch_xpu_available,
     logging,
 )
-from .utils.import_utils import is_tracing
+from .utils.import_utils import PACKAGE_DISTRIBUTION_MAPPING, is_tracing
 
 
 logger = logging.get_logger(__name__)
@@ -55,6 +58,55 @@ def is_flash_attn_available():
     )
 
 
+# Mapping from flash attention implementations to their kernel fallback repositories
+FLASH_ATTN_KERNEL_FALLBACK = {
+    "flash_attention_2": "kernels-community/flash-attn2",
+    "flash_attention_3": "kernels-community/vllm-flash-attn3",
+}
+
+
+# Meta information on each mainline FA compatibility:
+#   1. The import structure and availability
+#   2. Device support (with custom ones that use other workarounds, e.g. kernels)
+#   3. Supported major cuda devices, e.g. Hopper, Blackwell. Mostly found in the newest FA versions
+FLASH_ATTENTION_COMPATIBILITY_MATRIX = {
+    2: {
+        "flash_attn_version": 2,
+        "general_availability_check": is_flash_attn_2_available,
+        "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
+        and "flash_attn" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"],
+        "supported_devices": (
+            (is_torch_cuda_available, "cuda"),
+            (is_torch_mlu_available, "mlu"),
+            (is_torch_npu_available, "npu"),
+            (is_torch_xpu_available, "xpu"),
+        ),
+        "custom_supported_devices": (
+            (is_torch_npu_available, "Detect using FlashAttention2 on Ascend NPU."),
+            (
+                is_torch_xpu_available,
+                f"Detect using FlashAttention2 (via kernel `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}`) on XPU.",
+            ),
+        ),
+    },
+    3: {
+        "flash_attn_version": 3,
+        "general_availability_check": is_flash_attn_3_available,
+        "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn_3") is not None,
+        "supported_devices": ((is_torch_cuda_available, "cuda"),),
+        "cuda_min_major_version": 8,  # Ampere
+    },
+    4: {
+        "flash_attn_version": 4,
+        "general_availability_check": is_flash_attn_4_available,
+        "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
+        and importlib.util.find_spec("flash_attn.cute") is not None,
+        "supported_devices": ((is_torch_cuda_available, "cuda"),),
+        "cuda_min_major_version": 9,  # Hopper
+    },
+}
+
+
 # `globals()` is not compatible with dynamo, hence we have do define them in global scope ourselves
 _loaded_implementation = None
 _flash_fn = None
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index c6d213d4e84f..b55910b77470 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -15,7 +15,6 @@
 import collections
 import copy
 import functools
-import importlib.metadata
 import inspect
 import json
 import os
@@ -84,7 +83,12 @@
     verify_tp_plan,
 )
 from .loss.loss_utils import LOSS_MAPPING
-from .modeling_flash_attention_utils import lazy_import_flash_attention, lazy_import_paged_flash_attention
+from .modeling_flash_attention_utils import (
+    FLASH_ATTENTION_COMPATIBILITY_MATRIX,
+    FLASH_ATTN_KERNEL_FALLBACK,
+    lazy_import_flash_attention,
+    lazy_import_paged_flash_attention,
+)
 from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from .monkey_patching import apply_patches, patch_output_recorders
 from .pytorch_utils import id_tensor_storage
@@ -111,10 +115,8 @@
     is_env_variable_true,
     is_flash_attn_2_available,
     is_flash_attn_3_available,
-    is_flash_attn_4_available,
     is_kernels_available,
     is_torch_flex_attn_available,
-    is_torch_mlu_available,
     is_torch_npu_available,
     is_torch_xpu_available,
     logging,
@@ -122,7 +124,6 @@
 from .utils.generic import GeneralInterface, is_flash_attention_requested
 from .utils.hub import DownloadKwargs, create_and_tag_model_card, get_checkpoint_shard_files
 from .utils.import_utils import (
-    PACKAGE_DISTRIBUTION_MAPPING,
     is_flash_attn_greater_or_equal,
     is_huggingface_hub_greater_or_equal,
     is_sagemaker_mp_enabled,
@@ -158,12 +159,6 @@
 _is_quantized = False
 _is_ds_init_called = False
 
-# Mapping from flash attention implementations to their kernel fallback repositories
-FLASH_ATTN_KERNEL_FALLBACK = {
-    "flash_attention_2": "kernels-community/flash-attn2",
-    "flash_attention_3": "kernels-community/vllm-flash-attn3",
-}
-
 
 @dataclass(frozen=True)
 class LoadStateDictConfig:
@@ -1646,46 +1641,8 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
         if flash_attn_version not in [2, 3, 4]:
             raise ValueError(f"Requested Flash Attention {flash_attn_version} which is not supported.")
 
-        # Big matrix that shows each FA versions limitations by device and major cuda versions
-        flash_attention_compatibility_matrix = {
-            2: {
-                "flash_attn_version": 2,
-                "general_availability_check": is_flash_attn_2_available,
-                "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
-                and "flash_attn" in PACKAGE_DISTRIBUTION_MAPPING["flash_attn"],
-                "supported_devices": (
-                    (is_torch_cuda_available, "cuda"),
-                    (is_torch_mlu_available, "mlu"),
-                    (is_torch_npu_available, "npu"),
-                    (is_torch_xpu_available, "xpu"),
-                ),
-                "custom_supported_devices": (
-                    (is_torch_npu_available, "Detect using FlashAttention2 on Ascend NPU."),
-                    (
-                        is_torch_xpu_available,
-                        f"Detect using FlashAttention2 (via kernel `{FLASH_ATTN_KERNEL_FALLBACK['flash_attention_2']}`) on XPU.",
-                    ),
-                ),
-            },
-            3: {
-                "flash_attn_version": 3,
-                "general_availability_check": is_flash_attn_3_available,
-                "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn_3") is not None,
-                "supported_devices": ((is_torch_cuda_available, "cuda"),),
-                "cuda_min_major_version": 8,  # Ampere
-            },
-            4: {
-                "flash_attn_version": 4,
-                "general_availability_check": is_flash_attn_4_available,
-                "pkg_availability_check": lambda *args, **kwargs: importlib.util.find_spec("flash_attn") is not None
-                and importlib.util.find_spec("flash_attn.cute") is not None,
-                "supported_devices": ((is_torch_cuda_available, "cuda"),),
-                "cuda_min_major_version": 9,  # Hopper
-            },
-        }
-
         # Check if we can even use the FA version based on the env of the user
-        self._flash_attn_import_error(**flash_attention_compatibility_matrix[flash_attn_version])
+        self._flash_attn_import_error(**FLASH_ATTENTION_COMPATIBILITY_MATRIX[flash_attn_version])
 
         # Check for attention dropout, which is incompatible with newer FA versions
         # (many should not really care about dropout as it is not super effective, hence warning for now)
@@ -1714,7 +1671,7 @@ def _flash_attn_can_dispatch(self, flash_attn_version: int, is_init_check: bool
             param_devices = list({param.device for param in self.parameters()})
             if len(param_devices) == 1 and param_devices[0].type == "cpu":
                 found_device = False
-                for device_availability_check, device_name in flash_attention_compatibility_matrix[flash_attn_version][
+                for device_availability_check, device_name in FLASH_ATTENTION_COMPATIBILITY_MATRIX[flash_attn_version][
                     "supported_devices"
                 ]:
                     if device_availability_check():

From 476789fc9ad773867f12a5779a98cf1e555bed61 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 17:10:41 +0000
Subject: [PATCH 26/36] new marker for all fas needed

---
 .../self-scheduled-flash-attn-caller.yml       |  2 +-
 conftest.py                                    |  1 +
 pyproject.toml                                 |  1 +
 src/transformers/testing_utils.py              | 16 ++++++++++++++++
 .../generation/test_flash_attention_parity.py  | 18 ++++--------------
 5 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/self-scheduled-flash-attn-caller.yml b/.github/workflows/self-scheduled-flash-attn-caller.yml
index a3ba8b189c3d..ccaf205456e1 100644
--- a/.github/workflows/self-scheduled-flash-attn-caller.yml
+++ b/.github/workflows/self-scheduled-flash-attn-caller.yml
@@ -56,5 +56,5 @@ jobs:
       runner_type: "a10"
       report_repo_id: hf-internal-testing/transformers_flash_attn_ci
       commit_sha: ${{ github.sha }}
-      pytest_marker: "flash_attn_test or flash_attn_3_test or flash_attn_4_test"
+      pytest_marker: "flash_attn_test or flash_attn_3_test or flash_attn_4_test or all_flash_attn_test"
     secrets: inherit
diff --git a/conftest.py b/conftest.py
index 3a91af851434..96f8b74dd40c 100644
--- a/conftest.py
+++ b/conftest.py
@@ -91,6 +91,7 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
     config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
     config.addinivalue_line("markers", "flash_attn_4_test: mark test which tests flash attention 4 functionality")
+    config.addinivalue_line("markers", "all_flash_attn_test: mark test which tests all mainline flash attentions' functionality")
     config.addinivalue_line("markers", "training_ci: mark test for training CI validation")
     config.addinivalue_line("markers", "tensor_parallel_ci: mark test for tensor parallel CI validation")
 
diff --git a/pyproject.toml b/pyproject.toml
index 13ec38f0e542..f7e72facf021 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,6 +70,7 @@ line-ending = "auto"
 addopts = "--doctest-glob='**/*.md'"
 doctest_optionflags="NUMBER NORMALIZE_WHITESPACE ELLIPSIS"
 markers = [
+    "all_flash_attn_test: marks tests related to all mainline flash attentions at once (deselect with '-m \"not all_flash_attn_test\"')",
     "flash_attn_4_test: marks tests related to flash attention 4 (deselect with '-m \"not flash_attn_4_test\"')",
     "flash_attn_3_test: marks tests related to flash attention 3 (deselect with '-m \"not flash_attn_3_test\"')",
     "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')",
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index a2093749dda7..da2ea20144df 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -681,6 +681,22 @@ def require_flash_attn_4(test_case):
     return unittest.skipUnless(is_flash_attn_4_available(), "test requires Flash Attention 4")(test_case)
 
 
+def require_all_flash_attn(test_case):
+    flash_attn_available = is_flash_attn_2_available()
+    kernels_available = is_kernels_available()
+    try:
+        from kernels import get_kernel
+
+        get_kernel(FLASH_ATTN_KERNEL_FALLBACK["flash_attention_2"])
+    except Exception as _:
+        kernels_available = False
+
+    return unittest.skipUnless(
+        all(flash_attn_available | kernels_available, is_flash_attn_3_available(), is_flash_attn_4_available()),
+        "test requires all mainline Flash Attention packages",
+    )(test_case)
+
+
 def require_peft(test_case):
     """
     Decorator marking a test that requires PEFT.
diff --git a/tests/generation/test_flash_attention_parity.py b/tests/generation/test_flash_attention_parity.py
index 28327dca04cb..7b71ed02dc36 100644
--- a/tests/generation/test_flash_attention_parity.py
+++ b/tests/generation/test_flash_attention_parity.py
@@ -22,13 +22,7 @@
 import torch
 
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.testing_utils import (
-    require_flash_attn,
-    require_flash_attn_3,
-    require_flash_attn_4,
-    require_torch_gpu,
-    slow,
-)
+from transformers.testing_utils import require_all_flash_attn, require_torch_gpu, slow
 
 
 class FlashAttentionParityTest(unittest.TestCase):
@@ -81,14 +75,10 @@ def _benchmark_generation(self, model, inputs, n_warmup=3, n_runs=5):
 
         return start_time.elapsed_time(end_time) / n_runs
 
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @pytest.mark.flash_attn_3_test
-    @pytest.mark.flash_attn_4_test
-    @require_flash_attn
-    @require_flash_attn_3
-    @require_flash_attn_4
     @slow
+    @require_torch_gpu
+    @require_all_flash_attn
+    @pytest.mark.all_flash_attn_test
     def test_flash_attention_parity(self):
         flash_attn_versions = [2, 3, 4]
 

From 19e4c44f25eff0a134d1ee538e6557a1186abe1e Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 17:12:04 +0000
Subject: [PATCH 27/36] oops

---
 src/transformers/testing_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index da2ea20144df..4cbd8d8b2d72 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -692,7 +692,7 @@ def require_all_flash_attn(test_case):
         kernels_available = False
 
     return unittest.skipUnless(
-        all(flash_attn_available | kernels_available, is_flash_attn_3_available(), is_flash_attn_4_available()),
+        all((flash_attn_available | kernels_available, is_flash_attn_3_available(), is_flash_attn_4_available(),)),
         "test requires all mainline Flash Attention packages",
     )(test_case)
 

From 08445b61c1879cee097a15c81df1fca2b277a485 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 17:43:12 +0000
Subject: [PATCH 28/36] style and make the fa kernel fallback generalized

---
 conftest.py                        |  4 +++-
 src/transformers/modeling_utils.py | 36 +++++++++++++++++-------------
 src/transformers/testing_utils.py  |  8 ++++++-
 3 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/conftest.py b/conftest.py
index 96f8b74dd40c..a7d2341505a2 100644
--- a/conftest.py
+++ b/conftest.py
@@ -91,7 +91,9 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
     config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")
     config.addinivalue_line("markers", "flash_attn_4_test: mark test which tests flash attention 4 functionality")
-    config.addinivalue_line("markers", "all_flash_attn_test: mark test which tests all mainline flash attentions' functionality")
+    config.addinivalue_line(
+        "markers", "all_flash_attn_test: mark test which tests all mainline flash attentions' functionality"
+    )
     config.addinivalue_line("markers", "training_ci: mark test for training CI validation")
     config.addinivalue_line("markers", "tensor_parallel_ci: mark test for tensor parallel CI validation")
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index b55910b77470..e694800be126 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -113,8 +113,6 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_env_variable_true,
-    is_flash_attn_2_available,
-    is_flash_attn_3_available,
     is_kernels_available,
     is_torch_flex_attn_available,
     is_torch_npu_available,
@@ -1598,7 +1596,7 @@ def _flash_attn_import_error(
             # Can the package be seen in the import structure
             if not pkg_availability_check():
                 raise ImportError(
-                    f"{preface} the package for FlashAttention{flash_attn_version} doesn't seem to be not installed."
+                    f"{preface} the package for FlashAttention{flash_attn_version} doesn't seem to be installed."
                 )
             # Minimum version (FA2 only)
             elif flash_attn_version == 2 and not is_flash_attn_greater_or_equal("2.3.3"):
@@ -1806,18 +1804,26 @@ def _check_and_adjust_attn_implementation(
                 attn_implementation = default_flash_implementation
 
         applicable_attn_implementation = attn_implementation
-
         is_paged = attn_implementation is not None and attn_implementation.startswith("paged|")
 
-        # If FA not installed, do not fail but use kernels instead
-        requested_original_flash_attn = attn_implementation is not None and (
-            attn_implementation.removeprefix("paged|") == "flash_attention_2"
-            or attn_implementation.removeprefix("paged|") == "flash_attention_3"
-        )
+        requested_original_flash_attn = False
+        if is_flash_attention_requested(requested_attention_implementation=attn_implementation):
+            # If FA not installed, do not fail but use kernels instead if possible
+            for fa_version in FLASH_ATTENTION_COMPATIBILITY_MATRIX.keys():
+                # No kernels support for FA4 for now
+                if fa_version == 4:
+                    continue
+
+                # Check whether we have an original FA requested but not available in the env
+                if requested_original_flash_attn := (
+                    attn_implementation.removeprefix("paged|") == f"flash_attention_{fa_version}"
+                    and not FLASH_ATTENTION_COMPATIBILITY_MATRIX[fa_version]["general_availability_check"]()
+                ):
+                    break
+
         if (
-            requested_original_flash_attn
-            and self._supports_flash_attn
-            and not (is_flash_attn_2_available() or is_flash_attn_3_available())
+            self._supports_flash_attn
+            and requested_original_flash_attn
             and is_kernels_available()
             and not is_torch_npu_available()
         ):
@@ -1850,10 +1856,8 @@ def _check_and_adjust_attn_implementation(
             except Exception as e:
                 # raise the proper exception for requested flash attention
                 if requested_original_flash_attn:
-                    if attn_implementation.endswith("2"):
-                        self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
-                    else:
-                        self._flash_attn_can_dispatch(flash_attn_version=3, is_init_check=is_init_check)
+                    fa_version = int(attn_implementation[-1])  # "flash_attention_(2|3|...)"
+                    self._flash_attn_can_dispatch(flash_attn_version=fa_version, is_init_check=is_init_check)
 
                 # error properly out if a kernel was specifically requested
                 raise e
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 4cbd8d8b2d72..f9e371318872 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -692,7 +692,13 @@ def require_all_flash_attn(test_case):
         kernels_available = False
 
     return unittest.skipUnless(
-        all((flash_attn_available | kernels_available, is_flash_attn_3_available(), is_flash_attn_4_available(),)),
+        all(
+            (
+                flash_attn_available | kernels_available,
+                is_flash_attn_3_available(),
+                is_flash_attn_4_available(),
+            )
+        ),
         "test requires all mainline Flash Attention packages",
     )(test_case)
 

From 920bef73a1d2a3ccfb0d228aba23229cf5a795c9 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 17:46:51 +0000
Subject: [PATCH 29/36] default none...

---
 src/transformers/modeling_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index e694800be126..ed2b4a645fe8 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1807,7 +1807,9 @@ def _check_and_adjust_attn_implementation(
         is_paged = attn_implementation is not None and attn_implementation.startswith("paged|")
 
         requested_original_flash_attn = False
-        if is_flash_attention_requested(requested_attention_implementation=attn_implementation):
+        if attn_implementation is not None and is_flash_attention_requested(
+            requested_attention_implementation=attn_implementation
+        ):
             # If FA not installed, do not fail but use kernels instead if possible
             for fa_version in FLASH_ATTENTION_COMPATIBILITY_MATRIX.keys():
                 # No kernels support for FA4 for now

From 8ee8c567e239f63ad22c0af01219d5a6864ff0b9 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:05:08 +0000
Subject: [PATCH 30/36] more refactors

---
 src/transformers/modeling_utils.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index ed2b4a645fe8..cc0d6d519563 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1896,7 +1896,10 @@ def get_correct_attn_implementation(self, requested_attention: str | None, is_in
             )
             # check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
             if self._supports_flash_attn or getattr(self, "_supports_flash_attn_2", False):
-                message += ', `"attn_implementation=flash_attention_4"`, `"attn_implementation=flash_attention_3"`, `"attn_implementation=flash_attention_2"`, `"attn_implementation=paged|flash_attention_2"`'
+                message += ', '
+                for fa_version in FLASH_ATTENTION_COMPATIBILITY_MATRIX.keys():
+                    message += f'`"attn_implementation=flash_attention_{fa_version}"`, `"attn_implementation=paged|flash_attention_{fa_version}"`, '
+                message = message[:-2]  # remove trailing comma
             if self._supports_sdpa:
                 message += ', `"attn_implementation=sdpa"`, `"attn_implementation=paged|sdpa"`'
             if self._supports_flex_attn:
@@ -1904,12 +1907,11 @@ def get_correct_attn_implementation(self, requested_attention: str | None, is_in
             raise ValueError(message + ".")
 
         # Perform relevant checks
-        if "flash_attention_2" in applicable_attention:
-            self._flash_attn_can_dispatch(flash_attn_version=2, is_init_check=is_init_check)
-        elif "flash_attention_3" in applicable_attention:
-            self._flash_attn_can_dispatch(flash_attn_version=3, is_init_check=is_init_check)
-        elif "flash_attention_4" in applicable_attention:
-            self._flash_attn_can_dispatch(flash_attn_version=4, is_init_check=is_init_check)
+        if is_flash_attention_requested(requested_attention_implementation=applicable_attention) and (
+            fa_matched := re.search(r"^flash_attention_(\d)$", applicable_attention)
+        ):
+            fa_version = int(fa_matched.group(1))  # last digit
+            self._flash_attn_can_dispatch(flash_attn_version=fa_version, is_init_check=is_init_check)
         elif "flex_attention" in applicable_attention:
             self._flex_attn_can_dispatch(is_init_check)
         elif "sdpa" in applicable_attention:
@@ -3767,7 +3769,13 @@ def from_pretrained(
 
                 </Tip>
             attn_implementation (`str`, *optional*):
-                The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)), or `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper)), or `"flash_attention_4"` (using [Dao-AILab/flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
+                The attention implementation to use in the model (if relevant). Can be any of
+                    - `"eager"` (manual implementation of the attention)
+                    - `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html))
+                    - `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention))
+                    - `"flash_attention_3"` (using [Dao-AILab/flash-attention/hopper](https://github.com/Dao-AILab/flash-attention/tree/main/hopper))
+                    - `"flash_attention_4"` (using [Dao-AILab/flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)).
+                By default, if available, SDPA will be used. The default is otherwise the manual `"eager"` implementation.
 
                 Accept HF kernel references in the form:
                   <namespace>/<repo_name>[@<revision>][:<kernel_name>]

From cd2a9b34ff99a8393d1ac8f2f59dc89d7e9138e4 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:05:36 +0000
Subject: [PATCH 31/36] style

---
 src/transformers/modeling_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index cc0d6d519563..524a8605059a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1896,7 +1896,7 @@ def get_correct_attn_implementation(self, requested_attention: str | None, is_in
             )
             # check `supports_flash_attn_2` for BC with custom code. TODO: remove after a few releases
             if self._supports_flash_attn or getattr(self, "_supports_flash_attn_2", False):
-                message += ', '
+                message += ", "
                 for fa_version in FLASH_ATTENTION_COMPATIBILITY_MATRIX.keys():
                     message += f'`"attn_implementation=flash_attention_{fa_version}"`, `"attn_implementation=paged|flash_attention_{fa_version}"`, '
                 message = message[:-2]  # remove trailing comma

From 27e0d58b39043f5e213802ac88b08a10763de221 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:21:14 +0000
Subject: [PATCH 32/36] fix

---
 tests/utils/test_modeling_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 3c33e50ad6e5..31f1a07953eb 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -2836,7 +2836,7 @@ def test_not_available_flash(self):
             _ = AutoModel.from_pretrained(
                 "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="flash_attention_2"
             )
-        self.assertTrue("the package for FlashAttention2 doesn't seem to be not installed." in str(cm.exception))
+        self.assertTrue("the package for FlashAttention2 doesn't seem to be installed." in str(cm.exception))
 
     def test_not_available_flash_with_config(self):
         if is_flash_attn_2_available():
@@ -2859,7 +2859,7 @@ def test_not_available_flash_with_config(self):
                 attn_implementation="flash_attention_2",
             )
 
-        self.assertTrue("the package for FlashAttention2 doesn't seem to be not installed." in str(cm.exception))
+        self.assertTrue("the package for FlashAttention2 doesn't seem to be installed." in str(cm.exception))
 
     def test_kernels_fallback(self):
         if not is_kernels_available():

From 043f11f3b75aa01c54857246fdf9002c011955dc Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:33:20 +0000
Subject: [PATCH 33/36] this test faulty even on main, xformers can handle any
 shape apparently yikes

---
 tests/utils/test_modeling_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 31f1a07953eb..7366845c4d78 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -2316,6 +2316,9 @@ def test_decoder_only_model_can_be_used_as_encoder(self, attn_implementation: st
 
         config = LlamaConfig(
             num_hidden_layers=2,
+            num_attention_heads=2,
+            num_key_value_heads=1,
+            head_dim=16,
             hidden_size=32,
             intermediate_size=64,
             vocab_size=100,

From b0485b5ba2bb1e249d34e7175a0d5c401454bcae Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:41:42 +0000
Subject: [PATCH 34/36] lets make this more robust, we should check for none
 within...

---
 src/transformers/modeling_utils.py | 7 ++-----
 src/transformers/utils/generic.py  | 4 ++--
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 524a8605059a..8ebfcc267869 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1789,8 +1789,7 @@ def _check_and_adjust_attn_implementation(
 
             compatible_flash_implementations = getattr(self, "_compatible_flash_implementations", None)
             if (
-                compatible_flash_implementations
-                and is_flash_attention_requested(requested_attention_implementation=base_implementation)
+                is_flash_attention_requested(requested_attention_implementation=base_implementation)
                 and base_implementation not in compatible_flash_implementations
             ):
                 default_flash_implementation = (
@@ -1807,9 +1806,7 @@ def _check_and_adjust_attn_implementation(
         is_paged = attn_implementation is not None and attn_implementation.startswith("paged|")
 
         requested_original_flash_attn = False
-        if attn_implementation is not None and is_flash_attention_requested(
-            requested_attention_implementation=attn_implementation
-        ):
+        if is_flash_attention_requested(requested_attention_implementation=attn_implementation):
             # If FA not installed, do not fail but use kernels instead if possible
             for fa_version in FLASH_ATTENTION_COMPATIBILITY_MATRIX.keys():
                 # No kernels support for FA4 for now
diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 447cac2e9cf6..93f0e5346453 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -237,8 +237,8 @@ def is_flash_attention_requested(
 
     if config is not None:
         checked_attention_implementation = config._attn_implementation
-    else:
-        checked_attention_implementation = requested_attention_implementation
+    elif (checked_attention_implementation := requested_attention_implementation) is None:
+        return False
 
     # If a specific version is requested, look for a pattern of type "flash...{version}"
     if version is not None:

From eae216e6613d966a80e770eccfb34d9dccfe207d Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:46:23 +0000
Subject: [PATCH 35/36] fix

---
 src/transformers/utils/generic.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/utils/generic.py b/src/transformers/utils/generic.py
index 93f0e5346453..379b23b58de6 100644
--- a/src/transformers/utils/generic.py
+++ b/src/transformers/utils/generic.py
@@ -237,7 +237,11 @@ def is_flash_attention_requested(
 
     if config is not None:
         checked_attention_implementation = config._attn_implementation
-    elif (checked_attention_implementation := requested_attention_implementation) is None:
+    else:
+        checked_attention_implementation = requested_attention_implementation
+
+    # theoretically can happen, equivalent to default implementation (sdpa/eager)
+    if checked_attention_implementation is None:
         return False
 
     # If a specific version is requested, look for a pattern of type "flash...{version}"

From 15f6ba97bd3386ce816b46bab82a9f074b82ee49 Mon Sep 17 00:00:00 2001
From: Vasqu <antonprogamer@gmail.com>
Date: Fri, 13 Mar 2026 18:49:46 +0000
Subject: [PATCH 36/36] oops

---
 src/transformers/modeling_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 8ebfcc267869..a4928a331c11 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1790,6 +1790,7 @@ def _check_and_adjust_attn_implementation(
             compatible_flash_implementations = getattr(self, "_compatible_flash_implementations", None)
             if (
                 is_flash_attention_requested(requested_attention_implementation=base_implementation)
+                and compatible_flash_implementations is not None
                 and base_implementation not in compatible_flash_implementations
             ):
                 default_flash_implementation = (