diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index cbaebcc7ac..b529ba0dd8 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -234,6 +234,18 @@ def prefer_flex_attn_if_supported(model_class, config):
             model_class, "_supports_flex_attn", False
         ):
             return None
+        # flex_attention Triton kernels require sm80+ (Ampere and above).
+        # On older GPUs (T4/sm75, V100/sm70) the dense Python fallback runs
+        # instead, but sdpa_dense_backward has a dtype mismatch under fp16
+        # autocast (Half @ Float matmul). Skip flex_attention there.
+        import torch
+
+        if torch.cuda.is_available():
+            major, _ = torch.cuda.get_device_capability()
+            if major < 8:
+                return None
+        else:
+            return None
         # GPT-OSS, Mllama and Gemma3N use eager/sdpa attention during
         # inference since flex attention returns incorrect results or errors out.
         # GPT-OSS: left padding issues cause incorrect outputs.