From f73106779b6157be987717d688acbe4d9c1115c2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 5 Feb 2026 12:27:20 +0000
Subject: [PATCH] Fix GPT-OSS BlockMask error during inference

GPT-OSS models use eager attention during inference because flex
attention returns incorrect results (likely due to left padding).
However, when _attn_implementation is set to "flex_attention",
transformers creates BlockMask objects which cause a TypeError
when passed to the eager attention path:

  TypeError: unsupported operand type(s) for +=: 'Tensor' and 'BlockMask'

This fix excludes GPT-OSS from using flex_attention, keeping it on
the eager path to avoid the BlockMask/Tensor type mismatch.
---
 unsloth/models/_utils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index 47934e7322..e5f5dfe68e 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -197,6 +197,12 @@ def prefer_flex_attn_if_supported(model_class, config):
             model_class, "_supports_flex_attn", False
         ):
             return None
+        # GPT-OSS uses eager attention during inference since flex attention
+        # returns incorrect results (likely due to left padding issues).
+        # Skip setting flex_attention to avoid BlockMask type errors.
+        model_type = getattr(config, "model_type", "") if config else ""
+        if model_type == "gpt_oss":
+            return None
         if config is not None:
             setattr(config, "_attn_implementation", "flex_attention")
             if hasattr(config, "attn_implementation"):