From f73106779b6157be987717d688acbe4d9c1115c2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 5 Feb 2026 12:27:20 +0000 Subject: [PATCH] Fix GPT-OSS BlockMask error during inference GPT-OSS models use eager attention during inference because flex attention returns incorrect results (likely due to left padding). However, when _attn_implementation is set to "flex_attention", transformers creates BlockMask objects which cause a TypeError when passed to the eager attention path: TypeError: unsupported operand type(s) for +=: 'Tensor' and 'BlockMask' This fix excludes GPT-OSS from using flex_attention, keeping it on the eager path to avoid the BlockMask/Tensor type mismatch. --- unsloth/models/_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index 47934e7322..e5f5dfe68e 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -197,6 +197,12 @@ def prefer_flex_attn_if_supported(model_class, config): model_class, "_supports_flex_attn", False ): return None + # GPT-OSS uses eager attention during inference since flex attention + # returns incorrect results (likely due to left padding issues). + # Skip setting flex_attention to avoid BlockMask type errors. + model_type = getattr(config, "model_type", "") if config else "" + if model_type == "gpt_oss": + return None if config is not None: setattr(config, "_attn_implementation", "flex_attention") if hasattr(config, "attn_implementation"):