From d66df48dbacfeb3e8239afbec2caeb01f46bc186 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 00:50:20 +0000
Subject: [PATCH 1/9] Skip GPT-OSS allocator warmup on low-memory 4-bit loads

---
 unsloth_zoo/temporary_patches/gpt_oss.py | 75 ++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index 1388d78cc..3dbc78629 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -1094,6 +1094,81 @@ def patch_gpt_oss_bnb4bit_auto():
 TEMPORARY_PATCHES.append(patch_gpt_oss_bnb4bit_auto)
 
 
+def _get_accelerator_total_memory_bytes():
+    try:
+        if DEVICE_TYPE == "xpu":
+            return int(torch.xpu.memory.mem_get_info(0)[-1])
+        return int(torch.cuda.memory.mem_get_info(0)[-1])
+    except Exception:
+        return None
+
+
+def _get_effective_accelerator_memory_bytes():
+    total_memory = _get_accelerator_total_memory_bytes()
+    if total_memory is None:
+        return None
+    if DEVICE_TYPE != "xpu" and hasattr(torch.cuda, "get_per_process_memory_fraction"):
+        try:
+            fraction = float(torch.cuda.get_per_process_memory_fraction(0))
+            if 0.0 < fraction < 1.0:
+                return int(total_memory * fraction)
+        except Exception:
+            pass
+    return total_memory
+
+
+def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool:
+    """
+    GPT-OSS 4-bit can trigger a large allocator warmup in transformers
+    (`caching_allocator_warmup`) before weights are loaded. On 16GB GPUs this
+    warmup can OOM even though the actual loaded model fits.
+    """
+    model_name = os.environ.get("UNSLOTH_MODEL_NAME", "").replace("-", "_")
+    if "gpt_oss" not in model_name:
+        return False
+    if "_load_in_4bit_" not in model_name:
+        return False
+
+    mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower()
+    if mode in ("off", "disable", "0", "false"):
+        return True
+    if mode in ("on", "enable", "1", "true"):
+        return False
+
+    total_memory = _get_effective_accelerator_memory_bytes()
+    if total_memory is None:
+        return False
+    return total_memory <= int(20 * 1024**3)
+
+
+def patch_transformers_caching_allocator_warmup_for_gpt_oss():
+    try:
+        import transformers.modeling_utils
+    except Exception as e:
+        return raise_error("transformers.modeling_utils", e)
+
+    warmup_fn = transformers.modeling_utils.caching_allocator_warmup
+    if hasattr(warmup_fn, "__unsloth_gpt_oss_guarded__"):
+        return
+
+    def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer):
+        if _should_skip_transformers_allocator_warmup_for_gpt_oss():
+            if UNSLOTH_ENABLE_LOGGING:
+                logger.warning_once(
+                    "Unsloth: Skipping transformers caching_allocator_warmup "
+                    "for GPT-OSS 4-bit on low-memory accelerators. "
+                    "Set UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP=on to keep warmup."
+                )
+            return
+        return warmup_fn(model, expanded_device_map, hf_quantizer)
+
+    guarded_caching_allocator_warmup.__unsloth_gpt_oss_guarded__ = True
+    transformers.modeling_utils.caching_allocator_warmup = guarded_caching_allocator_warmup
+
+
+TEMPORARY_PATCHES.append(patch_transformers_caching_allocator_warmup_for_gpt_oss)
+
+
 # Combo kernels uses too much VRAM for low memory GPUs
 from ..device_type import DEVICE_TYPE
 

From 054aa899858e5eddd681657290e27f1bce80e1fd Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 01:43:18 +0000
Subject: [PATCH 2/9] Apply low-memory warmup guard globally under 24GB

---
 unsloth_zoo/temporary_patches/gpt_oss.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index 3dbc78629..6808dc958 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -1119,17 +1119,15 @@ def _get_effective_accelerator_memory_bytes():
 
 def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool:
     """
-    GPT-OSS 4-bit can trigger a large allocator warmup in transformers
-    (`caching_allocator_warmup`) before weights are loaded. On 16GB GPUs this
-    warmup can OOM even though the actual loaded model fits.
-    """
-    model_name = os.environ.get("UNSLOTH_MODEL_NAME", "").replace("-", "_")
-    if "gpt_oss" not in model_name:
-        return False
-    if "_load_in_4bit_" not in model_name:
-        return False
+    Skip transformers allocator warmup on low-memory accelerators.
 
-    mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower()
+    `caching_allocator_warmup` can allocate large single chunks before weights
+    are loaded, which can OOM constrained GPUs.
+    """
+    mode = os.environ.get("UNSLOTH_ALLOCATOR_WARMUP", "").strip().lower()
+    if mode == "":
+        # Backward compatible alias for existing GPT-OSS override.
+        mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower()
     if mode in ("off", "disable", "0", "false"):
         return True
     if mode in ("on", "enable", "1", "true"):
@@ -1138,7 +1136,7 @@ def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool:
     total_memory = _get_effective_accelerator_memory_bytes()
     if total_memory is None:
         return False
-    return total_memory <= int(20 * 1024**3)
+    return total_memory <= int(24 * 1024**3)
 
 
 def patch_transformers_caching_allocator_warmup_for_gpt_oss():
@@ -1156,8 +1154,8 @@ def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer):
             if UNSLOTH_ENABLE_LOGGING:
                 logger.warning_once(
                     "Unsloth: Skipping transformers caching_allocator_warmup "
-                    "for GPT-OSS 4-bit on low-memory accelerators. "
-                    "Set UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP=on to keep warmup."
+                    "on low-memory accelerators (<24GB effective memory). "
+                    "Set UNSLOTH_ALLOCATOR_WARMUP=on to keep warmup."
                 )
             return
         return warmup_fn(model, expanded_device_map, hf_quantizer)

From 07df91b714f5a4c6e7ba11420a74acc44c9bb25e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 01:53:34 +0000
Subject: [PATCH 3/9] Rename warmup guard identifiers to generic names

---
 unsloth_zoo/temporary_patches/gpt_oss.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index 6808dc958..4128b5146 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -1117,7 +1117,7 @@ def _get_effective_accelerator_memory_bytes():
     return total_memory
 
 
-def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool:
+def _should_skip_transformers_allocator_warmup() -> bool:
     """
     Skip transformers allocator warmup on low-memory accelerators.
 
@@ -1126,7 +1126,7 @@ def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool:
     """
     mode = os.environ.get("UNSLOTH_ALLOCATOR_WARMUP", "").strip().lower()
     if mode == "":
-        # Backward compatible alias for existing GPT-OSS override.
+        # Backward compatible alias for previous override variable.
         mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower()
     if mode in ("off", "disable", "0", "false"):
         return True
@@ -1139,18 +1139,21 @@ def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool:
     return total_memory <= int(24 * 1024**3)
 
 
-def patch_transformers_caching_allocator_warmup_for_gpt_oss():
+def patch_transformers_caching_allocator_warmup():
     try:
         import transformers.modeling_utils
     except Exception as e:
         return raise_error("transformers.modeling_utils", e)
 
     warmup_fn = transformers.modeling_utils.caching_allocator_warmup
+    if hasattr(warmup_fn, "__unsloth_allocator_warmup_guarded__"):
+        return
+    # Backward compatibility with previous guard attribute.
     if hasattr(warmup_fn, "__unsloth_gpt_oss_guarded__"):
         return
 
     def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer):
-        if _should_skip_transformers_allocator_warmup_for_gpt_oss():
+        if _should_skip_transformers_allocator_warmup():
             if UNSLOTH_ENABLE_LOGGING:
                 logger.warning_once(
                     "Unsloth: Skipping transformers caching_allocator_warmup "
@@ -1160,11 +1163,13 @@ def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer):
             return
         return warmup_fn(model, expanded_device_map, hf_quantizer)
 
+    guarded_caching_allocator_warmup.__unsloth_allocator_warmup_guarded__ = True
+    # Keep legacy marker so older checks still detect this as guarded.
     guarded_caching_allocator_warmup.__unsloth_gpt_oss_guarded__ = True
     transformers.modeling_utils.caching_allocator_warmup = guarded_caching_allocator_warmup
 
 
-TEMPORARY_PATCHES.append(patch_transformers_caching_allocator_warmup_for_gpt_oss)
+TEMPORARY_PATCHES.append(patch_transformers_caching_allocator_warmup)
 
 
 # Combo kernels uses too much VRAM for low memory GPUs

From 5b1f0ff6e931fc35f442f921f505d58ca48d3827 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 02:10:23 +0000
Subject: [PATCH 4/9] Normalize GPT-OSS model name guards for hyphenated names

---
 unsloth_zoo/temporary_patches/gpt_oss.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index 4128b5146..fe9db6505 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -1394,15 +1394,19 @@ def _should_use_gpt_oss_bnb4bit() -> bool:
     Default: True when load_in_4bit is active.
     Set UNSLOTH_GPT_OSS_BNB4BIT_DISABLE=1 to force BF16 path.
     """
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""):
+    if "gpt_oss" not in _normalized_unsloth_model_name():
         return False
-    if "_load_in_4bit_" not in os.environ.get("UNSLOTH_MODEL_NAME", ""):
+    if "_load_in_4bit_" not in _normalized_unsloth_model_name():
         return False
     return os.environ.get("UNSLOTH_GPT_OSS_BNB4BIT_DISABLE", "0") != "1"
 
 
 def _is_gpt_oss_4bit_load() -> bool:
-    return "_load_in_4bit_" in os.environ.get("UNSLOTH_MODEL_NAME", "")
+    return "_load_in_4bit_" in _normalized_unsloth_model_name()
+
+
+def _normalized_unsloth_model_name() -> str:
+    return os.environ.get("UNSLOTH_MODEL_NAME", "").replace("-", "_")
 
 
 def _is_transformers_v5() -> bool:
@@ -1418,7 +1422,7 @@ def patch_gpt_oss_moe_for_lora():
     IMPORTANT: We only patch the forward method, NOT replace the entire class.
     This preserves the original class structure so weights load correctly.
     """
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""):
+    if "gpt_oss" not in _normalized_unsloth_model_name():
         return
     if _is_gpt_oss_4bit_load() or _should_use_gpt_oss_bnb4bit():
         # 4-bit loads should keep quantized weights and use default PEFT LoRA.
@@ -1852,8 +1856,8 @@ def patch_gpt_oss_linearized():
     Patch GPT OSS for 4bit loading with grouped_mm support.
     Only patches the GptOssExperts forward method - keeps original classes for proper weight loading.
     """
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return
-    if "_load_in_4bit_" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return
+    if "gpt_oss" not in _normalized_unsloth_model_name(): return
+    if "_load_in_4bit_" not in _normalized_unsloth_model_name(): return
     if _should_use_gpt_oss_bnb4bit(): return
     try:
         import transformers.models.gpt_oss.modeling_gpt_oss
@@ -1891,7 +1895,7 @@ def experts_forward(
 
 def patch_GptOssAttention():
     if os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "1") == "0": return
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return
+    if "gpt_oss" not in _normalized_unsloth_model_name(): return
     try:
         from ..flex_attention import (
             flex_attention_with_sink,
@@ -2132,7 +2136,7 @@ def forward(
 
 def patch_GptOssModel():
     if os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "1") == "0": return
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return
+    if "gpt_oss" not in _normalized_unsloth_model_name(): return
     try:
         import transformers.models.gpt_oss.modeling_gpt_oss
         transformers.models.gpt_oss.modeling_gpt_oss.GptOssModel
@@ -2817,7 +2821,7 @@ def patch_gpt_oss_config():
 
 
 def patch_gpt_oss_init_weights_modulelist_fix():
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""):
+    if "gpt_oss" not in _normalized_unsloth_model_name():
         return
     try:
         import transformers.models.gpt_oss.modeling_gpt_oss
@@ -2862,7 +2866,7 @@ def patch_gpt_oss_for_grpo():
     When UNSLOTH_RETURN_HIDDEN_STATES=1, return hidden_states instead of logits.
     This fixes the matrix multiplication dimension mismatch issue in GRPO training.
     """
-    if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""):
+    if "gpt_oss" not in _normalized_unsloth_model_name():
         return
 
     try:

From b457eae0db67a9ebccdb1d62c7458852b5bcaf5f Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 02:14:44 +0000
Subject: [PATCH 5/9] Drop legacy GPT-OSS allocator warmup env alias

---
 unsloth_zoo/temporary_patches/gpt_oss.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index fe9db6505..c04154565 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -1125,9 +1125,6 @@ def _should_skip_transformers_allocator_warmup() -> bool:
     are loaded, which can OOM constrained GPUs.
     """
     mode = os.environ.get("UNSLOTH_ALLOCATOR_WARMUP", "").strip().lower()
-    if mode == "":
-        # Backward compatible alias for previous override variable.
-        mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower()
     if mode in ("off", "disable", "0", "false"):
         return True
     if mode in ("on", "enable", "1", "true"):

From 135d6ad2bc0fcb6751b00a3a5e7b79e20ff276bb Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 03:00:15 +0000
Subject: [PATCH 6/9] Use active accelerator index for warmup memory checks

---
 unsloth_zoo/temporary_patches/gpt_oss.py | 33 ++++++++++++++++++------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index c04154565..6d5f69ecc 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -1094,11 +1094,28 @@ def patch_gpt_oss_bnb4bit_auto():
 TEMPORARY_PATCHES.append(patch_gpt_oss_bnb4bit_auto)
 
 
+_LOW_MEMORY_ACCELERATOR_BYTES = int(24 * 1024**3)
+
+
+def _get_active_accelerator_index():
+    try:
+        if DEVICE_TYPE == "xpu":
+            if hasattr(torch, "xpu") and hasattr(torch.xpu, "current_device"):
+                return int(torch.xpu.current_device())
+            return 0
+        if hasattr(torch, "cuda") and hasattr(torch.cuda, "current_device"):
+            return int(torch.cuda.current_device())
+    except Exception:
+        pass
+    return 0
+
+
 def _get_accelerator_total_memory_bytes():
     try:
+        device_index = _get_active_accelerator_index()
         if DEVICE_TYPE == "xpu":
-            return int(torch.xpu.memory.mem_get_info(0)[-1])
-        return int(torch.cuda.memory.mem_get_info(0)[-1])
+            return int(torch.xpu.memory.mem_get_info(device_index)[-1])
+        return int(torch.cuda.memory.mem_get_info(device_index)[-1])
     except Exception:
         return None
 
@@ -1109,7 +1126,8 @@ def _get_effective_accelerator_memory_bytes():
         return None
     if DEVICE_TYPE != "xpu" and hasattr(torch.cuda, "get_per_process_memory_fraction"):
         try:
-            fraction = float(torch.cuda.get_per_process_memory_fraction(0))
+            device_index = _get_active_accelerator_index()
+            fraction = float(torch.cuda.get_per_process_memory_fraction(device_index))
             if 0.0 < fraction < 1.0:
                 return int(total_memory * fraction)
         except Exception:
@@ -1133,7 +1151,7 @@ def _should_skip_transformers_allocator_warmup() -> bool:
     total_memory = _get_effective_accelerator_memory_bytes()
     if total_memory is None:
         return False
-    return total_memory <= int(24 * 1024**3)
+    return total_memory <= _LOW_MEMORY_ACCELERATOR_BYTES
 
 
 def patch_transformers_caching_allocator_warmup():
@@ -1172,10 +1190,9 @@ def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer):
 # Combo kernels uses too much VRAM for low memory GPUs
 from ..device_type import DEVICE_TYPE
 
-if DEVICE_TYPE == "xpu":
-    device_memory = torch.xpu.memory.mem_get_info(0)[-1]
-else:
-    device_memory = torch.cuda.memory.mem_get_info(0)[-1]
+device_memory = _get_accelerator_total_memory_bytes()
+if device_memory is None:
+    device_memory = 0
 use_combo_kernels = False if device_memory/1024/1024/1024 <= 40 else True
 fused_torch_compile_options = get_torch_compile_options(
     epilogue_fusion = True,

From 684f684f7ddbe63e798212ab582eee173d136308 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 03:16:44 +0000
Subject: [PATCH 7/9] Strip use_kernel_forward_from_hub decorators during class
 rewrite

---
 unsloth_zoo/compiler.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py
index e3d42fda2..c4e74cc23 100644
--- a/unsloth_zoo/compiler.py
+++ b/unsloth_zoo/compiler.py
@@ -1151,8 +1151,12 @@ def create_standalone_class(
             for line in lines:
                 stripped = line.strip()
                 if stripped.startswith("@"):
-                    if "use_experts_implementation" in stripped:
-                        logger.info(f'Unsloth: stripped use_experts_implementation decorator from {module}')
+                    if (
+                        "use_experts_implementation" in stripped
+                        or "use_kernel_forward_from_hub" in stripped
+                    ):
+                        decorator_name = stripped.split("(")[0].lstrip("@")
+                        logger.info(f"Unsloth: stripped {decorator_name} decorator from {module}")
                         continue # Strip it
                     else:
                         logger.warning(f"Unsloth: Warning: Unknown decorator {stripped} found for {module}.")

From eaf1f4a28f58eb7e929206f25f526a0549f50642 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 03:18:32 +0000
Subject: [PATCH 8/9] Strip @auto_docstring decorators during class rewrite

---
 unsloth_zoo/compiler.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py
index c4e74cc23..f9ba0fb0b 100644
--- a/unsloth_zoo/compiler.py
+++ b/unsloth_zoo/compiler.py
@@ -1154,6 +1154,7 @@ def create_standalone_class(
                     if (
                         "use_experts_implementation" in stripped
                         or "use_kernel_forward_from_hub" in stripped
+                        or stripped.startswith("@auto_docstring")
                     ):
                         decorator_name = stripped.split("(")[0].lstrip("@")
                         logger.info(f"Unsloth: stripped {decorator_name} decorator from {module}")

From 9cd816674c9b1086403b498519f4e6409e7bb1d2 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Thu, 26 Feb 2026 04:07:00 +0000
Subject: [PATCH 9/9] Handle GPT-OSS 5.2 mask kwargs and strip kernelized
 decorators

---
 unsloth_zoo/compiler.py                  |  2 ++
 unsloth_zoo/temporary_patches/gpt_oss.py | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py
index f9ba0fb0b..ebf09124e 100644
--- a/unsloth_zoo/compiler.py
+++ b/unsloth_zoo/compiler.py
@@ -1154,6 +1154,7 @@ def create_standalone_class(
                     if (
                         "use_experts_implementation" in stripped
                         or "use_kernel_forward_from_hub" in stripped
+                        or "use_kernelized_func" in stripped
                         or stripped.startswith("@auto_docstring")
                     ):
                         decorator_name = stripped.split("(")[0].lstrip("@")
@@ -1274,6 +1275,7 @@ def create_standalone_class(
 
     # Remove @auto_docstring
     source = re.sub(r"@auto_docstring[\s]{0,}(\([^\)]{0,}\))?", "", source)
+    source = re.sub(r"@use_kernelized_func[\s]{0,}(\([^\)]{0,}\))?", "", source)
     source = re.sub(r"@check_model_inputs[\s]{0,}(\([^\)]{0,}\))?", "", source)
     # source = source.replace("@auto_docstring", "")
 
diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py
index 6d5f69ecc..b565a8212 100644
--- a/unsloth_zoo/temporary_patches/gpt_oss.py
+++ b/unsloth_zoo/temporary_patches/gpt_oss.py
@@ -2171,12 +2171,25 @@ def patch_GptOssModel():
     import transformers.generation.utils
     def wrap(f):
         def return_attention_mask(*args, **kwargs):
-            if kwargs["input_embeds"].requires_grad:
+            input_embeds = kwargs.get("input_embeds", None)
+            if input_embeds is None:
+                input_embeds = kwargs.get("inputs_embeds", None)
+            if input_embeds is None:
+                for arg in args:
+                    if type(arg) is torch.Tensor and arg.is_floating_point():
+                        input_embeds = arg
+                        break
+
+            if input_embeds is not None and input_embeds.requires_grad:
                 if "attention_mask" in kwargs:
                     return kwargs["attention_mask"]
                 for arg in args:
-                    if type(arg) is torch.Tensor and arg.dtype == torch.int32:
+                    if (
+                        type(arg) is torch.Tensor and
+                        arg.dtype in (torch.int32, torch.int64, torch.bool)
+                    ):
                         return arg
+                return f(*args, **kwargs)
             else:
                 # Eager
                 return f(*args, **kwargs)