From d66df48dbacfeb3e8239afbec2caeb01f46bc186 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 00:50:20 +0000 Subject: [PATCH 1/9] Skip GPT-OSS allocator warmup on low-memory 4-bit loads --- unsloth_zoo/temporary_patches/gpt_oss.py | 75 ++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index 1388d78cc..3dbc78629 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -1094,6 +1094,81 @@ def patch_gpt_oss_bnb4bit_auto(): TEMPORARY_PATCHES.append(patch_gpt_oss_bnb4bit_auto) +def _get_accelerator_total_memory_bytes(): + try: + if DEVICE_TYPE == "xpu": + return int(torch.xpu.memory.mem_get_info(0)[-1]) + return int(torch.cuda.memory.mem_get_info(0)[-1]) + except Exception: + return None + + +def _get_effective_accelerator_memory_bytes(): + total_memory = _get_accelerator_total_memory_bytes() + if total_memory is None: + return None + if DEVICE_TYPE != "xpu" and hasattr(torch.cuda, "get_per_process_memory_fraction"): + try: + fraction = float(torch.cuda.get_per_process_memory_fraction(0)) + if 0.0 < fraction < 1.0: + return int(total_memory * fraction) + except Exception: + pass + return total_memory + + +def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool: + """ + GPT-OSS 4-bit can trigger a large allocator warmup in transformers + (`caching_allocator_warmup`) before weights are loaded. On 16GB GPUs this + warmup can OOM even though the actual loaded model fits. + """ + model_name = os.environ.get("UNSLOTH_MODEL_NAME", "").replace("-", "_") + if "gpt_oss" not in model_name: + return False + if "_load_in_4bit_" not in model_name: + return False + + mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower() + if mode in ("off", "disable", "0", "false"): + return True + if mode in ("on", "enable", "1", "true"): + return False + + total_memory = _get_effective_accelerator_memory_bytes() + if total_memory is None: + return False + return total_memory <= int(20 * 1024**3) + + +def patch_transformers_caching_allocator_warmup_for_gpt_oss(): + try: + import transformers.modeling_utils + except Exception as e: + return raise_error("transformers.modeling_utils", e) + + warmup_fn = transformers.modeling_utils.caching_allocator_warmup + if hasattr(warmup_fn, "__unsloth_gpt_oss_guarded__"): + return + + def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer): + if _should_skip_transformers_allocator_warmup_for_gpt_oss(): + if UNSLOTH_ENABLE_LOGGING: + logger.warning_once( + "Unsloth: Skipping transformers caching_allocator_warmup " + "for GPT-OSS 4-bit on low-memory accelerators. " + "Set UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP=on to keep warmup." + ) + return + return warmup_fn(model, expanded_device_map, hf_quantizer) + + guarded_caching_allocator_warmup.__unsloth_gpt_oss_guarded__ = True + transformers.modeling_utils.caching_allocator_warmup = guarded_caching_allocator_warmup + + +TEMPORARY_PATCHES.append(patch_transformers_caching_allocator_warmup_for_gpt_oss) + + # Combo kernels uses too much VRAM for low memory GPUs from ..device_type import DEVICE_TYPE From 054aa899858e5eddd681657290e27f1bce80e1fd Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 01:43:18 +0000 Subject: [PATCH 2/9] Apply low-memory warmup guard globally under 24GB --- unsloth_zoo/temporary_patches/gpt_oss.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index 3dbc78629..6808dc958 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -1119,17 +1119,15 @@ def _get_effective_accelerator_memory_bytes(): def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool: """ - GPT-OSS 4-bit can trigger a large allocator warmup in transformers - (`caching_allocator_warmup`) before weights are loaded. On 16GB GPUs this - warmup can OOM even though the actual loaded model fits. - """ - model_name = os.environ.get("UNSLOTH_MODEL_NAME", "").replace("-", "_") - if "gpt_oss" not in model_name: - return False - if "_load_in_4bit_" not in model_name: - return False + Skip transformers allocator warmup on low-memory accelerators. - mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower() + `caching_allocator_warmup` can allocate large single chunks before weights + are loaded, which can OOM constrained GPUs. + """ + mode = os.environ.get("UNSLOTH_ALLOCATOR_WARMUP", "").strip().lower() + if mode == "": + # Backward compatible alias for existing GPT-OSS override. + mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower() if mode in ("off", "disable", "0", "false"): return True if mode in ("on", "enable", "1", "true"): @@ -1138,7 +1136,7 @@ def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool: total_memory = _get_effective_accelerator_memory_bytes() if total_memory is None: return False - return total_memory <= int(20 * 1024**3) + return total_memory <= int(24 * 1024**3) def patch_transformers_caching_allocator_warmup_for_gpt_oss(): @@ -1156,8 +1154,8 @@ def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer): if UNSLOTH_ENABLE_LOGGING: logger.warning_once( "Unsloth: Skipping transformers caching_allocator_warmup " - "for GPT-OSS 4-bit on low-memory accelerators. " - "Set UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP=on to keep warmup." + "on low-memory accelerators (<24GB effective memory). " + "Set UNSLOTH_ALLOCATOR_WARMUP=on to keep warmup." ) return return warmup_fn(model, expanded_device_map, hf_quantizer) From 07df91b714f5a4c6e7ba11420a74acc44c9bb25e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 01:53:34 +0000 Subject: [PATCH 3/9] Rename warmup guard identifiers to generic names --- unsloth_zoo/temporary_patches/gpt_oss.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index 6808dc958..4128b5146 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -1117,7 +1117,7 @@ def _get_effective_accelerator_memory_bytes(): return total_memory -def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool: +def _should_skip_transformers_allocator_warmup() -> bool: """ Skip transformers allocator warmup on low-memory accelerators. @@ -1126,7 +1126,7 @@ def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool: """ mode = os.environ.get("UNSLOTH_ALLOCATOR_WARMUP", "").strip().lower() if mode == "": - # Backward compatible alias for existing GPT-OSS override. + # Backward compatible alias for previous override variable. mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower() if mode in ("off", "disable", "0", "false"): return True @@ -1139,18 +1139,21 @@ def _should_skip_transformers_allocator_warmup_for_gpt_oss() -> bool: return total_memory <= int(24 * 1024**3) -def patch_transformers_caching_allocator_warmup_for_gpt_oss(): +def patch_transformers_caching_allocator_warmup(): try: import transformers.modeling_utils except Exception as e: return raise_error("transformers.modeling_utils", e) warmup_fn = transformers.modeling_utils.caching_allocator_warmup + if hasattr(warmup_fn, "__unsloth_allocator_warmup_guarded__"): + return + # Backward compatibility with previous guard attribute. if hasattr(warmup_fn, "__unsloth_gpt_oss_guarded__"): return def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer): - if _should_skip_transformers_allocator_warmup_for_gpt_oss(): + if _should_skip_transformers_allocator_warmup(): if UNSLOTH_ENABLE_LOGGING: logger.warning_once( "Unsloth: Skipping transformers caching_allocator_warmup " @@ -1160,11 +1163,13 @@ def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer): return return warmup_fn(model, expanded_device_map, hf_quantizer) + guarded_caching_allocator_warmup.__unsloth_allocator_warmup_guarded__ = True + # Keep legacy marker so older checks still detect this as guarded. guarded_caching_allocator_warmup.__unsloth_gpt_oss_guarded__ = True transformers.modeling_utils.caching_allocator_warmup = guarded_caching_allocator_warmup -TEMPORARY_PATCHES.append(patch_transformers_caching_allocator_warmup_for_gpt_oss) +TEMPORARY_PATCHES.append(patch_transformers_caching_allocator_warmup) # Combo kernels uses too much VRAM for low memory GPUs From 5b1f0ff6e931fc35f442f921f505d58ca48d3827 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 02:10:23 +0000 Subject: [PATCH 4/9] Normalize GPT-OSS model name guards for hyphenated names --- unsloth_zoo/temporary_patches/gpt_oss.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index 4128b5146..fe9db6505 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -1394,15 +1394,19 @@ def _should_use_gpt_oss_bnb4bit() -> bool: Default: True when load_in_4bit is active. Set UNSLOTH_GPT_OSS_BNB4BIT_DISABLE=1 to force BF16 path. """ - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): + if "gpt_oss" not in _normalized_unsloth_model_name(): return False - if "_load_in_4bit_" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): + if "_load_in_4bit_" not in _normalized_unsloth_model_name(): return False return os.environ.get("UNSLOTH_GPT_OSS_BNB4BIT_DISABLE", "0") != "1" def _is_gpt_oss_4bit_load() -> bool: - return "_load_in_4bit_" in os.environ.get("UNSLOTH_MODEL_NAME", "") + return "_load_in_4bit_" in _normalized_unsloth_model_name() + + +def _normalized_unsloth_model_name() -> str: + return os.environ.get("UNSLOTH_MODEL_NAME", "").replace("-", "_") def _is_transformers_v5() -> bool: @@ -1418,7 +1422,7 @@ def patch_gpt_oss_moe_for_lora(): IMPORTANT: We only patch the forward method, NOT replace the entire class. This preserves the original class structure so weights load correctly. """ - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): + if "gpt_oss" not in _normalized_unsloth_model_name(): return if _is_gpt_oss_4bit_load() or _should_use_gpt_oss_bnb4bit(): # 4-bit loads should keep quantized weights and use default PEFT LoRA. @@ -1852,8 +1856,8 @@ def patch_gpt_oss_linearized(): Patch GPT OSS for 4bit loading with grouped_mm support. Only patches the GptOssExperts forward method - keeps original classes for proper weight loading. """ - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return - if "_load_in_4bit_" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return + if "gpt_oss" not in _normalized_unsloth_model_name(): return + if "_load_in_4bit_" not in _normalized_unsloth_model_name(): return if _should_use_gpt_oss_bnb4bit(): return try: import transformers.models.gpt_oss.modeling_gpt_oss @@ -1891,7 +1895,7 @@ def experts_forward( def patch_GptOssAttention(): if os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "1") == "0": return - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return + if "gpt_oss" not in _normalized_unsloth_model_name(): return try: from ..flex_attention import ( flex_attention_with_sink, @@ -2132,7 +2136,7 @@ def forward( def patch_GptOssModel(): if os.environ.get("UNSLOTH_ENABLE_FLEX_ATTENTION", "1") == "0": return - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): return + if "gpt_oss" not in _normalized_unsloth_model_name(): return try: import transformers.models.gpt_oss.modeling_gpt_oss transformers.models.gpt_oss.modeling_gpt_oss.GptOssModel @@ -2817,7 +2821,7 @@ def patch_gpt_oss_config(): def patch_gpt_oss_init_weights_modulelist_fix(): - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): + if "gpt_oss" not in _normalized_unsloth_model_name(): return try: import transformers.models.gpt_oss.modeling_gpt_oss @@ -2862,7 +2866,7 @@ def patch_gpt_oss_for_grpo(): When UNSLOTH_RETURN_HIDDEN_STATES=1, return hidden_states instead of logits. This fixes the matrix multiplication dimension mismatch issue in GRPO training. """ - if "gpt_oss" not in os.environ.get("UNSLOTH_MODEL_NAME", ""): + if "gpt_oss" not in _normalized_unsloth_model_name(): return try: From b457eae0db67a9ebccdb1d62c7458852b5bcaf5f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 02:14:44 +0000 Subject: [PATCH 5/9] Drop legacy GPT-OSS allocator warmup env alias --- unsloth_zoo/temporary_patches/gpt_oss.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index fe9db6505..c04154565 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -1125,9 +1125,6 @@ def _should_skip_transformers_allocator_warmup() -> bool: are loaded, which can OOM constrained GPUs. """ mode = os.environ.get("UNSLOTH_ALLOCATOR_WARMUP", "").strip().lower() - if mode == "": - # Backward compatible alias for previous override variable. - mode = os.environ.get("UNSLOTH_GPT_OSS_ALLOCATOR_WARMUP", "auto").strip().lower() if mode in ("off", "disable", "0", "false"): return True if mode in ("on", "enable", "1", "true"): From 135d6ad2bc0fcb6751b00a3a5e7b79e20ff276bb Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 03:00:15 +0000 Subject: [PATCH 6/9] Use active accelerator index for warmup memory checks --- unsloth_zoo/temporary_patches/gpt_oss.py | 33 ++++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index c04154565..6d5f69ecc 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -1094,11 +1094,28 @@ def patch_gpt_oss_bnb4bit_auto(): TEMPORARY_PATCHES.append(patch_gpt_oss_bnb4bit_auto) +_LOW_MEMORY_ACCELERATOR_BYTES = int(24 * 1024**3) + + +def _get_active_accelerator_index(): + try: + if DEVICE_TYPE == "xpu": + if hasattr(torch, "xpu") and hasattr(torch.xpu, "current_device"): + return int(torch.xpu.current_device()) + return 0 + if hasattr(torch, "cuda") and hasattr(torch.cuda, "current_device"): + return int(torch.cuda.current_device()) + except Exception: + pass + return 0 + + def _get_accelerator_total_memory_bytes(): try: + device_index = _get_active_accelerator_index() if DEVICE_TYPE == "xpu": - return int(torch.xpu.memory.mem_get_info(0)[-1]) - return int(torch.cuda.memory.mem_get_info(0)[-1]) + return int(torch.xpu.memory.mem_get_info(device_index)[-1]) + return int(torch.cuda.memory.mem_get_info(device_index)[-1]) except Exception: return None @@ -1109,7 +1126,8 @@ def _get_effective_accelerator_memory_bytes(): return None if DEVICE_TYPE != "xpu" and hasattr(torch.cuda, "get_per_process_memory_fraction"): try: - fraction = float(torch.cuda.get_per_process_memory_fraction(0)) + device_index = _get_active_accelerator_index() + fraction = float(torch.cuda.get_per_process_memory_fraction(device_index)) if 0.0 < fraction < 1.0: return int(total_memory * fraction) except Exception: @@ -1133,7 +1151,7 @@ def _should_skip_transformers_allocator_warmup() -> bool: total_memory = _get_effective_accelerator_memory_bytes() if total_memory is None: return False - return total_memory <= int(24 * 1024**3) + return total_memory <= _LOW_MEMORY_ACCELERATOR_BYTES def patch_transformers_caching_allocator_warmup(): @@ -1172,10 +1190,9 @@ def guarded_caching_allocator_warmup(model, expanded_device_map, hf_quantizer): # Combo kernels uses too much VRAM for low memory GPUs from ..device_type import DEVICE_TYPE -if DEVICE_TYPE == "xpu": - device_memory = torch.xpu.memory.mem_get_info(0)[-1] -else: - device_memory = torch.cuda.memory.mem_get_info(0)[-1] +device_memory = _get_accelerator_total_memory_bytes() +if device_memory is None: + device_memory = 0 use_combo_kernels = False if device_memory/1024/1024/1024 <= 40 else True fused_torch_compile_options = get_torch_compile_options( epilogue_fusion = True, From 684f684f7ddbe63e798212ab582eee173d136308 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 03:16:44 +0000 Subject: [PATCH 7/9] Strip use_kernel_forward_from_hub decorators during class rewrite --- unsloth_zoo/compiler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py index e3d42fda2..c4e74cc23 100644 --- a/unsloth_zoo/compiler.py +++ b/unsloth_zoo/compiler.py @@ -1151,8 +1151,12 @@ def create_standalone_class( for line in lines: stripped = line.strip() if stripped.startswith("@"): - if "use_experts_implementation" in stripped: - logger.info(f'Unsloth: stripped use_experts_implementation decorator from {module}') + if ( + "use_experts_implementation" in stripped + or "use_kernel_forward_from_hub" in stripped + ): + decorator_name = stripped.split("(")[0].lstrip("@") + logger.info(f"Unsloth: stripped {decorator_name} decorator from {module}") continue # Strip it else: logger.warning(f"Unsloth: Warning: Unknown decorator {stripped} found for {module}.") From eaf1f4a28f58eb7e929206f25f526a0549f50642 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 03:18:32 +0000 Subject: [PATCH 8/9] Strip @auto_docstring decorators during class rewrite --- unsloth_zoo/compiler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py index c4e74cc23..f9ba0fb0b 100644 --- a/unsloth_zoo/compiler.py +++ b/unsloth_zoo/compiler.py @@ -1154,6 +1154,7 @@ def create_standalone_class( if ( "use_experts_implementation" in stripped or "use_kernel_forward_from_hub" in stripped + or stripped.startswith("@auto_docstring") ): decorator_name = stripped.split("(")[0].lstrip("@") logger.info(f"Unsloth: stripped {decorator_name} decorator from {module}") From 9cd816674c9b1086403b498519f4e6409e7bb1d2 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 26 Feb 2026 04:07:00 +0000 Subject: [PATCH 9/9] Handle GPT-OSS 5.2 mask kwargs and strip kernelized decorators --- unsloth_zoo/compiler.py | 2 ++ unsloth_zoo/temporary_patches/gpt_oss.py | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/unsloth_zoo/compiler.py b/unsloth_zoo/compiler.py index f9ba0fb0b..ebf09124e 100644 --- a/unsloth_zoo/compiler.py +++ b/unsloth_zoo/compiler.py @@ -1154,6 +1154,7 @@ def create_standalone_class( if ( "use_experts_implementation" in stripped or "use_kernel_forward_from_hub" in stripped + or "use_kernelized_func" in stripped or stripped.startswith("@auto_docstring") ): decorator_name = stripped.split("(")[0].lstrip("@") @@ -1274,6 +1275,7 @@ def create_standalone_class( # Remove @auto_docstring source = re.sub(r"@auto_docstring[\s]{0,}(\([^\)]{0,}\))?", "", source) + source = re.sub(r"@use_kernelized_func[\s]{0,}(\([^\)]{0,}\))?", "", source) source = re.sub(r"@check_model_inputs[\s]{0,}(\([^\)]{0,}\))?", "", source) # source = source.replace("@auto_docstring", "") diff --git a/unsloth_zoo/temporary_patches/gpt_oss.py b/unsloth_zoo/temporary_patches/gpt_oss.py index 6d5f69ecc..b565a8212 100644 --- a/unsloth_zoo/temporary_patches/gpt_oss.py +++ b/unsloth_zoo/temporary_patches/gpt_oss.py @@ -2171,12 +2171,25 @@ def patch_GptOssModel(): import transformers.generation.utils def wrap(f): def return_attention_mask(*args, **kwargs): - if kwargs["input_embeds"].requires_grad: + input_embeds = kwargs.get("input_embeds", None) + if input_embeds is None: + input_embeds = kwargs.get("inputs_embeds", None) + if input_embeds is None: + for arg in args: + if type(arg) is torch.Tensor and arg.is_floating_point(): + input_embeds = arg + break + + if input_embeds is not None and input_embeds.requires_grad: if "attention_mask" in kwargs: return kwargs["attention_mask"] for arg in args: - if type(arg) is torch.Tensor and arg.dtype == torch.int32: + if ( + type(arg) is torch.Tensor and + arg.dtype in (torch.int32, torch.int64, torch.bool) + ): return arg + return f(*args, **kwargs) else: # Eager return f(*args, **kwargs)