From 6ef7cbd61c993e82ab7a2c1f67baaec5d9149021 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 15 May 2026 03:26:26 +0000
Subject: [PATCH] add UNSLOTH_ALLOW_CPU=1 path for CPU-only CI

Lets `import unsloth.trainer` succeed on hosts without a CUDA/XPU/HIP
accelerator (typical of zoo's source-inspection test matrix). The env
var is read exactly once per process via @functools.cache on
`get_device_type()`, so production hosts pay no runtime cost.

Three edits beyond the device_type fallback:

* `_gpu_init.py:212/247` -- the bf16 + libcuda/bnb setup blocks call
  `torch.cuda.get_device_capability()` and `libcuda_dirs()`/`bnb.functional.lib.*`
  unconditionally when DEVICE_TYPE == "cuda". Guard with
  `and torch.cuda.is_available()` so the new CPU-CI sentinel doesn't
  fault those.
* `_gpu_init.py:353` -- gate `_patch_trl_trainer()` (the
  `_backwards_compatible_trainer.__init__` wrapper). Under
  UNSLOTH_ALLOW_CPU we want pristine upstream TRL classes for
  downstream `inspect.getsource(SFTTrainer)` drift detectors.
* `models/_utils.py:1196` -- same `and torch.cuda.is_available()` guard
  for `get_device_capability()` at import time.
* `models/rl.py:PatchFastRL` -- early-return under UNSLOTH_ALLOW_CPU=1
  so the heavier `patch_trl_rl_trainers()` (which replaces
  `trl.SFTTrainer` with the compiled `UnslothSFTTrainer` class)
  doesn't fire either. Without this gate the drift detectors that
  do `inspect.getsource(SFTTrainer)` see the wrapper source and
  spurious fail.

Local sanity: `UNSLOTH_ALLOW_CPU=1 python -c "import unsloth.trainer"`
succeeds on a CPU-only venv, `trl.SFTTrainer.__init__.__qualname__`
stays `SFTTrainer.__init__` (not `UnslothSFTTrainer.__init__`), and
`inspect.getsource(SFTTrainer)` still contains `self._signature_columns`.
Without the env var on a CUDA host, TRL is still patched normally
(verified `UnslothSFTTrainer.__init__`).
---
 unsloth/_gpu_init.py     | 19 +++++++++++++++----
 unsloth/device_type.py   |  6 ++++++
 unsloth/models/_utils.py |  2 +-
 unsloth/models/rl.py     |  5 +++++
 4 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/unsloth/_gpu_init.py b/unsloth/_gpu_init.py
index aa94c4a568..a30111b529 100644
--- a/unsloth/_gpu_init.py
+++ b/unsloth/_gpu_init.py
@@ -209,7 +209,7 @@
 del patch_peft_weight_converter_compatibility
 
 # Torch 2.4 has including_emulation
-if DEVICE_TYPE == "cuda":
+if DEVICE_TYPE == "cuda" and torch.cuda.is_available():
     major_version, minor_version = torch.cuda.get_device_capability()
     SUPPORTS_BFLOAT16 = major_version >= 8
 
@@ -233,12 +233,18 @@ def is_bf16_supported():
     # torch.xpu.is_bf16_supported() does not have including_emulation
     # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported()
     SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported()
+else:
+    # CPU-only CI under UNSLOTH_ALLOW_CPU=1. We can't probe device
+    # capability, so assume no bf16 -- training won't run on this host
+    # anyway, this branch only exists to let `import unsloth.trainer`
+    # succeed for source-inspection tests.
+    SUPPORTS_BFLOAT16 = False
 
 # For Gradio HF Spaces?
 # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ:
 import triton
 
-if DEVICE_TYPE == "cuda":
+if DEVICE_TYPE == "cuda" and torch.cuda.is_available():
     libcuda_dirs = lambda: None
     if Version(triton.__version__) >= Version("3.0.0"):
         try:
@@ -349,5 +355,10 @@ def is_bf16_supported():
     launch_openenv,
 )
 
-# Patch TRL trainers for backwards compatibility
-_patch_trl_trainer()
+# Patch TRL trainers for backwards compatibility.
+# Skipped under UNSLOTH_ALLOW_CPU=1 (CPU-only CI) because rebinding
+# trl.SFTTrainer.__init__ to a generic wrapper changes
+# inspect.getsource(SFTTrainer.__init__) and corrupts downstream
+# drift detectors that anchor on the pristine upstream source.
+if os.environ.get("UNSLOTH_ALLOW_CPU", "0") != "1":
+    _patch_trl_trainer()
diff --git a/unsloth/device_type.py b/unsloth/device_type.py
index 9bad9be0e4..6a82e42e8c 100644
--- a/unsloth/device_type.py
+++ b/unsloth/device_type.py
@@ -63,6 +63,10 @@ def get_device_type():
     # Check torch.accelerator
     if hasattr(torch, "accelerator"):
         if not torch.accelerator.is_available():
+            # Test-only CPU fallback. The env var is read exactly once per
+            # process because get_device_type is @functools.cache'd.
+            if os.environ.get("UNSLOTH_ALLOW_CPU", "0") == "1":
+                return "cuda"
             raise NotImplementedError(
                 "Unsloth cannot find any torch accelerator? You need a GPU."
             )
@@ -73,6 +77,8 @@ def get_device_type():
                 f"But `torch.accelerator.current_accelerator()` works with it being = `{accelerator}`\n"
                 f"Please reinstall torch - it's most likely broken :("
             )
+    if os.environ.get("UNSLOTH_ALLOW_CPU", "0") == "1":
+        return "cuda"
     raise NotImplementedError(
         "Unsloth currently only works on NVIDIA, AMD and Intel GPUs."
     )
diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py
index ccd75aa000..df498e89fb 100644
--- a/unsloth/models/_utils.py
+++ b/unsloth/models/_utils.py
@@ -1193,7 +1193,7 @@ def _is_openai_available():
 HAS_FLASH_ATTENTION = False
 HAS_FLASH_ATTENTION_SOFTCAPPING = False
 
-if DEVICE_TYPE == "cuda":
+if DEVICE_TYPE == "cuda" and torch.cuda.is_available():
     major_version, minor_version = torch.cuda.get_device_capability()
     torch.cuda.get_device_capability = functools.cache(torch.cuda.get_device_capability)
 
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index ee9bdda26a..31a498eada 100644
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -2270,6 +2270,11 @@ def patch_trl_vllm_generation():
 def PatchFastRL(algorithm = None, FastLanguageModel = None):
     if FastLanguageModel is not None:
         PatchRL(FastLanguageModel)
+    # Under UNSLOTH_ALLOW_CPU=1 (CPU-only CI), skip TRL trainer rewriting so
+    # downstream `inspect.getsource(trl.SFTTrainer)` drift detectors see the
+    # pristine upstream class, not the compiled Unsloth* wrappers.
+    if os.environ.get("UNSLOTH_ALLOW_CPU", "0") == "1":
+        return
     # Install the disable_gradient_checkpointing noop BEFORE
     # patch_trl_rl_trainers. patch_trl_rl_trainers imports extra trl.* trainer
     # submodules while generating the compiled cache; any new trl.* modules