From 6ef7cbd61c993e82ab7a2c1f67baaec5d9149021 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 15 May 2026 03:26:26 +0000 Subject: [PATCH] add UNSLOTH_ALLOW_CPU=1 path for CPU-only CI Lets `import unsloth.trainer` succeed on hosts without a CUDA/XPU/HIP accelerator (typical of zoo's source-inspection test matrix). The env var is read exactly once per process via @functools.cache on `get_device_type()`, so production hosts pay no runtime cost. Three edits beyond the device_type fallback: * `_gpu_init.py:212/247` -- the bf16 + libcuda/bnb setup blocks call `torch.cuda.get_device_capability()` and `libcuda_dirs()`/`bnb.functional.lib.*` unconditionally when DEVICE_TYPE == "cuda". Guard with `and torch.cuda.is_available()` so the new CPU-CI sentinel doesn't fault those. * `_gpu_init.py:353` -- gate `_patch_trl_trainer()` (the `_backwards_compatible_trainer.__init__` wrapper). Under UNSLOTH_ALLOW_CPU we want pristine upstream TRL classes for downstream `inspect.getsource(SFTTrainer)` drift detectors. * `models/_utils.py:1196` -- same `and torch.cuda.is_available()` guard for `get_device_capability()` at import time. * `models/rl.py:PatchFastRL` -- early-return under UNSLOTH_ALLOW_CPU=1 so the heavier `patch_trl_rl_trainers()` (which replaces `trl.SFTTrainer` with the compiled `UnslothSFTTrainer` class) doesn't fire either. Without this gate the drift detectors that do `inspect.getsource(SFTTrainer)` see the wrapper source and spurious fail. Local sanity: `UNSLOTH_ALLOW_CPU=1 python -c "import unsloth.trainer"` succeeds on a CPU-only venv, `trl.SFTTrainer.__init__.__qualname__` stays `SFTTrainer.__init__` (not `UnslothSFTTrainer.__init__`), and `inspect.getsource(SFTTrainer)` still contains `self._signature_columns`. Without the env var on a CUDA host, TRL is still patched normally (verified `UnslothSFTTrainer.__init__`). --- unsloth/_gpu_init.py | 19 +++++++++++++++---- unsloth/device_type.py | 6 ++++++ unsloth/models/_utils.py | 2 +- unsloth/models/rl.py | 5 +++++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/unsloth/_gpu_init.py b/unsloth/_gpu_init.py index aa94c4a568..a30111b529 100644 --- a/unsloth/_gpu_init.py +++ b/unsloth/_gpu_init.py @@ -209,7 +209,7 @@ del patch_peft_weight_converter_compatibility # Torch 2.4 has including_emulation -if DEVICE_TYPE == "cuda": +if DEVICE_TYPE == "cuda" and torch.cuda.is_available(): major_version, minor_version = torch.cuda.get_device_capability() SUPPORTS_BFLOAT16 = major_version >= 8 @@ -233,12 +233,18 @@ def is_bf16_supported(): # torch.xpu.is_bf16_supported() does not have including_emulation # set SUPPORTS_BFLOAT16 as torch.xpu.is_bf16_supported() SUPPORTS_BFLOAT16 = torch.xpu.is_bf16_supported() +else: + # CPU-only CI under UNSLOTH_ALLOW_CPU=1. We can't probe device + # capability, so assume no bf16 -- training won't run on this host + # anyway, this branch only exists to let `import unsloth.trainer` + # succeed for source-inspection tests. + SUPPORTS_BFLOAT16 = False # For Gradio HF Spaces? # if "SPACE_AUTHOR_NAME" not in os.environ and "SPACE_REPO_NAME" not in os.environ: import triton -if DEVICE_TYPE == "cuda": +if DEVICE_TYPE == "cuda" and torch.cuda.is_available(): libcuda_dirs = lambda: None if Version(triton.__version__) >= Version("3.0.0"): try: @@ -349,5 +355,10 @@ def is_bf16_supported(): launch_openenv, ) -# Patch TRL trainers for backwards compatibility -_patch_trl_trainer() +# Patch TRL trainers for backwards compatibility. +# Skipped under UNSLOTH_ALLOW_CPU=1 (CPU-only CI) because rebinding +# trl.SFTTrainer.__init__ to a generic wrapper changes +# inspect.getsource(SFTTrainer.__init__) and corrupts downstream +# drift detectors that anchor on the pristine upstream source. +if os.environ.get("UNSLOTH_ALLOW_CPU", "0") != "1": + _patch_trl_trainer() diff --git a/unsloth/device_type.py b/unsloth/device_type.py index 9bad9be0e4..6a82e42e8c 100644 --- a/unsloth/device_type.py +++ b/unsloth/device_type.py @@ -63,6 +63,10 @@ def get_device_type(): # Check torch.accelerator if hasattr(torch, "accelerator"): if not torch.accelerator.is_available(): + # Test-only CPU fallback. The env var is read exactly once per + # process because get_device_type is @functools.cache'd. + if os.environ.get("UNSLOTH_ALLOW_CPU", "0") == "1": + return "cuda" raise NotImplementedError( "Unsloth cannot find any torch accelerator? You need a GPU." ) @@ -73,6 +77,8 @@ def get_device_type(): f"But `torch.accelerator.current_accelerator()` works with it being = `{accelerator}`\n" f"Please reinstall torch - it's most likely broken :(" ) + if os.environ.get("UNSLOTH_ALLOW_CPU", "0") == "1": + return "cuda" raise NotImplementedError( "Unsloth currently only works on NVIDIA, AMD and Intel GPUs." ) diff --git a/unsloth/models/_utils.py b/unsloth/models/_utils.py index ccd75aa000..df498e89fb 100644 --- a/unsloth/models/_utils.py +++ b/unsloth/models/_utils.py @@ -1193,7 +1193,7 @@ def _is_openai_available(): HAS_FLASH_ATTENTION = False HAS_FLASH_ATTENTION_SOFTCAPPING = False -if DEVICE_TYPE == "cuda": +if DEVICE_TYPE == "cuda" and torch.cuda.is_available(): major_version, minor_version = torch.cuda.get_device_capability() torch.cuda.get_device_capability = functools.cache(torch.cuda.get_device_capability) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index ee9bdda26a..31a498eada 100644 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -2270,6 +2270,11 @@ def patch_trl_vllm_generation(): def PatchFastRL(algorithm = None, FastLanguageModel = None): if FastLanguageModel is not None: PatchRL(FastLanguageModel) + # Under UNSLOTH_ALLOW_CPU=1 (CPU-only CI), skip TRL trainer rewriting so + # downstream `inspect.getsource(trl.SFTTrainer)` drift detectors see the + # pristine upstream class, not the compiled Unsloth* wrappers. + if os.environ.get("UNSLOTH_ALLOW_CPU", "0") == "1": + return # Install the disable_gradient_checkpointing noop BEFORE # patch_trl_rl_trainers. patch_trl_rl_trainers imports extra trl.* trainer # submodules while generating the compiled cache; any new trl.* modules