unslothai · danielhanchen · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
@@ -10,6 +10,7 @@
 from transformers import TextStreamer
 from peft import PeftModel, PeftModelForCausalLM
 
+import contextlib
 import json
 import sys
 import torch
@@ -1646,8 +1647,30 @@ def _generate_dac(
             + text
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
+
         with torch.inference_mode():
-            with torch.amp.autocast("cuda", dtype = model.dtype):
+            # Derive the autocast device from the loaded model, not from the
+            # global backend: a CPU-fallback DAC on an XPU/CUDA host must not
+            # open a GPU autocast context around CPU tensors.
+            device_type = (
+                model.device.type
+                if hasattr(model.device, "type")
+                else str(model.device).split(":", 1)[0]
+            )
+            # Clamp to autocast-supported backends so exotic devices
+            # (e.g. "meta" during accelerate offloaded loading) do not raise.
+            # MPS is autocast-supported since torch 2.3, keep it in the set.
+            if device_type not in ("cuda", "xpu", "mps", "cpu"):
+                device_type = "cpu"
+            # CPU and XPU autocast only accept bfloat16/float16. For a
+            # float32 model, skip autocast entirely to avoid raising or
+            # producing a warning on every generate call.
+            autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16)
+            if device_type in ("cpu", "xpu") and not autocast_dtype_supported:
+                autocast_ctx = contextlib.nullcontext()
+            else:
+                autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype)
+            with autocast_ctx:
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
                     **inputs,

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -26,6 +26,8 @@
 
 import httpx
 
+from utils.hardware import clear_gpu_cache
+
 logger = get_logger(__name__)
 
 # ── Pre-compiled patterns for plan-without-action re-prompt ──
@@ -1512,9 +1514,20 @@ def load_model(
                     f"{new_ld}:{existing_ld}" if existing_ld else new_ld
                 )
 
-            # Pin to selected GPU(s) via CUDA_VISIBLE_DEVICES
+            # Pin to selected GPU(s) via the backend-appropriate visibility
+            # env var: CUDA_VISIBLE_DEVICES on NVIDIA/ROCm, ZE_AFFINITY_MASK
+            # on Intel XPU (llama-server's SYCL build reads ZE_AFFINITY_MASK,
+            # not CUDA_VISIBLE_DEVICES).
             if gpu_indices is not None:
-                env["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpu_indices)
+                from utils.hardware import get_device
+                from utils.hardware.hardware import DeviceType
+
+                mask = ",".join(str(i) for i in gpu_indices)
+                if get_device() == DeviceType.XPU:
+                    env["ZE_AFFINITY_MASK"] = mask
+                    env.pop("CUDA_VISIBLE_DEVICES", None)
+                else:
+                    env["CUDA_VISIBLE_DEVICES"] = mask
 
             self._stdout_lines = []
             self._process = subprocess.Popen(
@@ -1625,10 +1638,7 @@ def unload_model(self) -> bool:
             if LlamaCppBackend._codec_mgr is not None:
                 LlamaCppBackend._codec_mgr.unload()
                 LlamaCppBackend._codec_mgr = None
-                import torch
-
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
+                clear_gpu_cache()
             return True
 
     def _kill_process(self):
@@ -3261,6 +3271,11 @@ def init_audio_codec(self, audio_type: str) -> None:
         if LlamaCppBackend._codec_mgr is None:
             LlamaCppBackend._codec_mgr = AudioCodecManager()
 
+        # Preserve the pre-PR CPU fallback on non-CUDA hosts: the SNAC /
+        # BiCodec / DAC codecs are not yet validated on Intel XPU, so
+        # only promote to a GPU device when CUDA is actually available.
+        # A follow-up can extend this once an XPU-specific codec path is
+        # added.
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model_repo_path = None
 
@@ -3333,6 +3348,8 @@ def generate_audio_response(
             else None
         )
 
+        # Match init_audio_codec: stay on CPU for non-CUDA hosts until the
+        # codec path is validated on XPU.
         import torch
 
         device = "cuda" if torch.cuda.is_available() else "cpu"

diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
@@ -1540,6 +1540,10 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
 
         SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
         SNAC_SAMPLE_RATE = 24000
+
+        # SNAC codec has not been validated on Intel XPU yet; keep the
+        # pre-PR CPU fallback for non-CUDA hosts until an XPU-specific
+        # path is added.
         device = "cuda" if torch.cuda.is_available() else "cpu"
         max_length = self.max_seq_length or 2048
         tokenizer = self.tokenizer
@@ -1716,7 +1720,8 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
         import gc
 
         gc.collect()
-        torch.cuda.empty_cache()
+
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
@@ -1744,6 +1749,8 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):
 
         import subprocess
 
+        # Spark-TTS BiCodec has not been validated on Intel XPU; keep the
+        # pre-PR CPU fallback for non-CUDA hosts.
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
@@ -1944,7 +1951,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
         import gc
 
         gc.collect()
-        torch.cuda.empty_cache()
+
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
@@ -1979,6 +1987,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         from datasets import Dataset as HFDataset
         from utils.paths import ensure_dir, tmp_root
 
+        # OuteTTS DAC/Whisper preprocess has not been validated on Intel
+        # XPU; keep the pre-PR CPU fallback for non-CUDA hosts.
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # Clone OuteTTS repo (same as audio_codecs._load_dac)
@@ -2157,7 +2167,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         import gc
 
         gc.collect()
-        torch.cuda.empty_cache()
+
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:

@@ -102,7 +102,8 @@ def test_explicit_ids_are_rejected_for_uuid_parent_visibility(self):
             patch("utils.hardware.hardware.get_physical_gpu_count", return_value = 8),
         ):
             with self.assertRaisesRegex(
-                ValueError, "unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG"
+                ValueError,
+                "unsupported when CUDA_VISIBLE_DEVICES uses non-numeric or subdevice",
             ):
                 resolve_requested_gpu_ids([1])
 
@@ -711,12 +712,14 @@ def start(self):
 
 
 class TestRouteErrors(unittest.TestCase):
-    def test_prepare_gpu_selection_rejects_gpu_ids_on_non_cuda_backend(self):
+    def test_prepare_gpu_selection_rejects_gpu_ids_on_non_accelerator_backend(self):
         with patch("utils.hardware.hardware.get_device", return_value = DeviceType.CPU):
             with self.assertRaises(ValueError) as exc_info:
                 prepare_gpu_selection([0], model_name = "unsloth/test")
 
-        self.assertIn("only supported on CUDA devices", str(exc_info.exception))
+        self.assertIn(
+            "only supported on CUDA and Intel XPU", str(exc_info.exception)
+        )
 
     def test_inference_route_rejects_gpu_ids_for_gguf(self):
         inference_route = _load_route_module(
@@ -1089,15 +1092,66 @@ def test_auto_select_falls_back_when_estimate_unavailable(self):
         self.assertEqual(metadata["selection_mode"], "fallback_all")
 
 
-class TestXpuRejection(_GpuCacheResetMixin, unittest.TestCase):
-    def test_auto_select_returns_non_cuda_for_xpu(self):
-        with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU):
+class TestXpuSelection(_GpuCacheResetMixin, unittest.TestCase):
+    def test_auto_select_supports_xpu(self):
+        with (
+            patch(
+                "utils.hardware.hardware.get_device", return_value = DeviceType.XPU
+            ),
+            patch(
+                "utils.hardware.hardware.estimate_required_model_memory_gb",
+                return_value = (1.0, {}),
+            ),
+            patch(
+                "utils.hardware.hardware.get_visible_gpu_utilization",
+                return_value = {
+                    "devices": [
+                        {"index": 0, "vram_total_gb": 8, "vram_used_gb": 1},
+                    ]
+                },
+            ),
+            patch(
+                "utils.hardware.hardware._get_parent_visible_gpu_spec",
+                return_value = {
+                    "raw": None,
+                    "numeric_ids": [0],
+                    "supports_explicit_gpu_ids": True,
+                },
+            ),
+            patch(
+                "utils.hardware.hardware.get_parent_visible_gpu_ids",
+                return_value = [0],
+            ),
+        ):
             selected, metadata = auto_select_gpu_ids("unsloth/test")
 
-        self.assertIsNone(selected)
-        self.assertEqual(metadata["selection_mode"], "non_cuda")
+        self.assertEqual(selected, [0])
+        self.assertEqual(metadata["selection_mode"], "auto")
 
-    def test_prepare_gpu_selection_rejects_explicit_ids_on_xpu(self):
-        with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU):
-            with self.assertRaisesRegex(ValueError, "only supported on CUDA"):
-                prepare_gpu_selection([0], model_name = "unsloth/test")
+    def test_prepare_gpu_selection_accepts_explicit_ids_on_xpu(self):
+        with (
+            patch(
+                "utils.hardware.hardware.get_device", return_value = DeviceType.XPU
+            ),
+            patch(
+                "utils.hardware.hardware._get_parent_visible_gpu_spec",
+                return_value = {
+                    "raw": "0",
+                    "numeric_ids": [0],
+                    "supports_explicit_gpu_ids": True,
+                },
+            ),
+            patch(
+                "utils.hardware.hardware.get_parent_visible_gpu_ids",
+                return_value = [0],
+            ),
+            patch(
+                "utils.hardware.hardware.get_physical_gpu_count", return_value = 1
+            ),
+        ):
+            selected, metadata = prepare_gpu_selection(
+                [0], model_name = "unsloth/test"
+            )
+
+        self.assertEqual(selected, [0])
+        self.assertEqual(metadata["selection_mode"], "explicit")
@@ -302,14 +302,14 @@ def test_two_gpus_needed(self):
             # 35GB (first) + 30*0.85 (second) = 60.5GB > 50GB
             self.assertEqual(len(selected), 2)
 
-    def test_non_cuda_returns_none(self):
+    def test_non_accelerator_returns_none(self):
         from utils.hardware.hardware import auto_select_gpu_ids
         import utils.hardware.hardware as hw
 
         with patch.object(hw, "get_device", return_value = hw.DeviceType.CPU):
             selected, meta = auto_select_gpu_ids("test/model")
             self.assertIsNone(selected)
-            self.assertEqual(meta["selection_mode"], "non_cuda")
+            self.assertEqual(meta["selection_mode"], "non_accelerator")
 
 
 class TestGetDeviceMap(unittest.TestCase):

diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py
@@ -29,6 +29,7 @@
     estimate_required_model_memory_gb,
     auto_select_gpu_ids,
     prepare_gpu_selection,
+    get_torch_device_str,
     safe_num_proc,
     safe_thread_num_proc,
     dataset_map_num_proc,
@@ -70,6 +71,7 @@
     "estimate_required_model_memory_gb",
     "auto_select_gpu_ids",
     "prepare_gpu_selection",
+    "get_torch_device_str",
     "safe_num_proc",
     "safe_thread_num_proc",
     "dataset_map_num_proc",