unslothai · LeoBorcherding · Apr 11, 2026 · Apr 11, 2026 · Apr 15, 2026 · Apr 16, 2026
diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
@@ -10,6 +10,7 @@
 from transformers import TextStreamer
 from peft import PeftModel, PeftModelForCausalLM
 
+import contextlib
 import json
 import sys
 import torch
@@ -1670,8 +1671,30 @@ def _generate_dac(
             + text
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
+
         with torch.inference_mode():
-            with torch.amp.autocast("cuda", dtype = model.dtype):
+            # Derive the autocast device from the loaded model, not from the
+            # global backend: a CPU-fallback DAC on an XPU/CUDA host must not
+            # open a GPU autocast context around CPU tensors.
+            device_type = (
+                model.device.type
+                if hasattr(model.device, "type")
+                else str(model.device).split(":", 1)[0]
+            )
+            # Clamp to autocast-supported backends so exotic devices
+            # (e.g. "meta" during accelerate offloaded loading) do not raise.
+            # MPS is autocast-supported since torch 2.3, keep it in the set.
+            if device_type not in ("cuda", "xpu", "mps", "cpu"):
+                device_type = "cpu"
+            # CPU and XPU autocast only accept bfloat16/float16. For a
+            # float32 model, skip autocast entirely to avoid raising or
+            # producing a warning on every generate call.
+            autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16)
+            if device_type in ("cpu", "xpu") and not autocast_dtype_supported:
+                autocast_ctx = contextlib.nullcontext()
+            else:
+                autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype)
+            with autocast_ctx:
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
                     **inputs,

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -55,6 +55,7 @@
     RENDER_HTML_REPEAT_NUDGE,
     parse_tool_calls_from_text as _shared_parse_tool_calls_from_text,
 )
+from utils.hardware import clear_gpu_cache
 
 logger = get_logger(__name__)
 
@@ -1245,23 +1246,28 @@ def _amd_apu_wants_unified_memory() -> bool:
 
     @staticmethod
     def _get_gpu_free_memory() -> list[tuple[int, int]]:
-        """Query free memory per GPU.
-
-        Order:
-          1. ``nvidia-smi`` (NVIDIA CUDA hosts) -- respects
-             ``CUDA_VISIBLE_DEVICES``.
-          2. ``torch.cuda.mem_get_info`` -- universal fallback that
-             works on AMD ROCm too because the HIP runtime
-             reuses the entire ``torch.cuda.*`` namespace. Covers the
-             AMD case for issue #5106 (nvidia-smi-only probe silently
-             returned [] on AMD hosts) and also rescues NVIDIA hosts
-             where ``nvidia-smi`` is missing from PATH.
-
-        Returns list of (gpu_index, free_mib) sorted by index. Empty
-        list if no supported GPU is reachable.
+        """Query free memory per visible GPU, backend-aware.
+
+        Returns list of ``(gpu_index, free_mib)`` sorted by index. The index
+        space matches whatever the active backend exposes: physical
+        ``nvidia-smi`` indices on NVIDIA; parent-visible numeric IDs on
+        AMD/ROCm and Intel XPU (via Studio's hardware telemetry layer).
+        Returns an empty list if no per-GPU free-memory data is available,
+        which lets the caller fall through to a non-placement launch path.
         """
-        # ── NVIDIA via nvidia-smi ────────────────────────────────────
+        import os
+
+        from utils.hardware import get_device
+        from utils.hardware.hardware import DeviceType
+        import utils.hardware.hardware as _hw_mod
+
+        # Fast path: NVIDIA / nvidia-smi. Skip only when we know the backend
+        # is XPU or ROCm -- not CUDA, CPU-only, or undetected.
+        _detected = get_device()
+        nvidia_eligible = _detected != DeviceType.XPU and not getattr(_hw_mod, "IS_ROCM", False)
         try:
+            if not nvidia_eligible:
+                raise FileNotFoundError  # skip to generic telemetry path
             result = subprocess.run(
                 [
                     "nvidia-smi",
@@ -1275,7 +1281,9 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
                 **_windows_hidden_subprocess_kwargs(),
             )
             if result.returncode == 0:
-                allowed: Optional[set[int]] = None
+                # Filter nvidia-smi output by CUDA_VISIBLE_DEVICES.
+                # Skip empty tokens so trailing commas don't disable the filter.
+                allowed = None
                 cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
                 if cvd is not None:
                     try:
@@ -1286,8 +1294,9 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
                         # filtered out, matching the codebase convention.
                         allowed = set(int(x.strip()) for x in cvd.split(",") if x.strip())
                     except ValueError:
-                        pass
-                gpus: list[tuple[int, int]] = []
+                        pass  # Non-numeric (e.g., "GPU-uuid"), ignore filter
+
+                gpus = []
                 for line in result.stdout.strip().splitlines():
                     parts = line.split(",")
                     if len(parts) == 2:
@@ -1296,71 +1305,55 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
                         if allowed is not None and idx not in allowed:
                             continue
                         gpus.append((idx, free_mib))
-                # Match the docstring's sort-by-id guarantee. nvidia-smi
-                # almost always returns sorted output, but driver order
-                # is not formally guaranteed.
-                gpus.sort(key = lambda g: g[0])
                 if gpus:
-                    return gpus
+                    return sorted(gpus, key = lambda item: item[0])
+        except FileNotFoundError:
+            pass  # nvidia-smi not on PATH — fall through to generic path
         except Exception as e:
-            logger.debug(f"nvidia-smi probe failed: {e}")
+            logger.debug(f"nvidia-smi free-memory query failed: {e}")
 
-        # ── Torch fallback (covers AMD ROCm and missing nvidia-smi) ──
+        # Generic path: ROCm, XPU, or nvidia-smi absent/failed.
         try:
-            import torch
-
-            if not hasattr(torch, "cuda") or not torch.cuda.is_available():
-                return []
-            if not hasattr(torch.cuda, "mem_get_info"):
-                return []
-            # torch.cuda enumerates GPUs RELATIVE to the visibility mask.
-            # On NVIDIA builds the mask is CUDA_VISIBLE_DEVICES; on AMD
-            # ROCm builds it is HIP_VISIBLE_DEVICES (or ROCR_VISIBLE_DEVICES
-            # if HIP is unset). Downstream we feed these IDs back into the
-            # llama-server subprocess as CVD, so we must translate visible
-            # ordinals back to physical indices first; otherwise launching
-            # with ``CUDA_VISIBLE_DEVICES=2,3`` would get rewritten to
-            # ``CUDA_VISIBLE_DEVICES=0,1`` and target the wrong GPUs.
-            physical_ids: Optional[list[int]] = None
-            # Match the codebase convention in
-            # ``utils/hardware/hardware.py::_get_parent_visible_gpu_spec``:
-            # treat an explicitly empty mask (``HIP_VISIBLE_DEVICES=""``)
-            # as "set to no GPUs" rather than falling through to the next
-            # var. ``or`` would coerce empty string to falsy and silently
-            # promote the wrong source.
-            if getattr(torch.version, "hip", None) is not None:
-                hip_v = os.environ.get("HIP_VISIBLE_DEVICES")
-                rocr_v = os.environ.get("ROCR_VISIBLE_DEVICES")
-                cvd = (
-                    hip_v
-                    if hip_v is not None
-                    else rocr_v
-                    if rocr_v is not None
-                    else os.environ.get("CUDA_VISIBLE_DEVICES")
+            from utils.hardware import get_visible_gpu_utilization
+
+            utilization = get_visible_gpu_utilization()
+
+            # Relative ordinals are not safe to round-trip into
+            # visibility env vars. Return [] so llama-server inherits
+            # the parent's mask unchanged.
+            if utilization.get("index_kind") not in (None, "physical"):
+                logger.debug(
+                    "Skipping GPU placement: telemetry reports index_kind=%r "
+                    "(not reusable for placement)",
+                    utilization.get("index_kind"),
                 )
-            else:
-                cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
-            if cvd is not None:
-                try:
-                    # Empty mask (CVD="") yields an empty list so the
-                    # below loop produces no GPUs, consistent with the
-                    # nvidia-smi path and utils/hardware/hardware.py.
-                    physical_ids = [int(x.strip()) for x in cvd.split(",") if x.strip()]
-                except ValueError:
-                    physical_ids = None
-            gpus = []
-            for ordinal in range(torch.cuda.device_count()):
-                free_bytes, _total_bytes = torch.cuda.mem_get_info(ordinal)
-                idx = (
-                    physical_ids[ordinal]
-                    if physical_ids is not None and ordinal < len(physical_ids)
-                    else ordinal
-                )
-                gpus.append((idx, free_bytes // (1024 * 1024)))
-            # Match the nvidia-smi path's docstring guarantee of sorted-by-id.
-            return sorted(gpus, key = lambda g: g[0])
+                return []
+
+            gpus: list[tuple[int, int]] = []
+            for device in utilization.get("devices", []) or []:
+                index = device.get("index")
+
+                # Use explicit ``is None`` checks -- ``or`` would treat an
+                # idle GPU with vram_used_gb == 0.0 as missing telemetry and
+                # silently drop a perfectly valid free card.
+                total_gb = device.get("vram_total_gb")
+                if total_gb is None:
+                    total_gb = device.get("total_gb")
+
+                used_gb = device.get("vram_used_gb")
+                if used_gb is None:
+                    used_gb = device.get("used_gb")
+
+                if index is None or total_gb is None or used_gb is None:
+                    # Missing telemetry for this device -- skip rather than
+                    # invent a free-memory number that drives placement.
+                    continue
+
+                free_mib = max(int((float(total_gb) - float(used_gb)) * 1024), 0)
+                gpus.append((int(index), free_mib))
+            return sorted(gpus, key = lambda item: item[0])
         except Exception as e:
-            logger.debug(f"torch GPU probe failed: {e}")
+            logger.debug(f"Generic GPU free-memory query failed: {e}")
             return []
 
     # Skip the wait when the last kill is older than this; the GPU
@@ -3849,10 +3842,7 @@ def unload_model(self) -> bool:
             if LlamaCppBackend._codec_mgr is not None:
                 LlamaCppBackend._codec_mgr.unload()
                 LlamaCppBackend._codec_mgr = None
-                import torch
-
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
+                clear_gpu_cache()
             return True
 
     def _kill_process(self):
@@ -5439,6 +5429,7 @@ def init_audio_codec(self, audio_type: str) -> None:
         if LlamaCppBackend._codec_mgr is None:
             LlamaCppBackend._codec_mgr = AudioCodecManager()
 
+        # Audio codecs are only validated on CUDA; stay on CPU otherwise.
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model_repo_path = None
 
@@ -5501,6 +5492,8 @@ def generate_audio_response(
             else None
         )
 
+        # Match init_audio_codec: stay on CPU for non-CUDA hosts until the
+        # codec path is validated on XPU.
         import torch
 
         device = "cuda" if torch.cuda.is_available() else "cpu"

diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
@@ -1505,6 +1505,10 @@ def _preprocess_snac_dataset(
 
         SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
         SNAC_SAMPLE_RATE = 24000
+
+        # SNAC codec has not been validated on Intel XPU yet; keep the
+        # pre-PR CPU fallback for non-CUDA hosts until an XPU-specific
+        # path is added.
         device = "cuda" if torch.cuda.is_available() else "cpu"
         max_length = self.max_seq_length or 2048
         tokenizer = self.tokenizer
@@ -1666,7 +1670,8 @@ def _preprocess_snac_dataset(
         del snac_model
 
         gc.collect()
-        torch.cuda.empty_cache()
+
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
@@ -1692,6 +1697,10 @@ def _preprocess_bicodec_dataset(
         import numpy as np
         import torchaudio.transforms as T
 
+        import subprocess
+
+        # Spark-TTS BiCodec has not been validated on Intel XPU; keep the
+        # pre-PR CPU fallback for non-CUDA hosts.
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
@@ -1880,7 +1889,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
         del audio_tokenizer
 
         gc.collect()
-        torch.cuda.empty_cache()
+
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
@@ -1916,6 +1926,8 @@ def _preprocess_dac_dataset(
         from datasets import Dataset as HFDataset
         from utils.paths import ensure_dir, tmp_root
 
+        # OuteTTS DAC/Whisper preprocess has not been validated on Intel
+        # XPU; keep the pre-PR CPU fallback for non-CUDA hosts.
         device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # Clone OuteTTS repo (same as audio_codecs._load_dac)
@@ -2087,7 +2099,8 @@ def _preprocess_dac_dataset(
         del prompt_processor
 
         gc.collect()
-        torch.cuda.empty_cache()
+
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:

diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
@@ -62,7 +62,15 @@ def normalize_blank_chat_template_override(cls, value: Optional[str]) -> Optiona
     )
     gpu_ids: Optional[List[int]] = Field(
         None,
-        description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries. Not supported for GGUF models.",
+        description = (
+            "Physical GPU indices to use, for example [0, 1]. Omit or pass "
+            "[] to use automatic selection. Explicit gpu_ids are unsupported "
+            "when the parent visibility mask uses non-numeric or subdevice "
+            "entries -- this includes CUDA_VISIBLE_DEVICES with UUID/MIG "
+            "entries on NVIDIA, and ZE_AFFINITY_MASK with subdevice tokens "
+            "(e.g. '0.0,0.1') or FLAT-hierarchy (default) tile handles on "
+            "Intel XPU. Not supported for GGUF models."
+        ),
     )
     speculative_type: Optional[str] = Field(
         None,

diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py
@@ -337,7 +337,15 @@ def _check_lora_dropout(cls, v: float) -> float:
     # GPU selection
     gpu_ids: Optional[List[int]] = Field(
         None,
-        description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries.",
+        description = (
+            "Physical GPU indices to use, for example [0, 1]. Omit or pass "
+            "[] to use automatic selection. Explicit gpu_ids are unsupported "
+            "when the parent visibility mask uses non-numeric or subdevice "
+            "entries -- this includes CUDA_VISIBLE_DEVICES with UUID/MIG "
+            "entries on NVIDIA, and ZE_AFFINITY_MASK with subdevice tokens "
+            "(e.g. '0.0,0.1') or FLAT-hierarchy (default) tile handles on "
+            "Intel XPU."
+        ),
     )
 
     @model_validator(mode = "after")