From 9db527a717e74f1cac27bec778392a2e976484b1 Mon Sep 17 00:00:00 2001
From: leizhenyuan <zhenyuan.lei@intel.com>
Date: Tue, 31 Mar 2026 10:46:23 +0000
Subject: [PATCH 01/18] add intel GPU for unsloth studio

---
 studio/backend/core/inference/inference.py |   3 +-
 studio/backend/core/inference/llama_cpp.py |  10 +-
 studio/backend/core/training/trainer.py    |  18 ++-
 studio/backend/main.py                     |  59 +++++++
 studio/backend/utils/hardware/__init__.py  |   8 +
 studio/backend/utils/hardware/hardware.py  | 171 ++++++++++++++++++++-
 studio/backend/utils/utils.py              |   3 +-
 unsloth/import_fixes.py                    |  22 ++-
 unsloth/models/rl.py                       |   4 +
 9 files changed, 273 insertions(+), 25 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index 867bdefc62..085f01a194 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -1643,7 +1643,8 @@ def _generate_dac(
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
         with torch.inference_mode():
-            with torch.amp.autocast("cuda", dtype = model.dtype):
+            from utils.hardware import get_torch_device_str
+            with torch.amp.autocast(get_torch_device_str(), dtype = model.dtype):
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
                     **inputs,
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index c1f87ff936..19a169425f 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1438,8 +1438,8 @@ def unload_model(self) -> bool:
                 LlamaCppBackend._codec_mgr = None
                 import torch
 
-                if torch.cuda.is_available():
-                    torch.cuda.empty_cache()
+                from utils.hardware import clear_gpu_cache
+                clear_gpu_cache()
             return True
 
     def _kill_process(self):
@@ -3016,7 +3016,8 @@ def init_audio_codec(self, audio_type: str) -> None:
         if LlamaCppBackend._codec_mgr is None:
             LlamaCppBackend._codec_mgr = AudioCodecManager()
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        from utils.hardware import get_torch_device_str
+        device = get_torch_device_str()
         model_repo_path = None
 
         # BiCodec needs a repo with BiCodec/ weights — download canonical SparkTTS
@@ -3090,7 +3091,8 @@ def generate_audio_response(
 
         import torch
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        from utils.hardware import get_torch_device_str
+        device = get_torch_device_str()
         return LlamaCppBackend._codec_mgr.decode(
             audio_type, device, token_ids = token_ids, text = data.get("content", "")
         )
diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index ab1825d94a..f25e848c8f 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -1532,7 +1532,8 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
 
         SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
         SNAC_SAMPLE_RATE = 24000
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        from utils.hardware import get_torch_device_str
+        device = get_torch_device_str()
         max_length = self.max_seq_length or 2048
         tokenizer = self.tokenizer
 
@@ -1708,7 +1709,8 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
         import gc
 
         gc.collect()
-        torch.cuda.empty_cache()
+        from utils.hardware import clear_gpu_cache
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
@@ -1736,7 +1738,8 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):
 
         import subprocess
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        from utils.hardware import get_torch_device_str
+        device = get_torch_device_str()
 
         # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
         # NOT in the unsloth/Spark-TTS-0.5B HF model repo. Clone it if needed.
@@ -1936,7 +1939,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
         import gc
 
         gc.collect()
-        torch.cuda.empty_cache()
+        from utils.hardware import clear_gpu_cache
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
@@ -1971,7 +1975,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         from datasets import Dataset as HFDataset
         from utils.paths import ensure_dir, tmp_root
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        from utils.hardware import get_torch_device_str
+        device = get_torch_device_str()
 
         # Clone OuteTTS repo (same as audio_codecs._load_dac)
         import subprocess
@@ -2149,7 +2154,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         import gc
 
         gc.collect()
-        torch.cuda.empty_cache()
+        from utils.hardware import clear_gpu_cache
+        clear_gpu_cache()
         self._cuda_audio_used = True
 
         if not processed_examples:
diff --git a/studio/backend/main.py b/studio/backend/main.py
index c18f18a743..99358ae9de 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -238,11 +238,70 @@ async def get_system_info():
     import psutil
     from utils.hardware import get_device
 
+<<<<<<< Updated upstream
     visibility_info = get_backend_visible_gpu_info()
     gpu_info = {
         "available": visibility_info["available"],
         "devices": visibility_info["devices"],
     }
+=======
+        try:
+            result = subprocess.run(
+                [
+                    "nvidia-smi",
+                    "--query-gpu=index,name,memory.total",
+                    "--format=csv,noheader,nounits",
+                ],
+                capture_output = True,
+                text = True,
+                timeout = 10,
+            )
+            if result.returncode == 0:
+                for line in result.stdout.strip().splitlines():
+                    parts = [p.strip() for p in line.split(",")]
+                    if len(parts) == 3:
+                        idx = int(parts[0])
+                        if allowed_indices is not None and idx not in allowed_indices:
+                            continue
+                        gpu_info["devices"].append(
+                            {
+                                "index": idx,
+                                "name": parts[1],
+                                "memory_total_gb": round(int(parts[2]) / 1024, 2),
+                            }
+                        )
+                gpu_info["available"] = len(gpu_info["devices"]) > 0
+        except Exception:
+            pass
+    elif device == DeviceType.XPU:
+        try:
+            import torch
+            for i in range(torch.xpu.device_count()):
+                props = torch.xpu.get_device_properties(i)
+                gpu_info["devices"].append(
+                    {
+                        "index": i,
+                        "name": props.name,
+                        "memory_total_gb": round(props.total_memory / (1024**3), 2),
+                    }
+                )
+            gpu_info["available"] = len(gpu_info["devices"]) > 0
+        except Exception:
+            pass
+
+    # Fallback to torch-based single-GPU detection
+    if not gpu_info["available"]:
+        mem_info = get_gpu_memory_info()
+        if mem_info.get("available"):
+            gpu_info["available"] = True
+            gpu_info["devices"].append(
+                {
+                    "index": mem_info.get("device", 0),
+                    "name": mem_info.get("device_name", "Unknown"),
+                    "memory_total_gb": round(mem_info.get("total_gb", 0), 2),
+                }
+            )
+>>>>>>> Stashed changes
 
     # CPU & Memory
     memory = psutil.virtual_memory()
diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py
index aaa0452406..981c0d2453 100644
--- a/studio/backend/utils/hardware/__init__.py
+++ b/studio/backend/utils/hardware/__init__.py
@@ -22,12 +22,16 @@
     get_backend_visible_gpu_info,
     get_physical_gpu_count,
     get_visible_gpu_count,
+<<<<<<< Updated upstream
     get_parent_visible_gpu_ids,
     resolve_requested_gpu_ids,
     estimate_fp16_model_size_bytes,
     estimate_required_model_memory_gb,
     auto_select_gpu_ids,
     prepare_gpu_selection,
+=======
+    get_torch_device_str,
+>>>>>>> Stashed changes
     safe_num_proc,
     safe_thread_num_proc,
     dataset_map_num_proc,
@@ -62,12 +66,16 @@
     "get_backend_visible_gpu_info",
     "get_physical_gpu_count",
     "get_visible_gpu_count",
+<<<<<<< Updated upstream
     "get_parent_visible_gpu_ids",
     "resolve_requested_gpu_ids",
     "estimate_fp16_model_size_bytes",
     "estimate_required_model_memory_gb",
     "auto_select_gpu_ids",
     "prepare_gpu_selection",
+=======
+    "get_torch_device_str",
+>>>>>>> Stashed changes
     "safe_num_proc",
     "safe_thread_num_proc",
     "dataset_map_num_proc",
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index b6d3faf6d7..af5fbc51bb 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -42,7 +42,7 @@ class DeviceType(str, Enum):
 # ========== Global State (set once by detect_hardware) ==========
 
 DEVICE: Optional[DeviceType] = None
-CHAT_ONLY: bool = True  # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.)
+CHAT_ONLY: bool = True  # No CUDA/XPU GPU -> GGUF chat only (Mac, CPU-only, etc.)
 
 
 # ========== Detection ==========
@@ -82,16 +82,17 @@ def detect_hardware() -> DeviceType:
 
     Detection order:
       1. CUDA  (NVIDIA GPU, requires torch)
-      2. MLX   (Apple Silicon via MLX framework)
-      3. CPU   (fallback)
+      2. XPU   (Intel GPU, requires torch with XPU support)
+      3. MLX   (Apple Silicon via MLX framework)
+      4. CPU   (fallback)
     """
     global DEVICE, CHAT_ONLY
-    CHAT_ONLY = True  # reset -- only CUDA sets it to False
+    CHAT_ONLY = True  # reset -- only CUDA/XPU sets it to False
 
-    # --- CUDA: try PyTorch ---
     if _has_torch():
         import torch
 
+        # --- CUDA: NVIDIA GPU ---
         if torch.cuda.is_available():
             DEVICE = DeviceType.CUDA
             CHAT_ONLY = False
@@ -99,10 +100,14 @@ def detect_hardware() -> DeviceType:
             print(f"Hardware detected: CUDA — {device_name}")
             return DEVICE
 
+<<<<<<< Updated upstream
     # --- XPU: Intel GPU ---
     if _has_torch():
         import torch
 
+=======
+        # --- XPU: Intel GPU ---
+>>>>>>> Stashed changes
         if hasattr(torch, "xpu") and torch.xpu.is_available():
             DEVICE = DeviceType.XPU
             CHAT_ONLY = False
@@ -223,7 +228,11 @@ def get_gpu_memory_info() -> Dict[str, Any]:
                 "utilization_pct": (allocated / total) * 100,
             }
         except Exception as e:
+<<<<<<< Updated upstream
             logger.error("Error getting XPU GPU info: %s", e)
+=======
+            logger.error(f"Error getting XPU GPU info: {e}")
+>>>>>>> Stashed changes
             return {"available": False, "backend": device.value, "error": str(e)}
 
     # ---- MLX path (Apple Silicon) ----
@@ -315,18 +324,96 @@ def get_package_versions() -> Dict[str, Optional[str]]:
         except PackageNotFoundError:
             versions[name] = None
 
-    # CUDA toolkit version bundled with torch
+    # CUDA/XPU toolkit version bundled with torch
     try:
         import torch
 
         versions["cuda"] = getattr(torch.version, "cuda", None)
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            versions["xpu"] = True
     except Exception:
         versions["cuda"] = None
 
     return versions
 
 
+<<<<<<< Updated upstream
 # ========== Torch-based GPU fallbacks (AMD ROCm, Intel XPU, nvidia-smi missing) ==========
+=======
+# ========== Live GPU Utilization ==========
+
+
+def _get_xpu_utilization() -> Dict[str, Any]:
+    """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu."""
+    try:
+        import subprocess
+
+        result = subprocess.run(
+            ["xpu-smi", "dump", "-d", "0", "-m", "0,1,2,18"],
+            capture_output=True, text=True, timeout=5,
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            # xpu-smi dump outputs CSV: Timestamp, DeviceId, GPU Utilization (%), ...
+            lines = result.stdout.strip().splitlines()
+            for line in reversed(lines):
+                if line.startswith("Timestamp") or line.startswith("#"):
+                    continue
+                parts = [p.strip() for p in line.split(",")]
+                if len(parts) >= 4:
+                    gpu_util = float(parts[2]) if parts[2] not in ("", "N/A") else None
+                    temp = float(parts[3]) if parts[3] not in ("", "N/A") else None
+                    break
+            else:
+                gpu_util = None
+                temp = None
+        else:
+            gpu_util = None
+            temp = None
+    except Exception:
+        gpu_util = None
+        temp = None
+
+    # Get VRAM from torch.xpu
+    vram_used_gb = None
+    vram_total_gb = None
+    try:
+        import torch
+
+        idx = torch.xpu.current_device()
+        props = torch.xpu.get_device_properties(idx)
+        vram_total_gb = round(props.total_memory / (1024**3), 2)
+        vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2)
+    except Exception:
+        pass
+
+    vram_pct = (
+        round((vram_used_gb / vram_total_gb) * 100, 1)
+        if vram_used_gb is not None and vram_total_gb and vram_total_gb > 0
+        else None
+    )
+
+    has_any = any(v is not None for v in [gpu_util, temp, vram_used_gb])
+    if not has_any:
+        return {"available": False, "backend": "xpu"}
+
+    return {
+        "available": True,
+        "backend": "xpu",
+        "gpu_utilization_pct": gpu_util,
+        "temperature_c": temp,
+        "vram_used_gb": vram_used_gb,
+        "vram_total_gb": vram_total_gb,
+        "vram_utilization_pct": vram_pct,
+        "power_draw_w": None,
+        "power_limit_w": None,
+        "power_utilization_pct": None,
+    }
+
+
+def get_gpu_utilization() -> Dict[str, Any]:
+    """
+    Return a live snapshot of GPU utilization via ``nvidia-smi``.
+>>>>>>> Stashed changes
 
 
 def _torch_get_device_module():
@@ -334,11 +421,19 @@ def _torch_get_device_module():
     device = get_device()
     import torch
 
+<<<<<<< Updated upstream
     if device == DeviceType.CUDA:
         return torch.cuda, "cuda"
     if device == DeviceType.XPU and hasattr(torch, "xpu"):
         return torch.xpu, "xpu"
     return None, None
+=======
+    if device == DeviceType.XPU:
+        return _get_xpu_utilization()
+
+    if device != DeviceType.CUDA:
+        return {"available": False, "backend": device.value}
+>>>>>>> Stashed changes
 
 
 def _torch_get_physical_gpu_count() -> Optional[int]:
@@ -1097,8 +1192,12 @@ def get_physical_gpu_count() -> int:
     """
     Return the number of physical GPUs on the machine.
 
+<<<<<<< Updated upstream
     Uses ``nvidia-smi -L`` on NVIDIA (unaffected by CUDA_VISIBLE_DEVICES),
     with a torch-based fallback for AMD ROCm and Intel XPU.
+=======
+    For NVIDIA uses ``nvidia-smi -L``; for Intel XPU uses ``torch.xpu.device_count()``.
+>>>>>>> Stashed changes
     Result is cached after the first call.
     """
     global _physical_gpu_count
@@ -1106,6 +1205,20 @@ def get_physical_gpu_count() -> int:
         return _physical_gpu_count
 
     device = get_device()
+<<<<<<< Updated upstream
+=======
+
+    if device == DeviceType.XPU:
+        try:
+            import torch
+            _physical_gpu_count = torch.xpu.device_count()
+        except Exception:
+            _physical_gpu_count = 1
+        return _physical_gpu_count
+
+    try:
+        import subprocess
+>>>>>>> Stashed changes
 
     if device == DeviceType.CUDA:
         try:
@@ -1246,6 +1359,30 @@ def get_visible_gpu_count() -> int:
     if _visible_gpu_count is not None:
         return _visible_gpu_count
 
+<<<<<<< Updated upstream
+=======
+    # Check XPU visibility env var or CUDA_VISIBLE_DEVICES
+    import os
+
+    device = get_device()
+
+    if device == DeviceType.XPU:
+        xpu_visible = os.environ.get("ZE_AFFINITY_MASK")
+        if xpu_visible is not None:
+            xpu_visible = xpu_visible.strip()
+            if xpu_visible == "":
+                _visible_gpu_count = 0
+            else:
+                _visible_gpu_count = len([x for x in xpu_visible.split(",") if x.strip()])
+            return _visible_gpu_count
+        try:
+            import torch
+            _visible_gpu_count = torch.xpu.device_count()
+        except Exception:
+            _visible_gpu_count = get_physical_gpu_count()
+        return _visible_gpu_count
+
+>>>>>>> Stashed changes
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
     if cuda_visible is not None:
         # "" means zero GPUs, "0" means 1, "0,1,2" means 3
@@ -1270,6 +1407,7 @@ def get_visible_gpu_count() -> int:
     return _visible_gpu_count
 
 
+<<<<<<< Updated upstream
 def apply_gpu_ids(gpu_ids) -> None:
     if gpu_ids is None:
         return
@@ -1353,6 +1491,19 @@ def raise_if_offloaded(model, device_map: str, context: str = "Loading") -> None
         f"{context} does not support models loaded with CPU or disk offload. "
         f"device_map='{device_map}' produced offloaded modules: {example}"
     )
+=======
+def get_torch_device_str() -> str:
+    """
+    Return the torch device string for the detected hardware.
+    E.g. "cuda", "xpu", or "cpu".
+    """
+    device = get_device()
+    if device == DeviceType.CUDA:
+        return "cuda"
+    elif device == DeviceType.XPU:
+        return "xpu"
+    return "cpu"
+>>>>>>> Stashed changes
 
 
 def safe_num_proc(desired: Optional[int] = None) -> int:
@@ -1430,9 +1581,17 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]:
     Returns ``None`` on spawn-based platforms (Windows, macOS) because
     ``datasets`` treats ``num_proc=1`` as multiprocessing (creates ``Pool(1)``).
     Only ``num_proc=None`` guarantees in-process execution.
+
+    Also returns ``None`` on XPU devices because ``os.fork()`` corrupts the
+    Level-Zero GPU context, causing Triton kernel launches to fail with
+    "Pointer argument doesn't reference XPU device memory".
     """
     import sys
 
     if sys.platform in ("win32", "darwin"):
         return None
+
+    if get_device() == DeviceType.XPU:
+        return None
+
     return safe_num_proc(desired)
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index 4e61a5b969..a544bb0802 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -103,13 +103,14 @@ def format_error_message(error: Exception, model_name: str) -> str:
     if (
         "memory" in error_str
         or "cuda" in error_str
+        or "xpu" in error_str
         or "mlx" in error_str
         or "out of memory" in error_str
     ):
         from utils.hardware import get_device
 
         device = get_device()
-        device_label = {"cuda": "GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get(
+        device_label = {"cuda": "GPU", "xpu": "Intel GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get(
             device.value, "GPU"
         )
         return f"Not enough {device_label} memory to load '{model_short}'. Try a smaller model or free memory."
diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index ca44a0ce7e..195dc14d85 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -443,7 +443,9 @@ def fix_vllm_aimv2_issue():
 
 
 def fix_vllm_guided_decoding_params():
-    def _maybe_raise_vllm_transformers_mismatch(error):
+    def _maybe_disable_vllm_transformers_mismatch(error):
+        """If vLLM fails due to transformers version mismatch, disable it gracefully."""
+        global VLLM_BROKEN
         error_text = str(error)
         if (
             "ALLOWED_LAYER_TYPES" in error_text
@@ -453,13 +455,17 @@ def _maybe_raise_vllm_transformers_mismatch(error):
                 vllm_version = importlib_version("vllm")
             except Exception:
                 vllm_version = "unknown"
-            raise RuntimeError(
+            logger.warning(
                 "Unsloth: vLLM with version "
                 f"{vllm_version} does not yet support transformers>=5.0.0. "
-                "Please downgrade to transformers==4.57.3 via "
-                'pip install --force-reinstall "transformers==4.57.3". '
+                "Disabling vLLM and continuing without it. "
                 f"Original error: {error}"
-            ) from error
+            )
+            VLLM_BROKEN = True
+            _clear_vllm_modules()
+            _install_vllm_blocker()
+            return True
+        return False
 
     if importlib.util.find_spec("vllm") is None:
         return
@@ -469,7 +475,8 @@ def _maybe_raise_vllm_transformers_mismatch(error):
     try:
         import vllm
     except (ImportError, OSError) as e:
-        _maybe_raise_vllm_transformers_mismatch(e)
+        if _maybe_disable_vllm_transformers_mismatch(e):
+            return
         if disable_broken_vllm(e):
             return
         raise
@@ -477,7 +484,8 @@ def _maybe_raise_vllm_transformers_mismatch(error):
     try:
         from vllm.sampling_params import GuidedDecodingParams
     except (ImportError, OSError) as e:
-        _maybe_raise_vllm_transformers_mismatch(e)
+        if _maybe_disable_vllm_transformers_mismatch(e):
+            return
         if disable_broken_vllm(e):
             return
         if not hasattr(vllm, "sampling_params") or not hasattr(
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 5651a7da41..449419acf6 100755
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -1202,6 +1202,10 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
             "        memory_gb_left = psutil.virtual_memory().available / (1024**3)\n"
             "        if memory_gb_left <= 2: dataset_num_proc = 1\n"
             "        else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left))\n"
+            "# XPU: forking corrupts Level-Zero context, force single process\n"
+            "import torch as _torch\n"
+            "if hasattr(_torch, 'xpu') and _torch.xpu.is_available():\n"
+            "    dataset_num_proc = 1\n"
         )
         extra_args += num_proc_check
 

From f1c426b2f1e42536f49c60c508c1e11f7984bbe5 Mon Sep 17 00:00:00 2001
From: leizhenyuan <zhenyuan.lei@intel.com>
Date: Tue, 31 Mar 2026 11:04:29 +0000
Subject: [PATCH 02/18] clean stash

---
 studio/backend/main.py                    |  59 --------
 studio/backend/utils/hardware/__init__.py |   6 -
 studio/backend/utils/hardware/hardware.py | 171 ++++++++--------------
 3 files changed, 61 insertions(+), 175 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 99358ae9de..c18f18a743 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -238,70 +238,11 @@ async def get_system_info():
     import psutil
     from utils.hardware import get_device
 
-<<<<<<< Updated upstream
     visibility_info = get_backend_visible_gpu_info()
     gpu_info = {
         "available": visibility_info["available"],
         "devices": visibility_info["devices"],
     }
-=======
-        try:
-            result = subprocess.run(
-                [
-                    "nvidia-smi",
-                    "--query-gpu=index,name,memory.total",
-                    "--format=csv,noheader,nounits",
-                ],
-                capture_output = True,
-                text = True,
-                timeout = 10,
-            )
-            if result.returncode == 0:
-                for line in result.stdout.strip().splitlines():
-                    parts = [p.strip() for p in line.split(",")]
-                    if len(parts) == 3:
-                        idx = int(parts[0])
-                        if allowed_indices is not None and idx not in allowed_indices:
-                            continue
-                        gpu_info["devices"].append(
-                            {
-                                "index": idx,
-                                "name": parts[1],
-                                "memory_total_gb": round(int(parts[2]) / 1024, 2),
-                            }
-                        )
-                gpu_info["available"] = len(gpu_info["devices"]) > 0
-        except Exception:
-            pass
-    elif device == DeviceType.XPU:
-        try:
-            import torch
-            for i in range(torch.xpu.device_count()):
-                props = torch.xpu.get_device_properties(i)
-                gpu_info["devices"].append(
-                    {
-                        "index": i,
-                        "name": props.name,
-                        "memory_total_gb": round(props.total_memory / (1024**3), 2),
-                    }
-                )
-            gpu_info["available"] = len(gpu_info["devices"]) > 0
-        except Exception:
-            pass
-
-    # Fallback to torch-based single-GPU detection
-    if not gpu_info["available"]:
-        mem_info = get_gpu_memory_info()
-        if mem_info.get("available"):
-            gpu_info["available"] = True
-            gpu_info["devices"].append(
-                {
-                    "index": mem_info.get("device", 0),
-                    "name": mem_info.get("device_name", "Unknown"),
-                    "memory_total_gb": round(mem_info.get("total_gb", 0), 2),
-                }
-            )
->>>>>>> Stashed changes
 
     # CPU & Memory
     memory = psutil.virtual_memory()
diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py
index 981c0d2453..df67052389 100644
--- a/studio/backend/utils/hardware/__init__.py
+++ b/studio/backend/utils/hardware/__init__.py
@@ -22,16 +22,13 @@
     get_backend_visible_gpu_info,
     get_physical_gpu_count,
     get_visible_gpu_count,
-<<<<<<< Updated upstream
     get_parent_visible_gpu_ids,
     resolve_requested_gpu_ids,
     estimate_fp16_model_size_bytes,
     estimate_required_model_memory_gb,
     auto_select_gpu_ids,
     prepare_gpu_selection,
-=======
     get_torch_device_str,
->>>>>>> Stashed changes
     safe_num_proc,
     safe_thread_num_proc,
     dataset_map_num_proc,
@@ -66,16 +63,13 @@
     "get_backend_visible_gpu_info",
     "get_physical_gpu_count",
     "get_visible_gpu_count",
-<<<<<<< Updated upstream
     "get_parent_visible_gpu_ids",
     "resolve_requested_gpu_ids",
     "estimate_fp16_model_size_bytes",
     "estimate_required_model_memory_gb",
     "auto_select_gpu_ids",
     "prepare_gpu_selection",
-=======
     "get_torch_device_str",
->>>>>>> Stashed changes
     "safe_num_proc",
     "safe_thread_num_proc",
     "dataset_map_num_proc",
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index af5fbc51bb..746dc17039 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -100,14 +100,7 @@ def detect_hardware() -> DeviceType:
             print(f"Hardware detected: CUDA — {device_name}")
             return DEVICE
 
-<<<<<<< Updated upstream
-    # --- XPU: Intel GPU ---
-    if _has_torch():
-        import torch
-
-=======
         # --- XPU: Intel GPU ---
->>>>>>> Stashed changes
         if hasattr(torch, "xpu") and torch.xpu.is_available():
             DEVICE = DeviceType.XPU
             CHAT_ONLY = False
@@ -228,11 +221,7 @@ def get_gpu_memory_info() -> Dict[str, Any]:
                 "utilization_pct": (allocated / total) * 100,
             }
         except Exception as e:
-<<<<<<< Updated upstream
             logger.error("Error getting XPU GPU info: %s", e)
-=======
-            logger.error(f"Error getting XPU GPU info: {e}")
->>>>>>> Stashed changes
             return {"available": False, "backend": device.value, "error": str(e)}
 
     # ---- MLX path (Apple Silicon) ----
@@ -337,9 +326,64 @@ def get_package_versions() -> Dict[str, Optional[str]]:
     return versions
 
 
-<<<<<<< Updated upstream
 # ========== Torch-based GPU fallbacks (AMD ROCm, Intel XPU, nvidia-smi missing) ==========
-=======
+
+
+def _torch_get_device_module():
+    """Return the appropriate torch device module (cuda or xpu) and its name."""
+    device = get_device()
+    import torch
+
+    if device == DeviceType.CUDA:
+        return torch.cuda, "cuda"
+    if device == DeviceType.XPU and hasattr(torch, "xpu"):
+        return torch.xpu, "xpu"
+    return None, None
+
+
+def _torch_get_physical_gpu_count() -> Optional[int]:
+    mod, _ = _torch_get_device_module()
+    if mod is None:
+        return None
+    try:
+        return mod.device_count()
+    except Exception:
+        return None
+
+
+def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]]:
+    """Query torch for per-GPU name, total VRAM, and used VRAM."""
+    mod, _ = _torch_get_device_module()
+    if mod is None:
+        return []
+
+    devices = []
+    for ordinal, phys_idx in enumerate(device_indices):
+        try:
+            # torch uses 0-based ordinals relative to CUDA_VISIBLE_DEVICES
+            props = mod.get_device_properties(ordinal)
+            total_bytes = props.total_memory
+            # Prefer mem_get_info (reports system-wide usage, not just this
+            # process) so auto-selection accounts for other GPU consumers.
+            if hasattr(mod, "mem_get_info"):
+                free_bytes, total_bytes = mod.mem_get_info(ordinal)
+                used_bytes = total_bytes - free_bytes
+            else:
+                used_bytes = mod.memory_allocated(ordinal)
+            devices.append(
+                {
+                    "index": phys_idx,
+                    "visible_ordinal": ordinal,
+                    "name": props.name,
+                    "total_gb": round(total_bytes / (1024**3), 2),
+                    "used_gb": round(used_bytes / (1024**3), 2),
+                }
+            )
+        except Exception as e:
+            logger.debug("torch device query failed for ordinal %d: %s", ordinal, e)
+    return devices
+
+
 # ========== Live GPU Utilization ==========
 
 
@@ -411,81 +455,12 @@ def _get_xpu_utilization() -> Dict[str, Any]:
 
 
 def get_gpu_utilization() -> Dict[str, Any]:
-    """
-    Return a live snapshot of GPU utilization via ``nvidia-smi``.
->>>>>>> Stashed changes
-
-
-def _torch_get_device_module():
-    """Return the appropriate torch device module (cuda or xpu) and its name."""
+    """Return a live snapshot of device utilization information."""
     device = get_device()
-    import torch
 
-<<<<<<< Updated upstream
-    if device == DeviceType.CUDA:
-        return torch.cuda, "cuda"
-    if device == DeviceType.XPU and hasattr(torch, "xpu"):
-        return torch.xpu, "xpu"
-    return None, None
-=======
     if device == DeviceType.XPU:
         return _get_xpu_utilization()
 
-    if device != DeviceType.CUDA:
-        return {"available": False, "backend": device.value}
->>>>>>> Stashed changes
-
-
-def _torch_get_physical_gpu_count() -> Optional[int]:
-    mod, _ = _torch_get_device_module()
-    if mod is None:
-        return None
-    try:
-        return mod.device_count()
-    except Exception:
-        return None
-
-
-def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]]:
-    """Query torch for per-GPU name, total VRAM, and used VRAM."""
-    mod, _ = _torch_get_device_module()
-    if mod is None:
-        return []
-
-    devices = []
-    for ordinal, phys_idx in enumerate(device_indices):
-        try:
-            # torch uses 0-based ordinals relative to CUDA_VISIBLE_DEVICES
-            props = mod.get_device_properties(ordinal)
-            total_bytes = props.total_memory
-            # Prefer mem_get_info (reports system-wide usage, not just this
-            # process) so auto-selection accounts for other GPU consumers.
-            if hasattr(mod, "mem_get_info"):
-                free_bytes, total_bytes = mod.mem_get_info(ordinal)
-                used_bytes = total_bytes - free_bytes
-            else:
-                used_bytes = mod.memory_allocated(ordinal)
-            devices.append(
-                {
-                    "index": phys_idx,
-                    "visible_ordinal": ordinal,
-                    "name": props.name,
-                    "total_gb": round(total_bytes / (1024**3), 2),
-                    "used_gb": round(used_bytes / (1024**3), 2),
-                }
-            )
-        except Exception as e:
-            logger.debug("torch device query failed for ordinal %d: %s", ordinal, e)
-    return devices
-
-
-# ========== Live GPU Utilization ==========
-
-
-def get_gpu_utilization() -> Dict[str, Any]:
-    """Return a live snapshot of device utilization information."""
-    device = get_device()
-
     if device == DeviceType.CUDA:
         try:
             from . import nvidia
@@ -1192,12 +1167,8 @@ def get_physical_gpu_count() -> int:
     """
     Return the number of physical GPUs on the machine.
 
-<<<<<<< Updated upstream
     Uses ``nvidia-smi -L`` on NVIDIA (unaffected by CUDA_VISIBLE_DEVICES),
     with a torch-based fallback for AMD ROCm and Intel XPU.
-=======
-    For NVIDIA uses ``nvidia-smi -L``; for Intel XPU uses ``torch.xpu.device_count()``.
->>>>>>> Stashed changes
     Result is cached after the first call.
     """
     global _physical_gpu_count
@@ -1205,20 +1176,6 @@ def get_physical_gpu_count() -> int:
         return _physical_gpu_count
 
     device = get_device()
-<<<<<<< Updated upstream
-=======
-
-    if device == DeviceType.XPU:
-        try:
-            import torch
-            _physical_gpu_count = torch.xpu.device_count()
-        except Exception:
-            _physical_gpu_count = 1
-        return _physical_gpu_count
-
-    try:
-        import subprocess
->>>>>>> Stashed changes
 
     if device == DeviceType.CUDA:
         try:
@@ -1359,11 +1316,7 @@ def get_visible_gpu_count() -> int:
     if _visible_gpu_count is not None:
         return _visible_gpu_count
 
-<<<<<<< Updated upstream
-=======
-    # Check XPU visibility env var or CUDA_VISIBLE_DEVICES
-    import os
-
+    # Check XPU visibility env var
     device = get_device()
 
     if device == DeviceType.XPU:
@@ -1382,7 +1335,6 @@ def get_visible_gpu_count() -> int:
             _visible_gpu_count = get_physical_gpu_count()
         return _visible_gpu_count
 
->>>>>>> Stashed changes
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
     if cuda_visible is not None:
         # "" means zero GPUs, "0" means 1, "0,1,2" means 3
@@ -1407,7 +1359,6 @@ def get_visible_gpu_count() -> int:
     return _visible_gpu_count
 
 
-<<<<<<< Updated upstream
 def apply_gpu_ids(gpu_ids) -> None:
     if gpu_ids is None:
         return
@@ -1491,7 +1442,8 @@ def raise_if_offloaded(model, device_map: str, context: str = "Loading") -> None
         f"{context} does not support models loaded with CPU or disk offload. "
         f"device_map='{device_map}' produced offloaded modules: {example}"
     )
-=======
+
+
 def get_torch_device_str() -> str:
     """
     Return the torch device string for the detected hardware.
@@ -1503,7 +1455,6 @@ def get_torch_device_str() -> str:
     elif device == DeviceType.XPU:
         return "xpu"
     return "cpu"
->>>>>>> Stashed changes
 
 
 def safe_num_proc(desired: Optional[int] = None) -> int:

From dc55c950bb6815325b4bbdfa1b6ebb49615f466e Mon Sep 17 00:00:00 2001
From: leizhenyuan <zhenyuan.lei@intel.com>
Date: Tue, 31 Mar 2026 11:13:54 +0000
Subject: [PATCH 03/18] remove unuse code

---
 unsloth/import_fixes.py | 2 +-
 unsloth/models/rl.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 195dc14d85..394e6809c4 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -1828,4 +1828,4 @@ def disable_broken_causal_conv1d():
     print(
         "Unsloth: Detected broken causal_conv1d binary; "
         "disabling causal_conv1d fast path and continuing import."
-    )
+    )
\ No newline at end of file
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 449419acf6..710e21943e 100755
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -1989,4 +1989,4 @@ def PatchFastRL(algorithm = None, FastLanguageModel = None):
     patch_trl_openenv()
     patch_trl_vllm_generation()
     if type(algorithm) is str and algorithm.islower():
-        PatchRLStatistics(algorithm)
+        PatchRLStatistics(algorithm)
\ No newline at end of file

From 5487a1b59ba40a7c1a28a00c791a0df7fff4d185 Mon Sep 17 00:00:00 2001
From: leizhenyuan <zhenyuan.lei@intel.com>
Date: Tue, 31 Mar 2026 11:15:57 +0000
Subject: [PATCH 04/18] remove rl changes

---
 unsloth/models/rl.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
index 710e21943e..5651a7da41 100755
--- a/unsloth/models/rl.py
+++ b/unsloth/models/rl.py
@@ -1202,10 +1202,6 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
             "        memory_gb_left = psutil.virtual_memory().available / (1024**3)\n"
             "        if memory_gb_left <= 2: dataset_num_proc = 1\n"
             "        else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left))\n"
-            "# XPU: forking corrupts Level-Zero context, force single process\n"
-            "import torch as _torch\n"
-            "if hasattr(_torch, 'xpu') and _torch.xpu.is_available():\n"
-            "    dataset_num_proc = 1\n"
         )
         extra_args += num_proc_check
 
@@ -1989,4 +1985,4 @@ def PatchFastRL(algorithm = None, FastLanguageModel = None):
     patch_trl_openenv()
     patch_trl_vllm_generation()
     if type(algorithm) is str and algorithm.islower():
-        PatchRLStatistics(algorithm)
\ No newline at end of file
+        PatchRLStatistics(algorithm)

From 72eced43f69adc151e68a4f04f1ddf808f1207bc Mon Sep 17 00:00:00 2001
From: leizhenyuan <zhenyuan.lei@intel.com>
Date: Tue, 31 Mar 2026 11:18:37 +0000
Subject: [PATCH 05/18] remove unuse code

---
 unsloth/import_fixes.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 394e6809c4..3e7c2069a2 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -443,9 +443,7 @@ def fix_vllm_aimv2_issue():
 
 
 def fix_vllm_guided_decoding_params():
-    def _maybe_disable_vllm_transformers_mismatch(error):
-        """If vLLM fails due to transformers version mismatch, disable it gracefully."""
-        global VLLM_BROKEN
+    def _maybe_raise_vllm_transformers_mismatch(error):
         error_text = str(error)
         if (
             "ALLOWED_LAYER_TYPES" in error_text
@@ -455,17 +453,13 @@ def _maybe_disable_vllm_transformers_mismatch(error):
                 vllm_version = importlib_version("vllm")
             except Exception:
                 vllm_version = "unknown"
-            logger.warning(
+            raise RuntimeError(
                 "Unsloth: vLLM with version "
                 f"{vllm_version} does not yet support transformers>=5.0.0. "
-                "Disabling vLLM and continuing without it. "
+                "Please downgrade to transformers==4.57.3 via "
+                'pip install --force-reinstall "transformers==4.57.3". '
                 f"Original error: {error}"
-            )
-            VLLM_BROKEN = True
-            _clear_vllm_modules()
-            _install_vllm_blocker()
-            return True
-        return False
+            ) from error
 
     if importlib.util.find_spec("vllm") is None:
         return
@@ -475,8 +469,7 @@ def _maybe_disable_vllm_transformers_mismatch(error):
     try:
         import vllm
     except (ImportError, OSError) as e:
-        if _maybe_disable_vllm_transformers_mismatch(e):
-            return
+        _maybe_raise_vllm_transformers_mismatch(e):
         if disable_broken_vllm(e):
             return
         raise
@@ -484,8 +477,7 @@ def _maybe_disable_vllm_transformers_mismatch(error):
     try:
         from vllm.sampling_params import GuidedDecodingParams
     except (ImportError, OSError) as e:
-        if _maybe_disable_vllm_transformers_mismatch(e):
-            return
+        _maybe_raise_vllm_transformers_mismatch(e)
         if disable_broken_vllm(e):
             return
         if not hasattr(vllm, "sampling_params") or not hasattr(
@@ -1828,4 +1820,4 @@ def disable_broken_causal_conv1d():
     print(
         "Unsloth: Detected broken causal_conv1d binary; "
         "disabling causal_conv1d fast path and continuing import."
-    )
\ No newline at end of file
+    )

From f679dcae7fc5c9a8a19a6440908b0b64005cb2d4 Mon Sep 17 00:00:00 2001
From: leizhenyuan <zhenyuan.lei@intel.com>
Date: Tue, 31 Mar 2026 11:19:16 +0000
Subject: [PATCH 06/18] remove unuse code

---
 unsloth/import_fixes.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py
index 3e7c2069a2..ca44a0ce7e 100644
--- a/unsloth/import_fixes.py
+++ b/unsloth/import_fixes.py
@@ -469,7 +469,7 @@ def _maybe_raise_vllm_transformers_mismatch(error):
     try:
         import vllm
     except (ImportError, OSError) as e:
-        _maybe_raise_vllm_transformers_mismatch(e):
+        _maybe_raise_vllm_transformers_mismatch(e)
         if disable_broken_vllm(e):
             return
         raise

From 8dd4290132b6a2894c82e1ded0f3b917f9bf3c95 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 31 Mar 2026 11:19:34 +0000
Subject: [PATCH 07/18] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/inference.py | 1 +
 studio/backend/core/inference/llama_cpp.py | 3 +++
 studio/backend/core/training/trainer.py    | 6 ++++++
 studio/backend/utils/hardware/hardware.py  | 9 +++++++--
 studio/backend/utils/utils.py              | 9 ++++++---
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index 085f01a194..e79d1fa78f 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -1644,6 +1644,7 @@ def _generate_dac(
         )
         with torch.inference_mode():
             from utils.hardware import get_torch_device_str
+
             with torch.amp.autocast(get_torch_device_str(), dtype = model.dtype):
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 19a169425f..14173984c5 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1439,6 +1439,7 @@ def unload_model(self) -> bool:
                 import torch
 
                 from utils.hardware import clear_gpu_cache
+
                 clear_gpu_cache()
             return True
 
@@ -3017,6 +3018,7 @@ def init_audio_codec(self, audio_type: str) -> None:
             LlamaCppBackend._codec_mgr = AudioCodecManager()
 
         from utils.hardware import get_torch_device_str
+
         device = get_torch_device_str()
         model_repo_path = None
 
@@ -3092,6 +3094,7 @@ def generate_audio_response(
         import torch
 
         from utils.hardware import get_torch_device_str
+
         device = get_torch_device_str()
         return LlamaCppBackend._codec_mgr.decode(
             audio_type, device, token_ids = token_ids, text = data.get("content", "")
diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index f25e848c8f..8f76e71c7c 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -1533,6 +1533,7 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
         SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
         SNAC_SAMPLE_RATE = 24000
         from utils.hardware import get_torch_device_str
+
         device = get_torch_device_str()
         max_length = self.max_seq_length or 2048
         tokenizer = self.tokenizer
@@ -1710,6 +1711,7 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
 
         gc.collect()
         from utils.hardware import clear_gpu_cache
+
         clear_gpu_cache()
         self._cuda_audio_used = True
 
@@ -1739,6 +1741,7 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):
         import subprocess
 
         from utils.hardware import get_torch_device_str
+
         device = get_torch_device_str()
 
         # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
@@ -1940,6 +1943,7 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
 
         gc.collect()
         from utils.hardware import clear_gpu_cache
+
         clear_gpu_cache()
         self._cuda_audio_used = True
 
@@ -1976,6 +1980,7 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         from utils.paths import ensure_dir, tmp_root
 
         from utils.hardware import get_torch_device_str
+
         device = get_torch_device_str()
 
         # Clone OuteTTS repo (same as audio_codecs._load_dac)
@@ -2155,6 +2160,7 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
 
         gc.collect()
         from utils.hardware import clear_gpu_cache
+
         clear_gpu_cache()
         self._cuda_audio_used = True
 
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 746dc17039..35826979fd 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -394,7 +394,9 @@ def _get_xpu_utilization() -> Dict[str, Any]:
 
         result = subprocess.run(
             ["xpu-smi", "dump", "-d", "0", "-m", "0,1,2,18"],
-            capture_output=True, text=True, timeout=5,
+            capture_output = True,
+            text = True,
+            timeout = 5,
         )
         if result.returncode == 0 and result.stdout.strip():
             # xpu-smi dump outputs CSV: Timestamp, DeviceId, GPU Utilization (%), ...
@@ -1326,10 +1328,13 @@ def get_visible_gpu_count() -> int:
             if xpu_visible == "":
                 _visible_gpu_count = 0
             else:
-                _visible_gpu_count = len([x for x in xpu_visible.split(",") if x.strip()])
+                _visible_gpu_count = len(
+                    [x for x in xpu_visible.split(",") if x.strip()]
+                )
             return _visible_gpu_count
         try:
             import torch
+
             _visible_gpu_count = torch.xpu.device_count()
         except Exception:
             _visible_gpu_count = get_physical_gpu_count()
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index a544bb0802..290b5ad92e 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -110,9 +110,12 @@ def format_error_message(error: Exception, model_name: str) -> str:
         from utils.hardware import get_device
 
         device = get_device()
-        device_label = {"cuda": "GPU", "xpu": "Intel GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get(
-            device.value, "GPU"
-        )
+        device_label = {
+            "cuda": "GPU",
+            "xpu": "Intel GPU",
+            "mlx": "Apple Silicon GPU",
+            "cpu": "system",
+        }.get(device.value, "GPU")
         return f"Not enough {device_label} memory to load '{model_short}'. Try a smaller model or free memory."
 
     # Generic fallback

From 03e15c75f6326eba0e11b88c64c1161bd9a38df5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 3 Apr 2026 19:11:23 +0000
Subject: [PATCH 08/18] Fix xpu-smi metric parsing, dead code, and type
 inconsistency

- Fix _get_xpu_utilization() metric indices: use -m 0,2,3 (GPU Util,
  Power, Core Temp) instead of -m 0,1,2,18 which mapped parts[3] to
  temperature incorrectly (it was actually GPU Memory Utilization).
  Now correctly parses utilization, power draw, and temperature.
- Add -n 1 flag so xpu-smi dump exits after one sample instead of
  running indefinitely until the 5s timeout kills it.
- Use torch.xpu.current_device() for the -d flag instead of hardcoding
  device 0, so multi-GPU XPU setups query the correct device.
- Populate power_draw_w in the returned dict instead of always None.
- Fix versions["xpu"] = True (bool) to use the actual XPU version
  string from torch.version.xpu, falling back to "available". This
  keeps the dict type-consistent (all str or None).
- Remove dead code in get_visible_gpu_count() where the XPU branch
  at line 1357 was unreachable because the XPU early-return block
  above always returns before that point.
---
 studio/backend/utils/hardware/hardware.py | 48 +++++++++++++----------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 35826979fd..4be0ce23db 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -319,7 +319,7 @@ def get_package_versions() -> Dict[str, Optional[str]]:
 
         versions["cuda"] = getattr(torch.version, "cuda", None)
         if hasattr(torch, "xpu") and torch.xpu.is_available():
-            versions["xpu"] = True
+            versions["xpu"] = getattr(torch.version, "xpu", "available")
     except Exception:
         versions["cuda"] = None
 
@@ -389,37 +389,48 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]
 
 def _get_xpu_utilization() -> Dict[str, Any]:
     """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu."""
+    gpu_util = None
+    temp = None
+    power_w = None
+
+    # Resolve which physical device to query
+    dev_idx = 0
+    try:
+        import torch
+
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            dev_idx = torch.xpu.current_device()
+    except Exception:
+        pass
+
     try:
         import subprocess
 
+        # xpu-smi metric IDs: 0 = GPU Utilization (%), 2 = GPU Power (W),
+        # 3 = GPU Core Temperature (C).
+        # -n 1 requests exactly one sample so the command exits immediately.
+        # CSV columns: Timestamp, DeviceId, <metric0>, <metric1>, <metric2>
         result = subprocess.run(
-            ["xpu-smi", "dump", "-d", "0", "-m", "0,1,2,18"],
+            ["xpu-smi", "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"],
             capture_output = True,
             text = True,
-            timeout = 5,
+            timeout = 10,
         )
         if result.returncode == 0 and result.stdout.strip():
-            # xpu-smi dump outputs CSV: Timestamp, DeviceId, GPU Utilization (%), ...
             lines = result.stdout.strip().splitlines()
             for line in reversed(lines):
                 if line.startswith("Timestamp") or line.startswith("#"):
                     continue
                 parts = [p.strip() for p in line.split(",")]
-                if len(parts) >= 4:
+                if len(parts) >= 5:
                     gpu_util = float(parts[2]) if parts[2] not in ("", "N/A") else None
-                    temp = float(parts[3]) if parts[3] not in ("", "N/A") else None
+                    power_w = float(parts[3]) if parts[3] not in ("", "N/A") else None
+                    temp = float(parts[4]) if parts[4] not in ("", "N/A") else None
                     break
-            else:
-                gpu_util = None
-                temp = None
-        else:
-            gpu_util = None
-            temp = None
     except Exception:
-        gpu_util = None
-        temp = None
+        pass
 
-    # Get VRAM from torch.xpu
+    # Get VRAM from torch.xpu (only reports PyTorch-managed memory)
     vram_used_gb = None
     vram_total_gb = None
     try:
@@ -450,7 +461,7 @@ def _get_xpu_utilization() -> Dict[str, Any]:
         "vram_used_gb": vram_used_gb,
         "vram_total_gb": vram_total_gb,
         "vram_utilization_pct": vram_pct,
-        "power_draw_w": None,
+        "power_draw_w": power_w,
         "power_limit_w": None,
         "power_utilization_pct": None,
     }
@@ -1354,10 +1365,7 @@ def get_visible_gpu_count() -> int:
     try:
         import torch
 
-        if get_device() == DeviceType.XPU and hasattr(torch, "xpu"):
-            _visible_gpu_count = torch.xpu.device_count()
-        else:
-            _visible_gpu_count = torch.cuda.device_count()
+        _visible_gpu_count = torch.cuda.device_count()
     except Exception:
         _visible_gpu_count = get_physical_gpu_count()
 

From 3f8dc0dc69b47690a7ef34ae7821af468a765a68 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 3 Apr 2026 19:43:35 +0000
Subject: [PATCH 09/18] Add shutil.which guard before xpu-smi subprocess call

Skip the xpu-smi subprocess entirely when the binary is not on
PATH. This avoids a multi-second timeout on Intel GPU systems
that have PyTorch XPU support but no xpu-smi tooling installed.
The function still falls back to torch.xpu for VRAM metrics.
---
 studio/backend/utils/hardware/hardware.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 4be0ce23db..f5d9cb6938 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -404,14 +404,21 @@ def _get_xpu_utilization() -> Dict[str, Any]:
         pass
 
     try:
+        import shutil
         import subprocess
 
+        # Skip subprocess entirely when xpu-smi is not on PATH, avoiding
+        # a multi-second timeout on systems without the Intel tooling.
+        xpu_smi = shutil.which("xpu-smi")
+        if xpu_smi is None:
+            raise FileNotFoundError("xpu-smi not found")
+
         # xpu-smi metric IDs: 0 = GPU Utilization (%), 2 = GPU Power (W),
         # 3 = GPU Core Temperature (C).
         # -n 1 requests exactly one sample so the command exits immediately.
         # CSV columns: Timestamp, DeviceId, <metric0>, <metric1>, <metric2>
         result = subprocess.run(
-            ["xpu-smi", "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"],
+            [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"],
             capture_output = True,
             text = True,
             timeout = 10,

From 568a1a4781299c5c1c611368c9c40cfc976b1c1a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 3 Apr 2026 20:00:06 +0000
Subject: [PATCH 10/18] Improve get_visible_gpu_count() ZE_AFFINITY_MASK
 handling

Prefer torch.xpu.device_count() over manual mask parsing since the
runtime correctly interprets all ZE_AFFINITY_MASK syntax including
subdevice notation (e.g. "0.0,0.1" is 1 root device, not 2).

The manual parsing fallback now counts unique root device IDs from
the mask, handling "device.subdevice" notation correctly.
---
 studio/backend/utils/hardware/hardware.py | 24 +++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index f5d9cb6938..ea6262524c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1345,17 +1345,29 @@ def get_visible_gpu_count() -> int:
             xpu_visible = xpu_visible.strip()
             if xpu_visible == "":
                 _visible_gpu_count = 0
-            else:
-                _visible_gpu_count = len(
-                    [x for x in xpu_visible.split(",") if x.strip()]
-                )
-            return _visible_gpu_count
+                return _visible_gpu_count
+
+        # Prefer torch.xpu.device_count() as it correctly interprets
+        # ZE_AFFINITY_MASK including subdevice syntax (e.g. "0.0,0.1").
         try:
             import torch
 
             _visible_gpu_count = torch.xpu.device_count()
         except Exception:
-            _visible_gpu_count = get_physical_gpu_count()
+            if xpu_visible:
+                # Fallback: count unique root device IDs from the mask.
+                # ZE_AFFINITY_MASK can use "device.subdevice" notation,
+                # so "0.0,0.1" is 1 root device, not 2.
+                roots = set()
+                for token in xpu_visible.split(","):
+                    token = token.strip()
+                    if token:
+                        root = token.split(".", 1)[0]
+                        if root.isdigit():
+                            roots.add(int(root))
+                _visible_gpu_count = len(roots) if roots else 0
+            else:
+                _visible_gpu_count = get_physical_gpu_count()
         return _visible_gpu_count
 
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")

From 9f9637e752e8a318ee6d4e61026817db2ec19549 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 8 Apr 2026 09:06:48 +0000
Subject: [PATCH 11/18] Fix xpu-smi metrics, device resolution, and XPU OOM
 detection

- _get_xpu_utilization: request metrics -m 0,1,3 (Util, Power, Temp)
  rather than 0,2,3 so the power column no longer reports MHz as watts.
- _resolve_xpu_smi_device_id: map torch.xpu.current_device() (logical
  ordinal under ZE_AFFINITY_MASK) to the physical root device id that
  xpu-smi -d expects, so telemetry targets the active GPU.
- Merge the duplicated torch blocks in _get_xpu_utilization so the
  VRAM lookup is guarded and the device index is computed once.
- format_error_message: only rewrite true OOM errors (out of memory
  substrings) as memory errors, so non-OOM XPU/CUDA failures surface
  their real cause instead of a misleading memory message.
- inference.py DAC generation: derive autocast device from
  model.device.type, not the global backend, so CPU-fallback models
  on an XPU host do not open a GPU autocast context.
- dataset_map_num_proc: only disable XPU multiprocessing after the
  XPU runtime is actually initialized in this process, so pure
  CPU-side dataset preprocessing can still parallelize on Intel hosts.
- get_package_versions: preserve the "available" fallback for xpu
  when torch.version.xpu exists as None.
- get_visible_gpu_count: normalize ZE_AFFINITY_MASK parsing so the
  None and empty-string branches do not rely on implicit scoping.
---
 studio/backend/core/inference/inference.py |  12 ++-
 studio/backend/utils/hardware/hardware.py  | 104 +++++++++++++++------
 studio/backend/utils/utils.py              |   9 +-
 3 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index e79d1fa78f..1e2d7030e6 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -1643,9 +1643,15 @@ def _generate_dac(
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
         with torch.inference_mode():
-            from utils.hardware import get_torch_device_str
-
-            with torch.amp.autocast(get_torch_device_str(), dtype = model.dtype):
+            # Derive the autocast device from the loaded model, not from the
+            # global backend: a CPU-fallback DAC on an XPU/CUDA host must not
+            # open a GPU autocast context around CPU tensors.
+            device_type = (
+                model.device.type
+                if hasattr(model.device, "type")
+                else str(model.device).split(":", 1)[0]
+            )
+            with torch.amp.autocast(device_type, dtype = model.dtype):
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
                     **inputs,
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index ea6262524c..0f61d8d2d4 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -319,7 +319,11 @@ def get_package_versions() -> Dict[str, Optional[str]]:
 
         versions["cuda"] = getattr(torch.version, "cuda", None)
         if hasattr(torch, "xpu") and torch.xpu.is_available():
-            versions["xpu"] = getattr(torch.version, "xpu", "available")
+            # torch.version.xpu exists on modern torch builds but may be None;
+            # fall back to "available" so the UI distinguishes present-but-unknown
+            # from "package not found".
+            xpu_ver = getattr(torch.version, "xpu", None)
+            versions["xpu"] = xpu_ver if xpu_ver is not None else "available"
     except Exception:
         versions["cuda"] = None
 
@@ -387,22 +391,52 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]
 # ========== Live GPU Utilization ==========
 
 
-def _get_xpu_utilization() -> Dict[str, Any]:
-    """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu."""
-    gpu_util = None
-    temp = None
-    power_w = None
+def _resolve_xpu_smi_device_id() -> int:
+    """Resolve the physical root device ID used by ``xpu-smi -d``.
 
-    # Resolve which physical device to query
-    dev_idx = 0
+    ``torch.xpu.current_device()`` returns the logical ordinal after
+    ``ZE_AFFINITY_MASK`` remapping, whereas ``xpu-smi`` addresses physical
+    root devices. Translate the ordinal through the mask roots so telemetry
+    targets the GPU the process is actually running on. Subdevice syntax
+    such as ``0.0,0.1`` collapses to a single root device.
+    """
+    ordinal = 0
+    xpu_ok = False
     try:
         import torch
 
-        if hasattr(torch, "xpu") and torch.xpu.is_available():
-            dev_idx = torch.xpu.current_device()
+        xpu_ok = hasattr(torch, "xpu") and torch.xpu.is_available()
+        if xpu_ok:
+            ordinal = int(torch.xpu.current_device())
     except Exception:
         pass
 
+    mask = (os.environ.get("ZE_AFFINITY_MASK") or "").strip()
+    if mask:
+        roots: list[int] = []
+        for token in mask.split(","):
+            token = token.strip()
+            if not token:
+                continue
+            root = token.split(".", 1)[0]
+            if root.isdigit():
+                root_id = int(root)
+                if root_id not in roots:
+                    roots.append(root_id)
+        if roots:
+            return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0]
+
+    return ordinal if xpu_ok else 0
+
+
+def _get_xpu_utilization() -> Dict[str, Any]:
+    """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu."""
+    gpu_util = None
+    temp = None
+    power_w = None
+
+    dev_idx = _resolve_xpu_smi_device_id()
+
     try:
         import shutil
         import subprocess
@@ -413,12 +447,15 @@ def _get_xpu_utilization() -> Dict[str, Any]:
         if xpu_smi is None:
             raise FileNotFoundError("xpu-smi not found")
 
-        # xpu-smi metric IDs: 0 = GPU Utilization (%), 2 = GPU Power (W),
-        # 3 = GPU Core Temperature (C).
+        # xpu-smi metric IDs (per Intel xpu-smi docs):
+        #   0 = GPU Utilization (%)
+        #   1 = GPU Power (W)
+        #   2 = GPU Frequency (MHz)
+        #   3 = GPU Core Temperature (C)
         # -n 1 requests exactly one sample so the command exits immediately.
         # CSV columns: Timestamp, DeviceId, <metric0>, <metric1>, <metric2>
         result = subprocess.run(
-            [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"],
+            [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,1,3", "-n", "1"],
             capture_output = True,
             text = True,
             timeout = 10,
@@ -437,16 +474,19 @@ def _get_xpu_utilization() -> Dict[str, Any]:
     except Exception:
         pass
 
-    # Get VRAM from torch.xpu (only reports PyTorch-managed memory)
+    # Get VRAM from torch.xpu (only reports PyTorch-managed memory).
+    # Use the same logical ordinal that torch exposes; xpu-smi physical id is
+    # only needed by the subprocess call above.
     vram_used_gb = None
     vram_total_gb = None
     try:
         import torch
 
-        idx = torch.xpu.current_device()
-        props = torch.xpu.get_device_properties(idx)
-        vram_total_gb = round(props.total_memory / (1024**3), 2)
-        vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2)
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            idx = torch.xpu.current_device()
+            props = torch.xpu.get_device_properties(idx)
+            vram_total_gb = round(props.total_memory / (1024**3), 2)
+            vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2)
     except Exception:
         pass
 
@@ -1340,12 +1380,12 @@ def get_visible_gpu_count() -> int:
     device = get_device()
 
     if device == DeviceType.XPU:
-        xpu_visible = os.environ.get("ZE_AFFINITY_MASK")
-        if xpu_visible is not None:
-            xpu_visible = xpu_visible.strip()
-            if xpu_visible == "":
-                _visible_gpu_count = 0
-                return _visible_gpu_count
+        xpu_mask_raw = os.environ.get("ZE_AFFINITY_MASK")
+        xpu_mask_set = xpu_mask_raw is not None
+        xpu_visible = (xpu_mask_raw or "").strip()
+        if xpu_mask_set and xpu_visible == "":
+            _visible_gpu_count = 0
+            return _visible_gpu_count
 
         # Prefer torch.xpu.device_count() as it correctly interprets
         # ZE_AFFINITY_MASK including subdevice syntax (e.g. "0.0,0.1").
@@ -1565,9 +1605,11 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]:
     ``datasets`` treats ``num_proc=1`` as multiprocessing (creates ``Pool(1)``).
     Only ``num_proc=None`` guarantees in-process execution.
 
-    Also returns ``None`` on XPU devices because ``os.fork()`` corrupts the
-    Level-Zero GPU context, causing Triton kernel launches to fail with
-    "Pointer argument doesn't reference XPU device memory".
+    Also returns ``None`` on XPU devices once the XPU runtime has been
+    initialized in this process, because ``os.fork()`` corrupts the
+    Level-Zero GPU context and causes Triton kernel launches to fail with
+    "Pointer argument doesn't reference XPU device memory". Pre-init XPU
+    hosts can still parallelize pure CPU-side dataset preprocessing.
     """
     import sys
 
@@ -1575,6 +1617,12 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]:
         return None
 
     if get_device() == DeviceType.XPU:
-        return None
+        try:
+            import torch
+
+            if hasattr(torch, "xpu") and torch.xpu.is_initialized():
+                return None
+        except Exception:
+            return None
 
     return safe_num_proc(desired)
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index 290b5ad92e..fc4674a1d5 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -101,11 +101,10 @@ def format_error_message(error: Exception, model_name: str) -> str:
         return "Invalid HF token. Please check your token and try again."
 
     if (
-        "memory" in error_str
-        or "cuda" in error_str
-        or "xpu" in error_str
-        or "mlx" in error_str
-        or "out of memory" in error_str
+        "out of memory" in error_str
+        or "cuda out of memory" in error_str
+        or "xpu out of memory" in error_str
+        or ("mlx" in error_str and "memory" in error_str)
     ):
         from utils.hardware import get_device
 

From 393a70d9986322da45c26286f678b97b06f537d5 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 8 Apr 2026 09:26:26 +0000
Subject: [PATCH 12/18] Wire XPU through gpu-id pinning and visibility, restore
 CPU OOM detection

Round 2 fixes addressing reviewer feedback:

- format_error_message: tightening "out of memory" coverage in round 1
  dropped CPU allocator failures like "not enough memory to allocate"
  and "cannot allocate memory", and Level Zero
  ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY. Restore those patterns while
  still excluding non-memory XPU/CUDA exceptions.
- apply_gpu_ids: route Intel XPU through ZE_AFFINITY_MASK instead of
  CUDA_VISIBLE_DEVICES so worker subprocesses are actually pinned to
  the requested GPUs on multi-XPU hosts.
- _get_parent_visible_gpu_spec: add an XPU branch that reads
  ZE_AFFINITY_MASK and returns physical root device IDs, so the
  visibility/selection stack reports the correct devices on Intel
  hosts. Honors subdevice syntax and wildcards.
- Extract _parse_ze_mask_roots helper for the ZE_AFFINITY_MASK
  parsing previously duplicated between _resolve_xpu_smi_device_id
  and get_visible_gpu_count. Single source of truth for the mask
  semantics.
- get_visible_gpu_count: treat non-digit wildcard masks (e.g. "*")
  as "all physical XPUs visible" rather than zero.
- get_package_versions: also set versions["xpu"] = None in the
  except block so a failing XPU probe does not leave the key missing.
- inference.py DAC autocast: clamp the resolved device_type to
  ("cuda", "xpu", "cpu") so exotic devices like "meta" during
  accelerate offloaded loading do not raise.
---
 studio/backend/core/inference/inference.py |  4 +
 studio/backend/utils/hardware/hardware.py  | 99 +++++++++++++++++-----
 studio/backend/utils/utils.py              |  7 +-
 3 files changed, 86 insertions(+), 24 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index 1e2d7030e6..f07d40a0e5 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -1651,6 +1651,10 @@ def _generate_dac(
                 if hasattr(model.device, "type")
                 else str(model.device).split(":", 1)[0]
             )
+            # Clamp to autocast-supported backends so exotic devices
+            # (e.g. "meta" during accelerate offloaded loading) do not raise.
+            if device_type not in ("cuda", "xpu", "cpu"):
+                device_type = "cpu"
             with torch.amp.autocast(device_type, dtype = model.dtype):
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 0f61d8d2d4..b397d6196c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -326,6 +326,7 @@ def get_package_versions() -> Dict[str, Optional[str]]:
             versions["xpu"] = xpu_ver if xpu_ver is not None else "available"
     except Exception:
         versions["cuda"] = None
+        versions["xpu"] = None
 
     return versions
 
@@ -391,6 +392,29 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]
 # ========== Live GPU Utilization ==========
 
 
+def _parse_ze_mask_roots(mask: str) -> list[int]:
+    """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of unique root device IDs.
+
+    Accepts subdevice syntax such as ``0.0,0.1`` which collapses to ``[0]``.
+    Returns an empty list if the mask is empty or contains no parseable digits.
+    Insertion order is preserved so callers can map logical ordinals back to
+    physical root IDs via the returned list.
+    """
+    roots: list[int] = []
+    if not mask:
+        return roots
+    for token in mask.split(","):
+        token = token.strip()
+        if not token:
+            continue
+        root = token.split(".", 1)[0]
+        if root.isdigit():
+            root_id = int(root)
+            if root_id not in roots:
+                roots.append(root_id)
+    return roots
+
+
 def _resolve_xpu_smi_device_id() -> int:
     """Resolve the physical root device ID used by ``xpu-smi -d``.
 
@@ -412,19 +436,9 @@ def _resolve_xpu_smi_device_id() -> int:
         pass
 
     mask = (os.environ.get("ZE_AFFINITY_MASK") or "").strip()
-    if mask:
-        roots: list[int] = []
-        for token in mask.split(","):
-            token = token.strip()
-            if not token:
-                continue
-            root = token.split(".", 1)[0]
-            if root.isdigit():
-                root_id = int(root)
-                if root_id not in roots:
-                    roots.append(root_id)
-        if roots:
-            return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0]
+    roots = _parse_ze_mask_roots(mask)
+    if roots:
+        return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0]
 
     return ordinal if xpu_ok else 0
 
@@ -659,6 +673,42 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
 
 
 def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
+    # On Intel XPU hosts, device visibility is controlled by ZE_AFFINITY_MASK
+    # (the Level Zero affinity variable) rather than CUDA_VISIBLE_DEVICES.
+    if get_device() == DeviceType.XPU:
+        xpu_mask_raw = os.environ.get("ZE_AFFINITY_MASK")
+        if xpu_mask_raw is None:
+            return {
+                "raw": None,
+                "numeric_ids": list(range(get_physical_gpu_count())),
+                "supports_explicit_gpu_ids": True,
+            }
+
+        xpu_mask = xpu_mask_raw.strip()
+        if xpu_mask == "":
+            return {
+                "raw": xpu_mask,
+                "numeric_ids": [],
+                "supports_explicit_gpu_ids": True,
+            }
+
+        roots = _parse_ze_mask_roots(xpu_mask)
+        if not roots:
+            # Non-digit wildcard (e.g. "*") or unparseable mask: treat the
+            # same as "all physical XPUs visible" but disable explicit ids
+            # since we cannot map logical ordinals to root IDs.
+            return {
+                "raw": xpu_mask,
+                "numeric_ids": None,
+                "supports_explicit_gpu_ids": False,
+            }
+
+        return {
+            "raw": xpu_mask,
+            "numeric_ids": roots,
+            "supports_explicit_gpu_ids": True,
+        }
+
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
 
     if cuda_visible is None:
@@ -1398,14 +1448,12 @@ def get_visible_gpu_count() -> int:
                 # Fallback: count unique root device IDs from the mask.
                 # ZE_AFFINITY_MASK can use "device.subdevice" notation,
                 # so "0.0,0.1" is 1 root device, not 2.
-                roots = set()
-                for token in xpu_visible.split(","):
-                    token = token.strip()
-                    if token:
-                        root = token.split(".", 1)[0]
-                        if root.isdigit():
-                            roots.add(int(root))
-                _visible_gpu_count = len(roots) if roots else 0
+                roots = _parse_ze_mask_roots(xpu_visible)
+                # Non-digit wildcards (e.g. "*") yield an empty roots list;
+                # treat those as "all physical XPUs visible".
+                _visible_gpu_count = (
+                    len(roots) if roots else get_physical_gpu_count()
+                )
             else:
                 _visible_gpu_count = get_physical_gpu_count()
         return _visible_gpu_count
@@ -1448,6 +1496,15 @@ def apply_gpu_ids(gpu_ids) -> None:
     else:
         value = str(gpu_ids)
 
+    # Intel XPU uses Level Zero and honors ZE_AFFINITY_MASK, not
+    # CUDA_VISIBLE_DEVICES. Route XPU pinning through the correct env var
+    # so worker subprocesses are actually restricted to the intended GPU.
+    if get_device() == DeviceType.XPU:
+        os.environ["ZE_AFFINITY_MASK"] = value
+        _visible_gpu_count = None
+        logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value)
+        return
+
     os.environ["CUDA_VISIBLE_DEVICES"] = value
     _visible_gpu_count = None
     logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value)
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index fc4674a1d5..5555c14d8e 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -102,9 +102,10 @@ def format_error_message(error: Exception, model_name: str) -> str:
 
     if (
         "out of memory" in error_str
-        or "cuda out of memory" in error_str
-        or "xpu out of memory" in error_str
-        or ("mlx" in error_str and "memory" in error_str)
+        or "out of device memory" in error_str
+        or "not enough memory" in error_str
+        or "cannot allocate memory" in error_str
+        or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str))
     ):
         from utils.hardware import get_device
 

From 2cf6d0254c680c841171b38ad39375e47890f624 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 8 Apr 2026 09:47:29 +0000
Subject: [PATCH 13/18] Unblock XPU gpu_ids selection and harden OOM/autocast
 edge cases

Round 3 fixes targeting the remaining gaps reviewers flagged:

- prepare_gpu_selection: allow explicit gpu_ids on Intel XPU so the
  apply_gpu_ids() XPU branch (and _get_parent_visible_gpu_spec XPU
  branch) are actually reachable from the normal request path.
- _parse_ze_mask_roots: stop deduplicating. Keep one root ID per mask
  token so the logical-ordinal-to-physical-root mapping used by
  _resolve_xpu_smi_device_id() stays 1-to-1 even for mixed subdevice
  masks like "2.0,0.1,0.2". Update the docstring to document the
  new shape.
- _get_parent_visible_gpu_spec: dedupe roots only at the visibility
  layer, and flag subdevice masks as supports_explicit_gpu_ids=False
  so resolve_requested_gpu_ids() does not try to match duplicate IDs.
  Treat wildcard masks as "all physical XPUs visible".
- format_error_message: also match the literal Level Zero enum names
  ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY / _HOST_MEMORY which use
  underscores and were not caught by the "out of device memory"
  substring.
- inference.py DAC autocast: accept "mps" in the clamp list (it has
  been an autocast-supported backend since torch 2.3) and skip
  autocast entirely when the model is on CPU with an unsupported
  dtype like float32, since torch.amp.autocast("cpu", dtype=float32)
  raises.
- resolve_requested_gpu_ids: tailor the "unsupported explicit ids"
  error message to the current backend so XPU users see a
  ZE_AFFINITY_MASK reference instead of a CUDA one.
---
 studio/backend/core/inference/inference.py | 14 ++++-
 studio/backend/utils/hardware/hardware.py  | 60 +++++++++++++++-------
 studio/backend/utils/utils.py              |  2 +
 3 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index f07d40a0e5..79c52ed6bd 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -1643,6 +1643,8 @@ def _generate_dac(
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
         with torch.inference_mode():
+            import contextlib
+
             # Derive the autocast device from the loaded model, not from the
             # global backend: a CPU-fallback DAC on an XPU/CUDA host must not
             # open a GPU autocast context around CPU tensors.
@@ -1653,9 +1655,17 @@ def _generate_dac(
             )
             # Clamp to autocast-supported backends so exotic devices
             # (e.g. "meta" during accelerate offloaded loading) do not raise.
-            if device_type not in ("cuda", "xpu", "cpu"):
+            # MPS is autocast-supported since torch 2.3, keep it in the set.
+            if device_type not in ("cuda", "xpu", "mps", "cpu"):
                 device_type = "cpu"
-            with torch.amp.autocast(device_type, dtype = model.dtype):
+            # CPU autocast only accepts bfloat16/float16. For a float32 CPU
+            # model, skip autocast entirely to avoid raising before generate.
+            cpu_autocast_supported = model.dtype in (torch.bfloat16, torch.float16)
+            if device_type == "cpu" and not cpu_autocast_supported:
+                autocast_ctx = contextlib.nullcontext()
+            else:
+                autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype)
+            with autocast_ctx:
                 inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
                 generated = model.generate(
                     **inputs,
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index b397d6196c..26791b5363 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -393,12 +393,13 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]
 
 
 def _parse_ze_mask_roots(mask: str) -> list[int]:
-    """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of unique root device IDs.
+    """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of root device IDs.
 
-    Accepts subdevice syntax such as ``0.0,0.1`` which collapses to ``[0]``.
-    Returns an empty list if the mask is empty or contains no parseable digits.
-    Insertion order is preserved so callers can map logical ordinals back to
-    physical root IDs via the returned list.
+    Returns one root ID per mask token, preserving order and duplicates so
+    that logical ordinals map 1-to-1 back to physical root IDs. For example
+    ``"0.0,0.1"`` yields ``[0, 0]`` (two logical devices, both under root
+    GPU 0) and ``"2.0,0.1,0.2"`` yields ``[2, 0, 0]``. Returns an empty
+    list if the mask is empty or contains no parseable digits.
     """
     roots: list[int] = []
     if not mask:
@@ -409,9 +410,7 @@ def _parse_ze_mask_roots(mask: str) -> list[int]:
             continue
         root = token.split(".", 1)[0]
         if root.isdigit():
-            root_id = int(root)
-            if root_id not in roots:
-                roots.append(root_id)
+            roots.append(int(root))
     return roots
 
 
@@ -692,20 +691,40 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
                 "supports_explicit_gpu_ids": True,
             }
 
-        roots = _parse_ze_mask_roots(xpu_mask)
-        if not roots:
+        # Subdevice syntax (e.g. "0.0,0.1") expands a single root GPU into
+        # multiple logical devices. Explicit root-ID selection is not
+        # meaningful for subdevice masks, so surface them as unsupported.
+        has_subdevice = any(
+            "." in token.strip() for token in xpu_mask.split(",") if token.strip()
+        )
+
+        roots_with_dupes = _parse_ze_mask_roots(xpu_mask)
+        if not roots_with_dupes:
             # Non-digit wildcard (e.g. "*") or unparseable mask: treat the
             # same as "all physical XPUs visible" but disable explicit ids
             # since we cannot map logical ordinals to root IDs.
             return {
                 "raw": xpu_mask,
-                "numeric_ids": None,
+                "numeric_ids": list(range(get_physical_gpu_count())),
+                "supports_explicit_gpu_ids": False,
+            }
+
+        if has_subdevice:
+            # Dedup for display: multiple subdevice entries under the same
+            # root collapse to that root ID.
+            unique_roots: list[int] = []
+            for rid in roots_with_dupes:
+                if rid not in unique_roots:
+                    unique_roots.append(rid)
+            return {
+                "raw": xpu_mask,
+                "numeric_ids": unique_roots,
                 "supports_explicit_gpu_ids": False,
             }
 
         return {
             "raw": xpu_mask,
-            "numeric_ids": roots,
+            "numeric_ids": roots_with_dupes,
             "supports_explicit_gpu_ids": True,
         }
 
@@ -761,11 +780,16 @@ def resolve_requested_gpu_ids(gpu_ids: Optional[list[int]]) -> list[int]:
         return parent_visible_ids
 
     if not parent_visible_spec["supports_explicit_gpu_ids"]:
+        env_var_name = (
+            "ZE_AFFINITY_MASK"
+            if get_device() == DeviceType.XPU
+            else "CUDA_VISIBLE_DEVICES"
+        )
         raise ValueError(
             f"Invalid gpu_ids {requested_ids}: explicit physical GPU IDs are "
-            f"unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG entries "
-            f"({parent_visible_spec['raw']!r}). Omit gpu_ids to use the "
-            "parent-visible devices."
+            f"unsupported when {env_var_name} uses non-numeric or subdevice "
+            f"entries ({parent_visible_spec['raw']!r}). Omit gpu_ids to use "
+            "the parent-visible devices."
         )
 
     if len(set(requested_ids)) != len(requested_ids):
@@ -1244,10 +1268,10 @@ def prepare_gpu_selection(
     in the worker subprocess which narrows ``CUDA_VISIBLE_DEVICES`` before any
     torch/CUDA initialisation.
     """
-    if gpu_ids and get_device() != DeviceType.CUDA:
+    if gpu_ids and get_device() not in (DeviceType.CUDA, DeviceType.XPU):
         raise ValueError(
-            f"gpu_ids {list(gpu_ids)} is only supported on CUDA devices, "
-            f"but the current backend is '{get_device().value}'."
+            f"gpu_ids {list(gpu_ids)} is only supported on CUDA and Intel XPU "
+            f"devices, but the current backend is '{get_device().value}'."
         )
 
     if gpu_ids:
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index 5555c14d8e..08c5754ce7 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -103,6 +103,8 @@ def format_error_message(error: Exception, model_name: str) -> str:
     if (
         "out of memory" in error_str
         or "out of device memory" in error_str
+        or "out_of_device_memory" in error_str  # ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+        or "out_of_host_memory" in error_str  # ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
         or "not enough memory" in error_str
         or "cannot allocate memory" in error_str
         or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str))

From 5fbcb0fce4e25eed61f7926982fda2119ff30a89 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 8 Apr 2026 10:03:42 +0000
Subject: [PATCH 14/18] Enable multi-XPU sharding and auto-select, tighten XPU
 edge cases

Round 4 fixes completing the multi-XPU story unlocked in round 3:

- get_device_map: include DeviceType.XPU in the multi-GPU branch so
  explicit XPU gpu_ids=[0, 1] (or a wildcard-masked multi-XPU host)
  loads with device_map="balanced" instead of falling back to
  "sequential" and pinning the model to a single device.
- auto_select_gpu_ids: allow XPU auto mode. The function relies on
  get_visible_gpu_utilization() for per-device free-VRAM telemetry,
  which already has an XPU path via _get_xpu_utilization. XPU hosts
  omitting gpu_ids now benefit from VRAM-aware selection.
- get_visible_gpu_count torch-less fallback: count unique mask roots
  via len(set(roots)) so subdevice masks like "0.0,0.1" report the
  intended 1 root GPU, not 2. The ordinal-preserving semantics of
  _parse_ze_mask_roots are kept so _resolve_xpu_smi_device_id still
  maps logical ordinals to physical roots correctly.
- xpu-smi subprocess timeout lowered from 10s to 3s so a hung driver
  does not block status polls / UI refreshes.
- DAC autocast nullcontext fallback now covers XPU+float32 as well
  as CPU+float32, since XPU autocast only accepts bfloat16/float16
  and otherwise warns on every generate call.
- _get_parent_visible_gpu_spec subdevice dedup uses
  list(dict.fromkeys(...)) instead of an O(n^2) manual loop.
---
 studio/backend/core/inference/inference.py | 13 ++++----
 studio/backend/utils/hardware/hardware.py  | 39 ++++++++++++----------
 2 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index 79c52ed6bd..64e413ffe7 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -1642,9 +1642,9 @@ def _generate_dac(
             + text
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
-        with torch.inference_mode():
-            import contextlib
+        import contextlib
 
+        with torch.inference_mode():
             # Derive the autocast device from the loaded model, not from the
             # global backend: a CPU-fallback DAC on an XPU/CUDA host must not
             # open a GPU autocast context around CPU tensors.
@@ -1658,10 +1658,11 @@ def _generate_dac(
             # MPS is autocast-supported since torch 2.3, keep it in the set.
             if device_type not in ("cuda", "xpu", "mps", "cpu"):
                 device_type = "cpu"
-            # CPU autocast only accepts bfloat16/float16. For a float32 CPU
-            # model, skip autocast entirely to avoid raising before generate.
-            cpu_autocast_supported = model.dtype in (torch.bfloat16, torch.float16)
-            if device_type == "cpu" and not cpu_autocast_supported:
+            # CPU and XPU autocast only accept bfloat16/float16. For a
+            # float32 model, skip autocast entirely to avoid raising or
+            # producing a warning on every generate call.
+            autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16)
+            if device_type in ("cpu", "xpu") and not autocast_dtype_supported:
                 autocast_ctx = contextlib.nullcontext()
             else:
                 autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype)
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 26791b5363..519fd848d0 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -471,7 +471,7 @@ def _get_xpu_utilization() -> Dict[str, Any]:
             [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,1,3", "-n", "1"],
             capture_output = True,
             text = True,
-            timeout = 10,
+            timeout = 3,
         )
         if result.returncode == 0 and result.stdout.strip():
             lines = result.stdout.strip().splitlines()
@@ -711,11 +711,8 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
 
         if has_subdevice:
             # Dedup for display: multiple subdevice entries under the same
-            # root collapse to that root ID.
-            unique_roots: list[int] = []
-            for rid in roots_with_dupes:
-                if rid not in unique_roots:
-                    unique_roots.append(rid)
+            # root collapse to that root ID while preserving insertion order.
+            unique_roots = list(dict.fromkeys(roots_with_dupes))
             return {
                 "raw": xpu_mask,
                 "numeric_ids": unique_roots,
@@ -1101,8 +1098,12 @@ def auto_select_gpu_ids(
 ) -> tuple[Optional[list[int]], Dict[str, Any]]:
     metadata: Dict[str, Any] = {"selection_mode": "auto"}
 
-    if get_device() != DeviceType.CUDA:
-        metadata["selection_mode"] = "non_cuda"
+    # Auto-selection relies on per-device free-VRAM telemetry which is
+    # available on both CUDA (via nvidia-smi) and XPU (via torch.xpu +
+    # xpu-smi). Other backends (MLX, CPU) do not expose the required
+    # information, so fall through to inheriting parent visibility.
+    if get_device() not in (DeviceType.CUDA, DeviceType.XPU):
+        metadata["selection_mode"] = "non_accelerator"
         return None, metadata
 
     required_gb, estimate_metadata = estimate_required_model_memory_gb(
@@ -1471,12 +1472,14 @@ def get_visible_gpu_count() -> int:
             if xpu_visible:
                 # Fallback: count unique root device IDs from the mask.
                 # ZE_AFFINITY_MASK can use "device.subdevice" notation,
-                # so "0.0,0.1" is 1 root device, not 2.
+                # so "0.0,0.1" is 1 root device, not 2. Without torch we
+                # cannot know which hierarchy mode is active, so fall back
+                # to root-device counting (the more conservative choice).
                 roots = _parse_ze_mask_roots(xpu_visible)
                 # Non-digit wildcards (e.g. "*") yield an empty roots list;
                 # treat those as "all physical XPUs visible".
                 _visible_gpu_count = (
-                    len(roots) if roots else get_physical_gpu_count()
+                    len(set(roots)) if roots else get_physical_gpu_count()
                 )
             else:
                 _visible_gpu_count = get_physical_gpu_count()
@@ -1541,24 +1544,26 @@ def get_device_map(
 
     Returns ``"balanced"`` (shard evenly across GPUs) when:
       - ``gpu_ids`` explicitly lists >1 GPU, **or**
-      - ``CUDA_VISIBLE_DEVICES`` uses UUID/MIG identifiers (non-numeric) and
-        more than one GPU is visible (fallback: we cannot resolve numeric IDs,
-        so we assume the caller intends multi-GPU).
+      - ``CUDA_VISIBLE_DEVICES``/``ZE_AFFINITY_MASK`` uses non-numeric
+        identifiers (UUID/MIG/wildcard) and more than one GPU is visible
+        (fallback: we cannot resolve numeric IDs, so we assume the caller
+        intends multi-GPU).
 
     Returns ``"sequential"`` (single device) in all other cases, including
-    non-CUDA backends (CPU, MLX).
+    CPU/MLX backends.
 
     Callers should use ``prepare_gpu_selection()`` upstream to determine the
     ``gpu_ids`` list -- that function handles the smart auto-selection of the
     minimum number of GPUs needed for a given model.
     """
     device = get_device()
-    if device == DeviceType.CUDA:
+    if device in (DeviceType.CUDA, DeviceType.XPU):
         multi_gpu = gpu_ids is not None and len(gpu_ids) > 1
 
         if not multi_gpu:
-            # UUID/MIG masks cannot be split into numeric IDs, so if multiple
-            # GPUs are visible we assume multi-GPU sharding is intended.
+            # UUID/MIG/wildcard masks cannot be split into numeric IDs, so if
+            # multiple GPUs are visible we assume multi-GPU sharding is
+            # intended.
             parent_visible_spec = _get_parent_visible_gpu_spec()
             if (
                 parent_visible_spec["numeric_ids"] is None

From 9ce735d847a13bc32eda6295290c16e1a5505177 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 8 Apr 2026 10:29:42 +0000
Subject: [PATCH 15/18] Align XPU wildcard mask with CUDA UUID path so
 multi-GPU sharding triggers

_get_parent_visible_gpu_spec returned numeric_ids=list(range(physical))
for wildcard ZE_AFFINITY_MASK=*, which blocked get_device_map from
reaching its "unresolved multi-visible" fallback. Mirror the CUDA
UUID/MIG behavior by returning numeric_ids=None with
supports_explicit_gpu_ids=False, so explicit ids are still rejected
and get_device_map falls back to sharding across visible devices
when more than one is present.
---
 studio/backend/utils/hardware/hardware.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 519fd848d0..e63ee6edca 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -700,12 +700,14 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
 
         roots_with_dupes = _parse_ze_mask_roots(xpu_mask)
         if not roots_with_dupes:
-            # Non-digit wildcard (e.g. "*") or unparseable mask: treat the
-            # same as "all physical XPUs visible" but disable explicit ids
-            # since we cannot map logical ordinals to root IDs.
+            # Non-digit wildcard (e.g. "*") or unparseable mask: we cannot map
+            # logical ordinals to physical root IDs. Mirror the CUDA UUID/MIG
+            # path by returning numeric_ids=None + supports_explicit_gpu_ids
+            # False, so get_device_map() falls back to its multi-visible
+            # heuristic and explicit ids are rejected.
             return {
                 "raw": xpu_mask,
-                "numeric_ids": list(range(get_physical_gpu_count())),
+                "numeric_ids": None,
                 "supports_explicit_gpu_ids": False,
             }
 

From d6f1ea8ba41878f91d542642f43cbcbd9aa8335a Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 11 Apr 2026 12:05:12 +0000
Subject: [PATCH 16/18] Fix loop 1 XPU review findings

- _backend_visible_devices_env: return ZE_AFFINITY_MASK on XPU so
  get_backend_visible_gpu_info reports the active mask instead of a stale
  or None CUDA_VISIBLE_DEVICES after apply_gpu_ids runs.
- _get_parent_visible_gpu_spec: return numeric_ids=None for subdevice
  masks like 0.0,0.1 so get_visible_gpu_utilization, get_backend_visible_gpu_info
  and get_device_map enumerate torch-visible ordinals and can still shard
  across logical XPUs instead of collapsing to a single root.
- _parse_ze_mask_roots: use str.isdecimal() so Unicode superscripts do
  not crash int() via str.isdigit() admitting them.
- _get_xpu_utilization xpu-smi parsing: accept n/a, NA, - and lowercase
  variants as missing, and wrap the float parse so one bad column does
  not drop the whole telemetry row.
- clear_gpu_cache XPU branch: guard synchronize/empty_cache with hasattr
  + try/except so older torch-xpu builds do not propagate AttributeError.
- apply_gpu_ids XPU branch: pop stale CUDA_VISIBLE_DEVICES so
  environment-inspection tools do not show conflicting pinning state.
- format_error_message: add memory allocation failed pattern and
  isinstance(error, MemoryError) so CPU hosts still classify OOMs that
  the tightened substring list dropped.
- test_gpu_selection/test_gpu_selection_sandbox: rename TestXpuRejection
  to TestXpuSelection and update non_cuda -> non_accelerator and CUDA-only
  error substring to CUDA and Intel XPU so the suite matches the new
  behavior.
- inference.py/llama_cpp.py/trainer.py/utils.py: hoist contextlib,
  clear_gpu_cache, get_torch_device_str and get_device imports to module
  top per PEP 8 feedback from the hosted gemini bot.
---
 studio/backend/core/inference/inference.py    |  2 +-
 studio/backend/core/inference/llama_cpp.py    | 12 +--
 studio/backend/core/training/trainer.py       |  7 +-
 studio/backend/tests/test_gpu_selection.py    | 75 ++++++++++++++++---
 .../tests/test_gpu_selection_sandbox.py       |  4 +-
 studio/backend/utils/hardware/hardware.py     | 67 +++++++++++++----
 studio/backend/utils/utils.py                 |  6 +-
 7 files changed, 127 insertions(+), 46 deletions(-)

diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py
index fd16299719..0f6a035887 100644
--- a/studio/backend/core/inference/inference.py
+++ b/studio/backend/core/inference/inference.py
@@ -10,6 +10,7 @@
 from transformers import TextStreamer
 from peft import PeftModel, PeftModelForCausalLM
 
+import contextlib
 import json
 import sys
 import torch
@@ -1646,7 +1647,6 @@ def _generate_dac(
             + text
             + "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
         )
-        import contextlib
 
         with torch.inference_mode():
             # Derive the autocast device from the loaded model, not from the
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index d924776c34..2fce68be28 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -26,6 +26,8 @@
 
 import httpx
 
+from utils.hardware import clear_gpu_cache, get_torch_device_str
+
 logger = get_logger(__name__)
 
 # ── Pre-compiled patterns for plan-without-action re-prompt ──
@@ -1625,10 +1627,6 @@ def unload_model(self) -> bool:
             if LlamaCppBackend._codec_mgr is not None:
                 LlamaCppBackend._codec_mgr.unload()
                 LlamaCppBackend._codec_mgr = None
-                import torch
-
-                from utils.hardware import clear_gpu_cache
-
                 clear_gpu_cache()
             return True
 
@@ -3262,8 +3260,6 @@ def init_audio_codec(self, audio_type: str) -> None:
         if LlamaCppBackend._codec_mgr is None:
             LlamaCppBackend._codec_mgr = AudioCodecManager()
 
-        from utils.hardware import get_torch_device_str
-
         device = get_torch_device_str()
         model_repo_path = None
 
@@ -3336,10 +3332,6 @@ def generate_audio_response(
             else None
         )
 
-        import torch
-
-        from utils.hardware import get_torch_device_str
-
         device = get_torch_device_str()
         return LlamaCppBackend._codec_mgr.decode(
             audio_type, device, token_ids = token_ids, text = data.get("content", "")
diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index 1f64ea8d76..41b1cfa7bf 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -38,6 +38,7 @@
     safe_num_proc,
     dataset_map_num_proc,
     get_device_map,
+    get_torch_device_str,
     raise_if_offloaded,
     get_visible_gpu_count,
 )
@@ -1540,7 +1541,6 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
 
         SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
         SNAC_SAMPLE_RATE = 24000
-        from utils.hardware import get_torch_device_str
 
         device = get_torch_device_str()
         max_length = self.max_seq_length or 2048
@@ -1718,7 +1718,6 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
         import gc
 
         gc.collect()
-        from utils.hardware import clear_gpu_cache
 
         clear_gpu_cache()
         self._cuda_audio_used = True
@@ -1748,7 +1747,6 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):
 
         import subprocess
 
-        from utils.hardware import get_torch_device_str
 
         device = get_torch_device_str()
 
@@ -1950,7 +1948,6 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
         import gc
 
         gc.collect()
-        from utils.hardware import clear_gpu_cache
 
         clear_gpu_cache()
         self._cuda_audio_used = True
@@ -1987,7 +1984,6 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         from datasets import Dataset as HFDataset
         from utils.paths import ensure_dir, tmp_root
 
-        from utils.hardware import get_torch_device_str
 
         device = get_torch_device_str()
 
@@ -2167,7 +2163,6 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         import gc
 
         gc.collect()
-        from utils.hardware import clear_gpu_cache
 
         clear_gpu_cache()
         self._cuda_audio_used = True
diff --git a/studio/backend/tests/test_gpu_selection.py b/studio/backend/tests/test_gpu_selection.py
index c6f26037af..3ee39ea785 100644
--- a/studio/backend/tests/test_gpu_selection.py
+++ b/studio/backend/tests/test_gpu_selection.py
@@ -711,12 +711,14 @@ def start(self):
 
 
 class TestRouteErrors(unittest.TestCase):
-    def test_prepare_gpu_selection_rejects_gpu_ids_on_non_cuda_backend(self):
+    def test_prepare_gpu_selection_rejects_gpu_ids_on_non_accelerator_backend(self):
         with patch("utils.hardware.hardware.get_device", return_value = DeviceType.CPU):
             with self.assertRaises(ValueError) as exc_info:
                 prepare_gpu_selection([0], model_name = "unsloth/test")
 
-        self.assertIn("only supported on CUDA devices", str(exc_info.exception))
+        self.assertIn(
+            "only supported on CUDA and Intel XPU", str(exc_info.exception)
+        )
 
     def test_inference_route_rejects_gpu_ids_for_gguf(self):
         inference_route = _load_route_module(
@@ -1089,15 +1091,66 @@ def test_auto_select_falls_back_when_estimate_unavailable(self):
         self.assertEqual(metadata["selection_mode"], "fallback_all")
 
 
-class TestXpuRejection(_GpuCacheResetMixin, unittest.TestCase):
-    def test_auto_select_returns_non_cuda_for_xpu(self):
-        with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU):
+class TestXpuSelection(_GpuCacheResetMixin, unittest.TestCase):
+    def test_auto_select_supports_xpu(self):
+        with (
+            patch(
+                "utils.hardware.hardware.get_device", return_value = DeviceType.XPU
+            ),
+            patch(
+                "utils.hardware.hardware.estimate_required_model_memory_gb",
+                return_value = (1.0, {}),
+            ),
+            patch(
+                "utils.hardware.hardware.get_visible_gpu_utilization",
+                return_value = {
+                    "devices": [
+                        {"index": 0, "vram_total_gb": 8, "vram_used_gb": 1},
+                    ]
+                },
+            ),
+            patch(
+                "utils.hardware.hardware._get_parent_visible_gpu_spec",
+                return_value = {
+                    "raw": None,
+                    "numeric_ids": [0],
+                    "supports_explicit_gpu_ids": True,
+                },
+            ),
+            patch(
+                "utils.hardware.hardware.get_parent_visible_gpu_ids",
+                return_value = [0],
+            ),
+        ):
             selected, metadata = auto_select_gpu_ids("unsloth/test")
 
-        self.assertIsNone(selected)
-        self.assertEqual(metadata["selection_mode"], "non_cuda")
+        self.assertEqual(selected, [0])
+        self.assertEqual(metadata["selection_mode"], "auto")
 
-    def test_prepare_gpu_selection_rejects_explicit_ids_on_xpu(self):
-        with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU):
-            with self.assertRaisesRegex(ValueError, "only supported on CUDA"):
-                prepare_gpu_selection([0], model_name = "unsloth/test")
+    def test_prepare_gpu_selection_accepts_explicit_ids_on_xpu(self):
+        with (
+            patch(
+                "utils.hardware.hardware.get_device", return_value = DeviceType.XPU
+            ),
+            patch(
+                "utils.hardware.hardware._get_parent_visible_gpu_spec",
+                return_value = {
+                    "raw": "0",
+                    "numeric_ids": [0],
+                    "supports_explicit_gpu_ids": True,
+                },
+            ),
+            patch(
+                "utils.hardware.hardware.get_parent_visible_gpu_ids",
+                return_value = [0],
+            ),
+            patch(
+                "utils.hardware.hardware.get_physical_gpu_count", return_value = 1
+            ),
+        ):
+            selected, metadata = prepare_gpu_selection(
+                [0], model_name = "unsloth/test"
+            )
+
+        self.assertEqual(selected, [0])
+        self.assertEqual(metadata["selection_mode"], "explicit")
diff --git a/studio/backend/tests/test_gpu_selection_sandbox.py b/studio/backend/tests/test_gpu_selection_sandbox.py
index 830a98a2fb..3c7792472b 100644
--- a/studio/backend/tests/test_gpu_selection_sandbox.py
+++ b/studio/backend/tests/test_gpu_selection_sandbox.py
@@ -302,14 +302,14 @@ def test_two_gpus_needed(self):
             # 35GB (first) + 30*0.85 (second) = 60.5GB > 50GB
             self.assertEqual(len(selected), 2)
 
-    def test_non_cuda_returns_none(self):
+    def test_non_accelerator_returns_none(self):
         from utils.hardware.hardware import auto_select_gpu_ids
         import utils.hardware.hardware as hw
 
         with patch.object(hw, "get_device", return_value = hw.DeviceType.CPU):
             selected, meta = auto_select_gpu_ids("test/model")
             self.assertIsNone(selected)
-            self.assertEqual(meta["selection_mode"], "non_cuda")
+            self.assertEqual(meta["selection_mode"], "non_accelerator")
 
 
 class TestGetDeviceMap(unittest.TestCase):
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index d438ff87e6..14fa5f0e4d 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -185,10 +185,19 @@ def clear_gpu_cache():
         torch.cuda.empty_cache()
         torch.cuda.ipc_collect()
     elif device == DeviceType.XPU:
-        import torch
+        # Older torch-xpu builds may be missing synchronize/empty_cache;
+        # guard the calls so a stale build does not propagate AttributeError
+        # through callers that do not wrap clear_gpu_cache() themselves.
+        try:
+            import torch
 
-        torch.xpu.synchronize()
-        torch.xpu.empty_cache()
+            if hasattr(torch, "xpu"):
+                if hasattr(torch.xpu, "synchronize"):
+                    torch.xpu.synchronize()
+                if hasattr(torch.xpu, "empty_cache"):
+                    torch.xpu.empty_cache()
+        except Exception:
+            pass
     elif device == DeviceType.MLX:
         # MLX manages memory automatically; no explicit cache clear needed.
         # mlx.core has no empty_cache equivalent — gc.collect() above is enough.
@@ -455,7 +464,10 @@ def _parse_ze_mask_roots(mask: str) -> list[int]:
         if not token:
             continue
         root = token.split(".", 1)[0]
-        if root.isdigit():
+        # Use str.isdecimal() (not str.isdigit()) so Unicode superscripts
+        # like "2" / "3" are rejected -- they satisfy isdigit() but crash
+        # int() with ValueError.
+        if root.isdecimal():
             roots.append(int(root))
     return roots
 
@@ -520,15 +532,27 @@ def _get_xpu_utilization() -> Dict[str, Any]:
             timeout = 3,
         )
         if result.returncode == 0 and result.stdout.strip():
+            # xpu-smi versions differ slightly in how they render unknown
+            # metrics: empty string, "N/A", "n/a", "NA", or "-". Treat any
+            # of these as "value not available" so a single missing column
+            # does not silently drop the entire telemetry row.
+            _NA = frozenset(("", "n/a", "na", "-"))
+            def _parse_metric(value: str) -> Optional[float]:
+                if value.strip().lower() in _NA:
+                    return None
+                try:
+                    return float(value)
+                except ValueError:
+                    return None
             lines = result.stdout.strip().splitlines()
             for line in reversed(lines):
                 if line.startswith("Timestamp") or line.startswith("#"):
                     continue
                 parts = [p.strip() for p in line.split(",")]
                 if len(parts) >= 5:
-                    gpu_util = float(parts[2]) if parts[2] not in ("", "N/A") else None
-                    power_w = float(parts[3]) if parts[3] not in ("", "N/A") else None
-                    temp = float(parts[4]) if parts[4] not in ("", "N/A") else None
+                    gpu_util = _parse_metric(parts[2])
+                    power_w = _parse_metric(parts[3])
+                    temp = _parse_metric(parts[4])
                     break
     except Exception:
         pass
@@ -778,12 +802,18 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
             }
 
         if has_subdevice:
-            # Dedup for display: multiple subdevice entries under the same
-            # root collapse to that root ID while preserving insertion order.
-            unique_roots = list(dict.fromkeys(roots_with_dupes))
+            # Subdevice syntax (e.g. "0.0,0.1") expands one or more root
+            # GPUs into multiple logical devices. These logical ordinals
+            # do not map cleanly back to stable physical root IDs for
+            # explicit selection, so mirror the CUDA UUID/MIG and wildcard
+            # path: return numeric_ids=None and supports_explicit_gpu_ids
+            # False. Downstream (get_visible_gpu_utilization,
+            # get_backend_visible_gpu_info, get_device_map) then enumerates
+            # torch-visible ordinals and can still shard across the logical
+            # devices instead of collapsing them onto a single root.
             return {
                 "raw": xpu_mask,
-                "numeric_ids": unique_roots,
+                "numeric_ids": None,
                 "supports_explicit_gpu_ids": False,
             }
 
@@ -1428,11 +1458,16 @@ def get_physical_gpu_count() -> int:
 def _backend_visible_devices_env() -> Optional[str]:
     """Return the raw visibility env string that applies to this backend.
 
-    On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence
-    over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in
+    On XPU, ``ZE_AFFINITY_MASK`` is the visibility control (not
+    ``CUDA_VISIBLE_DEVICES``). On ROCm, ``HIP_VISIBLE_DEVICES`` /
+    ``ROCR_VISIBLE_DEVICES`` take precedence over ``CUDA_VISIBLE_DEVICES``;
+    the helper mirrors the resolution logic in
     ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices``
-    reports the value that is actually narrowing the visible device set.
+    reports the value that is actually narrowing the visible device set on
+    the current backend.
     """
+    if get_device() == DeviceType.XPU:
+        return os.environ.get("ZE_AFFINITY_MASK")
     if IS_ROCM:
         return _get_parent_visible_gpu_spec().get("raw")
     return os.environ.get("CUDA_VISIBLE_DEVICES")
@@ -1629,6 +1664,10 @@ def apply_gpu_ids(gpu_ids) -> None:
     # so worker subprocesses are actually restricted to the intended GPU.
     if get_device() == DeviceType.XPU:
         os.environ["ZE_AFFINITY_MASK"] = value
+        # Clear any stale CUDA_VISIBLE_DEVICES the parent may have inherited
+        # so tools that inspect the environment do not show conflicting
+        # pinning state (torch.xpu itself only reads ZE_AFFINITY_MASK).
+        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
         _visible_gpu_count = None
         logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value)
         return
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index 08c5754ce7..0b7e4d0de8 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -13,6 +13,8 @@
 import shutil
 import tempfile
 
+from utils.hardware import get_device
+
 
 logger = get_logger(__name__)
 
@@ -107,10 +109,10 @@ def format_error_message(error: Exception, model_name: str) -> str:
         or "out_of_host_memory" in error_str  # ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
         or "not enough memory" in error_str
         or "cannot allocate memory" in error_str
+        or "memory allocation failed" in error_str
+        or isinstance(error, MemoryError)
         or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str))
     ):
-        from utils.hardware import get_device
-
         device = get_device()
         device_label = {
             "cuda": "GPU",

From e22db8fdfda6ffa8803258c306324d239d43807b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 11 Apr 2026 12:23:23 +0000
Subject: [PATCH 17/18] Fix loop 2 XPU review findings

- test_gpu_selection.py:105 regex: update stale assertion from "uses
  UUID/MIG" to "uses non-numeric or subdevice" after the PR broadened
  resolve_requested_gpu_ids' error message to cover XPU subdevice masks.
  Three reviewers independently reproduced the suite failure.
- utils/utils.py: revert the module-top `from utils.hardware import
  get_device` hoist that broke test_utils.py::TestFormatErrorMessage::test_cpu_oom
  -- the test patches utils.hardware.get_device at call time, so the
  import must stay function-local. Keep the comment explaining why.
- hardware.py _get_xpu_utilization: lift _NA and _parse_metric out of
  the hot path to module scope (renamed _XPU_SMI_NA /
  _parse_xpu_smi_metric); re-instantiating them on every successful
  xpu-smi call is wasteful.
- hardware.py has_any check: include power_w alongside gpu_util, temp
  and vram_used_gb so a row that only exposes power is not silently
  discarded.
- hardware.py get_visible_gpu_utilization + get_backend_visible_gpu_info:
  honor explicit "no devices visible" masks (ZE_AFFINITY_MASK="" or
  CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the
  enumerate-visible-ordinals fallback. Previously get_visible_gpu_count
  returned 0 correctly but the telemetry helpers still enumerated torch
  devices, letting auto_select_gpu_ids pick a GPU the process explicitly
  hid.
- trainer.py: collapse the two consecutive blank lines left after
  removing inline `from utils.hardware import get_torch_device_str`
  imports at lines 1749 and 1985.
---
 studio/backend/core/training/trainer.py    |  2 -
 studio/backend/tests/test_gpu_selection.py |  3 +-
 studio/backend/utils/hardware/hardware.py  | 78 +++++++++++++++++-----
 studio/backend/utils/utils.py              |  7 +-
 4 files changed, 68 insertions(+), 22 deletions(-)

diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index 41b1cfa7bf..881350610e 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -1747,7 +1747,6 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):
 
         import subprocess
 
-
         device = get_torch_device_str()
 
         # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
@@ -1984,7 +1983,6 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         from datasets import Dataset as HFDataset
         from utils.paths import ensure_dir, tmp_root
 
-
         device = get_torch_device_str()
 
         # Clone OuteTTS repo (same as audio_codecs._load_dac)
diff --git a/studio/backend/tests/test_gpu_selection.py b/studio/backend/tests/test_gpu_selection.py
index 3ee39ea785..73110cd0eb 100644
--- a/studio/backend/tests/test_gpu_selection.py
+++ b/studio/backend/tests/test_gpu_selection.py
@@ -102,7 +102,8 @@ def test_explicit_ids_are_rejected_for_uuid_parent_visibility(self):
             patch("utils.hardware.hardware.get_physical_gpu_count", return_value = 8),
         ):
             with self.assertRaisesRegex(
-                ValueError, "unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG"
+                ValueError,
+                "unsupported when CUDA_VISIBLE_DEVICES uses non-numeric or subdevice",
             ):
                 resolve_requested_gpu_ids([1])
 
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 14fa5f0e4d..f310fbee2a 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -500,6 +500,25 @@ def _resolve_xpu_smi_device_id() -> int:
     return ordinal if xpu_ok else 0
 
 
+_XPU_SMI_NA = frozenset(("", "n/a", "na", "-"))
+
+
+def _parse_xpu_smi_metric(value: str) -> Optional[float]:
+    """Return float or None for missing/unknown xpu-smi CSV column values.
+
+    xpu-smi versions differ slightly in how they render unknown metrics:
+    empty string, "N/A", "n/a", "NA", or "-". Treat any of these as "value
+    not available" so a single missing column does not silently drop the
+    entire telemetry row.
+    """
+    if value.strip().lower() in _XPU_SMI_NA:
+        return None
+    try:
+        return float(value)
+    except ValueError:
+        return None
+
+
 def _get_xpu_utilization() -> Dict[str, Any]:
     """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu."""
     gpu_util = None
@@ -532,27 +551,15 @@ def _get_xpu_utilization() -> Dict[str, Any]:
             timeout = 3,
         )
         if result.returncode == 0 and result.stdout.strip():
-            # xpu-smi versions differ slightly in how they render unknown
-            # metrics: empty string, "N/A", "n/a", "NA", or "-". Treat any
-            # of these as "value not available" so a single missing column
-            # does not silently drop the entire telemetry row.
-            _NA = frozenset(("", "n/a", "na", "-"))
-            def _parse_metric(value: str) -> Optional[float]:
-                if value.strip().lower() in _NA:
-                    return None
-                try:
-                    return float(value)
-                except ValueError:
-                    return None
             lines = result.stdout.strip().splitlines()
             for line in reversed(lines):
                 if line.startswith("Timestamp") or line.startswith("#"):
                     continue
                 parts = [p.strip() for p in line.split(",")]
                 if len(parts) >= 5:
-                    gpu_util = _parse_metric(parts[2])
-                    power_w = _parse_metric(parts[3])
-                    temp = _parse_metric(parts[4])
+                    gpu_util = _parse_xpu_smi_metric(parts[2])
+                    power_w = _parse_xpu_smi_metric(parts[3])
+                    temp = _parse_xpu_smi_metric(parts[4])
                     break
     except Exception:
         pass
@@ -579,7 +586,9 @@ def _parse_metric(value: str) -> Optional[float]:
         else None
     )
 
-    has_any = any(v is not None for v in [gpu_util, temp, vram_used_gb])
+    has_any = any(
+        v is not None for v in [gpu_util, temp, vram_used_gb, power_w]
+    )
     if not has_any:
         return {"available": False, "backend": "xpu"}
 
@@ -673,6 +682,23 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
 
     # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel)
     if device in (DeviceType.CUDA, DeviceType.XPU):
+        parent_visible_spec = _get_parent_visible_gpu_spec()
+        # Honor an explicit empty visibility env (ZE_AFFINITY_MASK="" or
+        # CUDA_VISIBLE_DEVICES="" / "-1") as "no devices visible". Without
+        # this guard, the enumerate-visible-ordinals fallback below would
+        # happily report devices the process explicitly hid.
+        if (
+            parent_visible_spec["raw"] is not None
+            and parent_visible_spec["numeric_ids"] == []
+        ):
+            return {
+                "available": False,
+                "backend": _backend_label(device),
+                "parent_visible_gpu_ids": [],
+                "devices": [],
+                "index_kind": "relative",
+            }
+
         parent_ids = get_parent_visible_gpu_ids()
         # When parent_visible_ids is empty (UUID/MIG mask or no CVD set),
         # enumerate torch-visible ordinals so the UI still shows devices.
@@ -1476,13 +1502,31 @@ def _backend_visible_devices_env() -> Optional[str]:
 def get_backend_visible_gpu_info() -> Dict[str, Any]:
     device = get_device()
     if device in (DeviceType.CUDA, DeviceType.XPU):
+        parent_visible_spec = _get_parent_visible_gpu_spec()
         parent_visible_ids = get_parent_visible_gpu_ids()
+
+        # Honor an explicit "no devices visible" mask (ZE_AFFINITY_MASK=""
+        # or CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the
+        # torch-ordinal enumeration fallback, which would otherwise report
+        # devices that the process explicitly hid.
+        if (
+            parent_visible_spec["raw"] is not None
+            and parent_visible_spec["numeric_ids"] == []
+        ):
+            return {
+                "available": False,
+                "backend": _backend_label(device),
+                "backend_cuda_visible_devices": _backend_visible_devices_env(),
+                "parent_visible_gpu_ids": [],
+                "devices": [],
+                "index_kind": "relative",
+            }
+
         # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm)
         if device == DeviceType.CUDA and not IS_ROCM:
             try:
                 from . import nvidia
 
-                parent_visible_spec = _get_parent_visible_gpu_spec()
                 result = nvidia.get_backend_visible_gpu_info(
                     parent_visible_spec["numeric_ids"],
                     parent_visible_spec["raw"],
diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py
index 0b7e4d0de8..dfa399105d 100644
--- a/studio/backend/utils/utils.py
+++ b/studio/backend/utils/utils.py
@@ -13,8 +13,6 @@
 import shutil
 import tempfile
 
-from utils.hardware import get_device
-
 
 logger = get_logger(__name__)
 
@@ -113,6 +111,11 @@ def format_error_message(error: Exception, model_name: str) -> str:
         or isinstance(error, MemoryError)
         or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str))
     ):
+        # Resolve get_device() at call time (not import time) so tests that
+        # monkey-patch utils.hardware.get_device after this module is loaded
+        # still see the patched backend.
+        from utils.hardware import get_device
+
         device = get_device()
         device_label = {
             "cuda": "GPU",

From c19476ddbd4c0bfbf6500924c788d63a639d98f9 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Sat, 11 Apr 2026 12:40:33 +0000
Subject: [PATCH 18/18] Fix loop 3 XPU review findings

- apply_gpu_ids XPU: revert the CUDA_VISIBLE_DEVICES pop from loop 1.
  Popping it re-enabled CUDA detection on hybrid NVIDIA+Intel hosts
  where the parent had set CUDA_VISIBLE_DEVICES="" to force Studio
  onto XPU; the worker's follow-up detect_hardware() then flipped
  back to CUDA. torch.xpu only reads ZE_AFFINITY_MASK so the stale
  CUDA_VISIBLE_DEVICES is cosmetically redundant but functionally
  harmless, and leaving it alone preserves hybrid-host detection.

- llama_cpp._start_process: pin the llama-server subprocess via
  ZE_AFFINITY_MASK on XPU hosts and CUDA_VISIBLE_DEVICES elsewhere.
  llama-server's SYCL build reads ZE_AFFINITY_MASK, not
  CUDA_VISIBLE_DEVICES, so previous pinning was silently ignored
  on Intel.

- llama_cpp init_audio_codec / generate_audio_response: revert the
  promotion from get_torch_device_str() to "xpu" on Intel hosts.
  SNAC / BiCodec / DAC codecs are not yet validated on Intel XPU
  and the old CPU fallback was the known-working non-CUDA path.
  Drop the now-unused get_torch_device_str import from llama_cpp.py.

- trainer.py _preprocess_snac_dataset / _preprocess_bicodec_dataset /
  _preprocess_dac_dataset: revert the same unconditional XPU routing
  for audio dataset preprocessing back to the pre-PR CPU fallback on
  non-CUDA hosts. Spark-TTS BiCodec, SNAC, and OuteTTS DAC / Whisper
  paths were all CPU-backed on every non-CUDA host before this PR;
  promoting them to XPU without capability probes regressed the
  previously working CPU path. Drop the now-unused get_torch_device_str
  import from trainer.py.

- dataset_map_num_proc: only disable multiprocessing when
  torch.xpu.is_initialized exists and returns True. Older torch-xpu
  builds without is_initialized() were previously falling through the
  broad except and returning None, silently disabling pre-init CPU
  dataset parallelism the docstring explicitly says should still work.

- _get_xpu_utilization: cache the resolved xpu-smi binary path in a
  module-level sentinel via _resolve_xpu_smi_binary() so repeated
  telemetry polls do not re-scan PATH on every tick.

- get_backend_visible_gpu_info: move the parent_visible_ids lookup
  below the empty-mask short-circuit so the spec is not computed twice
  on the fast exit path.
---
 studio/backend/core/inference/llama_cpp.py | 30 ++++++++++--
 studio/backend/core/training/trainer.py    | 14 ++++--
 studio/backend/utils/hardware/hardware.py  | 55 +++++++++++++++++-----
 3 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 2fce68be28..2ac7d22afc 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -26,7 +26,7 @@
 
 import httpx
 
-from utils.hardware import clear_gpu_cache, get_torch_device_str
+from utils.hardware import clear_gpu_cache
 
 logger = get_logger(__name__)
 
@@ -1514,9 +1514,20 @@ def load_model(
                     f"{new_ld}:{existing_ld}" if existing_ld else new_ld
                 )
 
-            # Pin to selected GPU(s) via CUDA_VISIBLE_DEVICES
+            # Pin to selected GPU(s) via the backend-appropriate visibility
+            # env var: CUDA_VISIBLE_DEVICES on NVIDIA/ROCm, ZE_AFFINITY_MASK
+            # on Intel XPU (llama-server's SYCL build reads ZE_AFFINITY_MASK,
+            # not CUDA_VISIBLE_DEVICES).
             if gpu_indices is not None:
-                env["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpu_indices)
+                from utils.hardware import get_device
+                from utils.hardware.hardware import DeviceType
+
+                mask = ",".join(str(i) for i in gpu_indices)
+                if get_device() == DeviceType.XPU:
+                    env["ZE_AFFINITY_MASK"] = mask
+                    env.pop("CUDA_VISIBLE_DEVICES", None)
+                else:
+                    env["CUDA_VISIBLE_DEVICES"] = mask
 
             self._stdout_lines = []
             self._process = subprocess.Popen(
@@ -3260,7 +3271,12 @@ def init_audio_codec(self, audio_type: str) -> None:
         if LlamaCppBackend._codec_mgr is None:
             LlamaCppBackend._codec_mgr = AudioCodecManager()
 
-        device = get_torch_device_str()
+        # Preserve the pre-PR CPU fallback on non-CUDA hosts: the SNAC /
+        # BiCodec / DAC codecs are not yet validated on Intel XPU, so
+        # only promote to a GPU device when CUDA is actually available.
+        # A follow-up can extend this once an XPU-specific codec path is
+        # added.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         model_repo_path = None
 
         # BiCodec needs a repo with BiCodec/ weights — download canonical SparkTTS
@@ -3332,7 +3348,11 @@ def generate_audio_response(
             else None
         )
 
-        device = get_torch_device_str()
+        # Match init_audio_codec: stay on CPU for non-CUDA hosts until the
+        # codec path is validated on XPU.
+        import torch
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         return LlamaCppBackend._codec_mgr.decode(
             audio_type, device, token_ids = token_ids, text = data.get("content", "")
         )
diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index 881350610e..b00d1b6c8b 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -38,7 +38,6 @@
     safe_num_proc,
     dataset_map_num_proc,
     get_device_map,
-    get_torch_device_str,
     raise_if_offloaded,
     get_visible_gpu_count,
 )
@@ -1542,7 +1541,10 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
         SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
         SNAC_SAMPLE_RATE = 24000
 
-        device = get_torch_device_str()
+        # SNAC codec has not been validated on Intel XPU yet; keep the
+        # pre-PR CPU fallback for non-CUDA hosts until an XPU-specific
+        # path is added.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         max_length = self.max_seq_length or 2048
         tokenizer = self.tokenizer
 
@@ -1747,7 +1749,9 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):
 
         import subprocess
 
-        device = get_torch_device_str()
+        # Spark-TTS BiCodec has not been validated on Intel XPU; keep the
+        # pre-PR CPU fallback for non-CUDA hosts.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
         # NOT in the unsloth/Spark-TTS-0.5B HF model repo. Clone it if needed.
@@ -1983,7 +1987,9 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
         from datasets import Dataset as HFDataset
         from utils.paths import ensure_dir, tmp_root
 
-        device = get_torch_device_str()
+        # OuteTTS DAC/Whisper preprocess has not been validated on Intel
+        # XPU; keep the pre-PR CPU fallback for non-CUDA hosts.
+        device = "cuda" if torch.cuda.is_available() else "cpu"
 
         # Clone OuteTTS repo (same as audio_codecs._load_dac)
         import subprocess
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index f310fbee2a..0627212499 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -502,6 +502,23 @@ def _resolve_xpu_smi_device_id() -> int:
 
 _XPU_SMI_NA = frozenset(("", "n/a", "na", "-"))
 
+# Cached xpu-smi binary path. _XPU_SMI_PATH_UNSET is a sentinel distinct
+# from None: None means "scanned PATH and not found" while the sentinel
+# means "not scanned yet". Resolved once by _resolve_xpu_smi_binary() so
+# live telemetry polls do not re-scan PATH on every tick.
+_XPU_SMI_PATH_UNSET: Any = object()
+_xpu_smi_binary: Any = _XPU_SMI_PATH_UNSET
+
+
+def _resolve_xpu_smi_binary() -> Optional[str]:
+    """Return cached absolute path to ``xpu-smi`` or None if not on PATH."""
+    global _xpu_smi_binary
+    if _xpu_smi_binary is _XPU_SMI_PATH_UNSET:
+        import shutil as _shutil
+
+        _xpu_smi_binary = _shutil.which("xpu-smi")
+    return _xpu_smi_binary
+
 
 def _parse_xpu_smi_metric(value: str) -> Optional[float]:
     """Return float or None for missing/unknown xpu-smi CSV column values.
@@ -528,12 +545,14 @@ def _get_xpu_utilization() -> Dict[str, Any]:
     dev_idx = _resolve_xpu_smi_device_id()
 
     try:
-        import shutil
         import subprocess
 
         # Skip subprocess entirely when xpu-smi is not on PATH, avoiding
         # a multi-second timeout on systems without the Intel tooling.
-        xpu_smi = shutil.which("xpu-smi")
+        # The binary path is resolved once and cached by
+        # _resolve_xpu_smi_binary() so repeated telemetry polls do not
+        # re-scan PATH on every tick.
+        xpu_smi = _resolve_xpu_smi_binary()
         if xpu_smi is None:
             raise FileNotFoundError("xpu-smi not found")
 
@@ -1503,7 +1522,6 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
     device = get_device()
     if device in (DeviceType.CUDA, DeviceType.XPU):
         parent_visible_spec = _get_parent_visible_gpu_spec()
-        parent_visible_ids = get_parent_visible_gpu_ids()
 
         # Honor an explicit "no devices visible" mask (ZE_AFFINITY_MASK=""
         # or CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the
@@ -1522,6 +1540,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
                 "index_kind": "relative",
             }
 
+        parent_visible_ids = get_parent_visible_gpu_ids()
+
         # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm)
         if device == DeviceType.CUDA and not IS_ROCM:
             try:
@@ -1708,10 +1728,13 @@ def apply_gpu_ids(gpu_ids) -> None:
     # so worker subprocesses are actually restricted to the intended GPU.
     if get_device() == DeviceType.XPU:
         os.environ["ZE_AFFINITY_MASK"] = value
-        # Clear any stale CUDA_VISIBLE_DEVICES the parent may have inherited
-        # so tools that inspect the environment do not show conflicting
-        # pinning state (torch.xpu itself only reads ZE_AFFINITY_MASK).
-        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+        # Deliberately leave any inherited CUDA_VISIBLE_DEVICES alone: on
+        # hybrid NVIDIA+Intel hosts the parent may have set
+        # CUDA_VISIBLE_DEVICES="" to disable NVIDIA and force Studio onto
+        # XPU. Popping the variable here would let the worker's follow-up
+        # detect_hardware() call flip back to CUDA. torch.xpu only reads
+        # ZE_AFFINITY_MASK, so an extra CUDA_VISIBLE_DEVICES entry in env
+        # is cosmetically stale but functionally harmless.
         _visible_gpu_count = None
         logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value)
         return
@@ -1905,10 +1928,20 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]:
     if get_device() == DeviceType.XPU:
         try:
             import torch
-
-            if hasattr(torch, "xpu") and torch.xpu.is_initialized():
-                return None
         except Exception:
-            return None
+            # No torch means no XPU runtime is active here, so CPU-side
+            # dataset parallelism is still safe.
+            return safe_num_proc(desired)
+
+        xpu = getattr(torch, "xpu", None)
+        is_initialized = getattr(xpu, "is_initialized", None)
+        if callable(is_initialized):
+            try:
+                if is_initialized():
+                    return None
+            except Exception:
+                # Treat a failing probe as "runtime not touched yet" so
+                # pre-init CPU preprocessing can still parallelize.
+                pass
 
     return safe_num_proc(desired)