From 9db527a717e74f1cac27bec778392a2e976484b1 Mon Sep 17 00:00:00 2001 From: leizhenyuan Date: Tue, 31 Mar 2026 10:46:23 +0000 Subject: [PATCH 01/18] add intel GPU for unsloth studio --- studio/backend/core/inference/inference.py | 3 +- studio/backend/core/inference/llama_cpp.py | 10 +- studio/backend/core/training/trainer.py | 18 ++- studio/backend/main.py | 59 +++++++ studio/backend/utils/hardware/__init__.py | 8 + studio/backend/utils/hardware/hardware.py | 171 ++++++++++++++++++++- studio/backend/utils/utils.py | 3 +- unsloth/import_fixes.py | 22 ++- unsloth/models/rl.py | 4 + 9 files changed, 273 insertions(+), 25 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index 867bdefc62..085f01a194 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -1643,7 +1643,8 @@ def _generate_dac( + "<|text_end|>\n<|audio_start|><|global_features_start|>\n" ) with torch.inference_mode(): - with torch.amp.autocast("cuda", dtype = model.dtype): + from utils.hardware import get_torch_device_str + with torch.amp.autocast(get_torch_device_str(), dtype = model.dtype): inputs = tokenizer([prompt], return_tensors = "pt").to(model.device) generated = model.generate( **inputs, diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index c1f87ff936..19a169425f 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1438,8 +1438,8 @@ def unload_model(self) -> bool: LlamaCppBackend._codec_mgr = None import torch - if torch.cuda.is_available(): - torch.cuda.empty_cache() + from utils.hardware import clear_gpu_cache + clear_gpu_cache() return True def _kill_process(self): @@ -3016,7 +3016,8 @@ def init_audio_codec(self, audio_type: str) -> None: if LlamaCppBackend._codec_mgr is None: LlamaCppBackend._codec_mgr = AudioCodecManager() - device = "cuda" if torch.cuda.is_available() else "cpu" + from utils.hardware import get_torch_device_str + device = get_torch_device_str() model_repo_path = None # BiCodec needs a repo with BiCodec/ weights — download canonical SparkTTS @@ -3090,7 +3091,8 @@ def generate_audio_response( import torch - device = "cuda" if torch.cuda.is_available() else "cpu" + from utils.hardware import get_torch_device_str + device = get_torch_device_str() return LlamaCppBackend._codec_mgr.decode( audio_type, device, token_ids = token_ids, text = data.get("content", "") ) diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py index ab1825d94a..f25e848c8f 100644 --- a/studio/backend/core/training/trainer.py +++ b/studio/backend/core/training/trainer.py @@ -1532,7 +1532,8 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz" SNAC_SAMPLE_RATE = 24000 - device = "cuda" if torch.cuda.is_available() else "cpu" + from utils.hardware import get_torch_device_str + device = get_torch_device_str() max_length = self.max_seq_length or 2048 tokenizer = self.tokenizer @@ -1708,7 +1709,8 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): import gc gc.collect() - torch.cuda.empty_cache() + from utils.hardware import clear_gpu_cache + clear_gpu_cache() self._cuda_audio_used = True if not processed_examples: @@ -1736,7 +1738,8 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None): import subprocess - device = "cuda" if torch.cuda.is_available() else "cpu" + from utils.hardware import get_torch_device_str + device = get_torch_device_str() # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo, # NOT in the unsloth/Spark-TTS-0.5B HF model repo. Clone it if needed. @@ -1936,7 +1939,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor: import gc gc.collect() - torch.cuda.empty_cache() + from utils.hardware import clear_gpu_cache + clear_gpu_cache() self._cuda_audio_used = True if not processed_examples: @@ -1971,7 +1975,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): from datasets import Dataset as HFDataset from utils.paths import ensure_dir, tmp_root - device = "cuda" if torch.cuda.is_available() else "cpu" + from utils.hardware import get_torch_device_str + device = get_torch_device_str() # Clone OuteTTS repo (same as audio_codecs._load_dac) import subprocess @@ -2149,7 +2154,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): import gc gc.collect() - torch.cuda.empty_cache() + from utils.hardware import clear_gpu_cache + clear_gpu_cache() self._cuda_audio_used = True if not processed_examples: diff --git a/studio/backend/main.py b/studio/backend/main.py index c18f18a743..99358ae9de 100644 --- a/studio/backend/main.py +++ b/studio/backend/main.py @@ -238,11 +238,70 @@ async def get_system_info(): import psutil from utils.hardware import get_device +<<<<<<< Updated upstream visibility_info = get_backend_visible_gpu_info() gpu_info = { "available": visibility_info["available"], "devices": visibility_info["devices"], } +======= + try: + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=index,name,memory.total", + "--format=csv,noheader,nounits", + ], + capture_output = True, + text = True, + timeout = 10, + ) + if result.returncode == 0: + for line in result.stdout.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) == 3: + idx = int(parts[0]) + if allowed_indices is not None and idx not in allowed_indices: + continue + gpu_info["devices"].append( + { + "index": idx, + "name": parts[1], + "memory_total_gb": round(int(parts[2]) / 1024, 2), + } + ) + gpu_info["available"] = len(gpu_info["devices"]) > 0 + except Exception: + pass + elif device == DeviceType.XPU: + try: + import torch + for i in range(torch.xpu.device_count()): + props = torch.xpu.get_device_properties(i) + gpu_info["devices"].append( + { + "index": i, + "name": props.name, + "memory_total_gb": round(props.total_memory / (1024**3), 2), + } + ) + gpu_info["available"] = len(gpu_info["devices"]) > 0 + except Exception: + pass + + # Fallback to torch-based single-GPU detection + if not gpu_info["available"]: + mem_info = get_gpu_memory_info() + if mem_info.get("available"): + gpu_info["available"] = True + gpu_info["devices"].append( + { + "index": mem_info.get("device", 0), + "name": mem_info.get("device_name", "Unknown"), + "memory_total_gb": round(mem_info.get("total_gb", 0), 2), + } + ) +>>>>>>> Stashed changes # CPU & Memory memory = psutil.virtual_memory() diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index aaa0452406..981c0d2453 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -22,12 +22,16 @@ get_backend_visible_gpu_info, get_physical_gpu_count, get_visible_gpu_count, +<<<<<<< Updated upstream get_parent_visible_gpu_ids, resolve_requested_gpu_ids, estimate_fp16_model_size_bytes, estimate_required_model_memory_gb, auto_select_gpu_ids, prepare_gpu_selection, +======= + get_torch_device_str, +>>>>>>> Stashed changes safe_num_proc, safe_thread_num_proc, dataset_map_num_proc, @@ -62,12 +66,16 @@ "get_backend_visible_gpu_info", "get_physical_gpu_count", "get_visible_gpu_count", +<<<<<<< Updated upstream "get_parent_visible_gpu_ids", "resolve_requested_gpu_ids", "estimate_fp16_model_size_bytes", "estimate_required_model_memory_gb", "auto_select_gpu_ids", "prepare_gpu_selection", +======= + "get_torch_device_str", +>>>>>>> Stashed changes "safe_num_proc", "safe_thread_num_proc", "dataset_map_num_proc", diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index b6d3faf6d7..af5fbc51bb 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -42,7 +42,7 @@ class DeviceType(str, Enum): # ========== Global State (set once by detect_hardware) ========== DEVICE: Optional[DeviceType] = None -CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) +CHAT_ONLY: bool = True # No CUDA/XPU GPU -> GGUF chat only (Mac, CPU-only, etc.) # ========== Detection ========== @@ -82,16 +82,17 @@ def detect_hardware() -> DeviceType: Detection order: 1. CUDA (NVIDIA GPU, requires torch) - 2. MLX (Apple Silicon via MLX framework) - 3. CPU (fallback) + 2. XPU (Intel GPU, requires torch with XPU support) + 3. MLX (Apple Silicon via MLX framework) + 4. CPU (fallback) """ global DEVICE, CHAT_ONLY - CHAT_ONLY = True # reset -- only CUDA sets it to False + CHAT_ONLY = True # reset -- only CUDA/XPU sets it to False - # --- CUDA: try PyTorch --- if _has_torch(): import torch + # --- CUDA: NVIDIA GPU --- if torch.cuda.is_available(): DEVICE = DeviceType.CUDA CHAT_ONLY = False @@ -99,10 +100,14 @@ def detect_hardware() -> DeviceType: print(f"Hardware detected: CUDA — {device_name}") return DEVICE +<<<<<<< Updated upstream # --- XPU: Intel GPU --- if _has_torch(): import torch +======= + # --- XPU: Intel GPU --- +>>>>>>> Stashed changes if hasattr(torch, "xpu") and torch.xpu.is_available(): DEVICE = DeviceType.XPU CHAT_ONLY = False @@ -223,7 +228,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: "utilization_pct": (allocated / total) * 100, } except Exception as e: +<<<<<<< Updated upstream logger.error("Error getting XPU GPU info: %s", e) +======= + logger.error(f"Error getting XPU GPU info: {e}") +>>>>>>> Stashed changes return {"available": False, "backend": device.value, "error": str(e)} # ---- MLX path (Apple Silicon) ---- @@ -315,18 +324,96 @@ def get_package_versions() -> Dict[str, Optional[str]]: except PackageNotFoundError: versions[name] = None - # CUDA toolkit version bundled with torch + # CUDA/XPU toolkit version bundled with torch try: import torch versions["cuda"] = getattr(torch.version, "cuda", None) + if hasattr(torch, "xpu") and torch.xpu.is_available(): + versions["xpu"] = True except Exception: versions["cuda"] = None return versions +<<<<<<< Updated upstream # ========== Torch-based GPU fallbacks (AMD ROCm, Intel XPU, nvidia-smi missing) ========== +======= +# ========== Live GPU Utilization ========== + + +def _get_xpu_utilization() -> Dict[str, Any]: + """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu.""" + try: + import subprocess + + result = subprocess.run( + ["xpu-smi", "dump", "-d", "0", "-m", "0,1,2,18"], + capture_output=True, text=True, timeout=5, + ) + if result.returncode == 0 and result.stdout.strip(): + # xpu-smi dump outputs CSV: Timestamp, DeviceId, GPU Utilization (%), ... + lines = result.stdout.strip().splitlines() + for line in reversed(lines): + if line.startswith("Timestamp") or line.startswith("#"): + continue + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 4: + gpu_util = float(parts[2]) if parts[2] not in ("", "N/A") else None + temp = float(parts[3]) if parts[3] not in ("", "N/A") else None + break + else: + gpu_util = None + temp = None + else: + gpu_util = None + temp = None + except Exception: + gpu_util = None + temp = None + + # Get VRAM from torch.xpu + vram_used_gb = None + vram_total_gb = None + try: + import torch + + idx = torch.xpu.current_device() + props = torch.xpu.get_device_properties(idx) + vram_total_gb = round(props.total_memory / (1024**3), 2) + vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2) + except Exception: + pass + + vram_pct = ( + round((vram_used_gb / vram_total_gb) * 100, 1) + if vram_used_gb is not None and vram_total_gb and vram_total_gb > 0 + else None + ) + + has_any = any(v is not None for v in [gpu_util, temp, vram_used_gb]) + if not has_any: + return {"available": False, "backend": "xpu"} + + return { + "available": True, + "backend": "xpu", + "gpu_utilization_pct": gpu_util, + "temperature_c": temp, + "vram_used_gb": vram_used_gb, + "vram_total_gb": vram_total_gb, + "vram_utilization_pct": vram_pct, + "power_draw_w": None, + "power_limit_w": None, + "power_utilization_pct": None, + } + + +def get_gpu_utilization() -> Dict[str, Any]: + """ + Return a live snapshot of GPU utilization via ``nvidia-smi``. +>>>>>>> Stashed changes def _torch_get_device_module(): @@ -334,11 +421,19 @@ def _torch_get_device_module(): device = get_device() import torch +<<<<<<< Updated upstream if device == DeviceType.CUDA: return torch.cuda, "cuda" if device == DeviceType.XPU and hasattr(torch, "xpu"): return torch.xpu, "xpu" return None, None +======= + if device == DeviceType.XPU: + return _get_xpu_utilization() + + if device != DeviceType.CUDA: + return {"available": False, "backend": device.value} +>>>>>>> Stashed changes def _torch_get_physical_gpu_count() -> Optional[int]: @@ -1097,8 +1192,12 @@ def get_physical_gpu_count() -> int: """ Return the number of physical GPUs on the machine. +<<<<<<< Updated upstream Uses ``nvidia-smi -L`` on NVIDIA (unaffected by CUDA_VISIBLE_DEVICES), with a torch-based fallback for AMD ROCm and Intel XPU. +======= + For NVIDIA uses ``nvidia-smi -L``; for Intel XPU uses ``torch.xpu.device_count()``. +>>>>>>> Stashed changes Result is cached after the first call. """ global _physical_gpu_count @@ -1106,6 +1205,20 @@ def get_physical_gpu_count() -> int: return _physical_gpu_count device = get_device() +<<<<<<< Updated upstream +======= + + if device == DeviceType.XPU: + try: + import torch + _physical_gpu_count = torch.xpu.device_count() + except Exception: + _physical_gpu_count = 1 + return _physical_gpu_count + + try: + import subprocess +>>>>>>> Stashed changes if device == DeviceType.CUDA: try: @@ -1246,6 +1359,30 @@ def get_visible_gpu_count() -> int: if _visible_gpu_count is not None: return _visible_gpu_count +<<<<<<< Updated upstream +======= + # Check XPU visibility env var or CUDA_VISIBLE_DEVICES + import os + + device = get_device() + + if device == DeviceType.XPU: + xpu_visible = os.environ.get("ZE_AFFINITY_MASK") + if xpu_visible is not None: + xpu_visible = xpu_visible.strip() + if xpu_visible == "": + _visible_gpu_count = 0 + else: + _visible_gpu_count = len([x for x in xpu_visible.split(",") if x.strip()]) + return _visible_gpu_count + try: + import torch + _visible_gpu_count = torch.xpu.device_count() + except Exception: + _visible_gpu_count = get_physical_gpu_count() + return _visible_gpu_count + +>>>>>>> Stashed changes cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_visible is not None: # "" means zero GPUs, "0" means 1, "0,1,2" means 3 @@ -1270,6 +1407,7 @@ def get_visible_gpu_count() -> int: return _visible_gpu_count +<<<<<<< Updated upstream def apply_gpu_ids(gpu_ids) -> None: if gpu_ids is None: return @@ -1353,6 +1491,19 @@ def raise_if_offloaded(model, device_map: str, context: str = "Loading") -> None f"{context} does not support models loaded with CPU or disk offload. " f"device_map='{device_map}' produced offloaded modules: {example}" ) +======= +def get_torch_device_str() -> str: + """ + Return the torch device string for the detected hardware. + E.g. "cuda", "xpu", or "cpu". + """ + device = get_device() + if device == DeviceType.CUDA: + return "cuda" + elif device == DeviceType.XPU: + return "xpu" + return "cpu" +>>>>>>> Stashed changes def safe_num_proc(desired: Optional[int] = None) -> int: @@ -1430,9 +1581,17 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]: Returns ``None`` on spawn-based platforms (Windows, macOS) because ``datasets`` treats ``num_proc=1`` as multiprocessing (creates ``Pool(1)``). Only ``num_proc=None`` guarantees in-process execution. + + Also returns ``None`` on XPU devices because ``os.fork()`` corrupts the + Level-Zero GPU context, causing Triton kernel launches to fail with + "Pointer argument doesn't reference XPU device memory". """ import sys if sys.platform in ("win32", "darwin"): return None + + if get_device() == DeviceType.XPU: + return None + return safe_num_proc(desired) diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index 4e61a5b969..a544bb0802 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -103,13 +103,14 @@ def format_error_message(error: Exception, model_name: str) -> str: if ( "memory" in error_str or "cuda" in error_str + or "xpu" in error_str or "mlx" in error_str or "out of memory" in error_str ): from utils.hardware import get_device device = get_device() - device_label = {"cuda": "GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get( + device_label = {"cuda": "GPU", "xpu": "Intel GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get( device.value, "GPU" ) return f"Not enough {device_label} memory to load '{model_short}'. Try a smaller model or free memory." diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index ca44a0ce7e..195dc14d85 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -443,7 +443,9 @@ def fix_vllm_aimv2_issue(): def fix_vllm_guided_decoding_params(): - def _maybe_raise_vllm_transformers_mismatch(error): + def _maybe_disable_vllm_transformers_mismatch(error): + """If vLLM fails due to transformers version mismatch, disable it gracefully.""" + global VLLM_BROKEN error_text = str(error) if ( "ALLOWED_LAYER_TYPES" in error_text @@ -453,13 +455,17 @@ def _maybe_raise_vllm_transformers_mismatch(error): vllm_version = importlib_version("vllm") except Exception: vllm_version = "unknown" - raise RuntimeError( + logger.warning( "Unsloth: vLLM with version " f"{vllm_version} does not yet support transformers>=5.0.0. " - "Please downgrade to transformers==4.57.3 via " - 'pip install --force-reinstall "transformers==4.57.3". ' + "Disabling vLLM and continuing without it. " f"Original error: {error}" - ) from error + ) + VLLM_BROKEN = True + _clear_vllm_modules() + _install_vllm_blocker() + return True + return False if importlib.util.find_spec("vllm") is None: return @@ -469,7 +475,8 @@ def _maybe_raise_vllm_transformers_mismatch(error): try: import vllm except (ImportError, OSError) as e: - _maybe_raise_vllm_transformers_mismatch(e) + if _maybe_disable_vllm_transformers_mismatch(e): + return if disable_broken_vllm(e): return raise @@ -477,7 +484,8 @@ def _maybe_raise_vllm_transformers_mismatch(error): try: from vllm.sampling_params import GuidedDecodingParams except (ImportError, OSError) as e: - _maybe_raise_vllm_transformers_mismatch(e) + if _maybe_disable_vllm_transformers_mismatch(e): + return if disable_broken_vllm(e): return if not hasattr(vllm, "sampling_params") or not hasattr( diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 5651a7da41..449419acf6 100755 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -1202,6 +1202,10 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): " memory_gb_left = psutil.virtual_memory().available / (1024**3)\n" " if memory_gb_left <= 2: dataset_num_proc = 1\n" " else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left))\n" + "# XPU: forking corrupts Level-Zero context, force single process\n" + "import torch as _torch\n" + "if hasattr(_torch, 'xpu') and _torch.xpu.is_available():\n" + " dataset_num_proc = 1\n" ) extra_args += num_proc_check From f1c426b2f1e42536f49c60c508c1e11f7984bbe5 Mon Sep 17 00:00:00 2001 From: leizhenyuan Date: Tue, 31 Mar 2026 11:04:29 +0000 Subject: [PATCH 02/18] clean stash --- studio/backend/main.py | 59 -------- studio/backend/utils/hardware/__init__.py | 6 - studio/backend/utils/hardware/hardware.py | 171 ++++++++-------------- 3 files changed, 61 insertions(+), 175 deletions(-) diff --git a/studio/backend/main.py b/studio/backend/main.py index 99358ae9de..c18f18a743 100644 --- a/studio/backend/main.py +++ b/studio/backend/main.py @@ -238,70 +238,11 @@ async def get_system_info(): import psutil from utils.hardware import get_device -<<<<<<< Updated upstream visibility_info = get_backend_visible_gpu_info() gpu_info = { "available": visibility_info["available"], "devices": visibility_info["devices"], } -======= - try: - result = subprocess.run( - [ - "nvidia-smi", - "--query-gpu=index,name,memory.total", - "--format=csv,noheader,nounits", - ], - capture_output = True, - text = True, - timeout = 10, - ) - if result.returncode == 0: - for line in result.stdout.strip().splitlines(): - parts = [p.strip() for p in line.split(",")] - if len(parts) == 3: - idx = int(parts[0]) - if allowed_indices is not None and idx not in allowed_indices: - continue - gpu_info["devices"].append( - { - "index": idx, - "name": parts[1], - "memory_total_gb": round(int(parts[2]) / 1024, 2), - } - ) - gpu_info["available"] = len(gpu_info["devices"]) > 0 - except Exception: - pass - elif device == DeviceType.XPU: - try: - import torch - for i in range(torch.xpu.device_count()): - props = torch.xpu.get_device_properties(i) - gpu_info["devices"].append( - { - "index": i, - "name": props.name, - "memory_total_gb": round(props.total_memory / (1024**3), 2), - } - ) - gpu_info["available"] = len(gpu_info["devices"]) > 0 - except Exception: - pass - - # Fallback to torch-based single-GPU detection - if not gpu_info["available"]: - mem_info = get_gpu_memory_info() - if mem_info.get("available"): - gpu_info["available"] = True - gpu_info["devices"].append( - { - "index": mem_info.get("device", 0), - "name": mem_info.get("device_name", "Unknown"), - "memory_total_gb": round(mem_info.get("total_gb", 0), 2), - } - ) ->>>>>>> Stashed changes # CPU & Memory memory = psutil.virtual_memory() diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index 981c0d2453..df67052389 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -22,16 +22,13 @@ get_backend_visible_gpu_info, get_physical_gpu_count, get_visible_gpu_count, -<<<<<<< Updated upstream get_parent_visible_gpu_ids, resolve_requested_gpu_ids, estimate_fp16_model_size_bytes, estimate_required_model_memory_gb, auto_select_gpu_ids, prepare_gpu_selection, -======= get_torch_device_str, ->>>>>>> Stashed changes safe_num_proc, safe_thread_num_proc, dataset_map_num_proc, @@ -66,16 +63,13 @@ "get_backend_visible_gpu_info", "get_physical_gpu_count", "get_visible_gpu_count", -<<<<<<< Updated upstream "get_parent_visible_gpu_ids", "resolve_requested_gpu_ids", "estimate_fp16_model_size_bytes", "estimate_required_model_memory_gb", "auto_select_gpu_ids", "prepare_gpu_selection", -======= "get_torch_device_str", ->>>>>>> Stashed changes "safe_num_proc", "safe_thread_num_proc", "dataset_map_num_proc", diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index af5fbc51bb..746dc17039 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -100,14 +100,7 @@ def detect_hardware() -> DeviceType: print(f"Hardware detected: CUDA — {device_name}") return DEVICE -<<<<<<< Updated upstream - # --- XPU: Intel GPU --- - if _has_torch(): - import torch - -======= # --- XPU: Intel GPU --- ->>>>>>> Stashed changes if hasattr(torch, "xpu") and torch.xpu.is_available(): DEVICE = DeviceType.XPU CHAT_ONLY = False @@ -228,11 +221,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: "utilization_pct": (allocated / total) * 100, } except Exception as e: -<<<<<<< Updated upstream logger.error("Error getting XPU GPU info: %s", e) -======= - logger.error(f"Error getting XPU GPU info: {e}") ->>>>>>> Stashed changes return {"available": False, "backend": device.value, "error": str(e)} # ---- MLX path (Apple Silicon) ---- @@ -337,9 +326,64 @@ def get_package_versions() -> Dict[str, Optional[str]]: return versions -<<<<<<< Updated upstream # ========== Torch-based GPU fallbacks (AMD ROCm, Intel XPU, nvidia-smi missing) ========== -======= + + +def _torch_get_device_module(): + """Return the appropriate torch device module (cuda or xpu) and its name.""" + device = get_device() + import torch + + if device == DeviceType.CUDA: + return torch.cuda, "cuda" + if device == DeviceType.XPU and hasattr(torch, "xpu"): + return torch.xpu, "xpu" + return None, None + + +def _torch_get_physical_gpu_count() -> Optional[int]: + mod, _ = _torch_get_device_module() + if mod is None: + return None + try: + return mod.device_count() + except Exception: + return None + + +def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]]: + """Query torch for per-GPU name, total VRAM, and used VRAM.""" + mod, _ = _torch_get_device_module() + if mod is None: + return [] + + devices = [] + for ordinal, phys_idx in enumerate(device_indices): + try: + # torch uses 0-based ordinals relative to CUDA_VISIBLE_DEVICES + props = mod.get_device_properties(ordinal) + total_bytes = props.total_memory + # Prefer mem_get_info (reports system-wide usage, not just this + # process) so auto-selection accounts for other GPU consumers. + if hasattr(mod, "mem_get_info"): + free_bytes, total_bytes = mod.mem_get_info(ordinal) + used_bytes = total_bytes - free_bytes + else: + used_bytes = mod.memory_allocated(ordinal) + devices.append( + { + "index": phys_idx, + "visible_ordinal": ordinal, + "name": props.name, + "total_gb": round(total_bytes / (1024**3), 2), + "used_gb": round(used_bytes / (1024**3), 2), + } + ) + except Exception as e: + logger.debug("torch device query failed for ordinal %d: %s", ordinal, e) + return devices + + # ========== Live GPU Utilization ========== @@ -411,81 +455,12 @@ def _get_xpu_utilization() -> Dict[str, Any]: def get_gpu_utilization() -> Dict[str, Any]: - """ - Return a live snapshot of GPU utilization via ``nvidia-smi``. ->>>>>>> Stashed changes - - -def _torch_get_device_module(): - """Return the appropriate torch device module (cuda or xpu) and its name.""" + """Return a live snapshot of device utilization information.""" device = get_device() - import torch -<<<<<<< Updated upstream - if device == DeviceType.CUDA: - return torch.cuda, "cuda" - if device == DeviceType.XPU and hasattr(torch, "xpu"): - return torch.xpu, "xpu" - return None, None -======= if device == DeviceType.XPU: return _get_xpu_utilization() - if device != DeviceType.CUDA: - return {"available": False, "backend": device.value} ->>>>>>> Stashed changes - - -def _torch_get_physical_gpu_count() -> Optional[int]: - mod, _ = _torch_get_device_module() - if mod is None: - return None - try: - return mod.device_count() - except Exception: - return None - - -def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]]: - """Query torch for per-GPU name, total VRAM, and used VRAM.""" - mod, _ = _torch_get_device_module() - if mod is None: - return [] - - devices = [] - for ordinal, phys_idx in enumerate(device_indices): - try: - # torch uses 0-based ordinals relative to CUDA_VISIBLE_DEVICES - props = mod.get_device_properties(ordinal) - total_bytes = props.total_memory - # Prefer mem_get_info (reports system-wide usage, not just this - # process) so auto-selection accounts for other GPU consumers. - if hasattr(mod, "mem_get_info"): - free_bytes, total_bytes = mod.mem_get_info(ordinal) - used_bytes = total_bytes - free_bytes - else: - used_bytes = mod.memory_allocated(ordinal) - devices.append( - { - "index": phys_idx, - "visible_ordinal": ordinal, - "name": props.name, - "total_gb": round(total_bytes / (1024**3), 2), - "used_gb": round(used_bytes / (1024**3), 2), - } - ) - except Exception as e: - logger.debug("torch device query failed for ordinal %d: %s", ordinal, e) - return devices - - -# ========== Live GPU Utilization ========== - - -def get_gpu_utilization() -> Dict[str, Any]: - """Return a live snapshot of device utilization information.""" - device = get_device() - if device == DeviceType.CUDA: try: from . import nvidia @@ -1192,12 +1167,8 @@ def get_physical_gpu_count() -> int: """ Return the number of physical GPUs on the machine. -<<<<<<< Updated upstream Uses ``nvidia-smi -L`` on NVIDIA (unaffected by CUDA_VISIBLE_DEVICES), with a torch-based fallback for AMD ROCm and Intel XPU. -======= - For NVIDIA uses ``nvidia-smi -L``; for Intel XPU uses ``torch.xpu.device_count()``. ->>>>>>> Stashed changes Result is cached after the first call. """ global _physical_gpu_count @@ -1205,20 +1176,6 @@ def get_physical_gpu_count() -> int: return _physical_gpu_count device = get_device() -<<<<<<< Updated upstream -======= - - if device == DeviceType.XPU: - try: - import torch - _physical_gpu_count = torch.xpu.device_count() - except Exception: - _physical_gpu_count = 1 - return _physical_gpu_count - - try: - import subprocess ->>>>>>> Stashed changes if device == DeviceType.CUDA: try: @@ -1359,11 +1316,7 @@ def get_visible_gpu_count() -> int: if _visible_gpu_count is not None: return _visible_gpu_count -<<<<<<< Updated upstream -======= - # Check XPU visibility env var or CUDA_VISIBLE_DEVICES - import os - + # Check XPU visibility env var device = get_device() if device == DeviceType.XPU: @@ -1382,7 +1335,6 @@ def get_visible_gpu_count() -> int: _visible_gpu_count = get_physical_gpu_count() return _visible_gpu_count ->>>>>>> Stashed changes cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_visible is not None: # "" means zero GPUs, "0" means 1, "0,1,2" means 3 @@ -1407,7 +1359,6 @@ def get_visible_gpu_count() -> int: return _visible_gpu_count -<<<<<<< Updated upstream def apply_gpu_ids(gpu_ids) -> None: if gpu_ids is None: return @@ -1491,7 +1442,8 @@ def raise_if_offloaded(model, device_map: str, context: str = "Loading") -> None f"{context} does not support models loaded with CPU or disk offload. " f"device_map='{device_map}' produced offloaded modules: {example}" ) -======= + + def get_torch_device_str() -> str: """ Return the torch device string for the detected hardware. @@ -1503,7 +1455,6 @@ def get_torch_device_str() -> str: elif device == DeviceType.XPU: return "xpu" return "cpu" ->>>>>>> Stashed changes def safe_num_proc(desired: Optional[int] = None) -> int: From dc55c950bb6815325b4bbdfa1b6ebb49615f466e Mon Sep 17 00:00:00 2001 From: leizhenyuan Date: Tue, 31 Mar 2026 11:13:54 +0000 Subject: [PATCH 03/18] remove unuse code --- unsloth/import_fixes.py | 2 +- unsloth/models/rl.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index 195dc14d85..394e6809c4 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -1828,4 +1828,4 @@ def disable_broken_causal_conv1d(): print( "Unsloth: Detected broken causal_conv1d binary; " "disabling causal_conv1d fast path and continuing import." - ) + ) \ No newline at end of file diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 449419acf6..710e21943e 100755 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -1989,4 +1989,4 @@ def PatchFastRL(algorithm = None, FastLanguageModel = None): patch_trl_openenv() patch_trl_vllm_generation() if type(algorithm) is str and algorithm.islower(): - PatchRLStatistics(algorithm) + PatchRLStatistics(algorithm) \ No newline at end of file From 5487a1b59ba40a7c1a28a00c791a0df7fff4d185 Mon Sep 17 00:00:00 2001 From: leizhenyuan Date: Tue, 31 Mar 2026 11:15:57 +0000 Subject: [PATCH 04/18] remove rl changes --- unsloth/models/rl.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py index 710e21943e..5651a7da41 100755 --- a/unsloth/models/rl.py +++ b/unsloth/models/rl.py @@ -1202,10 +1202,6 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"): " memory_gb_left = psutil.virtual_memory().available / (1024**3)\n" " if memory_gb_left <= 2: dataset_num_proc = 1\n" " else: dataset_num_proc = min(dataset_num_proc, int(memory_gb_left))\n" - "# XPU: forking corrupts Level-Zero context, force single process\n" - "import torch as _torch\n" - "if hasattr(_torch, 'xpu') and _torch.xpu.is_available():\n" - " dataset_num_proc = 1\n" ) extra_args += num_proc_check @@ -1989,4 +1985,4 @@ def PatchFastRL(algorithm = None, FastLanguageModel = None): patch_trl_openenv() patch_trl_vllm_generation() if type(algorithm) is str and algorithm.islower(): - PatchRLStatistics(algorithm) \ No newline at end of file + PatchRLStatistics(algorithm) From 72eced43f69adc151e68a4f04f1ddf808f1207bc Mon Sep 17 00:00:00 2001 From: leizhenyuan Date: Tue, 31 Mar 2026 11:18:37 +0000 Subject: [PATCH 05/18] remove unuse code --- unsloth/import_fixes.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index 394e6809c4..3e7c2069a2 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -443,9 +443,7 @@ def fix_vllm_aimv2_issue(): def fix_vllm_guided_decoding_params(): - def _maybe_disable_vllm_transformers_mismatch(error): - """If vLLM fails due to transformers version mismatch, disable it gracefully.""" - global VLLM_BROKEN + def _maybe_raise_vllm_transformers_mismatch(error): error_text = str(error) if ( "ALLOWED_LAYER_TYPES" in error_text @@ -455,17 +453,13 @@ def _maybe_disable_vllm_transformers_mismatch(error): vllm_version = importlib_version("vllm") except Exception: vllm_version = "unknown" - logger.warning( + raise RuntimeError( "Unsloth: vLLM with version " f"{vllm_version} does not yet support transformers>=5.0.0. " - "Disabling vLLM and continuing without it. " + "Please downgrade to transformers==4.57.3 via " + 'pip install --force-reinstall "transformers==4.57.3". ' f"Original error: {error}" - ) - VLLM_BROKEN = True - _clear_vllm_modules() - _install_vllm_blocker() - return True - return False + ) from error if importlib.util.find_spec("vllm") is None: return @@ -475,8 +469,7 @@ def _maybe_disable_vllm_transformers_mismatch(error): try: import vllm except (ImportError, OSError) as e: - if _maybe_disable_vllm_transformers_mismatch(e): - return + _maybe_raise_vllm_transformers_mismatch(e): if disable_broken_vllm(e): return raise @@ -484,8 +477,7 @@ def _maybe_disable_vllm_transformers_mismatch(error): try: from vllm.sampling_params import GuidedDecodingParams except (ImportError, OSError) as e: - if _maybe_disable_vllm_transformers_mismatch(e): - return + _maybe_raise_vllm_transformers_mismatch(e) if disable_broken_vllm(e): return if not hasattr(vllm, "sampling_params") or not hasattr( @@ -1828,4 +1820,4 @@ def disable_broken_causal_conv1d(): print( "Unsloth: Detected broken causal_conv1d binary; " "disabling causal_conv1d fast path and continuing import." - ) \ No newline at end of file + ) From f679dcae7fc5c9a8a19a6440908b0b64005cb2d4 Mon Sep 17 00:00:00 2001 From: leizhenyuan Date: Tue, 31 Mar 2026 11:19:16 +0000 Subject: [PATCH 06/18] remove unuse code --- unsloth/import_fixes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsloth/import_fixes.py b/unsloth/import_fixes.py index 3e7c2069a2..ca44a0ce7e 100644 --- a/unsloth/import_fixes.py +++ b/unsloth/import_fixes.py @@ -469,7 +469,7 @@ def _maybe_raise_vllm_transformers_mismatch(error): try: import vllm except (ImportError, OSError) as e: - _maybe_raise_vllm_transformers_mismatch(e): + _maybe_raise_vllm_transformers_mismatch(e) if disable_broken_vllm(e): return raise From 8dd4290132b6a2894c82e1ded0f3b917f9bf3c95 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:19:34 +0000 Subject: [PATCH 07/18] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/inference.py | 1 + studio/backend/core/inference/llama_cpp.py | 3 +++ studio/backend/core/training/trainer.py | 6 ++++++ studio/backend/utils/hardware/hardware.py | 9 +++++++-- studio/backend/utils/utils.py | 9 ++++++--- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index 085f01a194..e79d1fa78f 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -1644,6 +1644,7 @@ def _generate_dac( ) with torch.inference_mode(): from utils.hardware import get_torch_device_str + with torch.amp.autocast(get_torch_device_str(), dtype = model.dtype): inputs = tokenizer([prompt], return_tensors = "pt").to(model.device) generated = model.generate( diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 19a169425f..14173984c5 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1439,6 +1439,7 @@ def unload_model(self) -> bool: import torch from utils.hardware import clear_gpu_cache + clear_gpu_cache() return True @@ -3017,6 +3018,7 @@ def init_audio_codec(self, audio_type: str) -> None: LlamaCppBackend._codec_mgr = AudioCodecManager() from utils.hardware import get_torch_device_str + device = get_torch_device_str() model_repo_path = None @@ -3092,6 +3094,7 @@ def generate_audio_response( import torch from utils.hardware import get_torch_device_str + device = get_torch_device_str() return LlamaCppBackend._codec_mgr.decode( audio_type, device, token_ids = token_ids, text = data.get("content", "") diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py index f25e848c8f..8f76e71c7c 100644 --- a/studio/backend/core/training/trainer.py +++ b/studio/backend/core/training/trainer.py @@ -1533,6 +1533,7 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz" SNAC_SAMPLE_RATE = 24000 from utils.hardware import get_torch_device_str + device = get_torch_device_str() max_length = self.max_seq_length or 2048 tokenizer = self.tokenizer @@ -1710,6 +1711,7 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): gc.collect() from utils.hardware import clear_gpu_cache + clear_gpu_cache() self._cuda_audio_used = True @@ -1739,6 +1741,7 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None): import subprocess from utils.hardware import get_torch_device_str + device = get_torch_device_str() # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo, @@ -1940,6 +1943,7 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor: gc.collect() from utils.hardware import clear_gpu_cache + clear_gpu_cache() self._cuda_audio_used = True @@ -1976,6 +1980,7 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): from utils.paths import ensure_dir, tmp_root from utils.hardware import get_torch_device_str + device = get_torch_device_str() # Clone OuteTTS repo (same as audio_codecs._load_dac) @@ -2155,6 +2160,7 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): gc.collect() from utils.hardware import clear_gpu_cache + clear_gpu_cache() self._cuda_audio_used = True diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 746dc17039..35826979fd 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -394,7 +394,9 @@ def _get_xpu_utilization() -> Dict[str, Any]: result = subprocess.run( ["xpu-smi", "dump", "-d", "0", "-m", "0,1,2,18"], - capture_output=True, text=True, timeout=5, + capture_output = True, + text = True, + timeout = 5, ) if result.returncode == 0 and result.stdout.strip(): # xpu-smi dump outputs CSV: Timestamp, DeviceId, GPU Utilization (%), ... @@ -1326,10 +1328,13 @@ def get_visible_gpu_count() -> int: if xpu_visible == "": _visible_gpu_count = 0 else: - _visible_gpu_count = len([x for x in xpu_visible.split(",") if x.strip()]) + _visible_gpu_count = len( + [x for x in xpu_visible.split(",") if x.strip()] + ) return _visible_gpu_count try: import torch + _visible_gpu_count = torch.xpu.device_count() except Exception: _visible_gpu_count = get_physical_gpu_count() diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index a544bb0802..290b5ad92e 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -110,9 +110,12 @@ def format_error_message(error: Exception, model_name: str) -> str: from utils.hardware import get_device device = get_device() - device_label = {"cuda": "GPU", "xpu": "Intel GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get( - device.value, "GPU" - ) + device_label = { + "cuda": "GPU", + "xpu": "Intel GPU", + "mlx": "Apple Silicon GPU", + "cpu": "system", + }.get(device.value, "GPU") return f"Not enough {device_label} memory to load '{model_short}'. Try a smaller model or free memory." # Generic fallback From 03e15c75f6326eba0e11b88c64c1161bd9a38df5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 19:11:23 +0000 Subject: [PATCH 08/18] Fix xpu-smi metric parsing, dead code, and type inconsistency - Fix _get_xpu_utilization() metric indices: use -m 0,2,3 (GPU Util, Power, Core Temp) instead of -m 0,1,2,18 which mapped parts[3] to temperature incorrectly (it was actually GPU Memory Utilization). Now correctly parses utilization, power draw, and temperature. - Add -n 1 flag so xpu-smi dump exits after one sample instead of running indefinitely until the 5s timeout kills it. - Use torch.xpu.current_device() for the -d flag instead of hardcoding device 0, so multi-GPU XPU setups query the correct device. - Populate power_draw_w in the returned dict instead of always None. - Fix versions["xpu"] = True (bool) to use the actual XPU version string from torch.version.xpu, falling back to "available". This keeps the dict type-consistent (all str or None). - Remove dead code in get_visible_gpu_count() where the XPU branch at line 1357 was unreachable because the XPU early-return block above always returns before that point. --- studio/backend/utils/hardware/hardware.py | 48 +++++++++++++---------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 35826979fd..4be0ce23db 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -319,7 +319,7 @@ def get_package_versions() -> Dict[str, Optional[str]]: versions["cuda"] = getattr(torch.version, "cuda", None) if hasattr(torch, "xpu") and torch.xpu.is_available(): - versions["xpu"] = True + versions["xpu"] = getattr(torch.version, "xpu", "available") except Exception: versions["cuda"] = None @@ -389,37 +389,48 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] def _get_xpu_utilization() -> Dict[str, Any]: """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu.""" + gpu_util = None + temp = None + power_w = None + + # Resolve which physical device to query + dev_idx = 0 + try: + import torch + + if hasattr(torch, "xpu") and torch.xpu.is_available(): + dev_idx = torch.xpu.current_device() + except Exception: + pass + try: import subprocess + # xpu-smi metric IDs: 0 = GPU Utilization (%), 2 = GPU Power (W), + # 3 = GPU Core Temperature (C). + # -n 1 requests exactly one sample so the command exits immediately. + # CSV columns: Timestamp, DeviceId, , , result = subprocess.run( - ["xpu-smi", "dump", "-d", "0", "-m", "0,1,2,18"], + ["xpu-smi", "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"], capture_output = True, text = True, - timeout = 5, + timeout = 10, ) if result.returncode == 0 and result.stdout.strip(): - # xpu-smi dump outputs CSV: Timestamp, DeviceId, GPU Utilization (%), ... lines = result.stdout.strip().splitlines() for line in reversed(lines): if line.startswith("Timestamp") or line.startswith("#"): continue parts = [p.strip() for p in line.split(",")] - if len(parts) >= 4: + if len(parts) >= 5: gpu_util = float(parts[2]) if parts[2] not in ("", "N/A") else None - temp = float(parts[3]) if parts[3] not in ("", "N/A") else None + power_w = float(parts[3]) if parts[3] not in ("", "N/A") else None + temp = float(parts[4]) if parts[4] not in ("", "N/A") else None break - else: - gpu_util = None - temp = None - else: - gpu_util = None - temp = None except Exception: - gpu_util = None - temp = None + pass - # Get VRAM from torch.xpu + # Get VRAM from torch.xpu (only reports PyTorch-managed memory) vram_used_gb = None vram_total_gb = None try: @@ -450,7 +461,7 @@ def _get_xpu_utilization() -> Dict[str, Any]: "vram_used_gb": vram_used_gb, "vram_total_gb": vram_total_gb, "vram_utilization_pct": vram_pct, - "power_draw_w": None, + "power_draw_w": power_w, "power_limit_w": None, "power_utilization_pct": None, } @@ -1354,10 +1365,7 @@ def get_visible_gpu_count() -> int: try: import torch - if get_device() == DeviceType.XPU and hasattr(torch, "xpu"): - _visible_gpu_count = torch.xpu.device_count() - else: - _visible_gpu_count = torch.cuda.device_count() + _visible_gpu_count = torch.cuda.device_count() except Exception: _visible_gpu_count = get_physical_gpu_count() From 3f8dc0dc69b47690a7ef34ae7821af468a765a68 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 19:43:35 +0000 Subject: [PATCH 09/18] Add shutil.which guard before xpu-smi subprocess call Skip the xpu-smi subprocess entirely when the binary is not on PATH. This avoids a multi-second timeout on Intel GPU systems that have PyTorch XPU support but no xpu-smi tooling installed. The function still falls back to torch.xpu for VRAM metrics. --- studio/backend/utils/hardware/hardware.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 4be0ce23db..f5d9cb6938 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -404,14 +404,21 @@ def _get_xpu_utilization() -> Dict[str, Any]: pass try: + import shutil import subprocess + # Skip subprocess entirely when xpu-smi is not on PATH, avoiding + # a multi-second timeout on systems without the Intel tooling. + xpu_smi = shutil.which("xpu-smi") + if xpu_smi is None: + raise FileNotFoundError("xpu-smi not found") + # xpu-smi metric IDs: 0 = GPU Utilization (%), 2 = GPU Power (W), # 3 = GPU Core Temperature (C). # -n 1 requests exactly one sample so the command exits immediately. # CSV columns: Timestamp, DeviceId, , , result = subprocess.run( - ["xpu-smi", "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"], + [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"], capture_output = True, text = True, timeout = 10, From 568a1a4781299c5c1c611368c9c40cfc976b1c1a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 20:00:06 +0000 Subject: [PATCH 10/18] Improve get_visible_gpu_count() ZE_AFFINITY_MASK handling Prefer torch.xpu.device_count() over manual mask parsing since the runtime correctly interprets all ZE_AFFINITY_MASK syntax including subdevice notation (e.g. "0.0,0.1" is 1 root device, not 2). The manual parsing fallback now counts unique root device IDs from the mask, handling "device.subdevice" notation correctly. --- studio/backend/utils/hardware/hardware.py | 24 +++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index f5d9cb6938..ea6262524c 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -1345,17 +1345,29 @@ def get_visible_gpu_count() -> int: xpu_visible = xpu_visible.strip() if xpu_visible == "": _visible_gpu_count = 0 - else: - _visible_gpu_count = len( - [x for x in xpu_visible.split(",") if x.strip()] - ) - return _visible_gpu_count + return _visible_gpu_count + + # Prefer torch.xpu.device_count() as it correctly interprets + # ZE_AFFINITY_MASK including subdevice syntax (e.g. "0.0,0.1"). try: import torch _visible_gpu_count = torch.xpu.device_count() except Exception: - _visible_gpu_count = get_physical_gpu_count() + if xpu_visible: + # Fallback: count unique root device IDs from the mask. + # ZE_AFFINITY_MASK can use "device.subdevice" notation, + # so "0.0,0.1" is 1 root device, not 2. + roots = set() + for token in xpu_visible.split(","): + token = token.strip() + if token: + root = token.split(".", 1)[0] + if root.isdigit(): + roots.add(int(root)) + _visible_gpu_count = len(roots) if roots else 0 + else: + _visible_gpu_count = get_physical_gpu_count() return _visible_gpu_count cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") From 9f9637e752e8a318ee6d4e61026817db2ec19549 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 09:06:48 +0000 Subject: [PATCH 11/18] Fix xpu-smi metrics, device resolution, and XPU OOM detection - _get_xpu_utilization: request metrics -m 0,1,3 (Util, Power, Temp) rather than 0,2,3 so the power column no longer reports MHz as watts. - _resolve_xpu_smi_device_id: map torch.xpu.current_device() (logical ordinal under ZE_AFFINITY_MASK) to the physical root device id that xpu-smi -d expects, so telemetry targets the active GPU. - Merge the duplicated torch blocks in _get_xpu_utilization so the VRAM lookup is guarded and the device index is computed once. - format_error_message: only rewrite true OOM errors (out of memory substrings) as memory errors, so non-OOM XPU/CUDA failures surface their real cause instead of a misleading memory message. - inference.py DAC generation: derive autocast device from model.device.type, not the global backend, so CPU-fallback models on an XPU host do not open a GPU autocast context. - dataset_map_num_proc: only disable XPU multiprocessing after the XPU runtime is actually initialized in this process, so pure CPU-side dataset preprocessing can still parallelize on Intel hosts. - get_package_versions: preserve the "available" fallback for xpu when torch.version.xpu exists as None. - get_visible_gpu_count: normalize ZE_AFFINITY_MASK parsing so the None and empty-string branches do not rely on implicit scoping. --- studio/backend/core/inference/inference.py | 12 ++- studio/backend/utils/hardware/hardware.py | 104 +++++++++++++++------ studio/backend/utils/utils.py | 9 +- 3 files changed, 89 insertions(+), 36 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index e79d1fa78f..1e2d7030e6 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -1643,9 +1643,15 @@ def _generate_dac( + "<|text_end|>\n<|audio_start|><|global_features_start|>\n" ) with torch.inference_mode(): - from utils.hardware import get_torch_device_str - - with torch.amp.autocast(get_torch_device_str(), dtype = model.dtype): + # Derive the autocast device from the loaded model, not from the + # global backend: a CPU-fallback DAC on an XPU/CUDA host must not + # open a GPU autocast context around CPU tensors. + device_type = ( + model.device.type + if hasattr(model.device, "type") + else str(model.device).split(":", 1)[0] + ) + with torch.amp.autocast(device_type, dtype = model.dtype): inputs = tokenizer([prompt], return_tensors = "pt").to(model.device) generated = model.generate( **inputs, diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index ea6262524c..0f61d8d2d4 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -319,7 +319,11 @@ def get_package_versions() -> Dict[str, Optional[str]]: versions["cuda"] = getattr(torch.version, "cuda", None) if hasattr(torch, "xpu") and torch.xpu.is_available(): - versions["xpu"] = getattr(torch.version, "xpu", "available") + # torch.version.xpu exists on modern torch builds but may be None; + # fall back to "available" so the UI distinguishes present-but-unknown + # from "package not found". + xpu_ver = getattr(torch.version, "xpu", None) + versions["xpu"] = xpu_ver if xpu_ver is not None else "available" except Exception: versions["cuda"] = None @@ -387,22 +391,52 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] # ========== Live GPU Utilization ========== -def _get_xpu_utilization() -> Dict[str, Any]: - """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu.""" - gpu_util = None - temp = None - power_w = None +def _resolve_xpu_smi_device_id() -> int: + """Resolve the physical root device ID used by ``xpu-smi -d``. - # Resolve which physical device to query - dev_idx = 0 + ``torch.xpu.current_device()`` returns the logical ordinal after + ``ZE_AFFINITY_MASK`` remapping, whereas ``xpu-smi`` addresses physical + root devices. Translate the ordinal through the mask roots so telemetry + targets the GPU the process is actually running on. Subdevice syntax + such as ``0.0,0.1`` collapses to a single root device. + """ + ordinal = 0 + xpu_ok = False try: import torch - if hasattr(torch, "xpu") and torch.xpu.is_available(): - dev_idx = torch.xpu.current_device() + xpu_ok = hasattr(torch, "xpu") and torch.xpu.is_available() + if xpu_ok: + ordinal = int(torch.xpu.current_device()) except Exception: pass + mask = (os.environ.get("ZE_AFFINITY_MASK") or "").strip() + if mask: + roots: list[int] = [] + for token in mask.split(","): + token = token.strip() + if not token: + continue + root = token.split(".", 1)[0] + if root.isdigit(): + root_id = int(root) + if root_id not in roots: + roots.append(root_id) + if roots: + return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0] + + return ordinal if xpu_ok else 0 + + +def _get_xpu_utilization() -> Dict[str, Any]: + """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu.""" + gpu_util = None + temp = None + power_w = None + + dev_idx = _resolve_xpu_smi_device_id() + try: import shutil import subprocess @@ -413,12 +447,15 @@ def _get_xpu_utilization() -> Dict[str, Any]: if xpu_smi is None: raise FileNotFoundError("xpu-smi not found") - # xpu-smi metric IDs: 0 = GPU Utilization (%), 2 = GPU Power (W), - # 3 = GPU Core Temperature (C). + # xpu-smi metric IDs (per Intel xpu-smi docs): + # 0 = GPU Utilization (%) + # 1 = GPU Power (W) + # 2 = GPU Frequency (MHz) + # 3 = GPU Core Temperature (C) # -n 1 requests exactly one sample so the command exits immediately. # CSV columns: Timestamp, DeviceId, , , result = subprocess.run( - [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,2,3", "-n", "1"], + [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,1,3", "-n", "1"], capture_output = True, text = True, timeout = 10, @@ -437,16 +474,19 @@ def _get_xpu_utilization() -> Dict[str, Any]: except Exception: pass - # Get VRAM from torch.xpu (only reports PyTorch-managed memory) + # Get VRAM from torch.xpu (only reports PyTorch-managed memory). + # Use the same logical ordinal that torch exposes; xpu-smi physical id is + # only needed by the subprocess call above. vram_used_gb = None vram_total_gb = None try: import torch - idx = torch.xpu.current_device() - props = torch.xpu.get_device_properties(idx) - vram_total_gb = round(props.total_memory / (1024**3), 2) - vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2) + if hasattr(torch, "xpu") and torch.xpu.is_available(): + idx = torch.xpu.current_device() + props = torch.xpu.get_device_properties(idx) + vram_total_gb = round(props.total_memory / (1024**3), 2) + vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2) except Exception: pass @@ -1340,12 +1380,12 @@ def get_visible_gpu_count() -> int: device = get_device() if device == DeviceType.XPU: - xpu_visible = os.environ.get("ZE_AFFINITY_MASK") - if xpu_visible is not None: - xpu_visible = xpu_visible.strip() - if xpu_visible == "": - _visible_gpu_count = 0 - return _visible_gpu_count + xpu_mask_raw = os.environ.get("ZE_AFFINITY_MASK") + xpu_mask_set = xpu_mask_raw is not None + xpu_visible = (xpu_mask_raw or "").strip() + if xpu_mask_set and xpu_visible == "": + _visible_gpu_count = 0 + return _visible_gpu_count # Prefer torch.xpu.device_count() as it correctly interprets # ZE_AFFINITY_MASK including subdevice syntax (e.g. "0.0,0.1"). @@ -1565,9 +1605,11 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]: ``datasets`` treats ``num_proc=1`` as multiprocessing (creates ``Pool(1)``). Only ``num_proc=None`` guarantees in-process execution. - Also returns ``None`` on XPU devices because ``os.fork()`` corrupts the - Level-Zero GPU context, causing Triton kernel launches to fail with - "Pointer argument doesn't reference XPU device memory". + Also returns ``None`` on XPU devices once the XPU runtime has been + initialized in this process, because ``os.fork()`` corrupts the + Level-Zero GPU context and causes Triton kernel launches to fail with + "Pointer argument doesn't reference XPU device memory". Pre-init XPU + hosts can still parallelize pure CPU-side dataset preprocessing. """ import sys @@ -1575,6 +1617,12 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]: return None if get_device() == DeviceType.XPU: - return None + try: + import torch + + if hasattr(torch, "xpu") and torch.xpu.is_initialized(): + return None + except Exception: + return None return safe_num_proc(desired) diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index 290b5ad92e..fc4674a1d5 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -101,11 +101,10 @@ def format_error_message(error: Exception, model_name: str) -> str: return "Invalid HF token. Please check your token and try again." if ( - "memory" in error_str - or "cuda" in error_str - or "xpu" in error_str - or "mlx" in error_str - or "out of memory" in error_str + "out of memory" in error_str + or "cuda out of memory" in error_str + or "xpu out of memory" in error_str + or ("mlx" in error_str and "memory" in error_str) ): from utils.hardware import get_device From 393a70d9986322da45c26286f678b97b06f537d5 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 09:26:26 +0000 Subject: [PATCH 12/18] Wire XPU through gpu-id pinning and visibility, restore CPU OOM detection Round 2 fixes addressing reviewer feedback: - format_error_message: tightening "out of memory" coverage in round 1 dropped CPU allocator failures like "not enough memory to allocate" and "cannot allocate memory", and Level Zero ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY. Restore those patterns while still excluding non-memory XPU/CUDA exceptions. - apply_gpu_ids: route Intel XPU through ZE_AFFINITY_MASK instead of CUDA_VISIBLE_DEVICES so worker subprocesses are actually pinned to the requested GPUs on multi-XPU hosts. - _get_parent_visible_gpu_spec: add an XPU branch that reads ZE_AFFINITY_MASK and returns physical root device IDs, so the visibility/selection stack reports the correct devices on Intel hosts. Honors subdevice syntax and wildcards. - Extract _parse_ze_mask_roots helper for the ZE_AFFINITY_MASK parsing previously duplicated between _resolve_xpu_smi_device_id and get_visible_gpu_count. Single source of truth for the mask semantics. - get_visible_gpu_count: treat non-digit wildcard masks (e.g. "*") as "all physical XPUs visible" rather than zero. - get_package_versions: also set versions["xpu"] = None in the except block so a failing XPU probe does not leave the key missing. - inference.py DAC autocast: clamp the resolved device_type to ("cuda", "xpu", "cpu") so exotic devices like "meta" during accelerate offloaded loading do not raise. --- studio/backend/core/inference/inference.py | 4 + studio/backend/utils/hardware/hardware.py | 99 +++++++++++++++++----- studio/backend/utils/utils.py | 7 +- 3 files changed, 86 insertions(+), 24 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index 1e2d7030e6..f07d40a0e5 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -1651,6 +1651,10 @@ def _generate_dac( if hasattr(model.device, "type") else str(model.device).split(":", 1)[0] ) + # Clamp to autocast-supported backends so exotic devices + # (e.g. "meta" during accelerate offloaded loading) do not raise. + if device_type not in ("cuda", "xpu", "cpu"): + device_type = "cpu" with torch.amp.autocast(device_type, dtype = model.dtype): inputs = tokenizer([prompt], return_tensors = "pt").to(model.device) generated = model.generate( diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 0f61d8d2d4..b397d6196c 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -326,6 +326,7 @@ def get_package_versions() -> Dict[str, Optional[str]]: versions["xpu"] = xpu_ver if xpu_ver is not None else "available" except Exception: versions["cuda"] = None + versions["xpu"] = None return versions @@ -391,6 +392,29 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] # ========== Live GPU Utilization ========== +def _parse_ze_mask_roots(mask: str) -> list[int]: + """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of unique root device IDs. + + Accepts subdevice syntax such as ``0.0,0.1`` which collapses to ``[0]``. + Returns an empty list if the mask is empty or contains no parseable digits. + Insertion order is preserved so callers can map logical ordinals back to + physical root IDs via the returned list. + """ + roots: list[int] = [] + if not mask: + return roots + for token in mask.split(","): + token = token.strip() + if not token: + continue + root = token.split(".", 1)[0] + if root.isdigit(): + root_id = int(root) + if root_id not in roots: + roots.append(root_id) + return roots + + def _resolve_xpu_smi_device_id() -> int: """Resolve the physical root device ID used by ``xpu-smi -d``. @@ -412,19 +436,9 @@ def _resolve_xpu_smi_device_id() -> int: pass mask = (os.environ.get("ZE_AFFINITY_MASK") or "").strip() - if mask: - roots: list[int] = [] - for token in mask.split(","): - token = token.strip() - if not token: - continue - root = token.split(".", 1)[0] - if root.isdigit(): - root_id = int(root) - if root_id not in roots: - roots.append(root_id) - if roots: - return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0] + roots = _parse_ze_mask_roots(mask) + if roots: + return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0] return ordinal if xpu_ok else 0 @@ -659,6 +673,42 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: def _get_parent_visible_gpu_spec() -> Dict[str, Any]: + # On Intel XPU hosts, device visibility is controlled by ZE_AFFINITY_MASK + # (the Level Zero affinity variable) rather than CUDA_VISIBLE_DEVICES. + if get_device() == DeviceType.XPU: + xpu_mask_raw = os.environ.get("ZE_AFFINITY_MASK") + if xpu_mask_raw is None: + return { + "raw": None, + "numeric_ids": list(range(get_physical_gpu_count())), + "supports_explicit_gpu_ids": True, + } + + xpu_mask = xpu_mask_raw.strip() + if xpu_mask == "": + return { + "raw": xpu_mask, + "numeric_ids": [], + "supports_explicit_gpu_ids": True, + } + + roots = _parse_ze_mask_roots(xpu_mask) + if not roots: + # Non-digit wildcard (e.g. "*") or unparseable mask: treat the + # same as "all physical XPUs visible" but disable explicit ids + # since we cannot map logical ordinals to root IDs. + return { + "raw": xpu_mask, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + + return { + "raw": xpu_mask, + "numeric_ids": roots, + "supports_explicit_gpu_ids": True, + } + cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_visible is None: @@ -1398,14 +1448,12 @@ def get_visible_gpu_count() -> int: # Fallback: count unique root device IDs from the mask. # ZE_AFFINITY_MASK can use "device.subdevice" notation, # so "0.0,0.1" is 1 root device, not 2. - roots = set() - for token in xpu_visible.split(","): - token = token.strip() - if token: - root = token.split(".", 1)[0] - if root.isdigit(): - roots.add(int(root)) - _visible_gpu_count = len(roots) if roots else 0 + roots = _parse_ze_mask_roots(xpu_visible) + # Non-digit wildcards (e.g. "*") yield an empty roots list; + # treat those as "all physical XPUs visible". + _visible_gpu_count = ( + len(roots) if roots else get_physical_gpu_count() + ) else: _visible_gpu_count = get_physical_gpu_count() return _visible_gpu_count @@ -1448,6 +1496,15 @@ def apply_gpu_ids(gpu_ids) -> None: else: value = str(gpu_ids) + # Intel XPU uses Level Zero and honors ZE_AFFINITY_MASK, not + # CUDA_VISIBLE_DEVICES. Route XPU pinning through the correct env var + # so worker subprocesses are actually restricted to the intended GPU. + if get_device() == DeviceType.XPU: + os.environ["ZE_AFFINITY_MASK"] = value + _visible_gpu_count = None + logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value) + return + os.environ["CUDA_VISIBLE_DEVICES"] = value _visible_gpu_count = None logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index fc4674a1d5..5555c14d8e 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -102,9 +102,10 @@ def format_error_message(error: Exception, model_name: str) -> str: if ( "out of memory" in error_str - or "cuda out of memory" in error_str - or "xpu out of memory" in error_str - or ("mlx" in error_str and "memory" in error_str) + or "out of device memory" in error_str + or "not enough memory" in error_str + or "cannot allocate memory" in error_str + or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str)) ): from utils.hardware import get_device From 2cf6d0254c680c841171b38ad39375e47890f624 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 09:47:29 +0000 Subject: [PATCH 13/18] Unblock XPU gpu_ids selection and harden OOM/autocast edge cases Round 3 fixes targeting the remaining gaps reviewers flagged: - prepare_gpu_selection: allow explicit gpu_ids on Intel XPU so the apply_gpu_ids() XPU branch (and _get_parent_visible_gpu_spec XPU branch) are actually reachable from the normal request path. - _parse_ze_mask_roots: stop deduplicating. Keep one root ID per mask token so the logical-ordinal-to-physical-root mapping used by _resolve_xpu_smi_device_id() stays 1-to-1 even for mixed subdevice masks like "2.0,0.1,0.2". Update the docstring to document the new shape. - _get_parent_visible_gpu_spec: dedupe roots only at the visibility layer, and flag subdevice masks as supports_explicit_gpu_ids=False so resolve_requested_gpu_ids() does not try to match duplicate IDs. Treat wildcard masks as "all physical XPUs visible". - format_error_message: also match the literal Level Zero enum names ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY / _HOST_MEMORY which use underscores and were not caught by the "out of device memory" substring. - inference.py DAC autocast: accept "mps" in the clamp list (it has been an autocast-supported backend since torch 2.3) and skip autocast entirely when the model is on CPU with an unsupported dtype like float32, since torch.amp.autocast("cpu", dtype=float32) raises. - resolve_requested_gpu_ids: tailor the "unsupported explicit ids" error message to the current backend so XPU users see a ZE_AFFINITY_MASK reference instead of a CUDA one. --- studio/backend/core/inference/inference.py | 14 ++++- studio/backend/utils/hardware/hardware.py | 60 +++++++++++++++------- studio/backend/utils/utils.py | 2 + 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index f07d40a0e5..79c52ed6bd 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -1643,6 +1643,8 @@ def _generate_dac( + "<|text_end|>\n<|audio_start|><|global_features_start|>\n" ) with torch.inference_mode(): + import contextlib + # Derive the autocast device from the loaded model, not from the # global backend: a CPU-fallback DAC on an XPU/CUDA host must not # open a GPU autocast context around CPU tensors. @@ -1653,9 +1655,17 @@ def _generate_dac( ) # Clamp to autocast-supported backends so exotic devices # (e.g. "meta" during accelerate offloaded loading) do not raise. - if device_type not in ("cuda", "xpu", "cpu"): + # MPS is autocast-supported since torch 2.3, keep it in the set. + if device_type not in ("cuda", "xpu", "mps", "cpu"): device_type = "cpu" - with torch.amp.autocast(device_type, dtype = model.dtype): + # CPU autocast only accepts bfloat16/float16. For a float32 CPU + # model, skip autocast entirely to avoid raising before generate. + cpu_autocast_supported = model.dtype in (torch.bfloat16, torch.float16) + if device_type == "cpu" and not cpu_autocast_supported: + autocast_ctx = contextlib.nullcontext() + else: + autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype) + with autocast_ctx: inputs = tokenizer([prompt], return_tensors = "pt").to(model.device) generated = model.generate( **inputs, diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index b397d6196c..26791b5363 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -393,12 +393,13 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] def _parse_ze_mask_roots(mask: str) -> list[int]: - """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of unique root device IDs. + """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of root device IDs. - Accepts subdevice syntax such as ``0.0,0.1`` which collapses to ``[0]``. - Returns an empty list if the mask is empty or contains no parseable digits. - Insertion order is preserved so callers can map logical ordinals back to - physical root IDs via the returned list. + Returns one root ID per mask token, preserving order and duplicates so + that logical ordinals map 1-to-1 back to physical root IDs. For example + ``"0.0,0.1"`` yields ``[0, 0]`` (two logical devices, both under root + GPU 0) and ``"2.0,0.1,0.2"`` yields ``[2, 0, 0]``. Returns an empty + list if the mask is empty or contains no parseable digits. """ roots: list[int] = [] if not mask: @@ -409,9 +410,7 @@ def _parse_ze_mask_roots(mask: str) -> list[int]: continue root = token.split(".", 1)[0] if root.isdigit(): - root_id = int(root) - if root_id not in roots: - roots.append(root_id) + roots.append(int(root)) return roots @@ -692,20 +691,40 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]: "supports_explicit_gpu_ids": True, } - roots = _parse_ze_mask_roots(xpu_mask) - if not roots: + # Subdevice syntax (e.g. "0.0,0.1") expands a single root GPU into + # multiple logical devices. Explicit root-ID selection is not + # meaningful for subdevice masks, so surface them as unsupported. + has_subdevice = any( + "." in token.strip() for token in xpu_mask.split(",") if token.strip() + ) + + roots_with_dupes = _parse_ze_mask_roots(xpu_mask) + if not roots_with_dupes: # Non-digit wildcard (e.g. "*") or unparseable mask: treat the # same as "all physical XPUs visible" but disable explicit ids # since we cannot map logical ordinals to root IDs. return { "raw": xpu_mask, - "numeric_ids": None, + "numeric_ids": list(range(get_physical_gpu_count())), + "supports_explicit_gpu_ids": False, + } + + if has_subdevice: + # Dedup for display: multiple subdevice entries under the same + # root collapse to that root ID. + unique_roots: list[int] = [] + for rid in roots_with_dupes: + if rid not in unique_roots: + unique_roots.append(rid) + return { + "raw": xpu_mask, + "numeric_ids": unique_roots, "supports_explicit_gpu_ids": False, } return { "raw": xpu_mask, - "numeric_ids": roots, + "numeric_ids": roots_with_dupes, "supports_explicit_gpu_ids": True, } @@ -761,11 +780,16 @@ def resolve_requested_gpu_ids(gpu_ids: Optional[list[int]]) -> list[int]: return parent_visible_ids if not parent_visible_spec["supports_explicit_gpu_ids"]: + env_var_name = ( + "ZE_AFFINITY_MASK" + if get_device() == DeviceType.XPU + else "CUDA_VISIBLE_DEVICES" + ) raise ValueError( f"Invalid gpu_ids {requested_ids}: explicit physical GPU IDs are " - f"unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG entries " - f"({parent_visible_spec['raw']!r}). Omit gpu_ids to use the " - "parent-visible devices." + f"unsupported when {env_var_name} uses non-numeric or subdevice " + f"entries ({parent_visible_spec['raw']!r}). Omit gpu_ids to use " + "the parent-visible devices." ) if len(set(requested_ids)) != len(requested_ids): @@ -1244,10 +1268,10 @@ def prepare_gpu_selection( in the worker subprocess which narrows ``CUDA_VISIBLE_DEVICES`` before any torch/CUDA initialisation. """ - if gpu_ids and get_device() != DeviceType.CUDA: + if gpu_ids and get_device() not in (DeviceType.CUDA, DeviceType.XPU): raise ValueError( - f"gpu_ids {list(gpu_ids)} is only supported on CUDA devices, " - f"but the current backend is '{get_device().value}'." + f"gpu_ids {list(gpu_ids)} is only supported on CUDA and Intel XPU " + f"devices, but the current backend is '{get_device().value}'." ) if gpu_ids: diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index 5555c14d8e..08c5754ce7 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -103,6 +103,8 @@ def format_error_message(error: Exception, model_name: str) -> str: if ( "out of memory" in error_str or "out of device memory" in error_str + or "out_of_device_memory" in error_str # ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY + or "out_of_host_memory" in error_str # ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY or "not enough memory" in error_str or "cannot allocate memory" in error_str or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str)) From 5fbcb0fce4e25eed61f7926982fda2119ff30a89 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 10:03:42 +0000 Subject: [PATCH 14/18] Enable multi-XPU sharding and auto-select, tighten XPU edge cases Round 4 fixes completing the multi-XPU story unlocked in round 3: - get_device_map: include DeviceType.XPU in the multi-GPU branch so explicit XPU gpu_ids=[0, 1] (or a wildcard-masked multi-XPU host) loads with device_map="balanced" instead of falling back to "sequential" and pinning the model to a single device. - auto_select_gpu_ids: allow XPU auto mode. The function relies on get_visible_gpu_utilization() for per-device free-VRAM telemetry, which already has an XPU path via _get_xpu_utilization. XPU hosts omitting gpu_ids now benefit from VRAM-aware selection. - get_visible_gpu_count torch-less fallback: count unique mask roots via len(set(roots)) so subdevice masks like "0.0,0.1" report the intended 1 root GPU, not 2. The ordinal-preserving semantics of _parse_ze_mask_roots are kept so _resolve_xpu_smi_device_id still maps logical ordinals to physical roots correctly. - xpu-smi subprocess timeout lowered from 10s to 3s so a hung driver does not block status polls / UI refreshes. - DAC autocast nullcontext fallback now covers XPU+float32 as well as CPU+float32, since XPU autocast only accepts bfloat16/float16 and otherwise warns on every generate call. - _get_parent_visible_gpu_spec subdevice dedup uses list(dict.fromkeys(...)) instead of an O(n^2) manual loop. --- studio/backend/core/inference/inference.py | 13 ++++---- studio/backend/utils/hardware/hardware.py | 39 ++++++++++++---------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index 79c52ed6bd..64e413ffe7 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -1642,9 +1642,9 @@ def _generate_dac( + text + "<|text_end|>\n<|audio_start|><|global_features_start|>\n" ) - with torch.inference_mode(): - import contextlib + import contextlib + with torch.inference_mode(): # Derive the autocast device from the loaded model, not from the # global backend: a CPU-fallback DAC on an XPU/CUDA host must not # open a GPU autocast context around CPU tensors. @@ -1658,10 +1658,11 @@ def _generate_dac( # MPS is autocast-supported since torch 2.3, keep it in the set. if device_type not in ("cuda", "xpu", "mps", "cpu"): device_type = "cpu" - # CPU autocast only accepts bfloat16/float16. For a float32 CPU - # model, skip autocast entirely to avoid raising before generate. - cpu_autocast_supported = model.dtype in (torch.bfloat16, torch.float16) - if device_type == "cpu" and not cpu_autocast_supported: + # CPU and XPU autocast only accept bfloat16/float16. For a + # float32 model, skip autocast entirely to avoid raising or + # producing a warning on every generate call. + autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16) + if device_type in ("cpu", "xpu") and not autocast_dtype_supported: autocast_ctx = contextlib.nullcontext() else: autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 26791b5363..519fd848d0 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -471,7 +471,7 @@ def _get_xpu_utilization() -> Dict[str, Any]: [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,1,3", "-n", "1"], capture_output = True, text = True, - timeout = 10, + timeout = 3, ) if result.returncode == 0 and result.stdout.strip(): lines = result.stdout.strip().splitlines() @@ -711,11 +711,8 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]: if has_subdevice: # Dedup for display: multiple subdevice entries under the same - # root collapse to that root ID. - unique_roots: list[int] = [] - for rid in roots_with_dupes: - if rid not in unique_roots: - unique_roots.append(rid) + # root collapse to that root ID while preserving insertion order. + unique_roots = list(dict.fromkeys(roots_with_dupes)) return { "raw": xpu_mask, "numeric_ids": unique_roots, @@ -1101,8 +1098,12 @@ def auto_select_gpu_ids( ) -> tuple[Optional[list[int]], Dict[str, Any]]: metadata: Dict[str, Any] = {"selection_mode": "auto"} - if get_device() != DeviceType.CUDA: - metadata["selection_mode"] = "non_cuda" + # Auto-selection relies on per-device free-VRAM telemetry which is + # available on both CUDA (via nvidia-smi) and XPU (via torch.xpu + + # xpu-smi). Other backends (MLX, CPU) do not expose the required + # information, so fall through to inheriting parent visibility. + if get_device() not in (DeviceType.CUDA, DeviceType.XPU): + metadata["selection_mode"] = "non_accelerator" return None, metadata required_gb, estimate_metadata = estimate_required_model_memory_gb( @@ -1471,12 +1472,14 @@ def get_visible_gpu_count() -> int: if xpu_visible: # Fallback: count unique root device IDs from the mask. # ZE_AFFINITY_MASK can use "device.subdevice" notation, - # so "0.0,0.1" is 1 root device, not 2. + # so "0.0,0.1" is 1 root device, not 2. Without torch we + # cannot know which hierarchy mode is active, so fall back + # to root-device counting (the more conservative choice). roots = _parse_ze_mask_roots(xpu_visible) # Non-digit wildcards (e.g. "*") yield an empty roots list; # treat those as "all physical XPUs visible". _visible_gpu_count = ( - len(roots) if roots else get_physical_gpu_count() + len(set(roots)) if roots else get_physical_gpu_count() ) else: _visible_gpu_count = get_physical_gpu_count() @@ -1541,24 +1544,26 @@ def get_device_map( Returns ``"balanced"`` (shard evenly across GPUs) when: - ``gpu_ids`` explicitly lists >1 GPU, **or** - - ``CUDA_VISIBLE_DEVICES`` uses UUID/MIG identifiers (non-numeric) and - more than one GPU is visible (fallback: we cannot resolve numeric IDs, - so we assume the caller intends multi-GPU). + - ``CUDA_VISIBLE_DEVICES``/``ZE_AFFINITY_MASK`` uses non-numeric + identifiers (UUID/MIG/wildcard) and more than one GPU is visible + (fallback: we cannot resolve numeric IDs, so we assume the caller + intends multi-GPU). Returns ``"sequential"`` (single device) in all other cases, including - non-CUDA backends (CPU, MLX). + CPU/MLX backends. Callers should use ``prepare_gpu_selection()`` upstream to determine the ``gpu_ids`` list -- that function handles the smart auto-selection of the minimum number of GPUs needed for a given model. """ device = get_device() - if device == DeviceType.CUDA: + if device in (DeviceType.CUDA, DeviceType.XPU): multi_gpu = gpu_ids is not None and len(gpu_ids) > 1 if not multi_gpu: - # UUID/MIG masks cannot be split into numeric IDs, so if multiple - # GPUs are visible we assume multi-GPU sharding is intended. + # UUID/MIG/wildcard masks cannot be split into numeric IDs, so if + # multiple GPUs are visible we assume multi-GPU sharding is + # intended. parent_visible_spec = _get_parent_visible_gpu_spec() if ( parent_visible_spec["numeric_ids"] is None From 9ce735d847a13bc32eda6295290c16e1a5505177 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 10:29:42 +0000 Subject: [PATCH 15/18] Align XPU wildcard mask with CUDA UUID path so multi-GPU sharding triggers _get_parent_visible_gpu_spec returned numeric_ids=list(range(physical)) for wildcard ZE_AFFINITY_MASK=*, which blocked get_device_map from reaching its "unresolved multi-visible" fallback. Mirror the CUDA UUID/MIG behavior by returning numeric_ids=None with supports_explicit_gpu_ids=False, so explicit ids are still rejected and get_device_map falls back to sharding across visible devices when more than one is present. --- studio/backend/utils/hardware/hardware.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 519fd848d0..e63ee6edca 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -700,12 +700,14 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]: roots_with_dupes = _parse_ze_mask_roots(xpu_mask) if not roots_with_dupes: - # Non-digit wildcard (e.g. "*") or unparseable mask: treat the - # same as "all physical XPUs visible" but disable explicit ids - # since we cannot map logical ordinals to root IDs. + # Non-digit wildcard (e.g. "*") or unparseable mask: we cannot map + # logical ordinals to physical root IDs. Mirror the CUDA UUID/MIG + # path by returning numeric_ids=None + supports_explicit_gpu_ids + # False, so get_device_map() falls back to its multi-visible + # heuristic and explicit ids are rejected. return { "raw": xpu_mask, - "numeric_ids": list(range(get_physical_gpu_count())), + "numeric_ids": None, "supports_explicit_gpu_ids": False, } From d6f1ea8ba41878f91d542642f43cbcbd9aa8335a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 11 Apr 2026 12:05:12 +0000 Subject: [PATCH 16/18] Fix loop 1 XPU review findings - _backend_visible_devices_env: return ZE_AFFINITY_MASK on XPU so get_backend_visible_gpu_info reports the active mask instead of a stale or None CUDA_VISIBLE_DEVICES after apply_gpu_ids runs. - _get_parent_visible_gpu_spec: return numeric_ids=None for subdevice masks like 0.0,0.1 so get_visible_gpu_utilization, get_backend_visible_gpu_info and get_device_map enumerate torch-visible ordinals and can still shard across logical XPUs instead of collapsing to a single root. - _parse_ze_mask_roots: use str.isdecimal() so Unicode superscripts do not crash int() via str.isdigit() admitting them. - _get_xpu_utilization xpu-smi parsing: accept n/a, NA, - and lowercase variants as missing, and wrap the float parse so one bad column does not drop the whole telemetry row. - clear_gpu_cache XPU branch: guard synchronize/empty_cache with hasattr + try/except so older torch-xpu builds do not propagate AttributeError. - apply_gpu_ids XPU branch: pop stale CUDA_VISIBLE_DEVICES so environment-inspection tools do not show conflicting pinning state. - format_error_message: add memory allocation failed pattern and isinstance(error, MemoryError) so CPU hosts still classify OOMs that the tightened substring list dropped. - test_gpu_selection/test_gpu_selection_sandbox: rename TestXpuRejection to TestXpuSelection and update non_cuda -> non_accelerator and CUDA-only error substring to CUDA and Intel XPU so the suite matches the new behavior. - inference.py/llama_cpp.py/trainer.py/utils.py: hoist contextlib, clear_gpu_cache, get_torch_device_str and get_device imports to module top per PEP 8 feedback from the hosted gemini bot. --- studio/backend/core/inference/inference.py | 2 +- studio/backend/core/inference/llama_cpp.py | 12 +-- studio/backend/core/training/trainer.py | 7 +- studio/backend/tests/test_gpu_selection.py | 75 ++++++++++++++++--- .../tests/test_gpu_selection_sandbox.py | 4 +- studio/backend/utils/hardware/hardware.py | 67 +++++++++++++---- studio/backend/utils/utils.py | 6 +- 7 files changed, 127 insertions(+), 46 deletions(-) diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index fd16299719..0f6a035887 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -10,6 +10,7 @@ from transformers import TextStreamer from peft import PeftModel, PeftModelForCausalLM +import contextlib import json import sys import torch @@ -1646,7 +1647,6 @@ def _generate_dac( + text + "<|text_end|>\n<|audio_start|><|global_features_start|>\n" ) - import contextlib with torch.inference_mode(): # Derive the autocast device from the loaded model, not from the diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index d924776c34..2fce68be28 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -26,6 +26,8 @@ import httpx +from utils.hardware import clear_gpu_cache, get_torch_device_str + logger = get_logger(__name__) # ── Pre-compiled patterns for plan-without-action re-prompt ── @@ -1625,10 +1627,6 @@ def unload_model(self) -> bool: if LlamaCppBackend._codec_mgr is not None: LlamaCppBackend._codec_mgr.unload() LlamaCppBackend._codec_mgr = None - import torch - - from utils.hardware import clear_gpu_cache - clear_gpu_cache() return True @@ -3262,8 +3260,6 @@ def init_audio_codec(self, audio_type: str) -> None: if LlamaCppBackend._codec_mgr is None: LlamaCppBackend._codec_mgr = AudioCodecManager() - from utils.hardware import get_torch_device_str - device = get_torch_device_str() model_repo_path = None @@ -3336,10 +3332,6 @@ def generate_audio_response( else None ) - import torch - - from utils.hardware import get_torch_device_str - device = get_torch_device_str() return LlamaCppBackend._codec_mgr.decode( audio_type, device, token_ids = token_ids, text = data.get("content", "") diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py index 1f64ea8d76..41b1cfa7bf 100644 --- a/studio/backend/core/training/trainer.py +++ b/studio/backend/core/training/trainer.py @@ -38,6 +38,7 @@ safe_num_proc, dataset_map_num_proc, get_device_map, + get_torch_device_str, raise_if_offloaded, get_visible_gpu_count, ) @@ -1540,7 +1541,6 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz" SNAC_SAMPLE_RATE = 24000 - from utils.hardware import get_torch_device_str device = get_torch_device_str() max_length = self.max_seq_length or 2048 @@ -1718,7 +1718,6 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): import gc gc.collect() - from utils.hardware import clear_gpu_cache clear_gpu_cache() self._cuda_audio_used = True @@ -1748,7 +1747,6 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None): import subprocess - from utils.hardware import get_torch_device_str device = get_torch_device_str() @@ -1950,7 +1948,6 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor: import gc gc.collect() - from utils.hardware import clear_gpu_cache clear_gpu_cache() self._cuda_audio_used = True @@ -1987,7 +1984,6 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): from datasets import Dataset as HFDataset from utils.paths import ensure_dir, tmp_root - from utils.hardware import get_torch_device_str device = get_torch_device_str() @@ -2167,7 +2163,6 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): import gc gc.collect() - from utils.hardware import clear_gpu_cache clear_gpu_cache() self._cuda_audio_used = True diff --git a/studio/backend/tests/test_gpu_selection.py b/studio/backend/tests/test_gpu_selection.py index c6f26037af..3ee39ea785 100644 --- a/studio/backend/tests/test_gpu_selection.py +++ b/studio/backend/tests/test_gpu_selection.py @@ -711,12 +711,14 @@ def start(self): class TestRouteErrors(unittest.TestCase): - def test_prepare_gpu_selection_rejects_gpu_ids_on_non_cuda_backend(self): + def test_prepare_gpu_selection_rejects_gpu_ids_on_non_accelerator_backend(self): with patch("utils.hardware.hardware.get_device", return_value = DeviceType.CPU): with self.assertRaises(ValueError) as exc_info: prepare_gpu_selection([0], model_name = "unsloth/test") - self.assertIn("only supported on CUDA devices", str(exc_info.exception)) + self.assertIn( + "only supported on CUDA and Intel XPU", str(exc_info.exception) + ) def test_inference_route_rejects_gpu_ids_for_gguf(self): inference_route = _load_route_module( @@ -1089,15 +1091,66 @@ def test_auto_select_falls_back_when_estimate_unavailable(self): self.assertEqual(metadata["selection_mode"], "fallback_all") -class TestXpuRejection(_GpuCacheResetMixin, unittest.TestCase): - def test_auto_select_returns_non_cuda_for_xpu(self): - with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU): +class TestXpuSelection(_GpuCacheResetMixin, unittest.TestCase): + def test_auto_select_supports_xpu(self): + with ( + patch( + "utils.hardware.hardware.get_device", return_value = DeviceType.XPU + ), + patch( + "utils.hardware.hardware.estimate_required_model_memory_gb", + return_value = (1.0, {}), + ), + patch( + "utils.hardware.hardware.get_visible_gpu_utilization", + return_value = { + "devices": [ + {"index": 0, "vram_total_gb": 8, "vram_used_gb": 1}, + ] + }, + ), + patch( + "utils.hardware.hardware._get_parent_visible_gpu_spec", + return_value = { + "raw": None, + "numeric_ids": [0], + "supports_explicit_gpu_ids": True, + }, + ), + patch( + "utils.hardware.hardware.get_parent_visible_gpu_ids", + return_value = [0], + ), + ): selected, metadata = auto_select_gpu_ids("unsloth/test") - self.assertIsNone(selected) - self.assertEqual(metadata["selection_mode"], "non_cuda") + self.assertEqual(selected, [0]) + self.assertEqual(metadata["selection_mode"], "auto") - def test_prepare_gpu_selection_rejects_explicit_ids_on_xpu(self): - with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU): - with self.assertRaisesRegex(ValueError, "only supported on CUDA"): - prepare_gpu_selection([0], model_name = "unsloth/test") + def test_prepare_gpu_selection_accepts_explicit_ids_on_xpu(self): + with ( + patch( + "utils.hardware.hardware.get_device", return_value = DeviceType.XPU + ), + patch( + "utils.hardware.hardware._get_parent_visible_gpu_spec", + return_value = { + "raw": "0", + "numeric_ids": [0], + "supports_explicit_gpu_ids": True, + }, + ), + patch( + "utils.hardware.hardware.get_parent_visible_gpu_ids", + return_value = [0], + ), + patch( + "utils.hardware.hardware.get_physical_gpu_count", return_value = 1 + ), + ): + selected, metadata = prepare_gpu_selection( + [0], model_name = "unsloth/test" + ) + + self.assertEqual(selected, [0]) + self.assertEqual(metadata["selection_mode"], "explicit") diff --git a/studio/backend/tests/test_gpu_selection_sandbox.py b/studio/backend/tests/test_gpu_selection_sandbox.py index 830a98a2fb..3c7792472b 100644 --- a/studio/backend/tests/test_gpu_selection_sandbox.py +++ b/studio/backend/tests/test_gpu_selection_sandbox.py @@ -302,14 +302,14 @@ def test_two_gpus_needed(self): # 35GB (first) + 30*0.85 (second) = 60.5GB > 50GB self.assertEqual(len(selected), 2) - def test_non_cuda_returns_none(self): + def test_non_accelerator_returns_none(self): from utils.hardware.hardware import auto_select_gpu_ids import utils.hardware.hardware as hw with patch.object(hw, "get_device", return_value = hw.DeviceType.CPU): selected, meta = auto_select_gpu_ids("test/model") self.assertIsNone(selected) - self.assertEqual(meta["selection_mode"], "non_cuda") + self.assertEqual(meta["selection_mode"], "non_accelerator") class TestGetDeviceMap(unittest.TestCase): diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index d438ff87e6..14fa5f0e4d 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -185,10 +185,19 @@ def clear_gpu_cache(): torch.cuda.empty_cache() torch.cuda.ipc_collect() elif device == DeviceType.XPU: - import torch + # Older torch-xpu builds may be missing synchronize/empty_cache; + # guard the calls so a stale build does not propagate AttributeError + # through callers that do not wrap clear_gpu_cache() themselves. + try: + import torch - torch.xpu.synchronize() - torch.xpu.empty_cache() + if hasattr(torch, "xpu"): + if hasattr(torch.xpu, "synchronize"): + torch.xpu.synchronize() + if hasattr(torch.xpu, "empty_cache"): + torch.xpu.empty_cache() + except Exception: + pass elif device == DeviceType.MLX: # MLX manages memory automatically; no explicit cache clear needed. # mlx.core has no empty_cache equivalent — gc.collect() above is enough. @@ -455,7 +464,10 @@ def _parse_ze_mask_roots(mask: str) -> list[int]: if not token: continue root = token.split(".", 1)[0] - if root.isdigit(): + # Use str.isdecimal() (not str.isdigit()) so Unicode superscripts + # like "2" / "3" are rejected -- they satisfy isdigit() but crash + # int() with ValueError. + if root.isdecimal(): roots.append(int(root)) return roots @@ -520,15 +532,27 @@ def _get_xpu_utilization() -> Dict[str, Any]: timeout = 3, ) if result.returncode == 0 and result.stdout.strip(): + # xpu-smi versions differ slightly in how they render unknown + # metrics: empty string, "N/A", "n/a", "NA", or "-". Treat any + # of these as "value not available" so a single missing column + # does not silently drop the entire telemetry row. + _NA = frozenset(("", "n/a", "na", "-")) + def _parse_metric(value: str) -> Optional[float]: + if value.strip().lower() in _NA: + return None + try: + return float(value) + except ValueError: + return None lines = result.stdout.strip().splitlines() for line in reversed(lines): if line.startswith("Timestamp") or line.startswith("#"): continue parts = [p.strip() for p in line.split(",")] if len(parts) >= 5: - gpu_util = float(parts[2]) if parts[2] not in ("", "N/A") else None - power_w = float(parts[3]) if parts[3] not in ("", "N/A") else None - temp = float(parts[4]) if parts[4] not in ("", "N/A") else None + gpu_util = _parse_metric(parts[2]) + power_w = _parse_metric(parts[3]) + temp = _parse_metric(parts[4]) break except Exception: pass @@ -778,12 +802,18 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]: } if has_subdevice: - # Dedup for display: multiple subdevice entries under the same - # root collapse to that root ID while preserving insertion order. - unique_roots = list(dict.fromkeys(roots_with_dupes)) + # Subdevice syntax (e.g. "0.0,0.1") expands one or more root + # GPUs into multiple logical devices. These logical ordinals + # do not map cleanly back to stable physical root IDs for + # explicit selection, so mirror the CUDA UUID/MIG and wildcard + # path: return numeric_ids=None and supports_explicit_gpu_ids + # False. Downstream (get_visible_gpu_utilization, + # get_backend_visible_gpu_info, get_device_map) then enumerates + # torch-visible ordinals and can still shard across the logical + # devices instead of collapsing them onto a single root. return { "raw": xpu_mask, - "numeric_ids": unique_roots, + "numeric_ids": None, "supports_explicit_gpu_ids": False, } @@ -1428,11 +1458,16 @@ def get_physical_gpu_count() -> int: def _backend_visible_devices_env() -> Optional[str]: """Return the raw visibility env string that applies to this backend. - On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence - over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in + On XPU, ``ZE_AFFINITY_MASK`` is the visibility control (not + ``CUDA_VISIBLE_DEVICES``). On ROCm, ``HIP_VISIBLE_DEVICES`` / + ``ROCR_VISIBLE_DEVICES`` take precedence over ``CUDA_VISIBLE_DEVICES``; + the helper mirrors the resolution logic in ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices`` - reports the value that is actually narrowing the visible device set. + reports the value that is actually narrowing the visible device set on + the current backend. """ + if get_device() == DeviceType.XPU: + return os.environ.get("ZE_AFFINITY_MASK") if IS_ROCM: return _get_parent_visible_gpu_spec().get("raw") return os.environ.get("CUDA_VISIBLE_DEVICES") @@ -1629,6 +1664,10 @@ def apply_gpu_ids(gpu_ids) -> None: # so worker subprocesses are actually restricted to the intended GPU. if get_device() == DeviceType.XPU: os.environ["ZE_AFFINITY_MASK"] = value + # Clear any stale CUDA_VISIBLE_DEVICES the parent may have inherited + # so tools that inspect the environment do not show conflicting + # pinning state (torch.xpu itself only reads ZE_AFFINITY_MASK). + os.environ.pop("CUDA_VISIBLE_DEVICES", None) _visible_gpu_count = None logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value) return diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index 08c5754ce7..0b7e4d0de8 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -13,6 +13,8 @@ import shutil import tempfile +from utils.hardware import get_device + logger = get_logger(__name__) @@ -107,10 +109,10 @@ def format_error_message(error: Exception, model_name: str) -> str: or "out_of_host_memory" in error_str # ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY or "not enough memory" in error_str or "cannot allocate memory" in error_str + or "memory allocation failed" in error_str + or isinstance(error, MemoryError) or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str)) ): - from utils.hardware import get_device - device = get_device() device_label = { "cuda": "GPU", From e22db8fdfda6ffa8803258c306324d239d43807b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 11 Apr 2026 12:23:23 +0000 Subject: [PATCH 17/18] Fix loop 2 XPU review findings - test_gpu_selection.py:105 regex: update stale assertion from "uses UUID/MIG" to "uses non-numeric or subdevice" after the PR broadened resolve_requested_gpu_ids' error message to cover XPU subdevice masks. Three reviewers independently reproduced the suite failure. - utils/utils.py: revert the module-top `from utils.hardware import get_device` hoist that broke test_utils.py::TestFormatErrorMessage::test_cpu_oom -- the test patches utils.hardware.get_device at call time, so the import must stay function-local. Keep the comment explaining why. - hardware.py _get_xpu_utilization: lift _NA and _parse_metric out of the hot path to module scope (renamed _XPU_SMI_NA / _parse_xpu_smi_metric); re-instantiating them on every successful xpu-smi call is wasteful. - hardware.py has_any check: include power_w alongside gpu_util, temp and vram_used_gb so a row that only exposes power is not silently discarded. - hardware.py get_visible_gpu_utilization + get_backend_visible_gpu_info: honor explicit "no devices visible" masks (ZE_AFFINITY_MASK="" or CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the enumerate-visible-ordinals fallback. Previously get_visible_gpu_count returned 0 correctly but the telemetry helpers still enumerated torch devices, letting auto_select_gpu_ids pick a GPU the process explicitly hid. - trainer.py: collapse the two consecutive blank lines left after removing inline `from utils.hardware import get_torch_device_str` imports at lines 1749 and 1985. --- studio/backend/core/training/trainer.py | 2 - studio/backend/tests/test_gpu_selection.py | 3 +- studio/backend/utils/hardware/hardware.py | 78 +++++++++++++++++----- studio/backend/utils/utils.py | 7 +- 4 files changed, 68 insertions(+), 22 deletions(-) diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py index 41b1cfa7bf..881350610e 100644 --- a/studio/backend/core/training/trainer.py +++ b/studio/backend/core/training/trainer.py @@ -1747,7 +1747,6 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None): import subprocess - device = get_torch_device_str() # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo, @@ -1984,7 +1983,6 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): from datasets import Dataset as HFDataset from utils.paths import ensure_dir, tmp_root - device = get_torch_device_str() # Clone OuteTTS repo (same as audio_codecs._load_dac) diff --git a/studio/backend/tests/test_gpu_selection.py b/studio/backend/tests/test_gpu_selection.py index 3ee39ea785..73110cd0eb 100644 --- a/studio/backend/tests/test_gpu_selection.py +++ b/studio/backend/tests/test_gpu_selection.py @@ -102,7 +102,8 @@ def test_explicit_ids_are_rejected_for_uuid_parent_visibility(self): patch("utils.hardware.hardware.get_physical_gpu_count", return_value = 8), ): with self.assertRaisesRegex( - ValueError, "unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG" + ValueError, + "unsupported when CUDA_VISIBLE_DEVICES uses non-numeric or subdevice", ): resolve_requested_gpu_ids([1]) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 14fa5f0e4d..f310fbee2a 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -500,6 +500,25 @@ def _resolve_xpu_smi_device_id() -> int: return ordinal if xpu_ok else 0 +_XPU_SMI_NA = frozenset(("", "n/a", "na", "-")) + + +def _parse_xpu_smi_metric(value: str) -> Optional[float]: + """Return float or None for missing/unknown xpu-smi CSV column values. + + xpu-smi versions differ slightly in how they render unknown metrics: + empty string, "N/A", "n/a", "NA", or "-". Treat any of these as "value + not available" so a single missing column does not silently drop the + entire telemetry row. + """ + if value.strip().lower() in _XPU_SMI_NA: + return None + try: + return float(value) + except ValueError: + return None + + def _get_xpu_utilization() -> Dict[str, Any]: """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu.""" gpu_util = None @@ -532,27 +551,15 @@ def _get_xpu_utilization() -> Dict[str, Any]: timeout = 3, ) if result.returncode == 0 and result.stdout.strip(): - # xpu-smi versions differ slightly in how they render unknown - # metrics: empty string, "N/A", "n/a", "NA", or "-". Treat any - # of these as "value not available" so a single missing column - # does not silently drop the entire telemetry row. - _NA = frozenset(("", "n/a", "na", "-")) - def _parse_metric(value: str) -> Optional[float]: - if value.strip().lower() in _NA: - return None - try: - return float(value) - except ValueError: - return None lines = result.stdout.strip().splitlines() for line in reversed(lines): if line.startswith("Timestamp") or line.startswith("#"): continue parts = [p.strip() for p in line.split(",")] if len(parts) >= 5: - gpu_util = _parse_metric(parts[2]) - power_w = _parse_metric(parts[3]) - temp = _parse_metric(parts[4]) + gpu_util = _parse_xpu_smi_metric(parts[2]) + power_w = _parse_xpu_smi_metric(parts[3]) + temp = _parse_xpu_smi_metric(parts[4]) break except Exception: pass @@ -579,7 +586,9 @@ def _parse_metric(value: str) -> Optional[float]: else None ) - has_any = any(v is not None for v in [gpu_util, temp, vram_used_gb]) + has_any = any( + v is not None for v in [gpu_util, temp, vram_used_gb, power_w] + ) if not has_any: return {"available": False, "backend": "xpu"} @@ -673,6 +682,23 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) if device in (DeviceType.CUDA, DeviceType.XPU): + parent_visible_spec = _get_parent_visible_gpu_spec() + # Honor an explicit empty visibility env (ZE_AFFINITY_MASK="" or + # CUDA_VISIBLE_DEVICES="" / "-1") as "no devices visible". Without + # this guard, the enumerate-visible-ordinals fallback below would + # happily report devices the process explicitly hid. + if ( + parent_visible_spec["raw"] is not None + and parent_visible_spec["numeric_ids"] == [] + ): + return { + "available": False, + "backend": _backend_label(device), + "parent_visible_gpu_ids": [], + "devices": [], + "index_kind": "relative", + } + parent_ids = get_parent_visible_gpu_ids() # When parent_visible_ids is empty (UUID/MIG mask or no CVD set), # enumerate torch-visible ordinals so the UI still shows devices. @@ -1476,13 +1502,31 @@ def _backend_visible_devices_env() -> Optional[str]: def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): + parent_visible_spec = _get_parent_visible_gpu_spec() parent_visible_ids = get_parent_visible_gpu_ids() + + # Honor an explicit "no devices visible" mask (ZE_AFFINITY_MASK="" + # or CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the + # torch-ordinal enumeration fallback, which would otherwise report + # devices that the process explicitly hid. + if ( + parent_visible_spec["raw"] is not None + and parent_visible_spec["numeric_ids"] == [] + ): + return { + "available": False, + "backend": _backend_label(device), + "backend_cuda_visible_devices": _backend_visible_devices_env(), + "parent_visible_gpu_ids": [], + "devices": [], + "index_kind": "relative", + } + # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm) if device == DeviceType.CUDA and not IS_ROCM: try: from . import nvidia - parent_visible_spec = _get_parent_visible_gpu_spec() result = nvidia.get_backend_visible_gpu_info( parent_visible_spec["numeric_ids"], parent_visible_spec["raw"], diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index 0b7e4d0de8..dfa399105d 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -13,8 +13,6 @@ import shutil import tempfile -from utils.hardware import get_device - logger = get_logger(__name__) @@ -113,6 +111,11 @@ def format_error_message(error: Exception, model_name: str) -> str: or isinstance(error, MemoryError) or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str)) ): + # Resolve get_device() at call time (not import time) so tests that + # monkey-patch utils.hardware.get_device after this module is loaded + # still see the patched backend. + from utils.hardware import get_device + device = get_device() device_label = { "cuda": "GPU", From c19476ddbd4c0bfbf6500924c788d63a639d98f9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sat, 11 Apr 2026 12:40:33 +0000 Subject: [PATCH 18/18] Fix loop 3 XPU review findings - apply_gpu_ids XPU: revert the CUDA_VISIBLE_DEVICES pop from loop 1. Popping it re-enabled CUDA detection on hybrid NVIDIA+Intel hosts where the parent had set CUDA_VISIBLE_DEVICES="" to force Studio onto XPU; the worker's follow-up detect_hardware() then flipped back to CUDA. torch.xpu only reads ZE_AFFINITY_MASK so the stale CUDA_VISIBLE_DEVICES is cosmetically redundant but functionally harmless, and leaving it alone preserves hybrid-host detection. - llama_cpp._start_process: pin the llama-server subprocess via ZE_AFFINITY_MASK on XPU hosts and CUDA_VISIBLE_DEVICES elsewhere. llama-server's SYCL build reads ZE_AFFINITY_MASK, not CUDA_VISIBLE_DEVICES, so previous pinning was silently ignored on Intel. - llama_cpp init_audio_codec / generate_audio_response: revert the promotion from get_torch_device_str() to "xpu" on Intel hosts. SNAC / BiCodec / DAC codecs are not yet validated on Intel XPU and the old CPU fallback was the known-working non-CUDA path. Drop the now-unused get_torch_device_str import from llama_cpp.py. - trainer.py _preprocess_snac_dataset / _preprocess_bicodec_dataset / _preprocess_dac_dataset: revert the same unconditional XPU routing for audio dataset preprocessing back to the pre-PR CPU fallback on non-CUDA hosts. Spark-TTS BiCodec, SNAC, and OuteTTS DAC / Whisper paths were all CPU-backed on every non-CUDA host before this PR; promoting them to XPU without capability probes regressed the previously working CPU path. Drop the now-unused get_torch_device_str import from trainer.py. - dataset_map_num_proc: only disable multiprocessing when torch.xpu.is_initialized exists and returns True. Older torch-xpu builds without is_initialized() were previously falling through the broad except and returning None, silently disabling pre-init CPU dataset parallelism the docstring explicitly says should still work. - _get_xpu_utilization: cache the resolved xpu-smi binary path in a module-level sentinel via _resolve_xpu_smi_binary() so repeated telemetry polls do not re-scan PATH on every tick. - get_backend_visible_gpu_info: move the parent_visible_ids lookup below the empty-mask short-circuit so the spec is not computed twice on the fast exit path. --- studio/backend/core/inference/llama_cpp.py | 30 ++++++++++-- studio/backend/core/training/trainer.py | 14 ++++-- studio/backend/utils/hardware/hardware.py | 55 +++++++++++++++++----- 3 files changed, 79 insertions(+), 20 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 2fce68be28..2ac7d22afc 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -26,7 +26,7 @@ import httpx -from utils.hardware import clear_gpu_cache, get_torch_device_str +from utils.hardware import clear_gpu_cache logger = get_logger(__name__) @@ -1514,9 +1514,20 @@ def load_model( f"{new_ld}:{existing_ld}" if existing_ld else new_ld ) - # Pin to selected GPU(s) via CUDA_VISIBLE_DEVICES + # Pin to selected GPU(s) via the backend-appropriate visibility + # env var: CUDA_VISIBLE_DEVICES on NVIDIA/ROCm, ZE_AFFINITY_MASK + # on Intel XPU (llama-server's SYCL build reads ZE_AFFINITY_MASK, + # not CUDA_VISIBLE_DEVICES). if gpu_indices is not None: - env["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpu_indices) + from utils.hardware import get_device + from utils.hardware.hardware import DeviceType + + mask = ",".join(str(i) for i in gpu_indices) + if get_device() == DeviceType.XPU: + env["ZE_AFFINITY_MASK"] = mask + env.pop("CUDA_VISIBLE_DEVICES", None) + else: + env["CUDA_VISIBLE_DEVICES"] = mask self._stdout_lines = [] self._process = subprocess.Popen( @@ -3260,7 +3271,12 @@ def init_audio_codec(self, audio_type: str) -> None: if LlamaCppBackend._codec_mgr is None: LlamaCppBackend._codec_mgr = AudioCodecManager() - device = get_torch_device_str() + # Preserve the pre-PR CPU fallback on non-CUDA hosts: the SNAC / + # BiCodec / DAC codecs are not yet validated on Intel XPU, so + # only promote to a GPU device when CUDA is actually available. + # A follow-up can extend this once an XPU-specific codec path is + # added. + device = "cuda" if torch.cuda.is_available() else "cpu" model_repo_path = None # BiCodec needs a repo with BiCodec/ weights — download canonical SparkTTS @@ -3332,7 +3348,11 @@ def generate_audio_response( else None ) - device = get_torch_device_str() + # Match init_audio_codec: stay on CPU for non-CUDA hosts until the + # codec path is validated on XPU. + import torch + + device = "cuda" if torch.cuda.is_available() else "cpu" return LlamaCppBackend._codec_mgr.decode( audio_type, device, token_ids = token_ids, text = data.get("content", "") ) diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py index 881350610e..b00d1b6c8b 100644 --- a/studio/backend/core/training/trainer.py +++ b/studio/backend/core/training/trainer.py @@ -38,7 +38,6 @@ safe_num_proc, dataset_map_num_proc, get_device_map, - get_torch_device_str, raise_if_offloaded, get_visible_gpu_count, ) @@ -1542,7 +1541,10 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None): SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz" SNAC_SAMPLE_RATE = 24000 - device = get_torch_device_str() + # SNAC codec has not been validated on Intel XPU yet; keep the + # pre-PR CPU fallback for non-CUDA hosts until an XPU-specific + # path is added. + device = "cuda" if torch.cuda.is_available() else "cpu" max_length = self.max_seq_length or 2048 tokenizer = self.tokenizer @@ -1747,7 +1749,9 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None): import subprocess - device = get_torch_device_str() + # Spark-TTS BiCodec has not been validated on Intel XPU; keep the + # pre-PR CPU fallback for non-CUDA hosts. + device = "cuda" if torch.cuda.is_available() else "cpu" # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo, # NOT in the unsloth/Spark-TTS-0.5B HF model repo. Clone it if needed. @@ -1983,7 +1987,9 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None): from datasets import Dataset as HFDataset from utils.paths import ensure_dir, tmp_root - device = get_torch_device_str() + # OuteTTS DAC/Whisper preprocess has not been validated on Intel + # XPU; keep the pre-PR CPU fallback for non-CUDA hosts. + device = "cuda" if torch.cuda.is_available() else "cpu" # Clone OuteTTS repo (same as audio_codecs._load_dac) import subprocess diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index f310fbee2a..0627212499 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -502,6 +502,23 @@ def _resolve_xpu_smi_device_id() -> int: _XPU_SMI_NA = frozenset(("", "n/a", "na", "-")) +# Cached xpu-smi binary path. _XPU_SMI_PATH_UNSET is a sentinel distinct +# from None: None means "scanned PATH and not found" while the sentinel +# means "not scanned yet". Resolved once by _resolve_xpu_smi_binary() so +# live telemetry polls do not re-scan PATH on every tick. +_XPU_SMI_PATH_UNSET: Any = object() +_xpu_smi_binary: Any = _XPU_SMI_PATH_UNSET + + +def _resolve_xpu_smi_binary() -> Optional[str]: + """Return cached absolute path to ``xpu-smi`` or None if not on PATH.""" + global _xpu_smi_binary + if _xpu_smi_binary is _XPU_SMI_PATH_UNSET: + import shutil as _shutil + + _xpu_smi_binary = _shutil.which("xpu-smi") + return _xpu_smi_binary + def _parse_xpu_smi_metric(value: str) -> Optional[float]: """Return float or None for missing/unknown xpu-smi CSV column values. @@ -528,12 +545,14 @@ def _get_xpu_utilization() -> Dict[str, Any]: dev_idx = _resolve_xpu_smi_device_id() try: - import shutil import subprocess # Skip subprocess entirely when xpu-smi is not on PATH, avoiding # a multi-second timeout on systems without the Intel tooling. - xpu_smi = shutil.which("xpu-smi") + # The binary path is resolved once and cached by + # _resolve_xpu_smi_binary() so repeated telemetry polls do not + # re-scan PATH on every tick. + xpu_smi = _resolve_xpu_smi_binary() if xpu_smi is None: raise FileNotFoundError("xpu-smi not found") @@ -1503,7 +1522,6 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): parent_visible_spec = _get_parent_visible_gpu_spec() - parent_visible_ids = get_parent_visible_gpu_ids() # Honor an explicit "no devices visible" mask (ZE_AFFINITY_MASK="" # or CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the @@ -1522,6 +1540,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: "index_kind": "relative", } + parent_visible_ids = get_parent_visible_gpu_ids() + # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm) if device == DeviceType.CUDA and not IS_ROCM: try: @@ -1708,10 +1728,13 @@ def apply_gpu_ids(gpu_ids) -> None: # so worker subprocesses are actually restricted to the intended GPU. if get_device() == DeviceType.XPU: os.environ["ZE_AFFINITY_MASK"] = value - # Clear any stale CUDA_VISIBLE_DEVICES the parent may have inherited - # so tools that inspect the environment do not show conflicting - # pinning state (torch.xpu itself only reads ZE_AFFINITY_MASK). - os.environ.pop("CUDA_VISIBLE_DEVICES", None) + # Deliberately leave any inherited CUDA_VISIBLE_DEVICES alone: on + # hybrid NVIDIA+Intel hosts the parent may have set + # CUDA_VISIBLE_DEVICES="" to disable NVIDIA and force Studio onto + # XPU. Popping the variable here would let the worker's follow-up + # detect_hardware() call flip back to CUDA. torch.xpu only reads + # ZE_AFFINITY_MASK, so an extra CUDA_VISIBLE_DEVICES entry in env + # is cosmetically stale but functionally harmless. _visible_gpu_count = None logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value) return @@ -1905,10 +1928,20 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]: if get_device() == DeviceType.XPU: try: import torch - - if hasattr(torch, "xpu") and torch.xpu.is_initialized(): - return None except Exception: - return None + # No torch means no XPU runtime is active here, so CPU-side + # dataset parallelism is still safe. + return safe_num_proc(desired) + + xpu = getattr(torch, "xpu", None) + is_initialized = getattr(xpu, "is_initialized", None) + if callable(is_initialized): + try: + if is_initialized(): + return None + except Exception: + # Treat a failing probe as "runtime not touched yet" so + # pre-init CPU preprocessing can still parallelize. + pass return safe_num_proc(desired)