diff --git a/studio/backend/core/inference/inference.py b/studio/backend/core/inference/inference.py index 33647a6cd1..743783d9d9 100644 --- a/studio/backend/core/inference/inference.py +++ b/studio/backend/core/inference/inference.py @@ -10,6 +10,7 @@ from transformers import TextStreamer from peft import PeftModel, PeftModelForCausalLM +import contextlib import json import sys import torch @@ -1670,8 +1671,30 @@ def _generate_dac( + text + "<|text_end|>\n<|audio_start|><|global_features_start|>\n" ) + with torch.inference_mode(): - with torch.amp.autocast("cuda", dtype = model.dtype): + # Derive the autocast device from the loaded model, not from the + # global backend: a CPU-fallback DAC on an XPU/CUDA host must not + # open a GPU autocast context around CPU tensors. + device_type = ( + model.device.type + if hasattr(model.device, "type") + else str(model.device).split(":", 1)[0] + ) + # Clamp to autocast-supported backends so exotic devices + # (e.g. "meta" during accelerate offloaded loading) do not raise. + # MPS is autocast-supported since torch 2.3, keep it in the set. + if device_type not in ("cuda", "xpu", "mps", "cpu"): + device_type = "cpu" + # CPU and XPU autocast only accept bfloat16/float16. For a + # float32 model, skip autocast entirely to avoid raising or + # producing a warning on every generate call. + autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16) + if device_type in ("cpu", "xpu") and not autocast_dtype_supported: + autocast_ctx = contextlib.nullcontext() + else: + autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype) + with autocast_ctx: inputs = tokenizer([prompt], return_tensors = "pt").to(model.device) generated = model.generate( **inputs, diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 72dcc11beb..7162e6bfcc 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -55,6 +55,7 @@ RENDER_HTML_REPEAT_NUDGE, parse_tool_calls_from_text as _shared_parse_tool_calls_from_text, ) +from utils.hardware import clear_gpu_cache logger = get_logger(__name__) @@ -1245,23 +1246,28 @@ def _amd_apu_wants_unified_memory() -> bool: @staticmethod def _get_gpu_free_memory() -> list[tuple[int, int]]: - """Query free memory per GPU. - - Order: - 1. ``nvidia-smi`` (NVIDIA CUDA hosts) -- respects - ``CUDA_VISIBLE_DEVICES``. - 2. ``torch.cuda.mem_get_info`` -- universal fallback that - works on AMD ROCm too because the HIP runtime - reuses the entire ``torch.cuda.*`` namespace. Covers the - AMD case for issue #5106 (nvidia-smi-only probe silently - returned [] on AMD hosts) and also rescues NVIDIA hosts - where ``nvidia-smi`` is missing from PATH. - - Returns list of (gpu_index, free_mib) sorted by index. Empty - list if no supported GPU is reachable. + """Query free memory per visible GPU, backend-aware. + + Returns list of ``(gpu_index, free_mib)`` sorted by index. The index + space matches whatever the active backend exposes: physical + ``nvidia-smi`` indices on NVIDIA; parent-visible numeric IDs on + AMD/ROCm and Intel XPU (via Studio's hardware telemetry layer). + Returns an empty list if no per-GPU free-memory data is available, + which lets the caller fall through to a non-placement launch path. """ - # ── NVIDIA via nvidia-smi ──────────────────────────────────── + import os + + from utils.hardware import get_device + from utils.hardware.hardware import DeviceType + import utils.hardware.hardware as _hw_mod + + # Fast path: NVIDIA / nvidia-smi. Skip only when we know the backend + # is XPU or ROCm -- not CUDA, CPU-only, or undetected. + _detected = get_device() + nvidia_eligible = _detected != DeviceType.XPU and not getattr(_hw_mod, "IS_ROCM", False) try: + if not nvidia_eligible: + raise FileNotFoundError # skip to generic telemetry path result = subprocess.run( [ "nvidia-smi", @@ -1275,7 +1281,9 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: **_windows_hidden_subprocess_kwargs(), ) if result.returncode == 0: - allowed: Optional[set[int]] = None + # Filter nvidia-smi output by CUDA_VISIBLE_DEVICES. + # Skip empty tokens so trailing commas don't disable the filter. + allowed = None cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: try: @@ -1286,8 +1294,9 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: # filtered out, matching the codebase convention. allowed = set(int(x.strip()) for x in cvd.split(",") if x.strip()) except ValueError: - pass - gpus: list[tuple[int, int]] = [] + pass # Non-numeric (e.g., "GPU-uuid"), ignore filter + + gpus = [] for line in result.stdout.strip().splitlines(): parts = line.split(",") if len(parts) == 2: @@ -1296,71 +1305,55 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: if allowed is not None and idx not in allowed: continue gpus.append((idx, free_mib)) - # Match the docstring's sort-by-id guarantee. nvidia-smi - # almost always returns sorted output, but driver order - # is not formally guaranteed. - gpus.sort(key = lambda g: g[0]) if gpus: - return gpus + return sorted(gpus, key = lambda item: item[0]) + except FileNotFoundError: + pass # nvidia-smi not on PATH — fall through to generic path except Exception as e: - logger.debug(f"nvidia-smi probe failed: {e}") + logger.debug(f"nvidia-smi free-memory query failed: {e}") - # ── Torch fallback (covers AMD ROCm and missing nvidia-smi) ── + # Generic path: ROCm, XPU, or nvidia-smi absent/failed. try: - import torch - - if not hasattr(torch, "cuda") or not torch.cuda.is_available(): - return [] - if not hasattr(torch.cuda, "mem_get_info"): - return [] - # torch.cuda enumerates GPUs RELATIVE to the visibility mask. - # On NVIDIA builds the mask is CUDA_VISIBLE_DEVICES; on AMD - # ROCm builds it is HIP_VISIBLE_DEVICES (or ROCR_VISIBLE_DEVICES - # if HIP is unset). Downstream we feed these IDs back into the - # llama-server subprocess as CVD, so we must translate visible - # ordinals back to physical indices first; otherwise launching - # with ``CUDA_VISIBLE_DEVICES=2,3`` would get rewritten to - # ``CUDA_VISIBLE_DEVICES=0,1`` and target the wrong GPUs. - physical_ids: Optional[list[int]] = None - # Match the codebase convention in - # ``utils/hardware/hardware.py::_get_parent_visible_gpu_spec``: - # treat an explicitly empty mask (``HIP_VISIBLE_DEVICES=""``) - # as "set to no GPUs" rather than falling through to the next - # var. ``or`` would coerce empty string to falsy and silently - # promote the wrong source. - if getattr(torch.version, "hip", None) is not None: - hip_v = os.environ.get("HIP_VISIBLE_DEVICES") - rocr_v = os.environ.get("ROCR_VISIBLE_DEVICES") - cvd = ( - hip_v - if hip_v is not None - else rocr_v - if rocr_v is not None - else os.environ.get("CUDA_VISIBLE_DEVICES") + from utils.hardware import get_visible_gpu_utilization + + utilization = get_visible_gpu_utilization() + + # Relative ordinals are not safe to round-trip into + # visibility env vars. Return [] so llama-server inherits + # the parent's mask unchanged. + if utilization.get("index_kind") not in (None, "physical"): + logger.debug( + "Skipping GPU placement: telemetry reports index_kind=%r " + "(not reusable for placement)", + utilization.get("index_kind"), ) - else: - cvd = os.environ.get("CUDA_VISIBLE_DEVICES") - if cvd is not None: - try: - # Empty mask (CVD="") yields an empty list so the - # below loop produces no GPUs, consistent with the - # nvidia-smi path and utils/hardware/hardware.py. - physical_ids = [int(x.strip()) for x in cvd.split(",") if x.strip()] - except ValueError: - physical_ids = None - gpus = [] - for ordinal in range(torch.cuda.device_count()): - free_bytes, _total_bytes = torch.cuda.mem_get_info(ordinal) - idx = ( - physical_ids[ordinal] - if physical_ids is not None and ordinal < len(physical_ids) - else ordinal - ) - gpus.append((idx, free_bytes // (1024 * 1024))) - # Match the nvidia-smi path's docstring guarantee of sorted-by-id. - return sorted(gpus, key = lambda g: g[0]) + return [] + + gpus: list[tuple[int, int]] = [] + for device in utilization.get("devices", []) or []: + index = device.get("index") + + # Use explicit ``is None`` checks -- ``or`` would treat an + # idle GPU with vram_used_gb == 0.0 as missing telemetry and + # silently drop a perfectly valid free card. + total_gb = device.get("vram_total_gb") + if total_gb is None: + total_gb = device.get("total_gb") + + used_gb = device.get("vram_used_gb") + if used_gb is None: + used_gb = device.get("used_gb") + + if index is None or total_gb is None or used_gb is None: + # Missing telemetry for this device -- skip rather than + # invent a free-memory number that drives placement. + continue + + free_mib = max(int((float(total_gb) - float(used_gb)) * 1024), 0) + gpus.append((int(index), free_mib)) + return sorted(gpus, key = lambda item: item[0]) except Exception as e: - logger.debug(f"torch GPU probe failed: {e}") + logger.debug(f"Generic GPU free-memory query failed: {e}") return [] # Skip the wait when the last kill is older than this; the GPU @@ -3849,10 +3842,7 @@ def unload_model(self) -> bool: if LlamaCppBackend._codec_mgr is not None: LlamaCppBackend._codec_mgr.unload() LlamaCppBackend._codec_mgr = None - import torch - - if torch.cuda.is_available(): - torch.cuda.empty_cache() + clear_gpu_cache() return True def _kill_process(self): @@ -5439,6 +5429,7 @@ def init_audio_codec(self, audio_type: str) -> None: if LlamaCppBackend._codec_mgr is None: LlamaCppBackend._codec_mgr = AudioCodecManager() + # Audio codecs are only validated on CUDA; stay on CPU otherwise. device = "cuda" if torch.cuda.is_available() else "cpu" model_repo_path = None @@ -5501,6 +5492,8 @@ def generate_audio_response( else None ) + # Match init_audio_codec: stay on CPU for non-CUDA hosts until the + # codec path is validated on XPU. import torch device = "cuda" if torch.cuda.is_available() else "cpu" diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py index 781c18ff15..5c9f196178 100644 --- a/studio/backend/core/training/trainer.py +++ b/studio/backend/core/training/trainer.py @@ -1505,6 +1505,10 @@ def _preprocess_snac_dataset( SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz" SNAC_SAMPLE_RATE = 24000 + + # SNAC codec has not been validated on Intel XPU yet; keep the + # pre-PR CPU fallback for non-CUDA hosts until an XPU-specific + # path is added. device = "cuda" if torch.cuda.is_available() else "cpu" max_length = self.max_seq_length or 2048 tokenizer = self.tokenizer @@ -1666,7 +1670,8 @@ def _preprocess_snac_dataset( del snac_model gc.collect() - torch.cuda.empty_cache() + + clear_gpu_cache() self._cuda_audio_used = True if not processed_examples: @@ -1692,6 +1697,10 @@ def _preprocess_bicodec_dataset( import numpy as np import torchaudio.transforms as T + import subprocess + + # Spark-TTS BiCodec has not been validated on Intel XPU; keep the + # pre-PR CPU fallback for non-CUDA hosts. device = "cuda" if torch.cuda.is_available() else "cpu" # The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo, @@ -1880,7 +1889,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor: del audio_tokenizer gc.collect() - torch.cuda.empty_cache() + + clear_gpu_cache() self._cuda_audio_used = True if not processed_examples: @@ -1916,6 +1926,8 @@ def _preprocess_dac_dataset( from datasets import Dataset as HFDataset from utils.paths import ensure_dir, tmp_root + # OuteTTS DAC/Whisper preprocess has not been validated on Intel + # XPU; keep the pre-PR CPU fallback for non-CUDA hosts. device = "cuda" if torch.cuda.is_available() else "cpu" # Clone OuteTTS repo (same as audio_codecs._load_dac) @@ -2087,7 +2099,8 @@ def _preprocess_dac_dataset( del prompt_processor gc.collect() - torch.cuda.empty_cache() + + clear_gpu_cache() self._cuda_audio_used = True if not processed_examples: diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py index d2fa45dd92..97c0057b32 100644 --- a/studio/backend/models/inference.py +++ b/studio/backend/models/inference.py @@ -62,7 +62,15 @@ def normalize_blank_chat_template_override(cls, value: Optional[str]) -> Optiona ) gpu_ids: Optional[List[int]] = Field( None, - description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries. Not supported for GGUF models.", + description = ( + "Physical GPU indices to use, for example [0, 1]. Omit or pass " + "[] to use automatic selection. Explicit gpu_ids are unsupported " + "when the parent visibility mask uses non-numeric or subdevice " + "entries -- this includes CUDA_VISIBLE_DEVICES with UUID/MIG " + "entries on NVIDIA, and ZE_AFFINITY_MASK with subdevice tokens " + "(e.g. '0.0,0.1') or FLAT-hierarchy (default) tile handles on " + "Intel XPU. Not supported for GGUF models." + ), ) speculative_type: Optional[str] = Field( None, diff --git a/studio/backend/models/training.py b/studio/backend/models/training.py index cef0b8f36f..1ac8306b9a 100644 --- a/studio/backend/models/training.py +++ b/studio/backend/models/training.py @@ -337,7 +337,15 @@ def _check_lora_dropout(cls, v: float) -> float: # GPU selection gpu_ids: Optional[List[int]] = Field( None, - description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries.", + description = ( + "Physical GPU indices to use, for example [0, 1]. Omit or pass " + "[] to use automatic selection. Explicit gpu_ids are unsupported " + "when the parent visibility mask uses non-numeric or subdevice " + "entries -- this includes CUDA_VISIBLE_DEVICES with UUID/MIG " + "entries on NVIDIA, and ZE_AFFINITY_MASK with subdevice tokens " + "(e.g. '0.0,0.1') or FLAT-hierarchy (default) tile handles on " + "Intel XPU." + ), ) @model_validator(mode = "after") diff --git a/studio/backend/tests/test_gpu_selection.py b/studio/backend/tests/test_gpu_selection.py index 90020efe43..61fcf3b09b 100644 --- a/studio/backend/tests/test_gpu_selection.py +++ b/studio/backend/tests/test_gpu_selection.py @@ -98,7 +98,8 @@ def test_explicit_ids_are_rejected_for_uuid_parent_visibility(self): patch("utils.hardware.hardware.get_physical_gpu_count", return_value = 8), ): with self.assertRaisesRegex( - ValueError, "unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG" + ValueError, + "unsupported when CUDA_VISIBLE_DEVICES uses non-numeric or subdevice", ): resolve_requested_gpu_ids([1]) @@ -681,12 +682,19 @@ def start(self): class TestRouteErrors(unittest.TestCase): + def test_prepare_gpu_selection_rejects_gpu_ids_on_non_accelerator_backend(self): + with patch("utils.hardware.hardware.get_device", return_value = DeviceType.CPU): + with self.assertRaises(ValueError) as exc_info: + prepare_gpu_selection([0], model_name = "unsloth/test") + + self.assertIn("only supported on CUDA and Intel XPU", str(exc_info.exception)) + def test_prepare_gpu_selection_rejects_gpu_ids_on_non_cuda_backend(self): with patch("utils.hardware.hardware.get_device", return_value = DeviceType.CPU): with self.assertRaises(ValueError) as exc_info: prepare_gpu_selection([0], model_name = "unsloth/test") - self.assertIn("only supported on CUDA devices", str(exc_info.exception)) + self.assertIn("only supported on", str(exc_info.exception)) def test_inference_route_rejects_gpu_ids_for_gguf(self): inference_route = _load_route_module( @@ -1246,19 +1254,85 @@ def test_auto_select_falls_back_when_estimate_unavailable(self): self.assertEqual(metadata["selection_mode"], "fallback_all") +class TestXpuSelection(_GpuCacheResetMixin, unittest.TestCase): + def test_auto_select_supports_xpu(self): + with ( + patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU), + patch( + "utils.hardware.hardware.estimate_required_model_memory_gb", + return_value = (1.0, {}), + ), + patch( + "utils.hardware.hardware.get_visible_gpu_utilization", + return_value = { + "devices": [ + {"index": 0, "vram_total_gb": 8, "vram_used_gb": 1}, + ] + }, + ), + patch( + "utils.hardware.hardware._get_parent_visible_gpu_spec", + return_value = { + "raw": None, + "numeric_ids": [0], + "supports_explicit_gpu_ids": True, + }, + ), + patch( + "utils.hardware.hardware.get_parent_visible_gpu_ids", + return_value = [0], + ), + ): + selected, metadata = auto_select_gpu_ids("unsloth/test") + + self.assertEqual(selected, [0]) + self.assertEqual(metadata["selection_mode"], "auto") + + def test_prepare_gpu_selection_accepts_explicit_ids_on_xpu(self): + with ( + patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU), + patch( + "utils.hardware.hardware._get_parent_visible_gpu_spec", + return_value = { + "raw": "0", + "numeric_ids": [0], + "supports_explicit_gpu_ids": True, + }, + ), + patch( + "utils.hardware.hardware.get_parent_visible_gpu_ids", + return_value = [0], + ), + patch("utils.hardware.hardware.get_physical_gpu_count", return_value = 1), + ): + selected, metadata = prepare_gpu_selection([0], model_name = "unsloth/test") + + self.assertEqual(selected, [0]) + self.assertEqual(metadata["selection_mode"], "explicit") + + class TestXpuRejection(_GpuCacheResetMixin, unittest.TestCase): def test_auto_select_returns_non_cuda_for_xpu(self): - with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU): + with ( + patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU), + patch( + "utils.hardware.hardware._get_parent_visible_gpu_spec", + return_value = { + "raw": None, + "numeric_ids": [], + "supports_explicit_gpu_ids": False, + }, + ), + patch( + "utils.hardware.hardware.get_parent_visible_gpu_ids", + return_value = [], + ), + ): selected, metadata = auto_select_gpu_ids("unsloth/test") self.assertIsNone(selected) self.assertEqual(metadata["selection_mode"], "non_cuda") - def test_prepare_gpu_selection_rejects_explicit_ids_on_xpu(self): - with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU): - with self.assertRaisesRegex(ValueError, "only supported on CUDA"): - prepare_gpu_selection([0], model_name = "unsloth/test") - class TestEstimateFp16ModelSizeBytesPrefersLocalWeights(unittest.TestCase): def _run( diff --git a/studio/backend/tests/test_gpu_selection_sandbox.py b/studio/backend/tests/test_gpu_selection_sandbox.py index 073f8d9437..1f2f5a228e 100644 --- a/studio/backend/tests/test_gpu_selection_sandbox.py +++ b/studio/backend/tests/test_gpu_selection_sandbox.py @@ -294,13 +294,13 @@ def test_two_gpus_needed(self): # 35GB (first) + 30*0.85 (second) = 60.5GB > 50GB self.assertEqual(len(selected), 2) - def test_non_cuda_returns_none(self): + def test_non_accelerator_returns_none(self): from utils.hardware.hardware import auto_select_gpu_ids import utils.hardware.hardware as hw with patch.object(hw, "get_device", return_value = hw.DeviceType.CPU): selected, meta = auto_select_gpu_ids("test/model") self.assertIsNone(selected) - self.assertEqual(meta["selection_mode"], "non_cuda") + self.assertEqual(meta["selection_mode"], "non_accelerator") class TestGetDeviceMap(unittest.TestCase): diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index 400b5dd066..c5bc97130c 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -29,6 +29,7 @@ estimate_required_model_memory_gb, auto_select_gpu_ids, prepare_gpu_selection, + get_torch_device_str, safe_num_proc, safe_thread_num_proc, dataset_map_num_proc, @@ -70,6 +71,7 @@ "estimate_required_model_memory_gb", "auto_select_gpu_ids", "prepare_gpu_selection", + "get_torch_device_str", "safe_num_proc", "safe_thread_num_proc", "dataset_map_num_proc", diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 3f2823302a..27bfc3f8a5 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -50,7 +50,7 @@ class DeviceType(str, Enum): # ========== Global State (set once by detect_hardware) ========== DEVICE: Optional[DeviceType] = None -CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) +CHAT_ONLY: bool = True # No CUDA/XPU GPU -> GGUF chat only (Mac, CPU-only, etc.) IS_ROCM: bool = False # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py @@ -64,7 +64,8 @@ def _backend_label(device: DeviceType) -> str: to the Studio frontend and other clients, however, "cuda" is misleading on an AMD machine. This helper swaps the label to ``"rocm"`` when the module-level ``IS_ROCM`` flag is set so the UI can render the correct - backend name without every caller having to duplicate the check. + backend name without every caller having to duplicate the check. XPU + and other backends fall through to ``device.value`` unchanged. """ if IS_ROCM and device == DeviceType.CUDA: return "rocm" @@ -105,17 +106,61 @@ def detect_hardware() -> DeviceType: Safe to call multiple times (idempotent). Detection order: - 1. CUDA (NVIDIA GPU, requires torch) - 2. MLX (Apple Silicon via MLX framework) - 3. CPU (fallback) + 1. XPU-preferred: honoured only when the caller has sent an + unambiguous "prefer XPU" signal -- CUDA explicitly hidden via + ``CUDA_VISIBLE_DEVICES="" / "-1"``, or ``UNSLOTH_FORCE_XPU=1``, + or CUDA simply not available on the host -- AND a non-empty + ``ZE_AFFINITY_MASK`` is set, AND ``torch.xpu`` actually reports + a device. A stray ``ZE_AFFINITY_MASK=0`` inherited from the + shell is not sufficient: CUDA still wins on hybrid hosts in + that case. + 2. CUDA (NVIDIA GPU, requires torch) + 3. XPU (Intel GPU, requires torch with XPU support) + 4. MLX (Apple Silicon via MLX framework) + 5. CPU (fallback) """ global DEVICE, CHAT_ONLY, IS_ROCM - CHAT_ONLY = True # reset -- only CUDA/ROCm sets it to False + CHAT_ONLY = True # reset -- only CUDA/ROCm/XPU sets it to False IS_ROCM = False - # --- CUDA / ROCm: try PyTorch --- + # --- CUDA / ROCm / XPU: try PyTorch --- if _has_torch(): import torch + + # --- Explicit-XPU hint --- + # Prefer XPU when: (a) UNSLOTH_FORCE_XPU=1, or + # (b) ZE_AFFINITY_MASK set + CUDA hidden/unavailable. + # A bare ZE_AFFINITY_MASK alone is NOT enough (can leak from + # unrelated Intel tooling). torch.xpu must report a device. + ze_mask = os.environ.get("ZE_AFFINITY_MASK") + cvd = os.environ.get("CUDA_VISIBLE_DEVICES") + cuda_hidden = cvd is not None and cvd.strip() in ("", "-1") + force_xpu = os.environ.get("UNSLOTH_FORCE_XPU") == "1" + try: + cuda_unavailable = not torch.cuda.is_available() + except Exception: + cuda_unavailable = True + + prefer_xpu = force_xpu or (bool(ze_mask) and (cuda_hidden or cuda_unavailable)) + if prefer_xpu: + try: + xpu_ok = hasattr(torch, "xpu") and torch.xpu.is_available() + except Exception: + xpu_ok = False + if xpu_ok: + DEVICE = DeviceType.XPU + CHAT_ONLY = False + device_name = torch.xpu.get_device_name(0) + if force_xpu and not ze_mask: + reason = "UNSLOTH_FORCE_XPU=1" + elif force_xpu: + reason = "UNSLOTH_FORCE_XPU=1 + ZE_AFFINITY_MASK" + else: + reason = "ZE_AFFINITY_MASK hint honoured" + print(f"Hardware detected: XPU -- {device_name} ({reason})") + return DEVICE + + # --- CUDA: NVIDIA GPU --- if torch.cuda.is_available(): DEVICE = DeviceType.CUDA CHAT_ONLY = False @@ -134,9 +179,7 @@ def detect_hardware() -> DeviceType: print(f"Hardware detected: CUDA -- {device_name}") return DEVICE - # --- XPU: Intel GPU --- - if _has_torch(): - import torch + # --- XPU: Intel GPU --- if hasattr(torch, "xpu") and torch.xpu.is_available(): DEVICE = DeviceType.XPU CHAT_ONLY = False @@ -191,9 +234,18 @@ def clear_gpu_cache(): torch.cuda.empty_cache() torch.cuda.ipc_collect() elif device == DeviceType.XPU: - import torch - torch.xpu.synchronize() - torch.xpu.empty_cache() + # Older torch-xpu builds may be missing synchronize/empty_cache; + # guard the calls so a stale build does not propagate AttributeError + # through callers that do not wrap clear_gpu_cache() themselves. + try: + import torch + if hasattr(torch, "xpu"): + if hasattr(torch.xpu, "synchronize"): + torch.xpu.synchronize() + if hasattr(torch.xpu, "empty_cache"): + torch.xpu.empty_cache() + except Exception as e: + logger.debug("Failed to clear XPU cache: %s", e) elif device == DeviceType.MLX: # MLX manages memory automatically; no explicit cache clear needed. # mlx.core has no empty_cache equivalent — gc.collect() above is enough. @@ -369,14 +421,22 @@ def get_package_versions() -> Dict[str, Optional[str]]: except PackageNotFoundError: versions[name] = None - # GPU runtime version bundled with torch + # GPU runtime versions bundled with torch (CUDA, ROCm/HIP, Intel XPU) try: import torch + versions["cuda"] = getattr(torch.version, "cuda", None) versions["rocm"] = getattr(torch.version, "hip", None) + if hasattr(torch, "xpu") and torch.xpu.is_available(): + # torch.version.xpu exists on modern torch builds but may be None; + # fall back to "available" so the UI distinguishes present-but-unknown + # from "package not found". + xpu_ver = getattr(torch.version, "xpu", None) + versions["xpu"] = xpu_ver if xpu_ver is not None else "available" except Exception: versions["cuda"] = None versions["rocm"] = None + versions["xpu"] = None return versions @@ -412,6 +472,7 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] if mod is None: return [] + device = get_device() devices = [] for ordinal, phys_idx in enumerate(device_indices): try: @@ -423,6 +484,11 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] if hasattr(mod, "mem_get_info"): free_bytes, total_bytes = mod.mem_get_info(ordinal) used_bytes = total_bytes - free_bytes + elif device == DeviceType.XPU: + # XPU without mem_get_info: memory_allocated() is process- + # local and misleading for placement. Return None so the + # selector uses its no-telemetry fallback. + used_bytes = None else: used_bytes = mod.memory_allocated(ordinal) devices.append( @@ -431,7 +497,9 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] "visible_ordinal": ordinal, "name": props.name, "total_gb": round(total_bytes / (1024**3), 2), - "used_gb": round(used_bytes / (1024**3), 2), + "used_gb": ( + round(used_bytes / (1024**3), 2) if used_bytes is not None else None + ), } ) except Exception as e: @@ -442,6 +510,213 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] # ========== Live GPU Utilization ========== +def _xpu_hierarchy_is_composite() -> bool: + """Return True iff Level Zero is running in COMPOSITE device hierarchy. + + In COMPOSITE mode, numeric ``ZE_AFFINITY_MASK`` entries address root + GPU IDs with subdevice syntax ``N.M`` for tiles. In FLAT mode (the + oneAPI default since the 2024 runtime), numeric entries address tile / + device-handle ordinals, so mapping them back to root GPU IDs is not + safe. Treat an unset ``ZE_FLAT_DEVICE_HIERARCHY`` as FLAT, which is the + documented default. Only COMPOSITE gives stable root-ID semantics. + """ + hierarchy = (os.environ.get("ZE_FLAT_DEVICE_HIERARCHY") or "FLAT").strip().upper() + return hierarchy == "COMPOSITE" + + +def _parse_ze_mask_roots(mask: str) -> list[int]: + """Parse a ``ZE_AFFINITY_MASK`` value into an ordered list of root device IDs. + + Returns one root ID per mask token, preserving order and duplicates so + that logical ordinals map 1-to-1 back to physical root IDs. For example + ``"0.0,0.1"`` yields ``[0, 0]`` (two logical devices, both under root + GPU 0) and ``"2.0,0.1,0.2"`` yields ``[2, 0, 0]``. Returns an empty + list if the mask is empty or contains no parseable digits. + + Only meaningful in COMPOSITE device hierarchy. Callers that need a + stable root-ID mapping must gate this call on + ``_xpu_hierarchy_is_composite()``. + """ + roots: list[int] = [] + if not mask: + return roots + for token in mask.split(","): + token = token.strip() + if not token: + continue + root = token.split(".", 1)[0] + # Use str.isdecimal() (not str.isdigit()) so Unicode superscripts + # like "2" / "3" are rejected -- they satisfy isdigit() but crash + # int() with ValueError. + if root.isdecimal(): + roots.append(int(root)) + return roots + + +def _resolve_xpu_smi_device_id() -> Optional[int]: + """Resolve the physical root device ID used by ``xpu-smi -d``. + + ``torch.xpu.current_device()`` returns the logical ordinal after + ``ZE_AFFINITY_MASK`` remapping, whereas ``xpu-smi`` addresses physical + root devices. Translate the ordinal through the mask roots so telemetry + targets the GPU the process is actually running on. Subdevice syntax + such as ``0.0,0.1`` collapses to a single root device. + + Returns ``None`` when we cannot produce a safe root-ID mapping -- + either because ``ZE_FLAT_DEVICE_HIERARCHY=FLAT`` (the oneAPI default, + in which case numeric mask entries address tile handles, not root GPU + IDs) or because torch.xpu is unavailable. Callers should treat ``None`` + as "skip xpu-smi -d and fall back to torch.xpu memory telemetry". + """ + # In FLAT mode, numeric ZE_AFFINITY_MASK entries are tile / device + # handles, not root GPU IDs. Querying xpu-smi -d would + # target the wrong physical device (or raise), so signal "no safe + # mapping" by returning None. + if not _xpu_hierarchy_is_composite(): + return None + + ordinal = 0 + xpu_ok = False + try: + import torch + xpu_ok = hasattr(torch, "xpu") and torch.xpu.is_available() + if xpu_ok: + ordinal = int(torch.xpu.current_device()) + except Exception as e: + logger.debug("torch.xpu.current_device() probe failed: %s", e) + + mask = (os.environ.get("ZE_AFFINITY_MASK") or "").strip() + roots = _parse_ze_mask_roots(mask) + if roots: + return roots[ordinal] if 0 <= ordinal < len(roots) else roots[0] + + return ordinal if xpu_ok else 0 + + +_XPU_SMI_NA = frozenset(("", "n/a", "na", "-")) + +# Cached xpu-smi binary path. _XPU_SMI_PATH_UNSET is a sentinel distinct +# from None: None means "scanned PATH and not found" while the sentinel +# means "not scanned yet". Resolved once by _resolve_xpu_smi_binary() so +# live telemetry polls do not re-scan PATH on every tick. +_XPU_SMI_PATH_UNSET: Any = object() +_xpu_smi_binary: Any = _XPU_SMI_PATH_UNSET + + +def _resolve_xpu_smi_binary() -> Optional[str]: + """Return cached absolute path to ``xpu-smi`` or None if not on PATH.""" + global _xpu_smi_binary + if _xpu_smi_binary is _XPU_SMI_PATH_UNSET: + import shutil as _shutil + _xpu_smi_binary = _shutil.which("xpu-smi") + return _xpu_smi_binary + + +def _parse_xpu_smi_metric(value: str) -> Optional[float]: + """Return float or None for missing/unknown xpu-smi CSV column values. + + xpu-smi versions differ slightly in how they render unknown metrics: + empty string, "N/A", "n/a", "NA", or "-". Treat any of these as "value + not available" so a single missing column does not silently drop the + entire telemetry row. + """ + if value.strip().lower() in _XPU_SMI_NA: + return None + try: + return float(value) + except ValueError: + return None + + +def _get_xpu_utilization() -> Dict[str, Any]: + """Return a live snapshot of Intel XPU GPU utilization via ``xpu-smi`` or torch.xpu.""" + gpu_util = None + temp = None + power_w = None + + dev_idx = _resolve_xpu_smi_device_id() + + try: + import subprocess + + # Skip subprocess entirely when xpu-smi is not on PATH, avoiding + # a multi-second timeout on systems without the Intel tooling. + # The binary path is resolved once and cached by + # _resolve_xpu_smi_binary() so repeated telemetry polls do not + # re-scan PATH on every tick. + xpu_smi = _resolve_xpu_smi_binary() + if xpu_smi is None or dev_idx is None: + # dev_idx is None in FLAT hierarchy where numeric affinity + # entries do not map to root GPU IDs -- skip xpu-smi -d and + # fall through to the torch.xpu VRAM-only telemetry below. + raise FileNotFoundError("xpu-smi unavailable for current hierarchy") + + # xpu-smi metric IDs (per Intel xpu-smi docs): + # 0 = GPU Utilization (%) + # 1 = GPU Power (W) + # 2 = GPU Frequency (MHz) + # 3 = GPU Core Temperature (C) + # -n 1 requests exactly one sample so the command exits immediately. + # CSV columns: Timestamp, DeviceId, , , + result = subprocess.run( + [xpu_smi, "dump", "-d", str(dev_idx), "-m", "0,1,3", "-n", "1"], + capture_output = True, + text = True, + timeout = 3, + ) + if result.returncode == 0 and result.stdout.strip(): + lines = result.stdout.strip().splitlines() + for line in reversed(lines): + if line.startswith("Timestamp") or line.startswith("#"): + continue + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 5: + gpu_util = _parse_xpu_smi_metric(parts[2]) + power_w = _parse_xpu_smi_metric(parts[3]) + temp = _parse_xpu_smi_metric(parts[4]) + break + except Exception as e: + logger.debug("xpu-smi query failed: %s", e) + + # Get VRAM from torch.xpu (only reports PyTorch-managed memory). + # Use the same logical ordinal that torch exposes; xpu-smi physical id is + # only needed by the subprocess call above. + vram_used_gb = None + vram_total_gb = None + try: + import torch + if hasattr(torch, "xpu") and torch.xpu.is_available(): + idx = torch.xpu.current_device() + props = torch.xpu.get_device_properties(idx) + vram_total_gb = round(props.total_memory / (1024**3), 2) + vram_used_gb = round(torch.xpu.memory_allocated(idx) / (1024**3), 2) + except Exception as e: + logger.debug("torch.xpu VRAM query failed: %s", e) + + vram_pct = ( + round((vram_used_gb / vram_total_gb) * 100, 1) + if vram_used_gb is not None and vram_total_gb and vram_total_gb > 0 + else None + ) + + has_any = any(v is not None for v in [gpu_util, temp, vram_used_gb, power_w]) + if not has_any: + return {"available": False, "backend": "xpu"} + + return { + "available": True, + "backend": "xpu", + "gpu_utilization_pct": gpu_util, + "temperature_c": temp, + "vram_used_gb": vram_used_gb, + "vram_total_gb": vram_total_gb, + "vram_utilization_pct": vram_pct, + "power_draw_w": power_w, + "power_limit_w": None, + "power_utilization_pct": None, + } + + def _smi_query(func_name: str, *args, **kwargs) -> Optional[Dict[str, Any]]: """Run a query against the appropriate SMI backend (amd-smi or nvidia-smi). @@ -632,6 +907,9 @@ def get_gpu_utilization() -> Dict[str, Any]: """Return a live snapshot of device utilization information.""" device = get_device() + if device == DeviceType.XPU: + return _get_xpu_utilization() + if device == DeviceType.CUDA: result = _smi_query("get_primary_gpu_utilization") if result is not None: @@ -853,22 +1131,48 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) if device in (DeviceType.CUDA, DeviceType.XPU): + parent_visible_spec = _get_parent_visible_gpu_spec() + # Honor an explicit empty visibility env (ZE_AFFINITY_MASK="" or + # CUDA_VISIBLE_DEVICES="" / "-1") as "no devices visible". Without + # this guard, the enumerate-visible-ordinals fallback below would + # happily report devices the process explicitly hid. + if parent_visible_spec["raw"] is not None and parent_visible_spec["numeric_ids"] == []: + return { + "available": False, + "backend": _backend_label(device), + "parent_visible_gpu_ids": [], + "devices": [], + "index_kind": "relative", + } + parent_ids = get_parent_visible_gpu_ids() - # When parent_visible_ids is empty (UUID/MIG mask or no CVD set), - # enumerate torch-visible ordinals so the UI still shows devices. - if parent_ids: + # Only label as "physical" when the IDs are safe to round-trip. + if parent_ids and parent_visible_spec["supports_explicit_gpu_ids"]: torch_indices = parent_ids index_kind = "physical" else: - visible_count = _torch_get_physical_gpu_count() or 0 + visible_count = ( + len(parent_ids) if parent_ids else (_torch_get_physical_gpu_count() or 0) + ) torch_indices = list(range(visible_count)) index_kind = "relative" + parent_ids = [] torch_devices = _torch_get_per_device_info(torch_indices) if torch_devices: devices = [] for td in torch_devices: total = td["total_gb"] used = td["used_gb"] + # used=None is a deliberate "telemetry unavailable" signal + # from _torch_get_per_device_info (e.g. XPU without + # mem_get_info). Propagate it through as None so downstream + # consumers fall back to their no-telemetry path instead of + # dividing by None. + vram_pct = ( + round((used / total) * 100, 1) + if used is not None and total and total > 0 + else None + ) devices.append( { "index": td["index"], @@ -878,9 +1182,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: "temperature_c": None, "vram_used_gb": used, "vram_total_gb": total, - "vram_utilization_pct": round((used / total) * 100, 1) - if total > 0 - else None, + "vram_utilization_pct": vram_pct, "power_draw_w": None, "power_limit_w": None, "power_utilization_pct": None, @@ -942,6 +1244,84 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: def _get_parent_visible_gpu_spec() -> Dict[str, Any]: + # On Intel XPU hosts, device visibility is controlled by ZE_AFFINITY_MASK + # (the Level Zero affinity variable) rather than CUDA_VISIBLE_DEVICES. + if get_device() == DeviceType.XPU: + xpu_mask_raw = os.environ.get("ZE_AFFINITY_MASK") + composite = _xpu_hierarchy_is_composite() + + if xpu_mask_raw is None: + # COMPOSITE: root GPU IDs are stable physical IDs. + if composite: + return { + "raw": None, + "numeric_ids": list(range(get_physical_gpu_count())), + "supports_explicit_gpu_ids": True, + } + # FLAT (oneAPI default): ordinals are tile/device handles, + # not physical GPU IDs. Leave numeric_ids=None so telemetry + # uses relative ordinals. Users who need explicit selection + # can set ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE. + return { + "raw": None, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + + xpu_mask = xpu_mask_raw.strip() + if xpu_mask == "": + return { + "raw": xpu_mask, + "numeric_ids": [], + "supports_explicit_gpu_ids": True, + } + + # Subdevice syntax ("N.M") expands one root into multiple + # logical devices -- not addressable by explicit root-ID selection. + has_subdevice = any("." in token.strip() for token in xpu_mask.split(",") if token.strip()) + if has_subdevice: + return { + "raw": xpu_mask, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + + # FLAT numeric entries are tile handles, not physical GPU IDs. + # Expose in numeric_ids for telemetry but reject as explicit + # gpu_ids so callers don't pin to a tile thinking it's a GPU. + if not composite: + tokens = [token.strip() for token in xpu_mask.split(",") if token.strip()] + if tokens and all(token.isdecimal() for token in tokens): + return { + "raw": xpu_mask, + "numeric_ids": [int(token) for token in tokens], + "supports_explicit_gpu_ids": False, + } + return { + "raw": xpu_mask, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + + # COMPOSITE + pure numeric (subdevice returned above). + # Parse the mask into root GPU IDs; _parse_ze_mask_roots silently + # drops any non-decimal tokens so "*" or "GPU-uuid" yield []. + roots_with_dupes = _parse_ze_mask_roots(xpu_mask) + if not roots_with_dupes: + # Unparseable mask (e.g. "*", "GPU-uuid") -- cannot map to + # physical root IDs. + return { + "raw": xpu_mask, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + + return { + "raw": xpu_mask, + "numeric_ids": roots_with_dupes, + "supports_explicit_gpu_ids": True, + } + # ROCm uses HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES in addition to # CUDA_VISIBLE_DEVICES (which HIP also respects). Check ROCm-specific # env vars first so multi-GPU AMD setups are handled correctly. @@ -1014,11 +1394,14 @@ def resolve_requested_gpu_ids(gpu_ids: Optional[list[int]]) -> list[int]: return parent_visible_ids if not parent_visible_spec["supports_explicit_gpu_ids"]: + env_var_name = ( + "ZE_AFFINITY_MASK" if get_device() == DeviceType.XPU else "CUDA_VISIBLE_DEVICES" + ) raise ValueError( f"Invalid gpu_ids {requested_ids}: explicit physical GPU IDs are " - f"unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG entries " - f"({parent_visible_spec['raw']!r}). Omit gpu_ids to use the " - "parent-visible devices." + f"unsupported when {env_var_name} uses non-numeric or subdevice " + f"entries ({parent_visible_spec['raw']!r}). Omit gpu_ids to use " + "the parent-visible devices." ) if len(set(requested_ids)) != len(requested_ids): @@ -1434,8 +1817,12 @@ def auto_select_gpu_ids( ) -> tuple[Optional[list[int]], Dict[str, Any]]: metadata: Dict[str, Any] = {"selection_mode": "auto"} - if get_device() != DeviceType.CUDA: - metadata["selection_mode"] = "non_cuda" + # Auto-selection relies on per-device free-VRAM telemetry which is + # available on both CUDA (via nvidia-smi) and XPU (via torch.xpu + + # xpu-smi). Other backends (MLX, CPU) do not expose the required + # information, so fall through to inheriting parent visibility. + if get_device() not in (DeviceType.CUDA, DeviceType.XPU): + metadata["selection_mode"] = "non_accelerator" return None, metadata required_gb, estimate_metadata = estimate_required_model_memory_gb( @@ -1454,23 +1841,27 @@ def auto_select_gpu_ids( parent_visible_spec = _get_parent_visible_gpu_spec() metadata["parent_cuda_visible_devices"] = parent_visible_spec["raw"] + # No stable physical GPU IDs available (FLAT, wildcard, subdevice). + # Do not synthesize gpu_ids -- on multi-tile devices that would + # rewrite ZE_AFFINITY_MASK with tile handles. get_device_map() + # still returns "balanced" for multi-visible XPU without gpu_ids. if not parent_visible_spec["supports_explicit_gpu_ids"]: metadata["selection_mode"] = "inherit_parent_visible" metadata["selected_gpu_ids"] = None return None, metadata + parent_ids = get_parent_visible_gpu_ids() + if required_gb is None: # Cannot estimate model size -- fall back to all visible GPUs # rather than risk loading on a single GPU that may not have # enough memory. - parent_ids = get_parent_visible_gpu_ids() metadata["selection_mode"] = "fallback_all" metadata["selected_gpu_ids"] = parent_ids return parent_ids, metadata utilization = get_visible_gpu_utilization() devices = utilization.get("devices", []) - parent_ids = get_parent_visible_gpu_ids() if not devices: metadata["selection_mode"] = "fallback_all" @@ -1599,10 +1990,10 @@ def prepare_gpu_selection( in the worker subprocess which narrows ``CUDA_VISIBLE_DEVICES`` before any torch/CUDA initialisation. """ - if gpu_ids and get_device() != DeviceType.CUDA: + if gpu_ids and get_device() not in (DeviceType.CUDA, DeviceType.XPU): raise ValueError( - f"gpu_ids {list(gpu_ids)} is only supported on CUDA devices, " - f"but the current backend is '{get_device().value}'." + f"gpu_ids {list(gpu_ids)} is only supported on CUDA and Intel XPU " + f"devices, but the current backend is '{get_device().value}'." ) if gpu_ids: @@ -1676,11 +2067,16 @@ def get_physical_gpu_count() -> int: def _backend_visible_devices_env() -> Optional[str]: """Return the raw visibility env string that applies to this backend. - On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence - over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in + On XPU, ``ZE_AFFINITY_MASK`` is the visibility control (not + ``CUDA_VISIBLE_DEVICES``). On ROCm, ``HIP_VISIBLE_DEVICES`` / + ``ROCR_VISIBLE_DEVICES`` take precedence over ``CUDA_VISIBLE_DEVICES``; + the helper mirrors the resolution logic in ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices`` - reports the value that is actually narrowing the visible device set. + reports the value that is actually narrowing the visible device set on + the current backend. """ + if get_device() == DeviceType.XPU: + return os.environ.get("ZE_AFFINITY_MASK") if IS_ROCM: return _get_parent_visible_gpu_spec().get("raw") return os.environ.get("CUDA_VISIBLE_DEVICES") @@ -1689,13 +2085,28 @@ def _backend_visible_devices_env() -> Optional[str]: def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): + parent_visible_spec = _get_parent_visible_gpu_spec() + + # Honor an explicit "no devices visible" mask (ZE_AFFINITY_MASK="" + # or CUDA_VISIBLE_DEVICES="" / "-1") by short-circuiting before the + # torch-ordinal enumeration fallback, which would otherwise report + # devices that the process explicitly hid. + if parent_visible_spec["raw"] is not None and parent_visible_spec["numeric_ids"] == []: + return { + "available": False, + "backend": _backend_label(device), + "backend_cuda_visible_devices": _backend_visible_devices_env(), + "parent_visible_gpu_ids": [], + "devices": [], + "index_kind": "relative", + } + parent_visible_ids = get_parent_visible_gpu_ids() + # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm) if device == DeviceType.CUDA and not IS_ROCM: try: from . import nvidia - - parent_visible_spec = _get_parent_visible_gpu_spec() result = nvidia.get_backend_visible_gpu_info( parent_visible_spec["numeric_ids"], parent_visible_spec["raw"], @@ -1706,16 +2117,20 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: except Exception as e: logger.warning("Backend GPU visibility query failed: %s", e) - # Torch fallback (AMD ROCm, Intel XPU, nvidia-smi missing/failed) - # When parent_visible_ids is empty (UUID/MIG mask), enumerate by - # torch ordinal so the UI still shows devices. - if parent_visible_ids: + # Torch fallback (AMD ROCm, Intel XPU, nvidia-smi missing/failed). + # Only label as "physical" when the IDs are safe to round-trip. + if parent_visible_ids and parent_visible_spec["supports_explicit_gpu_ids"]: torch_indices = parent_visible_ids index_kind = "physical" else: - visible_count = _torch_get_physical_gpu_count() or 0 + visible_count = ( + len(parent_visible_ids) + if parent_visible_ids + else (_torch_get_physical_gpu_count() or 0) + ) torch_indices = list(range(visible_count)) index_kind = "relative" + parent_visible_ids = [] torch_devices = _torch_get_per_device_info(torch_indices) if torch_devices: devices = [ @@ -1796,8 +2211,50 @@ def get_visible_gpu_count() -> int: if _visible_gpu_count is not None: return _visible_gpu_count + # Prefer torch.xpu.device_count() on Intel XPU hosts because the Level + # Zero runtime correctly interprets ZE_AFFINITY_MASK semantics (including + # subdevice syntax like "0.0,0.1", where two logical devices collapse + # onto a single root GPU). + device = get_device() + if device == DeviceType.XPU: + xpu_mask_raw = os.environ.get("ZE_AFFINITY_MASK") + xpu_mask_set = xpu_mask_raw is not None + xpu_visible = (xpu_mask_raw or "").strip() + if xpu_mask_set and xpu_visible == "": + _visible_gpu_count = 0 + return _visible_gpu_count + + try: + import torch + _visible_gpu_count = torch.xpu.device_count() + except Exception as e: + logger.debug( + "torch.xpu.device_count() failed, falling back to mask parsing: %s", + e, + ) + if xpu_visible: + # Fallback: count unique root device IDs from the mask. + # ZE_AFFINITY_MASK can use "device.subdevice" notation, + # so "0.0,0.1" is 1 root device, not 2. Without torch we + # cannot know which hierarchy mode is active, so fall back + # to root-device counting (the more conservative choice). + if xpu_visible == "*": + # Documented wildcard: all physical XPUs visible. + _visible_gpu_count = get_physical_gpu_count() + else: + roots = _parse_ze_mask_roots(xpu_visible) + # Non-parseable masks (",,,", "GPU-abc", etc.) yield an + # empty roots list and are treated as 0 visible devices, + # not "all visible" -- we have no evidence the user + # intended to expose the whole fleet. + _visible_gpu_count = len(set(roots)) + else: + _visible_gpu_count = get_physical_gpu_count() + return _visible_gpu_count + # Use _get_parent_visible_gpu_spec() which already handles - # HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES on ROCm. + # HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES on ROCm and + # CUDA_VISIBLE_DEVICES everywhere else. visible_spec = _get_parent_visible_gpu_spec() if visible_spec["raw"] is not None: raw = visible_spec["raw"].strip() @@ -1839,6 +2296,17 @@ def apply_gpu_ids(gpu_ids) -> None: else: value = str(gpu_ids) + # Intel XPU uses Level Zero and honors ZE_AFFINITY_MASK, not + # CUDA_VISIBLE_DEVICES. Route XPU pinning through the correct env var + # so worker subprocesses are actually restricted to the intended GPU. + if get_device() == DeviceType.XPU: + os.environ["ZE_AFFINITY_MASK"] = value + # Leave inherited CUDA_VISIBLE_DEVICES alone -- removing it could + # let the worker flip back to CUDA on hybrid hosts. + _visible_gpu_count = None + logger.info("Applied gpu_ids: ZE_AFFINITY_MASK='%s'", value) + return + os.environ["CUDA_VISIBLE_DEVICES"] = value # Keep ROCm visibility env vars in sync so _get_parent_visible_gpu_spec() # picks up the narrowed set on AMD systems. Workers can call @@ -1885,27 +2353,44 @@ def get_device_map(gpu_ids: Optional[list[int]] = None) -> str: Returns ``"balanced"`` (shard evenly across GPUs) when: - ``gpu_ids`` explicitly lists >1 GPU, **or** - - ``CUDA_VISIBLE_DEVICES`` uses UUID/MIG identifiers (non-numeric) and - more than one GPU is visible (fallback: we cannot resolve numeric IDs, - so we assume the caller intends multi-GPU). + - ``CUDA_VISIBLE_DEVICES``/``ZE_AFFINITY_MASK`` uses non-numeric + identifiers (UUID/MIG/wildcard) and more than one GPU is visible + (fallback: we cannot resolve numeric IDs, so we assume the caller + intends multi-GPU). Returns ``"sequential"`` (single device) in all other cases, including - non-CUDA backends (CPU, MLX). + CPU/MLX backends. Callers should use ``prepare_gpu_selection()`` upstream to determine the ``gpu_ids`` list -- that function handles the smart auto-selection of the minimum number of GPUs needed for a given model. """ device = get_device() - if device == DeviceType.CUDA: + if device in (DeviceType.CUDA, DeviceType.XPU): multi_gpu = gpu_ids is not None and len(gpu_ids) > 1 - if not multi_gpu: - # UUID/MIG masks cannot be split into numeric IDs, so if multiple - # GPUs are visible we assume multi-GPU sharding is intended. + # Only apply the "implicit multi-visible" heuristic when the + # caller did NOT pass any gpu_ids. Passing gpu_ids=[0] explicitly + # is a deliberate "use exactly device 0" signal that must stay + # sequential even if more devices are visible. + if not multi_gpu and gpu_ids is None: parent_visible_spec = _get_parent_visible_gpu_spec() if parent_visible_spec["numeric_ids"] is None and get_visible_gpu_count() > 1: multi_gpu = True + elif device == DeviceType.XPU: + # Shard across visible XPU ordinals via HF (no mask + # rewrite). Safe for tiles and roots alike since HF + # uses torch ordinals directly within the worker scope. + supports_physical = parent_visible_spec["supports_explicit_gpu_ids"] + has_multiple_numeric = ( + parent_visible_spec["numeric_ids"] is not None + and len(parent_visible_spec["numeric_ids"]) > 1 + ) + has_multiple_unresolved = ( + parent_visible_spec["numeric_ids"] is None and get_visible_gpu_count() > 1 + ) + if has_multiple_unresolved or (not supports_physical and has_multiple_numeric): + multi_gpu = True if multi_gpu: return "balanced" @@ -1940,6 +2425,19 @@ def raise_if_offloaded( ) +def get_torch_device_str() -> str: + """ + Return the torch device string for the detected hardware. + E.g. "cuda", "xpu", or "cpu". + """ + device = get_device() + if device == DeviceType.CUDA: + return "cuda" + elif device == DeviceType.XPU: + return "xpu" + return "cpu" + + def safe_num_proc(desired: Optional[int] = None) -> int: """ Return a safe ``num_proc`` for ``dataset.map()`` calls. @@ -2013,7 +2511,33 @@ def dataset_map_num_proc(desired: Optional[int] = None) -> Optional[int]: Returns ``None`` on spawn-based platforms (Windows, macOS) because ``datasets`` treats ``num_proc=1`` as multiprocessing (creates ``Pool(1)``). Only ``num_proc=None`` guarantees in-process execution. + + Also returns ``None`` on XPU devices once the XPU runtime has been + initialized in this process, because ``os.fork()`` corrupts the + Level-Zero GPU context and causes Triton kernel launches to fail with + "Pointer argument doesn't reference XPU device memory". Pre-init XPU + hosts can still parallelize pure CPU-side dataset preprocessing. """ if sys.platform in ("win32", "darwin"): return None + + if get_device() == DeviceType.XPU: + try: + import torch + except Exception: + # No torch means no XPU runtime is active here, so CPU-side + # dataset parallelism is still safe. + return safe_num_proc(desired) + + xpu = getattr(torch, "xpu", None) + is_initialized = getattr(xpu, "is_initialized", None) + if callable(is_initialized): + try: + if is_initialized(): + return None + except Exception as e: + # Treat a failing probe as "runtime not touched yet" so + # pre-init CPU preprocessing can still parallelize. + logger.debug("torch.xpu.is_initialized() probe failed: %s", e) + return safe_num_proc(desired) diff --git a/studio/backend/utils/utils.py b/studio/backend/utils/utils.py index 7a7774be40..6982af639f 100644 --- a/studio/backend/utils/utils.py +++ b/studio/backend/utils/utils.py @@ -158,17 +158,31 @@ def format_error_message(error: Exception, model_name: str) -> str: return "Invalid HF token. Please check your token and try again." if ( - "memory" in error_str - or "cuda" in error_str - or "mlx" in error_str - or "out of memory" in error_str + "out of memory" in error_str + or "out of device memory" in error_str + or "out_of_device_memory" in error_str # ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY + or "out_of_host_memory" in error_str # ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY + or "not enough memory" in error_str + or "cannot allocate memory" in error_str + or "memory allocation failed" in error_str + or "cublas_status_alloc_failed" in error_str # cuBLAS workspace OOM + or ("cuda error" in error_str and "alloc" in error_str) + or ("xpu" in error_str and ("alloc" in error_str or "memory" in error_str)) + or isinstance(error, MemoryError) + or ("mlx" in error_str and ("memory" in error_str or "allocate" in error_str)) ): + # Resolve get_device() at call time (not import time) so tests that + # monkey-patch utils.hardware.get_device after this module is loaded + # still see the patched backend. from utils.hardware import get_device device = get_device() - device_label = {"cuda": "GPU", "mlx": "Apple Silicon GPU", "cpu": "system"}.get( - device.value, "GPU" - ) + device_label = { + "cuda": "GPU", + "xpu": "Intel GPU", + "mlx": "Apple Silicon GPU", + "cpu": "system", + }.get(device.value, "GPU") return f"Not enough {device_label} memory to load '{model_short}'. Try a smaller model or free memory." # Generic fallback diff --git a/unsloth/models/gemma.py b/unsloth/models/gemma.py index 34655c5735..d9f6ab7b83 100644 --- a/unsloth/models/gemma.py +++ b/unsloth/models/gemma.py @@ -284,16 +284,16 @@ def __init__( for device in range(DEVICE_COUNT): self._set_cos_sin_cache( seq_len = self.current_rope_size, - device = torch.device(device), + device = torch.device(DEVICE_TYPE_TORCH, device), dtype = torch.get_default_dtype(), ) # dummy so that patch_utils doesn't fail for now self.cos_cached = torch.empty( - 1, device = torch.cuda.current_device(), dtype = torch.get_default_dtype() + 1, device = get_current_device(), dtype = torch.get_default_dtype() ) self.sin_cached = torch.empty( - 1, device = torch.cuda.current_device(), dtype = torch.get_default_dtype() + 1, device = get_current_device(), dtype = torch.get_default_dtype() ) def _set_cos_sin_cache(self, seq_len, device, dtype): @@ -341,7 +341,7 @@ def get_cached( device_index = None, ): if device_index is None: - device_index = torch.cuda.current_device() + device_index = get_current_device() return self.multi_gpu_cos_cached[device_index], self.multi_gpu_sin_cached[device_index] def extend_rope_embedding(self, x, seq_len): @@ -351,7 +351,9 @@ def extend_rope_embedding(self, x, seq_len): self.current_rope_size = math.ceil(seq_len / 8192) * 8192 for device in range(DEVICE_COUNT): self._set_cos_sin_cache( - self.current_rope_size, device = torch.device(device), dtype = x.dtype + self.current_rope_size, + device = torch.device(DEVICE_TYPE_TORCH, device), + dtype = x.dtype, ) @@ -474,5 +476,5 @@ def post_patch( for _ in range(3): gc.collect() - torch.cuda.empty_cache() + clean_gpu_cache() return model, tokenizer