Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
b759a03
Enable studio for Intel GPU (XPU / Level Zero)
danielhanchen Apr 11, 2026
a1c2c4b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 11, 2026
7131356
fix(gemma): Replace hardcoded CUDA calls for XPU support
cheehook Apr 15, 2026
6c55664
Address review feedback for PR #4724: hybrid-host CVD preservation, F…
danielhanchen Apr 16, 2026
a293d25
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 16, 2026
b8b7d47
Round 2 review fixes: idle GPUs, hybrid hint opt-in, relative ordinal…
danielhanchen Apr 16, 2026
eebf077
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 16, 2026
3d579d9
Round 3 review fixes: FLAT gpu_ids contract, FORCE_XPU opt-in, wildca…
danielhanchen Apr 16, 2026
cc04baa
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 16, 2026
90ffb22
Round 4 review fixes: replace silent excepts with debug logging
danielhanchen Apr 16, 2026
b3ace02
Round 5 review fixes: FLAT ID contract, hybrid telemetry, OOM matcher
danielhanchen Apr 16, 2026
24b2a30
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 16, 2026
4a0fc3d
Round 6 review fixes: XPU device_map and telemetry index_kind
danielhanchen Apr 16, 2026
3fa23d9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 16, 2026
a57adb2
Round 7 review fixes: enable XPU FLAT auto-select and placement
danielhanchen Apr 16, 2026
95ac005
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 16, 2026
c1d9249
Round 8 review fixes: preserve inherited ZE_AFFINITY_MASK
danielhanchen Apr 16, 2026
3756b5c
Round 9 Gemini fix: skip empty tokens when parsing CUDA_VISIBLE_DEVICES
danielhanchen Apr 16, 2026
efef89a
Round 10 review fixes: revert ordinal synthesis, use HF balanced instead
danielhanchen Apr 16, 2026
f32a546
Trim verbose code comments across hardware.py and llama_cpp.py
danielhanchen Apr 16, 2026
b3844e0
Merge branch 'main' into zhenyuan_enable_studio
rolandtannous Apr 23, 2026
f4dc510
merge: resolve conflicts from leizhenyuan/zhenyuan_enable_studio (Int…
LeoBorcherding Jun 8, 2026
ce2639f
Merge remote-tracking branch 'cheehook/fix-gemma-xpu-error' into stag…
LeoBorcherding Jun 8, 2026
c0cd40c
merge: sync with unslothai/unsloth main, resolve upstream conflicts
LeoBorcherding Jun 8, 2026
edd8c74
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 8, 2026
1950cfe
fix: remove duplicated llama-server launch block from inside hf_repo …
LeoBorcherding Jun 8, 2026
22e7f72
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 8, 2026
5f1066b
tests: restore lint-required test names for XPU selection behaviour
LeoBorcherding Jun 8, 2026
f2b763c
fix: re-apply nvidia_eligible and CVD fixes after pre-commit.ci reformat
LeoBorcherding Jun 8, 2026
bf4b3bc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion studio/backend/core/inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from transformers import TextStreamer
from peft import PeftModel, PeftModelForCausalLM

import contextlib
import json
import sys
import torch
Expand Down Expand Up @@ -1670,8 +1671,30 @@ def _generate_dac(
+ text
+ "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
)

with torch.inference_mode():
with torch.amp.autocast("cuda", dtype = model.dtype):
# Derive the autocast device from the loaded model, not from the
# global backend: a CPU-fallback DAC on an XPU/CUDA host must not
# open a GPU autocast context around CPU tensors.
device_type = (
model.device.type
if hasattr(model.device, "type")
else str(model.device).split(":", 1)[0]
)
# Clamp to autocast-supported backends so exotic devices
# (e.g. "meta" during accelerate offloaded loading) do not raise.
# MPS is autocast-supported since torch 2.3, keep it in the set.
if device_type not in ("cuda", "xpu", "mps", "cpu"):
device_type = "cpu"
# CPU and XPU autocast only accept bfloat16/float16. For a
# float32 model, skip autocast entirely to avoid raising or
# producing a warning on every generate call.
autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16)
if device_type in ("cpu", "xpu") and not autocast_dtype_supported:
autocast_ctx = contextlib.nullcontext()
else:
autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype)
with autocast_ctx:
inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
generated = model.generate(
**inputs,
Expand Down
155 changes: 74 additions & 81 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
RENDER_HTML_REPEAT_NUDGE,
parse_tool_calls_from_text as _shared_parse_tool_calls_from_text,
)
from utils.hardware import clear_gpu_cache

logger = get_logger(__name__)

Expand Down Expand Up @@ -1245,23 +1246,28 @@ def _amd_apu_wants_unified_memory() -> bool:

@staticmethod
def _get_gpu_free_memory() -> list[tuple[int, int]]:
"""Query free memory per GPU.

Order:
1. ``nvidia-smi`` (NVIDIA CUDA hosts) -- respects
``CUDA_VISIBLE_DEVICES``.
2. ``torch.cuda.mem_get_info`` -- universal fallback that
works on AMD ROCm too because the HIP runtime
reuses the entire ``torch.cuda.*`` namespace. Covers the
AMD case for issue #5106 (nvidia-smi-only probe silently
returned [] on AMD hosts) and also rescues NVIDIA hosts
where ``nvidia-smi`` is missing from PATH.

Returns list of (gpu_index, free_mib) sorted by index. Empty
list if no supported GPU is reachable.
"""Query free memory per visible GPU, backend-aware.

Returns list of ``(gpu_index, free_mib)`` sorted by index. The index
space matches whatever the active backend exposes: physical
``nvidia-smi`` indices on NVIDIA; parent-visible numeric IDs on
AMD/ROCm and Intel XPU (via Studio's hardware telemetry layer).
Returns an empty list if no per-GPU free-memory data is available,
which lets the caller fall through to a non-placement launch path.
"""
# ── NVIDIA via nvidia-smi ────────────────────────────────────
import os

from utils.hardware import get_device
from utils.hardware.hardware import DeviceType
import utils.hardware.hardware as _hw_mod

# Fast path: NVIDIA / nvidia-smi. Skip only when we know the backend
# is XPU or ROCm -- not CUDA, CPU-only, or undetected.
_detected = get_device()
nvidia_eligible = _detected != DeviceType.XPU and not getattr(_hw_mod, "IS_ROCM", False)
try:
if not nvidia_eligible:
raise FileNotFoundError # skip to generic telemetry path
result = subprocess.run(
[
"nvidia-smi",
Expand All @@ -1275,7 +1281,9 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
**_windows_hidden_subprocess_kwargs(),
)
if result.returncode == 0:
allowed: Optional[set[int]] = None
# Filter nvidia-smi output by CUDA_VISIBLE_DEVICES.
# Skip empty tokens so trailing commas don't disable the filter.
allowed = None
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
try:
Expand All @@ -1286,8 +1294,9 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
# filtered out, matching the codebase convention.
allowed = set(int(x.strip()) for x in cvd.split(",") if x.strip())
except ValueError:
pass
gpus: list[tuple[int, int]] = []
pass # Non-numeric (e.g., "GPU-uuid"), ignore filter

gpus = []
for line in result.stdout.strip().splitlines():
parts = line.split(",")
if len(parts) == 2:
Expand All @@ -1296,71 +1305,55 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
if allowed is not None and idx not in allowed:
continue
gpus.append((idx, free_mib))
# Match the docstring's sort-by-id guarantee. nvidia-smi
# almost always returns sorted output, but driver order
# is not formally guaranteed.
gpus.sort(key = lambda g: g[0])
if gpus:
return gpus
return sorted(gpus, key = lambda item: item[0])
except FileNotFoundError:
pass # nvidia-smi not on PATH — fall through to generic path
except Exception as e:
logger.debug(f"nvidia-smi probe failed: {e}")
logger.debug(f"nvidia-smi free-memory query failed: {e}")

# ── Torch fallback (covers AMD ROCm and missing nvidia-smi) ──
# Generic path: ROCm, XPU, or nvidia-smi absent/failed.
try:
import torch

if not hasattr(torch, "cuda") or not torch.cuda.is_available():
return []
if not hasattr(torch.cuda, "mem_get_info"):
return []
# torch.cuda enumerates GPUs RELATIVE to the visibility mask.
# On NVIDIA builds the mask is CUDA_VISIBLE_DEVICES; on AMD
# ROCm builds it is HIP_VISIBLE_DEVICES (or ROCR_VISIBLE_DEVICES
# if HIP is unset). Downstream we feed these IDs back into the
# llama-server subprocess as CVD, so we must translate visible
# ordinals back to physical indices first; otherwise launching
# with ``CUDA_VISIBLE_DEVICES=2,3`` would get rewritten to
# ``CUDA_VISIBLE_DEVICES=0,1`` and target the wrong GPUs.
physical_ids: Optional[list[int]] = None
# Match the codebase convention in
# ``utils/hardware/hardware.py::_get_parent_visible_gpu_spec``:
# treat an explicitly empty mask (``HIP_VISIBLE_DEVICES=""``)
# as "set to no GPUs" rather than falling through to the next
# var. ``or`` would coerce empty string to falsy and silently
# promote the wrong source.
if getattr(torch.version, "hip", None) is not None:
hip_v = os.environ.get("HIP_VISIBLE_DEVICES")
rocr_v = os.environ.get("ROCR_VISIBLE_DEVICES")
cvd = (
hip_v
if hip_v is not None
else rocr_v
if rocr_v is not None
else os.environ.get("CUDA_VISIBLE_DEVICES")
from utils.hardware import get_visible_gpu_utilization

utilization = get_visible_gpu_utilization()

# Relative ordinals are not safe to round-trip into
# visibility env vars. Return [] so llama-server inherits
# the parent's mask unchanged.
if utilization.get("index_kind") not in (None, "physical"):
logger.debug(
"Skipping GPU placement: telemetry reports index_kind=%r "
"(not reusable for placement)",
utilization.get("index_kind"),
)
else:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
try:
# Empty mask (CVD="") yields an empty list so the
# below loop produces no GPUs, consistent with the
# nvidia-smi path and utils/hardware/hardware.py.
physical_ids = [int(x.strip()) for x in cvd.split(",") if x.strip()]
except ValueError:
physical_ids = None
gpus = []
for ordinal in range(torch.cuda.device_count()):
free_bytes, _total_bytes = torch.cuda.mem_get_info(ordinal)
idx = (
physical_ids[ordinal]
if physical_ids is not None and ordinal < len(physical_ids)
else ordinal
)
gpus.append((idx, free_bytes // (1024 * 1024)))
# Match the nvidia-smi path's docstring guarantee of sorted-by-id.
return sorted(gpus, key = lambda g: g[0])
return []

gpus: list[tuple[int, int]] = []
for device in utilization.get("devices", []) or []:
index = device.get("index")

# Use explicit ``is None`` checks -- ``or`` would treat an
# idle GPU with vram_used_gb == 0.0 as missing telemetry and
# silently drop a perfectly valid free card.
total_gb = device.get("vram_total_gb")
if total_gb is None:
total_gb = device.get("total_gb")

used_gb = device.get("vram_used_gb")
if used_gb is None:
used_gb = device.get("used_gb")

if index is None or total_gb is None or used_gb is None:
# Missing telemetry for this device -- skip rather than
# invent a free-memory number that drives placement.
continue

free_mib = max(int((float(total_gb) - float(used_gb)) * 1024), 0)
gpus.append((int(index), free_mib))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Pin GGUF XPU selections with ZE_AFFINITY_MASK

When this generic path runs on Intel XPU with physical indices, it now returns a non-empty GPU list that drives gpu_indices selection, but the llama-server launch later only writes CUDA_VISIBLE_DEVICES (and HIP/ROCR for ROCm), not ZE_AFFINITY_MASK. On multi-XPU COMPOSITE setups, Studio can therefore decide to pin a specific Intel GPU while the child process still sees the parent XPU visibility and may run on the wrong GPU or all XPUs; either avoid returning XPU placement here or propagate the selection through ZE_AFFINITY_MASK before spawning llama-server.

Useful? React with 👍 / 👎.

return sorted(gpus, key = lambda item: item[0])
except Exception as e:
logger.debug(f"torch GPU probe failed: {e}")
logger.debug(f"Generic GPU free-memory query failed: {e}")
return []

# Skip the wait when the last kill is older than this; the GPU
Expand Down Expand Up @@ -3849,10 +3842,7 @@ def unload_model(self) -> bool:
if LlamaCppBackend._codec_mgr is not None:
LlamaCppBackend._codec_mgr.unload()
LlamaCppBackend._codec_mgr = None
import torch

if torch.cuda.is_available():
torch.cuda.empty_cache()
clear_gpu_cache()
return True

def _kill_process(self):
Expand Down Expand Up @@ -5439,6 +5429,7 @@ def init_audio_codec(self, audio_type: str) -> None:
if LlamaCppBackend._codec_mgr is None:
LlamaCppBackend._codec_mgr = AudioCodecManager()

# Audio codecs are only validated on CUDA; stay on CPU otherwise.
device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_path = None

Expand Down Expand Up @@ -5501,6 +5492,8 @@ def generate_audio_response(
else None
)

# Match init_audio_codec: stay on CPU for non-CUDA hosts until the
# codec path is validated on XPU.
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
Expand Down
19 changes: 16 additions & 3 deletions studio/backend/core/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1505,6 +1505,10 @@ def _preprocess_snac_dataset(

SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
SNAC_SAMPLE_RATE = 24000

# SNAC codec has not been validated on Intel XPU yet; keep the
# pre-PR CPU fallback for non-CUDA hosts until an XPU-specific
# path is added.
device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = self.max_seq_length or 2048
tokenizer = self.tokenizer
Expand Down Expand Up @@ -1666,7 +1670,8 @@ def _preprocess_snac_dataset(
del snac_model

gc.collect()
torch.cuda.empty_cache()

clear_gpu_cache()
self._cuda_audio_used = True

if not processed_examples:
Expand All @@ -1692,6 +1697,10 @@ def _preprocess_bicodec_dataset(
import numpy as np
import torchaudio.transforms as T

import subprocess

# Spark-TTS BiCodec has not been validated on Intel XPU; keep the
# pre-PR CPU fallback for non-CUDA hosts.
device = "cuda" if torch.cuda.is_available() else "cpu"

# The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
Expand Down Expand Up @@ -1880,7 +1889,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
del audio_tokenizer

gc.collect()
torch.cuda.empty_cache()

clear_gpu_cache()
self._cuda_audio_used = True

if not processed_examples:
Expand Down Expand Up @@ -1916,6 +1926,8 @@ def _preprocess_dac_dataset(
from datasets import Dataset as HFDataset
from utils.paths import ensure_dir, tmp_root

# OuteTTS DAC/Whisper preprocess has not been validated on Intel
# XPU; keep the pre-PR CPU fallback for non-CUDA hosts.
device = "cuda" if torch.cuda.is_available() else "cpu"

# Clone OuteTTS repo (same as audio_codecs._load_dac)
Expand Down Expand Up @@ -2087,7 +2099,8 @@ def _preprocess_dac_dataset(
del prompt_processor

gc.collect()
torch.cuda.empty_cache()

clear_gpu_cache()
self._cuda_audio_used = True

if not processed_examples:
Expand Down
10 changes: 9 additions & 1 deletion studio/backend/models/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,15 @@ def normalize_blank_chat_template_override(cls, value: Optional[str]) -> Optiona
)
gpu_ids: Optional[List[int]] = Field(
None,
description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries. Not supported for GGUF models.",
description = (
"Physical GPU indices to use, for example [0, 1]. Omit or pass "
"[] to use automatic selection. Explicit gpu_ids are unsupported "
"when the parent visibility mask uses non-numeric or subdevice "
"entries -- this includes CUDA_VISIBLE_DEVICES with UUID/MIG "
"entries on NVIDIA, and ZE_AFFINITY_MASK with subdevice tokens "
"(e.g. '0.0,0.1') or FLAT-hierarchy (default) tile handles on "
"Intel XPU. Not supported for GGUF models."
),
)
speculative_type: Optional[str] = Field(
None,
Expand Down
10 changes: 9 additions & 1 deletion studio/backend/models/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,15 @@ def _check_lora_dropout(cls, v: float) -> float:
# GPU selection
gpu_ids: Optional[List[int]] = Field(
None,
description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries.",
description = (
"Physical GPU indices to use, for example [0, 1]. Omit or pass "
"[] to use automatic selection. Explicit gpu_ids are unsupported "
"when the parent visibility mask uses non-numeric or subdevice "
"entries -- this includes CUDA_VISIBLE_DEVICES with UUID/MIG "
"entries on NVIDIA, and ZE_AFFINITY_MASK with subdevice tokens "
"(e.g. '0.0,0.1') or FLAT-hierarchy (default) tile handles on "
"Intel XPU."
),
)

@model_validator(mode = "after")
Expand Down
Loading
Loading