Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion studio/backend/core/inference/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from transformers import TextStreamer
from peft import PeftModel, PeftModelForCausalLM

import contextlib
import json
import sys
import torch
Expand Down Expand Up @@ -1646,8 +1647,30 @@ def _generate_dac(
+ text
+ "<|text_end|>\n<|audio_start|><|global_features_start|>\n"
)

with torch.inference_mode():
with torch.amp.autocast("cuda", dtype = model.dtype):
# Derive the autocast device from the loaded model, not from the
# global backend: a CPU-fallback DAC on an XPU/CUDA host must not
# open a GPU autocast context around CPU tensors.
device_type = (
model.device.type
if hasattr(model.device, "type")
else str(model.device).split(":", 1)[0]
)
# Clamp to autocast-supported backends so exotic devices
# (e.g. "meta" during accelerate offloaded loading) do not raise.
# MPS is autocast-supported since torch 2.3, keep it in the set.
if device_type not in ("cuda", "xpu", "mps", "cpu"):
device_type = "cpu"
# CPU and XPU autocast only accept bfloat16/float16. For a
# float32 model, skip autocast entirely to avoid raising or
# producing a warning on every generate call.
autocast_dtype_supported = model.dtype in (torch.bfloat16, torch.float16)
if device_type in ("cpu", "xpu") and not autocast_dtype_supported:
autocast_ctx = contextlib.nullcontext()
else:
autocast_ctx = torch.amp.autocast(device_type, dtype = model.dtype)
with autocast_ctx:
inputs = tokenizer([prompt], return_tensors = "pt").to(model.device)
generated = model.generate(
**inputs,
Expand Down
29 changes: 23 additions & 6 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@

import httpx

from utils.hardware import clear_gpu_cache

logger = get_logger(__name__)

# ── Pre-compiled patterns for plan-without-action re-prompt ──
Expand Down Expand Up @@ -1512,9 +1514,20 @@ def load_model(
f"{new_ld}:{existing_ld}" if existing_ld else new_ld
)

# Pin to selected GPU(s) via CUDA_VISIBLE_DEVICES
# Pin to selected GPU(s) via the backend-appropriate visibility
# env var: CUDA_VISIBLE_DEVICES on NVIDIA/ROCm, ZE_AFFINITY_MASK
# on Intel XPU (llama-server's SYCL build reads ZE_AFFINITY_MASK,
# not CUDA_VISIBLE_DEVICES).
if gpu_indices is not None:
env["CUDA_VISIBLE_DEVICES"] = ",".join(str(i) for i in gpu_indices)
from utils.hardware import get_device
from utils.hardware.hardware import DeviceType

mask = ",".join(str(i) for i in gpu_indices)
if get_device() == DeviceType.XPU:
env["ZE_AFFINITY_MASK"] = mask
env.pop("CUDA_VISIBLE_DEVICES", None)
else:
env["CUDA_VISIBLE_DEVICES"] = mask

self._stdout_lines = []
self._process = subprocess.Popen(
Expand Down Expand Up @@ -1625,10 +1638,7 @@ def unload_model(self) -> bool:
if LlamaCppBackend._codec_mgr is not None:
LlamaCppBackend._codec_mgr.unload()
LlamaCppBackend._codec_mgr = None
import torch

if torch.cuda.is_available():
torch.cuda.empty_cache()
clear_gpu_cache()
return True

def _kill_process(self):
Expand Down Expand Up @@ -3261,6 +3271,11 @@ def init_audio_codec(self, audio_type: str) -> None:
if LlamaCppBackend._codec_mgr is None:
LlamaCppBackend._codec_mgr = AudioCodecManager()

# Preserve the pre-PR CPU fallback on non-CUDA hosts: the SNAC /
# BiCodec / DAC codecs are not yet validated on Intel XPU, so
# only promote to a GPU device when CUDA is actually available.
# A follow-up can extend this once an XPU-specific codec path is
# added.
device = "cuda" if torch.cuda.is_available() else "cpu"
model_repo_path = None

Expand Down Expand Up @@ -3333,6 +3348,8 @@ def generate_audio_response(
else None
)

# Match init_audio_codec: stay on CPU for non-CUDA hosts until the
# codec path is validated on XPU.
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
Expand Down
17 changes: 14 additions & 3 deletions studio/backend/core/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,10 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):

SNAC_MODEL_NAME = "hubertsiuzdak/snac_24khz"
SNAC_SAMPLE_RATE = 24000

# SNAC codec has not been validated on Intel XPU yet; keep the
# pre-PR CPU fallback for non-CUDA hosts until an XPU-specific
# path is added.
device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = self.max_seq_length or 2048
tokenizer = self.tokenizer
Expand Down Expand Up @@ -1716,7 +1720,8 @@ def _preprocess_snac_dataset(self, dataset, custom_format_mapping = None):
import gc

gc.collect()
torch.cuda.empty_cache()

clear_gpu_cache()
self._cuda_audio_used = True

if not processed_examples:
Expand Down Expand Up @@ -1744,6 +1749,8 @@ def _preprocess_bicodec_dataset(self, dataset, custom_format_mapping = None):

import subprocess

# Spark-TTS BiCodec has not been validated on Intel XPU; keep the
# pre-PR CPU fallback for non-CUDA hosts.
device = "cuda" if torch.cuda.is_available() else "cpu"

# The sparktts Python package lives in the SparkAudio/Spark-TTS GitHub repo,
Expand Down Expand Up @@ -1944,7 +1951,8 @@ def extract_wav2vec2_features(wavs: torch.Tensor) -> torch.Tensor:
import gc

gc.collect()
torch.cuda.empty_cache()

clear_gpu_cache()
self._cuda_audio_used = True

if not processed_examples:
Expand Down Expand Up @@ -1979,6 +1987,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
from datasets import Dataset as HFDataset
from utils.paths import ensure_dir, tmp_root

# OuteTTS DAC/Whisper preprocess has not been validated on Intel
# XPU; keep the pre-PR CPU fallback for non-CUDA hosts.
device = "cuda" if torch.cuda.is_available() else "cpu"

# Clone OuteTTS repo (same as audio_codecs._load_dac)
Expand Down Expand Up @@ -2157,7 +2167,8 @@ def _preprocess_dac_dataset(self, dataset, custom_format_mapping = None):
import gc

gc.collect()
torch.cuda.empty_cache()

clear_gpu_cache()
self._cuda_audio_used = True

if not processed_examples:
Expand Down
78 changes: 66 additions & 12 deletions studio/backend/tests/test_gpu_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def test_explicit_ids_are_rejected_for_uuid_parent_visibility(self):
patch("utils.hardware.hardware.get_physical_gpu_count", return_value = 8),
):
with self.assertRaisesRegex(
ValueError, "unsupported when CUDA_VISIBLE_DEVICES uses UUID/MIG"
ValueError,
"unsupported when CUDA_VISIBLE_DEVICES uses non-numeric or subdevice",
):
resolve_requested_gpu_ids([1])

Expand Down Expand Up @@ -711,12 +712,14 @@ def start(self):


class TestRouteErrors(unittest.TestCase):
def test_prepare_gpu_selection_rejects_gpu_ids_on_non_cuda_backend(self):
def test_prepare_gpu_selection_rejects_gpu_ids_on_non_accelerator_backend(self):
with patch("utils.hardware.hardware.get_device", return_value = DeviceType.CPU):
with self.assertRaises(ValueError) as exc_info:
prepare_gpu_selection([0], model_name = "unsloth/test")

self.assertIn("only supported on CUDA devices", str(exc_info.exception))
self.assertIn(
"only supported on CUDA and Intel XPU", str(exc_info.exception)
)

def test_inference_route_rejects_gpu_ids_for_gguf(self):
inference_route = _load_route_module(
Expand Down Expand Up @@ -1089,15 +1092,66 @@ def test_auto_select_falls_back_when_estimate_unavailable(self):
self.assertEqual(metadata["selection_mode"], "fallback_all")


class TestXpuRejection(_GpuCacheResetMixin, unittest.TestCase):
def test_auto_select_returns_non_cuda_for_xpu(self):
with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU):
class TestXpuSelection(_GpuCacheResetMixin, unittest.TestCase):
def test_auto_select_supports_xpu(self):
with (
patch(
"utils.hardware.hardware.get_device", return_value = DeviceType.XPU
),
patch(
"utils.hardware.hardware.estimate_required_model_memory_gb",
return_value = (1.0, {}),
),
patch(
"utils.hardware.hardware.get_visible_gpu_utilization",
return_value = {
"devices": [
{"index": 0, "vram_total_gb": 8, "vram_used_gb": 1},
]
},
),
patch(
"utils.hardware.hardware._get_parent_visible_gpu_spec",
return_value = {
"raw": None,
"numeric_ids": [0],
"supports_explicit_gpu_ids": True,
},
),
patch(
"utils.hardware.hardware.get_parent_visible_gpu_ids",
return_value = [0],
),
):
selected, metadata = auto_select_gpu_ids("unsloth/test")

self.assertIsNone(selected)
self.assertEqual(metadata["selection_mode"], "non_cuda")
self.assertEqual(selected, [0])
self.assertEqual(metadata["selection_mode"], "auto")

def test_prepare_gpu_selection_rejects_explicit_ids_on_xpu(self):
with patch("utils.hardware.hardware.get_device", return_value = DeviceType.XPU):
with self.assertRaisesRegex(ValueError, "only supported on CUDA"):
prepare_gpu_selection([0], model_name = "unsloth/test")
def test_prepare_gpu_selection_accepts_explicit_ids_on_xpu(self):
with (
patch(
"utils.hardware.hardware.get_device", return_value = DeviceType.XPU
),
patch(
"utils.hardware.hardware._get_parent_visible_gpu_spec",
return_value = {
"raw": "0",
"numeric_ids": [0],
"supports_explicit_gpu_ids": True,
},
),
patch(
"utils.hardware.hardware.get_parent_visible_gpu_ids",
return_value = [0],
),
patch(
"utils.hardware.hardware.get_physical_gpu_count", return_value = 1
),
):
selected, metadata = prepare_gpu_selection(
[0], model_name = "unsloth/test"
)

self.assertEqual(selected, [0])
self.assertEqual(metadata["selection_mode"], "explicit")
4 changes: 2 additions & 2 deletions studio/backend/tests/test_gpu_selection_sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,14 @@ def test_two_gpus_needed(self):
# 35GB (first) + 30*0.85 (second) = 60.5GB > 50GB
self.assertEqual(len(selected), 2)

def test_non_cuda_returns_none(self):
def test_non_accelerator_returns_none(self):
from utils.hardware.hardware import auto_select_gpu_ids
import utils.hardware.hardware as hw

with patch.object(hw, "get_device", return_value = hw.DeviceType.CPU):
selected, meta = auto_select_gpu_ids("test/model")
self.assertIsNone(selected)
self.assertEqual(meta["selection_mode"], "non_cuda")
self.assertEqual(meta["selection_mode"], "non_accelerator")


class TestGetDeviceMap(unittest.TestCase):
Expand Down
2 changes: 2 additions & 0 deletions studio/backend/utils/hardware/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
estimate_required_model_memory_gb,
auto_select_gpu_ids,
prepare_gpu_selection,
get_torch_device_str,
safe_num_proc,
safe_thread_num_proc,
dataset_map_num_proc,
Expand Down Expand Up @@ -70,6 +71,7 @@
"estimate_required_model_memory_gb",
"auto_select_gpu_ids",
"prepare_gpu_selection",
"get_torch_device_str",
"safe_num_proc",
"safe_thread_num_proc",
"dataset_map_num_proc",
Expand Down
Loading
Loading