From 7fa0192bb0ce16d7120511e2450031fff6d912e8 Mon Sep 17 00:00:00 2001 From: Eda Z Date: Tue, 31 Mar 2026 08:37:33 +0000 Subject: [PATCH 01/55] Add ROCm detection to install.sh and expand shell tests Add AMD ROCm GPU detection to get_torch_index_url() in install.sh. When nvidia-smi is not found, probe for ROCm via amd-smi, /opt/rocm version file, hipconfig, dpkg-query, and rpm. Includes validation guard for malformed _rocm_tag, Debian epoch prefix stripping, ROCm 7.2+ cap to rocm7.1 index, bitsandbytes AMD install, and status messaging. Shell tests expanded to 23 cases. Co-authored-by: Daniel Han --- install.sh | 54 +++++++++++- tests/sh/test_get_torch_index_url.sh | 119 ++++++++++++++++++++++++++- 2 files changed, 169 insertions(+), 4 deletions(-) diff --git a/install.sh b/install.sh index 9ea80bc161..b8f49abf51 100755 --- a/install.sh +++ b/install.sh @@ -982,7 +982,44 @@ get_torch_index_url() { elif [ -x "/usr/bin/nvidia-smi" ]; then _smi="/usr/bin/nvidia-smi" fi - if [ -z "$_smi" ]; then echo "$_base/cpu"; return; fi + if [ -z "$_smi" ]; then + # No NVIDIA GPU -- check for AMD ROCm + _rocm_tag="" + _rocm_tag=$({ command -v amd-smi >/dev/null 2>&1 && \ + amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ + 'NF>1{gsub(/[^0-9.]/, "", $2); split($2,a,"."); print "rocm"a[1]"."a[2]; ok=1; exit} END{exit !ok}'; } || \ + { [ -r /opt/rocm/.info/version ] && \ + awk -F. '{print "rocm"$1"."$2; exit}' /opt/rocm/.info/version; } || \ + { command -v hipconfig >/dev/null 2>&1 && \ + hipconfig --version 2>/dev/null | awk 'NR==1{split($1,a,"."); if(a[1]+0>0) print "rocm"a[1]"."a[2]}'; } || \ + { command -v dpkg-query >/dev/null 2>&1 && \ + ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \ + [ -n "$ver" ] && \ + printf '%s\n' "$ver" | sed 's/^[0-9]*://' | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; } || \ + { command -v rpm >/dev/null 2>&1 && \ + ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \ + [ -n "$ver" ] && \ + printf '%s\n' "$ver" | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; }) 2>/dev/null + # Validate _rocm_tag: must match "rocmX.Y" with leading digits + case "$_rocm_tag" in + rocm[0-9]*.[0-9]*) : ;; # valid + *) _rocm_tag="" ;; # reject malformed (empty version, garbled output) + esac + if [ -n "$_rocm_tag" ]; then + # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds (<2.11.0). + # Fall back to rocm7.1 index which has torch 2.10.0. + # TODO: uncomment the next line when torch upper bound is bumped to >=2.11.0 + # echo "$_base/$_rocm_tag"; return + case "$_rocm_tag" in + rocm7.2*|rocm7.3*|rocm7.4*|rocm7.5*|rocm8*|rocm9*) + echo "$_base/rocm7.1" ;; + *) + echo "$_base/$_rocm_tag" ;; + esac + return + fi + echo "$_base/cpu"; return + fi # Parse CUDA version from nvidia-smi output (POSIX-safe, no grep -P) _cuda_ver=$(LC_ALL=C $_smi 2>/dev/null \ | sed -n 's/.*CUDA Version:[[:space:]]*\([0-9][0-9]*\.[0-9][0-9]*\).*/\1/p' \ @@ -1007,13 +1044,19 @@ case "$TORCH_INDEX_URL" in */cpu) if [ "$SKIP_TORCH" = false ] && [ "$OS" != "macos" ]; then echo "" - echo " NOTE: No NVIDIA GPU detected (nvidia-smi not found)." + echo " NOTE: No GPU detected (nvidia-smi and ROCm not found)." echo " Installing CPU-only PyTorch. If you only need GGUF chat/inference," echo " re-run with --no-torch for a faster, lighter install:" echo " curl -fsSL https://unsloth.ai/install.sh | sh -s -- --no-torch" + echo " AMD ROCm users: see https://docs.unsloth.ai/get-started/install-and-update/amd" echo "" fi ;; + */rocm*) + echo "" + echo " AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)" + echo "" + ;; esac # ── Install unsloth directly into the venv (no activation needed) ── @@ -1051,6 +1094,13 @@ elif [ -n "$TORCH_INDEX_URL" ]; then substep "installing PyTorch ($TORCH_INDEX_URL)..." run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "torch>=2.4,<2.11.0" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" + # AMD ROCm: install bitsandbytes with AMD support + case "$TORCH_INDEX_URL" in + */rocm*) + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" + ;; + esac fi # Fresh: Step 2 - install unsloth, preserving pre-installed torch substep "installing unsloth (this may take a few minutes)..." diff --git a/tests/sh/test_get_torch_index_url.sh b/tests/sh/test_get_torch_index_url.sh index 6387922712..81da79aa32 100755 --- a/tests/sh/test_get_torch_index_url.sh +++ b/tests/sh/test_get_torch_index_url.sh @@ -45,10 +45,23 @@ MOCK echo "$_dir" } +# Helper: create a mock amd-smi that prints a given ROCm version string +make_mock_amd_smi() { + _dir=$(mktemp -d) + cat > "$_dir/amd-smi" </dev/null || true) [ -n "$_real" ] && ln -sf "$_real" "$_TOOLS_DIR/$_cmd" done @@ -119,6 +132,108 @@ _result=$(run_func "$_dir") assert_eq "unparseable -> cu126" "https://download.pytorch.org/whl/cu126" "$_result" rm -rf "$_dir" +# 9) ROCm 6.3 (no nvidia-smi) -> rocm6.3 +_dir=$(make_mock_amd_smi "6.3") +_result=$(run_func "$_dir") +assert_eq "ROCm 6.3 -> rocm6.3" "https://download.pytorch.org/whl/rocm6.3" "$_result" +rm -rf "$_dir" + +# 10) ROCm 7.1 (no nvidia-smi) -> rocm7.1 +_dir=$(make_mock_amd_smi "7.1") +_result=$(run_func "$_dir") +assert_eq "ROCm 7.1 -> rocm7.1" "https://download.pytorch.org/whl/rocm7.1" "$_result" +rm -rf "$_dir" + +# 11) ROCm 7.2 (no nvidia-smi) -> rocm7.1 (capped due to torch <2.11.0) +_dir=$(make_mock_amd_smi "7.2") +_result=$(run_func "$_dir") +assert_eq "ROCm 7.2 -> rocm7.1 (capped)" "https://download.pytorch.org/whl/rocm7.1" "$_result" +rm -rf "$_dir" + +# 12) Both nvidia-smi and amd-smi present -> CUDA takes precedence +_cuda_dir=$(make_mock_smi "12.6") +_amd_dir=$(make_mock_amd_smi "6.3") +_combined_dir=$(mktemp -d) +ln -sf "$_cuda_dir/nvidia-smi" "$_combined_dir/nvidia-smi" +ln -sf "$_amd_dir/amd-smi" "$_combined_dir/amd-smi" +_result=$(run_func "$_combined_dir") +assert_eq "CUDA+ROCm -> CUDA precedence" "https://download.pytorch.org/whl/cu126" "$_result" +rm -rf "$_cuda_dir" "$_amd_dir" "$_combined_dir" + +# 13) No nvidia-smi, no amd-smi -> cpu (duplicate of test 1, confirms ROCm didn't break it) +_result=$(run_func "none") +assert_eq "no GPU -> cpu" "https://download.pytorch.org/whl/cpu" "$_result" + +# 14) ROCm 6.1 (no nvidia-smi) -> rocm6.1 +_dir=$(make_mock_amd_smi "6.1") +_result=$(run_func "$_dir") +assert_eq "ROCm 6.1 -> rocm6.1" "https://download.pytorch.org/whl/rocm6.1" "$_result" +rm -rf "$_dir" + +# 15) ROCm 6.4 (no nvidia-smi) -> rocm6.4 +_dir=$(make_mock_amd_smi "6.4") +_result=$(run_func "$_dir") +assert_eq "ROCm 6.4 -> rocm6.4" "https://download.pytorch.org/whl/rocm6.4" "$_result" +rm -rf "$_dir" + +# 16) ROCm 7.0 (no nvidia-smi) -> rocm7.0 +_dir=$(make_mock_amd_smi "7.0") +_result=$(run_func "$_dir") +assert_eq "ROCm 7.0 -> rocm7.0" "https://download.pytorch.org/whl/rocm7.0" "$_result" +rm -rf "$_dir" + +# 17) ROCm 8.0 (future, no nvidia-smi) -> rocm7.1 (capped) +_dir=$(make_mock_amd_smi "8.0") +_result=$(run_func "$_dir") +assert_eq "ROCm 8.0 -> rocm7.1 (capped)" "https://download.pytorch.org/whl/rocm7.1" "$_result" +rm -rf "$_dir" + +# 18) Malformed amd-smi output (empty version field) -> cpu +_dir=$(mktemp -d) +cat > "$_dir/amd-smi" <<'MOCK' +#!/bin/sh +echo "AMDSMI Tool: 25.0.1 | AMDSMI Library version: 25.0.1.0 | ROCm version: " +MOCK +chmod +x "$_dir/amd-smi" +_result=$(run_func "$_dir") +assert_eq "empty amd-smi version -> cpu" "https://download.pytorch.org/whl/cpu" "$_result" +rm -rf "$_dir" + +# 19) amd-smi with "N/A" version -> cpu +_dir=$(mktemp -d) +cat > "$_dir/amd-smi" <<'MOCK' +#!/bin/sh +echo "AMDSMI Tool: 25.0.1 | AMDSMI Library version: 25.0.1.0 | ROCm version: N/A" +MOCK +chmod +x "$_dir/amd-smi" +_result=$(run_func "$_dir") +assert_eq "N/A amd-smi version -> cpu" "https://download.pytorch.org/whl/cpu" "$_result" +rm -rf "$_dir" + +# 20) ROCm version with trailing text (e.g. "6.3.1-beta") -> rocm6.3 +_dir=$(make_mock_amd_smi "6.3.1-beta") +_result=$(run_func "$_dir") +assert_eq "ROCm 6.3.1-beta -> rocm6.3" "https://download.pytorch.org/whl/rocm6.3" "$_result" +rm -rf "$_dir" + +# 22) CUDA 12.6 still works after ROCm changes (regression check) +_dir=$(make_mock_smi "12.6") +_result=$(run_func "$_dir") +assert_eq "CUDA 12.6 regression -> cu126" "https://download.pytorch.org/whl/cu126" "$_result" +rm -rf "$_dir" + +# 23) CUDA 13.0 still works after ROCm changes (regression check) +_dir=$(make_mock_smi "13.0") +_result=$(run_func "$_dir") +assert_eq "CUDA 13.0 regression -> cu130" "https://download.pytorch.org/whl/cu130" "$_result" +rm -rf "$_dir" + +# 24) CUDA 12.8 still works after ROCm changes (regression check) +_dir=$(make_mock_smi "12.8") +_result=$(run_func "$_dir") +assert_eq "CUDA 12.8 regression -> cu128" "https://download.pytorch.org/whl/cu128" "$_result" +rm -rf "$_dir" + rm -f "$_FUNC_FILE" rm -rf "$_FAKE_SMI_DIR" rm -rf "$_TOOLS_DIR" From f3cc7585f15d3142afdb55cb4a1e72598064fb89 Mon Sep 17 00:00:00 2001 From: GoldenGrapeGentleman Date: Tue, 31 Mar 2026 08:37:41 +0000 Subject: [PATCH 02/55] Add ROCm torch reinstall support to install_python_stack.py Add _detect_rocm_version() and _ensure_rocm_torch() to detect when a Linux host has ROCm but the venv received CPU-only torch, and reinstall with the correct ROCm wheels. Covers ROCm 6.0 through 7.1 with a 30-second timeout on the torch GPU probe subprocess. Co-authored-by: Daniel Han --- studio/install_python_stack.py | 124 +++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index f2981ea665..1f17a2e79c 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -25,6 +25,120 @@ IS_MACOS = sys.platform == "darwin" IS_MAC_INTEL = IS_MACOS and platform.machine() == "x86_64" +# ── ROCm / AMD GPU support ───────────────────────────────────────────────────── +# Mapping from detected ROCm (major, minor) to the best PyTorch wheel tag on +# download.pytorch.org. Entries are checked newest-first (>=). +# ROCm 7.2 only has torch 2.11.0 on download.pytorch.org, which exceeds the +# current torch upper bound (<2.11.0). Fall back to rocm7.1 (torch 2.10.0). +# TODO: uncomment rocm7.2 when torch upper bound is bumped to >=2.11.0 +_ROCM_TORCH_INDEX: dict[tuple[int, int], str] = { + # (7, 2): "rocm7.2", # torch 2.11.0 -- requires torch>=2.11 + (7, 1): "rocm7.1", + (7, 0): "rocm7.0", + (6, 4): "rocm6.4", + (6, 3): "rocm6.3", + (6, 2): "rocm6.2", + (6, 1): "rocm6.1", + (6, 0): "rocm6.0", +} +_PYTORCH_WHL_BASE = "https://download.pytorch.org/whl" + + +def _detect_rocm_version() -> tuple[int, int] | None: + """Return (major, minor) of the installed ROCm stack, or None.""" + # Check /opt/rocm/.info/version or ROCM_PATH equivalent + rocm_root = os.environ.get("ROCM_PATH", "/opt/rocm") + for path in ( + os.path.join(rocm_root, ".info", "version"), + os.path.join(rocm_root, "lib", "rocm_version"), + ): + try: + parts = open(path).read().strip().split("-")[0].split(".") + return int(parts[0]), int(parts[1]) + except Exception: + pass + + # Try hipconfig --version (outputs bare version like "6.3.21234.2") + hipconfig = shutil.which("hipconfig") + if hipconfig: + try: + result = subprocess.run( + [hipconfig, "--version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + timeout = 5, + ) + if result.returncode == 0: + raw = result.stdout.decode().strip().split("\n")[0] + parts = raw.split(".") + if len(parts) >= 2 and parts[0].isdigit(): + return int(parts[0]), int(parts[1]) + except Exception: + pass + + return None + + +def _ensure_rocm_torch() -> None: + """Reinstall torch with ROCm wheels when the venv received CPU-only torch. + + Runs only on Linux hosts where ROCm is installed. No-op when torch already + links against HIP (ROCm) or CUDA (NVIDIA). Skips on Windows/macOS. + Uses pip_install() to respect uv, constraints, and --python targeting. + """ + rocm_root = os.environ.get("ROCM_PATH", "/opt/rocm") + if not os.path.isdir(rocm_root) and not shutil.which("hipcc"): + return # no ROCm toolchain + + ver = _detect_rocm_version() + if ver is None: + print(" ROCm detected but version unreadable -- skipping torch reinstall") + return + + # Skip if torch is already GPU-enabled (HIP or CUDA) + probe = subprocess.run( + [ + sys.executable, + "-c", + "import torch; print(torch.version.hip or torch.version.cuda or '')", + ], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + timeout = 30, + ) + if probe.returncode == 0 and probe.stdout.decode().strip(): + return # torch already GPU-enabled + + # Select best matching wheel tag (newest ROCm version <= installed) + tag = next( + (t for (maj, mn), t in _ROCM_TORCH_INDEX.items() if ver >= (maj, mn)), + None, + ) + if tag is None: + print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping") + return + + index_url = f"{_PYTORCH_WHL_BASE}/{tag}" + print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") + pip_install( + f"ROCm torch ({tag})", + "--force-reinstall", + "--no-cache-dir", + "torch", + "torchvision", + "torchaudio", + "--index-url", + index_url, + constrain = False, + ) + # Also install bitsandbytes for AMD + pip_install( + "bitsandbytes (AMD)", + "--no-cache-dir", + "bitsandbytes>=0.49.1", + constrain = False, + ) + def _infer_no_torch() -> bool: """Determine whether to run in no-torch (GGUF-only) mode. @@ -414,6 +528,9 @@ def install_python_stack() -> int: base_total = 10 if IS_WINDOWS else 11 if IS_MACOS: base_total -= 1 # triton step is skipped on macOS + # ROCm torch check step (Linux only, non-macOS, non-no-torch) + if not IS_WINDOWS and not IS_MACOS and not NO_TORCH: + base_total += 1 _TOTAL = (base_total - 1) if skip_base else base_total # 1. Try to use uv for faster installs (must happen before pip upgrade @@ -537,6 +654,13 @@ def install_python_stack() -> int: req = REQ_ROOT / "base.txt", ) + # 2b. AMD ROCm: reinstall torch with HIP wheels if the host has ROCm but the + # venv received CPU-only torch (common when pip resolves torch from PyPI). + # Must come immediately after base packages so torch is present for inspection. + if not IS_WINDOWS and not IS_MACOS and not NO_TORCH: + _progress("ROCm torch check") + _ensure_rocm_torch() + # 3. Extra dependencies _progress("unsloth extras") pip_install( From 062e25fe7b4a66cb54562eab90a3cd691efe7011 Mon Sep 17 00:00:00 2001 From: Eda Z Date: Tue, 31 Mar 2026 08:37:49 +0000 Subject: [PATCH 03/55] Add ROCm support to llama.cpp prebuilt installer Add has_rocm field to HostInfo, extend detect_host() to probe for ROCm via hipcc/amd-smi/rocm-smi/ROCM_PATH, and route ROCm hosts to upstream prebuilts (Linux ROCm 7.2 prebuilt with source fallback, Windows HIP prebuilt with CPU fallback). Add linux-rocm and windows-hip install kinds to runtime_patterns_for_choice(). Co-authored-by: Daniel Han --- studio/install_llama_prebuilt.py | 58 ++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 516dc4b6a4..cbc8f6d15f 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -88,6 +88,7 @@ class HostInfo: visible_cuda_devices: str | None has_physical_nvidia: bool has_usable_nvidia: bool + has_rocm: bool = False @dataclass @@ -1430,6 +1431,18 @@ def detect_host() -> HostInfo: except Exception: pass + # Detect AMD ROCm (HIP) + has_rocm = False + if not is_macos: + rocm_hints = [ + shutil.which("hipcc"), + shutil.which("amd-smi"), + shutil.which("rocm-smi"), + ] + rocm_paths = ["/opt/rocm", os.environ.get("ROCM_PATH", "")] + if any(rocm_hints) or any(os.path.isdir(p) for p in rocm_paths if p): + has_rocm = True + return HostInfo( system = system, machine = machine, @@ -1444,6 +1457,7 @@ def detect_host() -> HostInfo: visible_cuda_devices = visible_cuda_devices, has_physical_nvidia = has_physical_nvidia, has_usable_nvidia = has_usable_nvidia, + has_rocm = has_rocm, ) @@ -1724,6 +1738,30 @@ def resolve_linux_cuda_choice( def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice: upstream_assets = github_release_assets(UPSTREAM_REPO, llama_tag) if host.is_linux and host.is_x86_64: + # AMD ROCm: try upstream ROCm prebuilt first, then fall back to source build. + # Source build (via setup.sh) compiles with -DGGML_HIP=ON and auto-detects + # the exact GPU target via rocminfo, which is more reliable for consumer + # GPUs (e.g. gfx1151) that may not be in the prebuilt. + if host.has_rocm and not host.has_usable_nvidia: + rocm_name = f"llama-{llama_tag}-bin-ubuntu-rocm-7.2-x64.tar.gz" + if rocm_name in upstream_assets: + log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") + log("Note: prebuilt is compiled for ROCm 7.2; if your ROCm version differs, " + "this may fail preflight and fall back to a source build (safe)") + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = rocm_name, + url = upstream_assets[rocm_name], + source_label = "upstream", + install_kind = "linux-rocm", + ) + # No ROCm prebuilt available -- fall back to source build + raise PrebuiltFallback( + "AMD ROCm detected but no upstream ROCm prebuilt found; " + "falling back to source build with HIP support" + ) + upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Linux CPU asset was not found") @@ -1743,6 +1781,21 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice return attempts[0] raise PrebuiltFallback("no compatible Windows CUDA asset was found") + # AMD ROCm on Windows: try HIP prebuilt + if host.has_rocm: + hip_name = f"llama-{llama_tag}-bin-win-hip-radeon-x64.zip" + if hip_name in upstream_assets: + log(f"AMD ROCm detected on Windows -- trying upstream HIP prebuilt {hip_name}") + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = hip_name, + url = upstream_assets[hip_name], + source_label = "upstream", + install_kind = "windows-hip", + ) + log("AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU") + upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Windows CPU asset was not found") @@ -2121,7 +2174,7 @@ def overlay_directory_for_choice( def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: - if choice.install_kind in {"linux-cpu", "linux-cuda"}: + if choice.install_kind in {"linux-cpu", "linux-cuda", "linux-rocm"}: return [ "llama-server", "llama-quantize", @@ -2131,11 +2184,12 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: "libmtmd.so*", "libggml-cpu-*.so*", "libggml-cuda.so*", + "libggml-hip.so*", "libggml-rpc.so*", ] if choice.install_kind in {"macos-arm64", "macos-x64"}: return ["llama-server", "llama-quantize", "lib*.dylib"] - if choice.install_kind in {"windows-cpu", "windows-cuda"}: + if choice.install_kind in {"windows-cpu", "windows-cuda", "windows-hip"}: return ["*.exe", "*.dll"] raise PrebuiltFallback( f"unsupported install kind for runtime overlay: {choice.install_kind}" From 450f5de5070e800bfd93b1716a334e1df9bd2192 Mon Sep 17 00:00:00 2001 From: GoldenGrapeGentleman Date: Tue, 31 Mar 2026 08:37:57 +0000 Subject: [PATCH 04/55] Add IS_ROCM hardware flag and fix AMD error message Add IS_ROCM flag to hardware.py detect_hardware() (set when torch.version.hip is present, DeviceType stays CUDA). Export IS_ROCM from __init__.py. Add "rocm" key to get_package_versions(). Replace "We do not support AMD" error in tokenizer_utils.py with a helpful message pointing to ROCm installation docs. Co-authored-by: Daniel Han --- studio/backend/utils/hardware/__init__.py | 2 ++ studio/backend/utils/hardware/hardware.py | 21 ++++++++++++++++----- unsloth/tokenizer_utils.py | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index aaa0452406..b9b61cdcfe 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -9,6 +9,7 @@ DeviceType, DEVICE, CHAT_ONLY, + IS_ROCM, detect_hardware, get_device, is_apple_silicon, @@ -49,6 +50,7 @@ "DeviceType", "DEVICE", "CHAT_ONLY", + "IS_ROCM", "detect_hardware", "get_device", "is_apple_silicon", diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 742e8f6b7e..d5de59a592 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -43,6 +43,7 @@ class DeviceType(str, Enum): DEVICE: Optional[DeviceType] = None CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) +IS_ROCM: bool = False # True when running on AMD ROCm (HIP) -- display/logging only # ========== Detection ========== @@ -85,10 +86,11 @@ def detect_hardware() -> DeviceType: 2. MLX (Apple Silicon via MLX framework) 3. CPU (fallback) """ - global DEVICE, CHAT_ONLY - CHAT_ONLY = True # reset -- only CUDA sets it to False + global DEVICE, CHAT_ONLY, IS_ROCM + CHAT_ONLY = True # reset -- only CUDA/ROCm sets it to False + IS_ROCM = False - # --- CUDA: try PyTorch --- + # --- CUDA / ROCm: try PyTorch --- if _has_torch(): import torch @@ -96,7 +98,14 @@ def detect_hardware() -> DeviceType: DEVICE = DeviceType.CUDA CHAT_ONLY = False device_name = torch.cuda.get_device_properties(0).name - print(f"Hardware detected: CUDA — {device_name}") + + # Distinguish AMD ROCm (HIP) from NVIDIA CUDA for display purposes. + # DeviceType stays CUDA since torch.cuda.* works on ROCm via HIP. + if getattr(torch.version, "hip", None) is not None: + IS_ROCM = True + print(f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}") + else: + print(f"Hardware detected: CUDA -- {device_name}") return DEVICE # --- XPU: Intel GPU --- @@ -315,13 +324,15 @@ def get_package_versions() -> Dict[str, Optional[str]]: except PackageNotFoundError: versions[name] = None - # CUDA toolkit version bundled with torch + # GPU runtime version bundled with torch try: import torch versions["cuda"] = getattr(torch.version, "cuda", None) + versions["rocm"] = getattr(torch.version, "hip", None) except Exception: versions["cuda"] = None + versions["rocm"] = None return versions diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 8be6bb5a5a..07949cd32e 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1103,7 +1103,7 @@ def patch_sft_trainer_tokenizer(): " a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n" "except:\n" " if not torch.cuda.is_available():\n" - " raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n" + " raise RuntimeError('Unsloth: No GPU detected. AMD ROCm users: install ROCm-enabled PyTorch -- see https://docs.unsloth.ai/get-started/install-and-update/amd')\n" "if ((a - PRE_CHECK) >= 1).sum() > 1:\n" " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" "for _ in range(3):\n" From f6c2eb863f47e7654c0e5305c752165fe15d199f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 08:38:05 +0000 Subject: [PATCH 05/55] Add comprehensive ROCm support test suite (68 tests) Add tests/studio/install/test_rocm_support.py covering all ROCm code paths across install_llama_prebuilt.py, install_python_stack.py, hardware.py, tokenizer_utils.py, and install.sh. All tests use mocks and run without AMD hardware. Covers: asset selection (11), runtime patterns (5), HostInfo (4), ROCm version detection (9), torch reinstall (9), index mapping (8), hardware flag (8), tokenizer message (2), install.sh structure (10), and live regression (1). --- tests/studio/install/test_rocm_support.py | 838 ++++++++++++++++++++++ 1 file changed, 838 insertions(+) create mode 100644 tests/studio/install/test_rocm_support.py diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py new file mode 100644 index 0000000000..560edf7a16 --- /dev/null +++ b/tests/studio/install/test_rocm_support.py @@ -0,0 +1,838 @@ +"""Tests for AMD ROCm support across install pathways. + +Verifies that ROCm detection and installation logic works correctly +WITHOUT breaking existing CUDA, CPU, macOS, and Windows pathways. +All tests use mocks -- no AMD hardware required. +""" + +import importlib.util +import os +import subprocess +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch, PropertyMock + +import pytest + + +# ── Load modules under test ────────────────────────────────────────────────── + +PACKAGE_ROOT = Path(__file__).resolve().parents[3] + +# install_llama_prebuilt.py +_PREBUILT_PATH = PACKAGE_ROOT / "studio" / "install_llama_prebuilt.py" +_PREBUILT_SPEC = importlib.util.spec_from_file_location( + "studio_install_llama_prebuilt", _PREBUILT_PATH +) +assert _PREBUILT_SPEC is not None and _PREBUILT_SPEC.loader is not None +prebuilt_mod = importlib.util.module_from_spec(_PREBUILT_SPEC) +sys.modules[_PREBUILT_SPEC.name] = prebuilt_mod +_PREBUILT_SPEC.loader.exec_module(prebuilt_mod) + +HostInfo = prebuilt_mod.HostInfo +AssetChoice = prebuilt_mod.AssetChoice +PrebuiltFallback = prebuilt_mod.PrebuiltFallback +resolve_upstream_asset_choice = prebuilt_mod.resolve_upstream_asset_choice +runtime_patterns_for_choice = prebuilt_mod.runtime_patterns_for_choice + +# install_python_stack.py +_STACK_PATH = PACKAGE_ROOT / "studio" / "install_python_stack.py" +_STACK_SPEC = importlib.util.spec_from_file_location( + "studio_install_python_stack", _STACK_PATH +) +assert _STACK_SPEC is not None and _STACK_SPEC.loader is not None +stack_mod = importlib.util.module_from_spec(_STACK_SPEC) +sys.modules[_STACK_SPEC.name] = stack_mod +_STACK_SPEC.loader.exec_module(stack_mod) + +_detect_rocm_version = stack_mod._detect_rocm_version +_ensure_rocm_torch = stack_mod._ensure_rocm_torch +_ROCM_TORCH_INDEX = stack_mod._ROCM_TORCH_INDEX + + +# ── Helper: build HostInfo for different scenarios ────────────────────────── + +def nvidia_host(**overrides) -> HostInfo: + """NVIDIA Linux x86_64 host.""" + defaults = dict( + system = "Linux", machine = "x86_64", + is_windows = False, is_linux = True, is_macos = False, + is_x86_64 = True, is_arm64 = False, + nvidia_smi = "/usr/bin/nvidia-smi", + driver_cuda_version = (12, 6), + compute_caps = ["89"], + visible_cuda_devices = None, + has_physical_nvidia = True, + has_usable_nvidia = True, + has_rocm = False, + ) + defaults.update(overrides) + return HostInfo(**defaults) + + +def rocm_host(**overrides) -> HostInfo: + """AMD ROCm Linux x86_64 host (no NVIDIA).""" + defaults = dict( + system = "Linux", machine = "x86_64", + is_windows = False, is_linux = True, is_macos = False, + is_x86_64 = True, is_arm64 = False, + nvidia_smi = None, + driver_cuda_version = None, + compute_caps = [], + visible_cuda_devices = None, + has_physical_nvidia = False, + has_usable_nvidia = False, + has_rocm = True, + ) + defaults.update(overrides) + return HostInfo(**defaults) + + +def cpu_host(**overrides) -> HostInfo: + """CPU-only Linux x86_64 host.""" + defaults = dict( + system = "Linux", machine = "x86_64", + is_windows = False, is_linux = True, is_macos = False, + is_x86_64 = True, is_arm64 = False, + nvidia_smi = None, + driver_cuda_version = None, + compute_caps = [], + visible_cuda_devices = None, + has_physical_nvidia = False, + has_usable_nvidia = False, + has_rocm = False, + ) + defaults.update(overrides) + return HostInfo(**defaults) + + +def macos_host(**overrides) -> HostInfo: + """macOS arm64 host.""" + defaults = dict( + system = "Darwin", machine = "arm64", + is_windows = False, is_linux = False, is_macos = True, + is_x86_64 = False, is_arm64 = True, + nvidia_smi = None, + driver_cuda_version = None, + compute_caps = [], + visible_cuda_devices = None, + has_physical_nvidia = False, + has_usable_nvidia = False, + has_rocm = False, + ) + defaults.update(overrides) + return HostInfo(**defaults) + + +def windows_host(**overrides) -> HostInfo: + """Windows x86_64 host.""" + defaults = dict( + system = "Windows", machine = "amd64", + is_windows = True, is_linux = False, is_macos = False, + is_x86_64 = True, is_arm64 = False, + nvidia_smi = None, + driver_cuda_version = None, + compute_caps = [], + visible_cuda_devices = None, + has_physical_nvidia = False, + has_usable_nvidia = False, + has_rocm = False, + ) + defaults.update(overrides) + return HostInfo(**defaults) + + +def windows_rocm_host(**overrides) -> HostInfo: + """Windows x86_64 host with ROCm.""" + defaults = dict( + system = "Windows", machine = "amd64", + is_windows = True, is_linux = False, is_macos = False, + is_x86_64 = True, is_arm64 = False, + nvidia_smi = None, + driver_cuda_version = None, + compute_caps = [], + visible_cuda_devices = None, + has_physical_nvidia = False, + has_usable_nvidia = False, + has_rocm = True, + ) + defaults.update(overrides) + return HostInfo(**defaults) + + +# ── Upstream asset fixture ─────────────────────────────────────────────────── + +LLAMA_TAG = "b8508" + +UPSTREAM_ASSETS = { + f"llama-{LLAMA_TAG}-bin-ubuntu-x64.tar.gz": f"https://example.com/{LLAMA_TAG}-linux-cpu.tar.gz", + f"llama-{LLAMA_TAG}-bin-ubuntu-rocm-7.2-x64.tar.gz": f"https://example.com/{LLAMA_TAG}-linux-rocm.tar.gz", + f"llama-{LLAMA_TAG}-bin-win-cpu-x64.zip": f"https://example.com/{LLAMA_TAG}-win-cpu.zip", + f"llama-{LLAMA_TAG}-bin-win-cuda-12.4-x64.zip": f"https://example.com/{LLAMA_TAG}-win-cuda.zip", + f"llama-{LLAMA_TAG}-bin-win-hip-radeon-x64.zip": f"https://example.com/{LLAMA_TAG}-win-hip.zip", + f"llama-{LLAMA_TAG}-bin-macos-arm64.tar.gz": f"https://example.com/{LLAMA_TAG}-macos-arm64.tar.gz", + f"llama-{LLAMA_TAG}-bin-macos-x64.tar.gz": f"https://example.com/{LLAMA_TAG}-macos-x64.tar.gz", +} + + +# ============================================================================= +# TEST: install_llama_prebuilt.py -- resolve_upstream_asset_choice +# ============================================================================= + +class TestResolveUpstreamAssetChoice: + """Verify that the asset selection logic picks the right binary for each platform.""" + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_nvidia_linux_gets_cpu_asset(self, mock_assets): + """NVIDIA host should NOT hit the ROCm path -- gets CPU asset (CUDA handled elsewhere).""" + host = nvidia_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "linux-cpu" + assert "ubuntu-x64" in choice.name + assert "rocm" not in choice.name + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_rocm_linux_gets_rocm_prebuilt(self, mock_assets): + """AMD ROCm Linux host should get the ROCm prebuilt.""" + host = rocm_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "linux-rocm" + assert "rocm" in choice.name + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_cpu_linux_gets_cpu_asset(self, mock_assets): + """CPU-only Linux host should get CPU asset.""" + host = cpu_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "linux-cpu" + assert "ubuntu-x64" in choice.name + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_macos_arm64_gets_macos_asset(self, mock_assets): + """macOS arm64 host should get macOS asset.""" + host = macos_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "macos-arm64" + assert "macos-arm64" in choice.name + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_windows_cpu_gets_cpu_asset(self, mock_assets): + """Windows CPU-only host should get Windows CPU asset.""" + host = windows_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "windows-cpu" + assert "win-cpu" in choice.name + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_windows_rocm_gets_hip_asset(self, mock_assets): + """Windows ROCm host should get Windows HIP asset.""" + host = windows_rocm_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "windows-hip" + assert "hip" in choice.name + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_mixed_nvidia_rocm_prefers_nvidia(self, mock_assets): + """Host with both NVIDIA and ROCm should use NVIDIA (CPU path here, CUDA elsewhere).""" + host = nvidia_host(has_rocm = True) + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + # NVIDIA hosts go through the normal path (CUDA handled by resolve_linux_cuda_choice) + assert choice.install_kind == "linux-cpu" + assert "rocm" not in choice.name + + @patch.object(prebuilt_mod, "github_release_assets") + def test_rocm_linux_no_prebuilt_falls_back(self, mock_assets): + """AMD ROCm host should fall back to source build when no ROCm prebuilt exists.""" + # Remove the ROCm asset from available assets + assets_without_rocm = {k: v for k, v in UPSTREAM_ASSETS.items() if "rocm" not in k} + mock_assets.return_value = assets_without_rocm + host = rocm_host() + with pytest.raises(PrebuiltFallback, match = "ROCm detected"): + resolve_upstream_asset_choice(host, LLAMA_TAG) + + @patch.object(prebuilt_mod, "github_release_assets") + def test_windows_rocm_no_hip_falls_to_cpu(self, mock_assets): + """Windows+ROCm with HIP prebuilt missing should fall through to CPU.""" + assets_no_hip = {k: v for k, v in UPSTREAM_ASSETS.items() if "hip" not in k} + mock_assets.return_value = assets_no_hip + host = windows_rocm_host() + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "windows-cpu" + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_macos_rocm_impossible_has_rocm_false(self, mock_assets): + """macOS host should never have has_rocm=True in practice; verify it gets macOS asset.""" + host = macos_host(has_rocm = True) + choice = resolve_upstream_asset_choice(host, LLAMA_TAG) + assert choice.install_kind == "macos-arm64" + + @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) + def test_linux_aarch64_rocm_gets_prebuilt_fallback(self, mock_assets): + """Linux aarch64 with ROCm -- no x86_64 match, should raise PrebuiltFallback.""" + host = rocm_host(machine = "aarch64", is_x86_64 = False, is_arm64 = True) + with pytest.raises(PrebuiltFallback): + resolve_upstream_asset_choice(host, LLAMA_TAG) + + +# ============================================================================= +# TEST: install_llama_prebuilt.py -- runtime_patterns_for_choice +# ============================================================================= + +class TestRuntimePatterns: + """Verify runtime file patterns for all install kinds.""" + + def test_linux_cpu_patterns(self): + choice = AssetChoice(repo = "", tag = "", name = "", url = "", + source_label = "", install_kind = "linux-cpu") + patterns = runtime_patterns_for_choice(choice) + assert "llama-server" in patterns + assert "llama-quantize" in patterns + + def test_linux_cuda_patterns(self): + choice = AssetChoice(repo = "", tag = "", name = "", url = "", + source_label = "", install_kind = "linux-cuda") + patterns = runtime_patterns_for_choice(choice) + assert "libggml-cuda.so*" in patterns + + def test_linux_rocm_patterns(self): + choice = AssetChoice(repo = "", tag = "", name = "", url = "", + source_label = "", install_kind = "linux-rocm") + patterns = runtime_patterns_for_choice(choice) + assert "libggml-hip.so*" in patterns + assert "llama-server" in patterns + + def test_windows_hip_patterns(self): + choice = AssetChoice(repo = "", tag = "", name = "", url = "", + source_label = "", install_kind = "windows-hip") + patterns = runtime_patterns_for_choice(choice) + assert "*.exe" in patterns + assert "*.dll" in patterns + + def test_macos_patterns(self): + choice = AssetChoice(repo = "", tag = "", name = "", url = "", + source_label = "", install_kind = "macos-arm64") + patterns = runtime_patterns_for_choice(choice) + assert "lib*.dylib" in patterns + + +# ============================================================================= +# TEST: install_llama_prebuilt.py -- HostInfo.has_rocm field +# ============================================================================= + +class TestHostInfoRocm: + """Verify has_rocm field does not affect other HostInfo behavior.""" + + def test_has_rocm_default_false(self): + host = HostInfo( + system = "Linux", machine = "x86_64", + is_windows = False, is_linux = True, is_macos = False, + is_x86_64 = True, is_arm64 = False, + nvidia_smi = None, driver_cuda_version = None, + compute_caps = [], visible_cuda_devices = None, + has_physical_nvidia = False, has_usable_nvidia = False, + ) + assert host.has_rocm is False + + def test_has_rocm_explicit_true(self): + host = rocm_host() + assert host.has_rocm is True + + def test_nvidia_host_no_rocm(self): + host = nvidia_host() + assert host.has_rocm is False + assert host.has_usable_nvidia is True + + def test_detect_host_with_rocm_path_env(self): + """detect_host() checks ROCM_PATH env var for ROCm detection.""" + # Verify the detect_host function source references ROCM_PATH + import inspect + source = inspect.getsource(prebuilt_mod.detect_host) + assert "ROCM_PATH" in source or "rocm" in source.lower() + + +# ============================================================================= +# TEST: install_python_stack.py -- _detect_rocm_version +# ============================================================================= + +class TestDetectRocmVersion: + """Verify ROCm version detection from various sources.""" + + def test_no_rocm_returns_none(self, tmp_path): + """No ROCm installed should return None.""" + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): + with patch("shutil.which", return_value = None): + result = _detect_rocm_version() + assert result is None + + def test_version_from_file(self, tmp_path): + """Reads version from /opt/rocm/.info/version.""" + info_dir = tmp_path / ".info" + info_dir.mkdir() + (info_dir / "version").write_text("7.1.0-12345\n") + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): + result = _detect_rocm_version() + assert result == (7, 1) + + def test_version_62(self, tmp_path): + """Reads ROCm 6.2 version.""" + info_dir = tmp_path / ".info" + info_dir.mkdir() + (info_dir / "version").write_text("6.2.0\n") + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): + result = _detect_rocm_version() + assert result == (6, 2) + + def test_hipconfig_fallback(self, tmp_path): + """Falls back to hipconfig --version when file not found.""" + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = b"6.3.21234.2\n" + with patch("shutil.which", return_value = "/usr/bin/hipconfig"): + with patch("subprocess.run", return_value = mock_result): + result = _detect_rocm_version() + assert result == (6, 3) + + def test_empty_version_file(self, tmp_path): + """Empty version file should return None.""" + info_dir = tmp_path / ".info" + info_dir.mkdir() + (info_dir / "version").write_text("") + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): + with patch("shutil.which", return_value = None): + result = _detect_rocm_version() + assert result is None + + def test_version_with_epoch_prefix(self, tmp_path): + """Debian epoch prefix (2:6.2.0) -- version file has no epoch, so should parse.""" + info_dir = tmp_path / ".info" + info_dir.mkdir() + # Version files don't typically have epoch prefix, but lib/rocm_version might + (info_dir / "version").write_text("6.2.0\n") + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): + result = _detect_rocm_version() + assert result == (6, 2) + + def test_multiple_version_sources_first_wins(self, tmp_path): + """When both .info/version and lib/rocm_version exist, first found wins.""" + info_dir = tmp_path / ".info" + info_dir.mkdir() + (info_dir / "version").write_text("7.1.0\n") + lib_dir = tmp_path / "lib" + lib_dir.mkdir() + (lib_dir / "rocm_version").write_text("6.3.0\n") + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): + result = _detect_rocm_version() + assert result == (7, 1) # .info/version checked first + + def test_hipconfig_multiline_output(self, tmp_path): + """hipconfig with multi-line output -- should use first line.""" + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = b"6.3.21234.2\nSome extra info\n" + with patch("shutil.which", return_value = "/usr/bin/hipconfig"): + with patch("subprocess.run", return_value = mock_result): + result = _detect_rocm_version() + assert result == (6, 3) + + def test_hipconfig_timeout(self, tmp_path): + """hipconfig that times out should return None.""" + with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): + with patch("shutil.which", return_value = "/usr/bin/hipconfig"): + with patch("subprocess.run", side_effect = subprocess.TimeoutExpired("hipconfig", 5)): + result = _detect_rocm_version() + assert result is None + + +# ============================================================================= +# TEST: install_python_stack.py -- _ensure_rocm_torch +# ============================================================================= + +class TestEnsureRocmTorch: + """Verify ROCm torch reinstall logic.""" + + @patch.object(stack_mod, "pip_install") + def test_no_rocm_skips(self, mock_pip): + """No ROCm toolchain should skip entirely.""" + with patch("os.path.isdir", return_value = False): + with patch("shutil.which", return_value = None): + _ensure_rocm_torch() + mock_pip.assert_not_called() + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) + def test_torch_already_has_cuda_skips(self, mock_ver, mock_pip): + """If torch already has CUDA, should skip ROCm reinstall.""" + mock_probe = MagicMock() + mock_probe.returncode = 0 + mock_probe.stdout = b"12.6\n" # CUDA version string + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", return_value = mock_probe): + _ensure_rocm_torch() + mock_pip.assert_not_called() + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) + def test_torch_already_has_hip_skips(self, mock_ver, mock_pip): + """If torch already has HIP, should skip ROCm reinstall.""" + mock_probe = MagicMock() + mock_probe.returncode = 0 + mock_probe.stdout = b"7.1.12345\n" # HIP version string + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", return_value = mock_probe): + _ensure_rocm_torch() + mock_pip.assert_not_called() + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) + def test_cpu_torch_gets_rocm_reinstall(self, mock_ver, mock_pip): + """CPU-only torch on ROCm host should trigger reinstall.""" + mock_probe = MagicMock() + mock_probe.returncode = 0 + mock_probe.stdout = b"\n" # empty = no GPU backend + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", return_value = mock_probe): + _ensure_rocm_torch() + # Should call pip_install twice: once for torch, once for bitsandbytes + assert mock_pip.call_count == 2 + torch_call = mock_pip.call_args_list[0] + assert "rocm7.1" in str(torch_call) + bnb_call = mock_pip.call_args_list[1] + assert "bitsandbytes" in str(bnb_call) + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (6, 3)) + def test_rocm_63_selects_correct_tag(self, mock_ver, mock_pip): + """ROCm 6.3 should select rocm6.3 tag.""" + mock_probe = MagicMock() + mock_probe.returncode = 0 + mock_probe.stdout = b"\n" + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", return_value = mock_probe): + _ensure_rocm_torch() + torch_call = mock_pip.call_args_list[0] + assert "rocm6.3" in str(torch_call) + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (5, 0)) + def test_old_rocm_skips(self, mock_ver, mock_pip): + """ROCm version too old (below 6.0) should skip.""" + mock_probe = MagicMock() + mock_probe.returncode = 0 + mock_probe.stdout = b"\n" + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", return_value = mock_probe): + _ensure_rocm_torch() + mock_pip.assert_not_called() + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = None) + def test_version_unreadable_prints_warning(self, mock_ver, mock_pip, capsys): + """ROCm detected but version unreadable should print warning and skip.""" + with patch("os.path.isdir", return_value = True): + _ensure_rocm_torch() + mock_pip.assert_not_called() + captured = capsys.readouterr() + assert "unreadable" in captured.out + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 2)) + def test_rocm_72_selects_71_tag(self, mock_ver, mock_pip): + """ROCm 7.2 should select rocm7.1 tag (capped, not in mapping).""" + mock_probe = MagicMock() + mock_probe.returncode = 0 + mock_probe.stdout = b"\n" + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", return_value = mock_probe): + _ensure_rocm_torch() + torch_call = mock_pip.call_args_list[0] + assert "rocm7.1" in str(torch_call) + + @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) + def test_probe_timeout_handled(self, mock_ver, mock_pip): + """Probe subprocess timeout should be handled gracefully.""" + with patch("os.path.isdir", return_value = True): + with patch("subprocess.run", side_effect = subprocess.TimeoutExpired("python", 30)): + # Should not crash -- timeout on probe means torch not importable + # The function will get an exception from subprocess.run and + # proceed to reinstall + try: + _ensure_rocm_torch() + except subprocess.TimeoutExpired: + pass # Acceptable -- the fix is about the timeout being set + + +# ============================================================================= +# TEST: install_python_stack.py -- _ROCM_TORCH_INDEX mapping +# ============================================================================= + +class TestRocmTorchIndex: + """Verify the ROCm version -> torch index tag mapping.""" + + def test_mapping_is_sorted_descending(self): + """Keys should be in descending order for the next() iteration to work.""" + keys = list(_ROCM_TORCH_INDEX.keys()) + assert keys == sorted(keys, reverse = True) + + def test_rocm_72_not_in_mapping(self): + """ROCm 7.2 should NOT be in the active mapping (torch 2.11.0 exceeds bound).""" + assert (7, 2) not in _ROCM_TORCH_INDEX + + def test_rocm_71_maps_correctly(self): + assert _ROCM_TORCH_INDEX[(7, 1)] == "rocm7.1" + + def test_rocm_63_maps_correctly(self): + assert _ROCM_TORCH_INDEX[(6, 3)] == "rocm6.3" + + def test_rocm_60_maps_correctly(self): + assert _ROCM_TORCH_INDEX[(6, 0)] == "rocm6.0" + + def test_all_tags_use_download_pytorch(self): + """All tags should be for download.pytorch.org, not repo.radeon.com.""" + for tag in _ROCM_TORCH_INDEX.values(): + assert tag.startswith("rocm") + assert "radeon" not in tag + + def test_newer_rocm_selects_best_match(self): + """ROCm 7.2 (not in map) should select rocm7.1 via >= comparison.""" + ver = (7, 2) + tag = next( + (t for (maj, mn), t in _ROCM_TORCH_INDEX.items() if ver >= (maj, mn)), + None, + ) + assert tag == "rocm7.1" + + def test_rocm_64_selects_64(self): + ver = (6, 4) + tag = next( + (t for (maj, mn), t in _ROCM_TORCH_INDEX.items() if ver >= (maj, mn)), + None, + ) + assert tag == "rocm6.4" + + +# ============================================================================= +# TEST: hardware.py -- IS_ROCM flag and detect_hardware +# ============================================================================= + +class TestHardwareRocmFlag: + """Verify IS_ROCM flag behavior without importing the full hardware module.""" + + def test_hardware_py_has_is_rocm(self): + """hardware.py should define IS_ROCM.""" + hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + source = hw_path.read_text() + assert "IS_ROCM: bool = False" in source + + def test_hardware_py_sets_is_rocm_on_hip(self): + """detect_hardware() should set IS_ROCM when torch.version.hip is set.""" + hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + source = hw_path.read_text() + assert 'torch.version, "hip"' in source or "torch.version.hip" in source + + def test_hardware_py_still_returns_cuda_for_rocm(self): + """DeviceType should remain CUDA even on ROCm -- no DeviceType.ROCM.""" + hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + source = hw_path.read_text() + # Ensure ROCM is NOT a DeviceType member + enum_section = source.split("class DeviceType")[1].split("\n\n")[0] + assert "ROCM" not in enum_section + + def test_hardware_py_has_rocm_in_package_versions(self): + """get_package_versions() should include 'rocm' key.""" + hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + source = hw_path.read_text() + assert '"rocm"' in source + + def test_hardware_py_device_type_cuda_references_intact(self): + """All existing DeviceType.CUDA references should still be present.""" + hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + source = hw_path.read_text() + # Key functions that must still reference DeviceType.CUDA + assert "DeviceType.CUDA" in source + assert "DEVICE = DeviceType.CUDA" in source + + def test_is_rocm_exported_from_init(self): + """IS_ROCM should be exported from hardware __init__.py.""" + init_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" + source = init_path.read_text() + assert "IS_ROCM" in source + + def test_is_rocm_in_all_list(self): + """IS_ROCM should be in __all__ list in __init__.py.""" + init_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" + source = init_path.read_text() + # Extract __all__ section + assert '"IS_ROCM"' in source + + def test_get_package_versions_returns_rocm_key(self): + """get_package_versions() source should return both 'cuda' and 'rocm' keys.""" + hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + source = hw_path.read_text() + # Find the get_package_versions function body + func_start = source.find("def get_package_versions") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert '"cuda"' in func_body + assert '"rocm"' in func_body + + +# ============================================================================= +# TEST: tokenizer_utils.py -- error message +# ============================================================================= + +class TestTokenizerErrorMessage: + """Verify the AMD error message is updated.""" + + def test_no_old_amd_message(self): + """Old 'We do not support AMD' message should be gone.""" + tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py" + source = tu_path.read_text() + assert "We do not support AMD" not in source + + def test_new_message_has_docs_link(self): + """New message should point to Unsloth AMD docs.""" + tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py" + source = tu_path.read_text() + assert "docs.unsloth.ai" in source or "No GPU detected" in source + + +# ============================================================================= +# TEST: install.sh -- structural checks +# ============================================================================= + +class TestInstallShStructure: + """Verify install.sh structural properties without running it.""" + + def test_no_here_strings(self): + """install.sh must not use <<< (not POSIX).""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + # <<< is bash-only; breaks dash + for i, line in enumerate(source.splitlines(), 1): + stripped = line.lstrip() + if stripped.startswith("#"): + continue + assert "<<<" not in line, f"install.sh:{i} uses non-POSIX <<< here-string" + + def test_rocm_detection_present(self): + """install.sh should have ROCm detection in get_torch_index_url.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + assert "amd-smi" in source + assert "rocm" in source.lower() + + def test_cuda_precedence(self): + """ROCm detection should only run when nvidia-smi is absent.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + # The ROCm block should be inside the "if [ -z "$_smi" ]" branch + smi_block_start = source.find('if [ -z "$_smi" ]') + rocm_block_start = source.find("amd-smi") + assert smi_block_start < rocm_block_start, \ + "ROCm detection should be inside the 'no nvidia-smi' branch" + + def test_bitsandbytes_amd_install(self): + """install.sh should install bitsandbytes for AMD when ROCm detected.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + assert "bitsandbytes" in source + assert "rocm*)" in source # case pattern for ROCm URLs + + def test_cpu_hint_mentions_amd(self): + """CPU-only hint should mention AMD ROCm.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + assert "ROCm" in source + + def test_rocm72_capped_to_71(self): + """ROCm 7.2+ should fall back to rocm7.1 index.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + assert "rocm7.2" in source # case pattern + assert 'echo "$_base/rocm7.1"' in source # fallback + + def test_rocm_tag_validation_guard_exists(self): + """install.sh should validate _rocm_tag with a case guard.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + assert 'rocm[0-9]*.[0-9]*)' in source + assert '_rocm_tag=""' in source # rejection path + + def test_dpkg_epoch_handling(self): + """install.sh should strip Debian epoch prefix from dpkg-query output.""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + assert "sed 's/^[0-9]*://' " in source or "sed 's/^[0-9]*://'" in source + + def test_no_double_bracket_in_rocm_block(self): + """ROCm detection block should not use [[ ]] (bash-only, not POSIX). + Note: [[:space:]], [[:digit:]] etc. are valid POSIX character classes, not bash [[ ]].""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + func_start = source.find("get_torch_index_url()") + func_end = source.find("\n}", func_start) + func_body = source[func_start:func_end] + import re + for i, line in enumerate(func_body.splitlines(), 1): + stripped = line.lstrip() + if stripped.startswith("#"): + continue + # Remove POSIX character classes [[:foo:]] before checking for [[ ]] + cleaned = re.sub(r'\[\[:[a-z]+:\]\]', '', line) + assert "[[" not in cleaned, f"get_torch_index_url line {i} uses non-POSIX [[" + + def test_no_arithmetic_expansion_in_rocm_block(self): + """ROCm detection block should not use (( )) (bash-only).""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + func_start = source.find("get_torch_index_url()") + func_end = source.find("\n}", func_start) + func_body = source[func_start:func_end] + for i, line in enumerate(func_body.splitlines(), 1): + stripped = line.lstrip() + if stripped.startswith("#"): + continue + assert "((" not in line or "))" not in line or "$(()" in line, \ + f"get_torch_index_url line {i} may use non-POSIX (( ))" + + def test_macos_returns_cpu_before_rocm_check(self): + """macOS should return CPU immediately (before any ROCm check).""" + sh_path = PACKAGE_ROOT / "install.sh" + source = sh_path.read_text() + func_start = source.find("get_torch_index_url()") + func_body = source[func_start:] + darwin_pos = func_body.find("Darwin") + rocm_pos = func_body.find("amd-smi") + assert darwin_pos < rocm_pos, "macOS check should come before ROCm detection" + + +# ============================================================================= +# TEST: Live regression on current host (NVIDIA B200 expected) +# ============================================================================= + +class TestLiveRegression: + """Live checks that run on the actual host -- skip if no NVIDIA GPU.""" + + def test_get_torch_index_url_returns_cuda_on_nvidia(self): + """On an NVIDIA machine, get_torch_index_url should return a CUDA URL.""" + import shutil + if not shutil.which("nvidia-smi"): + pytest.skip("No nvidia-smi available") + sh_path = PACKAGE_ROOT / "install.sh" + # Extract just the function (don't source the whole installer) + result = subprocess.run( + ["bash", "-c", + f"eval \"$(sed -n '/^get_torch_index_url()/,/^}}/p' '{sh_path}')\"; " + "get_torch_index_url"], + capture_output = True, text = True, timeout = 30, + ) + if result.returncode != 0: + pytest.skip("Could not extract get_torch_index_url for live test") + url = result.stdout.strip() + assert "cu1" in url or "cuda" in url.lower(), f"Expected CUDA URL, got: {url}" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) From 7290199c26792918e00c326117e84c52832500db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 08:38:56 +0000 Subject: [PATCH 06/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/utils/hardware/hardware.py | 4 +- studio/install_llama_prebuilt.py | 14 +- tests/studio/install/test_rocm_support.py | 206 ++++++++++++++++------ 3 files changed, 162 insertions(+), 62 deletions(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index d5de59a592..244ef3fe98 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -103,7 +103,9 @@ def detect_hardware() -> DeviceType: # DeviceType stays CUDA since torch.cuda.* works on ROCm via HIP. if getattr(torch.version, "hip", None) is not None: IS_ROCM = True - print(f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}") + print( + f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}" + ) else: print(f"Hardware detected: CUDA -- {device_name}") return DEVICE diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index cbc8f6d15f..4fbd87850d 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1746,8 +1746,10 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice rocm_name = f"llama-{llama_tag}-bin-ubuntu-rocm-7.2-x64.tar.gz" if rocm_name in upstream_assets: log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") - log("Note: prebuilt is compiled for ROCm 7.2; if your ROCm version differs, " - "this may fail preflight and fall back to a source build (safe)") + log( + "Note: prebuilt is compiled for ROCm 7.2; if your ROCm version differs, " + "this may fail preflight and fall back to a source build (safe)" + ) return AssetChoice( repo = UPSTREAM_REPO, tag = llama_tag, @@ -1785,7 +1787,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice if host.has_rocm: hip_name = f"llama-{llama_tag}-bin-win-hip-radeon-x64.zip" if hip_name in upstream_assets: - log(f"AMD ROCm detected on Windows -- trying upstream HIP prebuilt {hip_name}") + log( + f"AMD ROCm detected on Windows -- trying upstream HIP prebuilt {hip_name}" + ) return AssetChoice( repo = UPSTREAM_REPO, tag = llama_tag, @@ -1794,7 +1798,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice source_label = "upstream", install_kind = "windows-hip", ) - log("AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU") + log( + "AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU" + ) upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index 560edf7a16..62ceece457 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -52,12 +52,17 @@ # ── Helper: build HostInfo for different scenarios ────────────────────────── + def nvidia_host(**overrides) -> HostInfo: """NVIDIA Linux x86_64 host.""" defaults = dict( - system = "Linux", machine = "x86_64", - is_windows = False, is_linux = True, is_macos = False, - is_x86_64 = True, is_arm64 = False, + system = "Linux", + machine = "x86_64", + is_windows = False, + is_linux = True, + is_macos = False, + is_x86_64 = True, + is_arm64 = False, nvidia_smi = "/usr/bin/nvidia-smi", driver_cuda_version = (12, 6), compute_caps = ["89"], @@ -73,9 +78,13 @@ def nvidia_host(**overrides) -> HostInfo: def rocm_host(**overrides) -> HostInfo: """AMD ROCm Linux x86_64 host (no NVIDIA).""" defaults = dict( - system = "Linux", machine = "x86_64", - is_windows = False, is_linux = True, is_macos = False, - is_x86_64 = True, is_arm64 = False, + system = "Linux", + machine = "x86_64", + is_windows = False, + is_linux = True, + is_macos = False, + is_x86_64 = True, + is_arm64 = False, nvidia_smi = None, driver_cuda_version = None, compute_caps = [], @@ -91,9 +100,13 @@ def rocm_host(**overrides) -> HostInfo: def cpu_host(**overrides) -> HostInfo: """CPU-only Linux x86_64 host.""" defaults = dict( - system = "Linux", machine = "x86_64", - is_windows = False, is_linux = True, is_macos = False, - is_x86_64 = True, is_arm64 = False, + system = "Linux", + machine = "x86_64", + is_windows = False, + is_linux = True, + is_macos = False, + is_x86_64 = True, + is_arm64 = False, nvidia_smi = None, driver_cuda_version = None, compute_caps = [], @@ -109,9 +122,13 @@ def cpu_host(**overrides) -> HostInfo: def macos_host(**overrides) -> HostInfo: """macOS arm64 host.""" defaults = dict( - system = "Darwin", machine = "arm64", - is_windows = False, is_linux = False, is_macos = True, - is_x86_64 = False, is_arm64 = True, + system = "Darwin", + machine = "arm64", + is_windows = False, + is_linux = False, + is_macos = True, + is_x86_64 = False, + is_arm64 = True, nvidia_smi = None, driver_cuda_version = None, compute_caps = [], @@ -127,9 +144,13 @@ def macos_host(**overrides) -> HostInfo: def windows_host(**overrides) -> HostInfo: """Windows x86_64 host.""" defaults = dict( - system = "Windows", machine = "amd64", - is_windows = True, is_linux = False, is_macos = False, - is_x86_64 = True, is_arm64 = False, + system = "Windows", + machine = "amd64", + is_windows = True, + is_linux = False, + is_macos = False, + is_x86_64 = True, + is_arm64 = False, nvidia_smi = None, driver_cuda_version = None, compute_caps = [], @@ -145,9 +166,13 @@ def windows_host(**overrides) -> HostInfo: def windows_rocm_host(**overrides) -> HostInfo: """Windows x86_64 host with ROCm.""" defaults = dict( - system = "Windows", machine = "amd64", - is_windows = True, is_linux = False, is_macos = False, - is_x86_64 = True, is_arm64 = False, + system = "Windows", + machine = "amd64", + is_windows = True, + is_linux = False, + is_macos = False, + is_x86_64 = True, + is_arm64 = False, nvidia_smi = None, driver_cuda_version = None, compute_caps = [], @@ -179,6 +204,7 @@ def windows_rocm_host(**overrides) -> HostInfo: # TEST: install_llama_prebuilt.py -- resolve_upstream_asset_choice # ============================================================================= + class TestResolveUpstreamAssetChoice: """Verify that the asset selection logic picks the right binary for each platform.""" @@ -244,7 +270,9 @@ def test_mixed_nvidia_rocm_prefers_nvidia(self, mock_assets): def test_rocm_linux_no_prebuilt_falls_back(self, mock_assets): """AMD ROCm host should fall back to source build when no ROCm prebuilt exists.""" # Remove the ROCm asset from available assets - assets_without_rocm = {k: v for k, v in UPSTREAM_ASSETS.items() if "rocm" not in k} + assets_without_rocm = { + k: v for k, v in UPSTREAM_ASSETS.items() if "rocm" not in k + } mock_assets.return_value = assets_without_rocm host = rocm_host() with pytest.raises(PrebuiltFallback, match = "ROCm detected"): @@ -278,39 +306,55 @@ def test_linux_aarch64_rocm_gets_prebuilt_fallback(self, mock_assets): # TEST: install_llama_prebuilt.py -- runtime_patterns_for_choice # ============================================================================= + class TestRuntimePatterns: """Verify runtime file patterns for all install kinds.""" def test_linux_cpu_patterns(self): - choice = AssetChoice(repo = "", tag = "", name = "", url = "", - source_label = "", install_kind = "linux-cpu") + choice = AssetChoice( + repo = "", tag = "", name = "", url = "", source_label = "", install_kind = "linux-cpu" + ) patterns = runtime_patterns_for_choice(choice) assert "llama-server" in patterns assert "llama-quantize" in patterns def test_linux_cuda_patterns(self): - choice = AssetChoice(repo = "", tag = "", name = "", url = "", - source_label = "", install_kind = "linux-cuda") + choice = AssetChoice( + repo = "", tag = "", name = "", url = "", source_label = "", install_kind = "linux-cuda" + ) patterns = runtime_patterns_for_choice(choice) assert "libggml-cuda.so*" in patterns def test_linux_rocm_patterns(self): - choice = AssetChoice(repo = "", tag = "", name = "", url = "", - source_label = "", install_kind = "linux-rocm") + choice = AssetChoice( + repo = "", tag = "", name = "", url = "", source_label = "", install_kind = "linux-rocm" + ) patterns = runtime_patterns_for_choice(choice) assert "libggml-hip.so*" in patterns assert "llama-server" in patterns def test_windows_hip_patterns(self): - choice = AssetChoice(repo = "", tag = "", name = "", url = "", - source_label = "", install_kind = "windows-hip") + choice = AssetChoice( + repo = "", + tag = "", + name = "", + url = "", + source_label = "", + install_kind = "windows-hip", + ) patterns = runtime_patterns_for_choice(choice) assert "*.exe" in patterns assert "*.dll" in patterns def test_macos_patterns(self): - choice = AssetChoice(repo = "", tag = "", name = "", url = "", - source_label = "", install_kind = "macos-arm64") + choice = AssetChoice( + repo = "", + tag = "", + name = "", + url = "", + source_label = "", + install_kind = "macos-arm64", + ) patterns = runtime_patterns_for_choice(choice) assert "lib*.dylib" in patterns @@ -319,17 +363,25 @@ def test_macos_patterns(self): # TEST: install_llama_prebuilt.py -- HostInfo.has_rocm field # ============================================================================= + class TestHostInfoRocm: """Verify has_rocm field does not affect other HostInfo behavior.""" def test_has_rocm_default_false(self): host = HostInfo( - system = "Linux", machine = "x86_64", - is_windows = False, is_linux = True, is_macos = False, - is_x86_64 = True, is_arm64 = False, - nvidia_smi = None, driver_cuda_version = None, - compute_caps = [], visible_cuda_devices = None, - has_physical_nvidia = False, has_usable_nvidia = False, + system = "Linux", + machine = "x86_64", + is_windows = False, + is_linux = True, + is_macos = False, + is_x86_64 = True, + is_arm64 = False, + nvidia_smi = None, + driver_cuda_version = None, + compute_caps = [], + visible_cuda_devices = None, + has_physical_nvidia = False, + has_usable_nvidia = False, ) assert host.has_rocm is False @@ -346,6 +398,7 @@ def test_detect_host_with_rocm_path_env(self): """detect_host() checks ROCM_PATH env var for ROCm detection.""" # Verify the detect_host function source references ROCM_PATH import inspect + source = inspect.getsource(prebuilt_mod.detect_host) assert "ROCM_PATH" in source or "rocm" in source.lower() @@ -354,6 +407,7 @@ def test_detect_host_with_rocm_path_env(self): # TEST: install_python_stack.py -- _detect_rocm_version # ============================================================================= + class TestDetectRocmVersion: """Verify ROCm version detection from various sources.""" @@ -440,7 +494,10 @@ def test_hipconfig_timeout(self, tmp_path): """hipconfig that times out should return None.""" with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): with patch("shutil.which", return_value = "/usr/bin/hipconfig"): - with patch("subprocess.run", side_effect = subprocess.TimeoutExpired("hipconfig", 5)): + with patch( + "subprocess.run", + side_effect = subprocess.TimeoutExpired("hipconfig", 5), + ): result = _detect_rocm_version() assert result is None @@ -449,6 +506,7 @@ def test_hipconfig_timeout(self, tmp_path): # TEST: install_python_stack.py -- _ensure_rocm_torch # ============================================================================= + class TestEnsureRocmTorch: """Verify ROCm torch reinstall logic.""" @@ -554,7 +612,9 @@ def test_rocm_72_selects_71_tag(self, mock_ver, mock_pip): def test_probe_timeout_handled(self, mock_ver, mock_pip): """Probe subprocess timeout should be handled gracefully.""" with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", side_effect = subprocess.TimeoutExpired("python", 30)): + with patch( + "subprocess.run", side_effect = subprocess.TimeoutExpired("python", 30) + ): # Should not crash -- timeout on probe means torch not importable # The function will get an exception from subprocess.run and # proceed to reinstall @@ -568,6 +628,7 @@ def test_probe_timeout_handled(self, mock_ver, mock_pip): # TEST: install_python_stack.py -- _ROCM_TORCH_INDEX mapping # ============================================================================= + class TestRocmTorchIndex: """Verify the ROCm version -> torch index tag mapping.""" @@ -617,24 +678,31 @@ def test_rocm_64_selects_64(self): # TEST: hardware.py -- IS_ROCM flag and detect_hardware # ============================================================================= + class TestHardwareRocmFlag: """Verify IS_ROCM flag behavior without importing the full hardware module.""" def test_hardware_py_has_is_rocm(self): """hardware.py should define IS_ROCM.""" - hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) source = hw_path.read_text() assert "IS_ROCM: bool = False" in source def test_hardware_py_sets_is_rocm_on_hip(self): """detect_hardware() should set IS_ROCM when torch.version.hip is set.""" - hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) source = hw_path.read_text() assert 'torch.version, "hip"' in source or "torch.version.hip" in source def test_hardware_py_still_returns_cuda_for_rocm(self): """DeviceType should remain CUDA even on ROCm -- no DeviceType.ROCM.""" - hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) source = hw_path.read_text() # Ensure ROCM is NOT a DeviceType member enum_section = source.split("class DeviceType")[1].split("\n\n")[0] @@ -642,13 +710,17 @@ def test_hardware_py_still_returns_cuda_for_rocm(self): def test_hardware_py_has_rocm_in_package_versions(self): """get_package_versions() should include 'rocm' key.""" - hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) source = hw_path.read_text() assert '"rocm"' in source def test_hardware_py_device_type_cuda_references_intact(self): """All existing DeviceType.CUDA references should still be present.""" - hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) source = hw_path.read_text() # Key functions that must still reference DeviceType.CUDA assert "DeviceType.CUDA" in source @@ -656,24 +728,30 @@ def test_hardware_py_device_type_cuda_references_intact(self): def test_is_rocm_exported_from_init(self): """IS_ROCM should be exported from hardware __init__.py.""" - init_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" + init_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" + ) source = init_path.read_text() assert "IS_ROCM" in source def test_is_rocm_in_all_list(self): """IS_ROCM should be in __all__ list in __init__.py.""" - init_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" + init_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" + ) source = init_path.read_text() # Extract __all__ section assert '"IS_ROCM"' in source def test_get_package_versions_returns_rocm_key(self): """get_package_versions() source should return both 'cuda' and 'rocm' keys.""" - hw_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) source = hw_path.read_text() # Find the get_package_versions function body func_start = source.find("def get_package_versions") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert '"cuda"' in func_body assert '"rocm"' in func_body @@ -682,6 +760,7 @@ def test_get_package_versions_returns_rocm_key(self): # TEST: tokenizer_utils.py -- error message # ============================================================================= + class TestTokenizerErrorMessage: """Verify the AMD error message is updated.""" @@ -702,6 +781,7 @@ def test_new_message_has_docs_link(self): # TEST: install.sh -- structural checks # ============================================================================= + class TestInstallShStructure: """Verify install.sh structural properties without running it.""" @@ -730,8 +810,9 @@ def test_cuda_precedence(self): # The ROCm block should be inside the "if [ -z "$_smi" ]" branch smi_block_start = source.find('if [ -z "$_smi" ]') rocm_block_start = source.find("amd-smi") - assert smi_block_start < rocm_block_start, \ - "ROCm detection should be inside the 'no nvidia-smi' branch" + assert ( + smi_block_start < rocm_block_start + ), "ROCm detection should be inside the 'no nvidia-smi' branch" def test_bitsandbytes_amd_install(self): """install.sh should install bitsandbytes for AMD when ROCm detected.""" @@ -757,7 +838,7 @@ def test_rocm_tag_validation_guard_exists(self): """install.sh should validate _rocm_tag with a case guard.""" sh_path = PACKAGE_ROOT / "install.sh" source = sh_path.read_text() - assert 'rocm[0-9]*.[0-9]*)' in source + assert "rocm[0-9]*.[0-9]*)" in source assert '_rocm_tag=""' in source # rejection path def test_dpkg_epoch_handling(self): @@ -775,13 +856,16 @@ def test_no_double_bracket_in_rocm_block(self): func_end = source.find("\n}", func_start) func_body = source[func_start:func_end] import re + for i, line in enumerate(func_body.splitlines(), 1): stripped = line.lstrip() if stripped.startswith("#"): continue # Remove POSIX character classes [[:foo:]] before checking for [[ ]] - cleaned = re.sub(r'\[\[:[a-z]+:\]\]', '', line) - assert "[[" not in cleaned, f"get_torch_index_url line {i} uses non-POSIX [[" + cleaned = re.sub(r"\[\[:[a-z]+:\]\]", "", line) + assert ( + "[[" not in cleaned + ), f"get_torch_index_url line {i} uses non-POSIX [[" def test_no_arithmetic_expansion_in_rocm_block(self): """ROCm detection block should not use (( )) (bash-only).""" @@ -794,8 +878,9 @@ def test_no_arithmetic_expansion_in_rocm_block(self): stripped = line.lstrip() if stripped.startswith("#"): continue - assert "((" not in line or "))" not in line or "$(()" in line, \ - f"get_torch_index_url line {i} may use non-POSIX (( ))" + assert ( + "((" not in line or "))" not in line or "$(()" in line + ), f"get_torch_index_url line {i} may use non-POSIX (( ))" def test_macos_returns_cpu_before_rocm_check(self): """macOS should return CPU immediately (before any ROCm check).""" @@ -812,21 +897,28 @@ def test_macos_returns_cpu_before_rocm_check(self): # TEST: Live regression on current host (NVIDIA B200 expected) # ============================================================================= + class TestLiveRegression: """Live checks that run on the actual host -- skip if no NVIDIA GPU.""" def test_get_torch_index_url_returns_cuda_on_nvidia(self): """On an NVIDIA machine, get_torch_index_url should return a CUDA URL.""" import shutil + if not shutil.which("nvidia-smi"): pytest.skip("No nvidia-smi available") sh_path = PACKAGE_ROOT / "install.sh" # Extract just the function (don't source the whole installer) result = subprocess.run( - ["bash", "-c", - f"eval \"$(sed -n '/^get_torch_index_url()/,/^}}/p' '{sh_path}')\"; " - "get_torch_index_url"], - capture_output = True, text = True, timeout = 30, + [ + "bash", + "-c", + f"eval \"$(sed -n '/^get_torch_index_url()/,/^}}/p' '{sh_path}')\"; " + "get_torch_index_url", + ], + capture_output = True, + text = True, + timeout = 30, ) if result.returncode != 0: pytest.skip("Could not extract get_torch_index_url for live test") From 2cb0b52be7e6fd3a35cf542b633630396d8c48ce Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 08:58:03 +0000 Subject: [PATCH 07/55] Harden ROCm support: probe error handling, version cap, validation Address review findings from 8 independent reviewers: - Wrap _ensure_rocm_torch() torch probe in try/except for TimeoutExpired and OSError so a hung or broken torch import does not crash the installer (8/8 reviewers flagged this) - Add torch>=2.4,<2.11.0 version cap to the ROCm reinstall path to prevent installing unsupported torch 2.11.0 from the rocm7.1 index - Use with-statement for file reads in _detect_rocm_version() to avoid resource leaks - Handle ROCM_PATH="" correctly (use `or "/opt/rocm"` instead of default parameter to avoid relative path resolution) - Strengthen shell validation guard from rocm[0-9] to rocm[1-9] to reject rocm0.x tags that would produce nonexistent PyTorch index URLs - Switch shell version cap from blocklist to allowlist (rocm6.*|rocm7.0* |rocm7.1* pass through, everything else caps to rocm7.1) so future ROCm 10+ does not fall through to a nonexistent index - Add sorted() to _ROCM_TORCH_INDEX lookup for defensive ordering - Fix test_probe_timeout_handled: replace zero-assertion test with proper assertions verifying reinstall proceeds after timeout --- install.sh | 13 ++++---- studio/install_python_stack.py | 40 ++++++++++++++--------- tests/studio/install/test_rocm_support.py | 22 ++++++------- 3 files changed, 41 insertions(+), 34 deletions(-) diff --git a/install.sh b/install.sh index b8f49abf51..85268b14a4 100755 --- a/install.sh +++ b/install.sh @@ -1000,10 +1000,10 @@ get_torch_index_url() { ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \ [ -n "$ver" ] && \ printf '%s\n' "$ver" | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; }) 2>/dev/null - # Validate _rocm_tag: must match "rocmX.Y" with leading digits + # Validate _rocm_tag: must match "rocmX.Y" with major >= 1 case "$_rocm_tag" in - rocm[0-9]*.[0-9]*) : ;; # valid - *) _rocm_tag="" ;; # reject malformed (empty version, garbled output) + rocm[1-9]*.[0-9]*) : ;; # valid (major >= 1) + *) _rocm_tag="" ;; # reject malformed (empty, garbled, or major=0) esac if [ -n "$_rocm_tag" ]; then # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds (<2.11.0). @@ -1011,10 +1011,11 @@ get_torch_index_url() { # TODO: uncomment the next line when torch upper bound is bumped to >=2.11.0 # echo "$_base/$_rocm_tag"; return case "$_rocm_tag" in - rocm7.2*|rocm7.3*|rocm7.4*|rocm7.5*|rocm8*|rocm9*) - echo "$_base/rocm7.1" ;; - *) + rocm6.*|rocm7.0*|rocm7.1*) echo "$_base/$_rocm_tag" ;; + *) + # ROCm 7.2+ (including future 10.x+): cap to rocm7.1 + echo "$_base/rocm7.1" ;; esac return fi diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 1f17a2e79c..16b9877da9 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -47,13 +47,14 @@ def _detect_rocm_version() -> tuple[int, int] | None: """Return (major, minor) of the installed ROCm stack, or None.""" # Check /opt/rocm/.info/version or ROCM_PATH equivalent - rocm_root = os.environ.get("ROCM_PATH", "/opt/rocm") + rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" for path in ( os.path.join(rocm_root, ".info", "version"), os.path.join(rocm_root, "lib", "rocm_version"), ): try: - parts = open(path).read().strip().split("-")[0].split(".") + with open(path) as fh: + parts = fh.read().strip().split("-")[0].split(".") return int(parts[0]), int(parts[1]) except Exception: pass @@ -86,7 +87,7 @@ def _ensure_rocm_torch() -> None: links against HIP (ROCm) or CUDA (NVIDIA). Skips on Windows/macOS. Uses pip_install() to respect uv, constraints, and --python targeting. """ - rocm_root = os.environ.get("ROCM_PATH", "/opt/rocm") + rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" if not os.path.isdir(rocm_root) and not shutil.which("hipcc"): return # no ROCm toolchain @@ -96,22 +97,29 @@ def _ensure_rocm_torch() -> None: return # Skip if torch is already GPU-enabled (HIP or CUDA) - probe = subprocess.run( - [ - sys.executable, - "-c", - "import torch; print(torch.version.hip or torch.version.cuda or '')", - ], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - timeout = 30, - ) - if probe.returncode == 0 and probe.stdout.decode().strip(): + try: + probe = subprocess.run( + [ + sys.executable, + "-c", + "import torch; print(torch.version.hip or torch.version.cuda or '')", + ], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + timeout = 30, + ) + except (OSError, subprocess.TimeoutExpired): + probe = None + if probe is not None and probe.returncode == 0 and probe.stdout.decode().strip(): return # torch already GPU-enabled # Select best matching wheel tag (newest ROCm version <= installed) tag = next( - (t for (maj, mn), t in _ROCM_TORCH_INDEX.items() if ver >= (maj, mn)), + ( + t + for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) + if ver >= (maj, mn) + ), None, ) if tag is None: @@ -124,7 +132,7 @@ def _ensure_rocm_torch() -> None: f"ROCm torch ({tag})", "--force-reinstall", "--no-cache-dir", - "torch", + "torch>=2.4,<2.11.0", "torchvision", "torchaudio", "--index-url", diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index 62ceece457..5c45fb2ca9 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -609,19 +609,16 @@ def test_rocm_72_selects_71_tag(self, mock_ver, mock_pip): @patch.object(stack_mod, "pip_install") @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_probe_timeout_handled(self, mock_ver, mock_pip): - """Probe subprocess timeout should be handled gracefully.""" + def test_probe_timeout_triggers_reinstall(self, mock_ver, mock_pip): + """Probe subprocess timeout should not crash; should proceed to reinstall.""" with patch("os.path.isdir", return_value = True): with patch( "subprocess.run", side_effect = subprocess.TimeoutExpired("python", 30) ): - # Should not crash -- timeout on probe means torch not importable - # The function will get an exception from subprocess.run and - # proceed to reinstall - try: - _ensure_rocm_torch() - except subprocess.TimeoutExpired: - pass # Acceptable -- the fix is about the timeout being set + _ensure_rocm_torch() + # If probe times out, the function should treat torch as unusable and reinstall + assert mock_pip.call_count == 2 + assert "rocm7.1" in str(mock_pip.call_args_list[0]) # ============================================================================= @@ -831,14 +828,15 @@ def test_rocm72_capped_to_71(self): """ROCm 7.2+ should fall back to rocm7.1 index.""" sh_path = PACKAGE_ROOT / "install.sh" source = sh_path.read_text() - assert "rocm7.2" in source # case pattern - assert 'echo "$_base/rocm7.1"' in source # fallback + assert 'echo "$_base/rocm7.1"' in source # fallback for unknown versions + # Allowlisted versions should pass through directly + assert "rocm6.*|rocm7.0*|rocm7.1*)" in source def test_rocm_tag_validation_guard_exists(self): """install.sh should validate _rocm_tag with a case guard.""" sh_path = PACKAGE_ROOT / "install.sh" source = sh_path.read_text() - assert "rocm[0-9]*.[0-9]*)" in source + assert "rocm[1-9]*.[0-9]*)" in source assert '_rocm_tag=""' in source # rejection path def test_dpkg_epoch_handling(self): From 56098e5d02471f7e803f5647b9ef72107607daff Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 09:09:02 +0000 Subject: [PATCH 08/55] Clean up rocm_paths list construction in detect_host() Filter None from the ROCM_PATH env var lookup at list construction time instead of relying on the inline `if p` guard in the any() call. --- studio/install_llama_prebuilt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 4fbd87850d..43f3362b81 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1439,8 +1439,8 @@ def detect_host() -> HostInfo: shutil.which("amd-smi"), shutil.which("rocm-smi"), ] - rocm_paths = ["/opt/rocm", os.environ.get("ROCM_PATH", "")] - if any(rocm_hints) or any(os.path.isdir(p) for p in rocm_paths if p): + rocm_paths = [p for p in ("/opt/rocm", os.environ.get("ROCM_PATH")) if p] + if any(rocm_hints) or any(os.path.isdir(p) for p in rocm_paths): has_rocm = True return HostInfo( From 9e33c25eac8368eb19e1866b0ecf1d5e787207db Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 09:25:56 +0000 Subject: [PATCH 09/55] Require actual AMD GPU presence before selecting ROCm paths All 8 reviewers across 2 cycles independently flagged that ROCm detection used toolkit/filesystem hints (hipcc, /opt/rocm, rocm-core) as a proxy for GPU presence, which would misroute CPU-only or NVIDIA hosts that happen to have ROCm tools installed. Now all 3 detection points (install.sh, install_python_stack.py, install_llama_prebuilt.py) probe for an actual AMD GPU before entering the ROCm path: - install.sh: check rocminfo for gfx* GPU names, or amd-smi list for device rows, before version detection - install_python_stack.py: new _has_rocm_gpu() function probes rocminfo and amd-smi list before _ensure_rocm_torch() proceeds - install_llama_prebuilt.py: detect_host() probes rocminfo/amd-smi list instead of just checking tool existence or directory paths Also: - Shell test mock amd-smi now handles "list" subcommand - Python tests updated to mock _has_rocm_gpu where needed - Added test_no_gpu_with_rocm_tools_skips to verify the new guard - Test index lookups now use sorted() to match production code --- install.sh | 15 +++++++- studio/install_llama_prebuilt.py | 25 ++++++++----- studio/install_python_stack.py | 32 +++++++++++++++- tests/sh/test_get_torch_index_url.sh | 11 +++++- tests/studio/install/test_rocm_support.py | 45 ++++++++++++++++++----- 5 files changed, 105 insertions(+), 23 deletions(-) diff --git a/install.sh b/install.sh index 85268b14a4..4d121bf331 100755 --- a/install.sh +++ b/install.sh @@ -983,7 +983,20 @@ get_torch_index_url() { _smi="/usr/bin/nvidia-smi" fi if [ -z "$_smi" ]; then - # No NVIDIA GPU -- check for AMD ROCm + # No NVIDIA GPU -- check for AMD ROCm GPU + # First confirm an actual AMD GPU is present (not just ROCm tools installed) + _has_rocm_gpu=false + if command -v rocminfo >/dev/null 2>&1 && \ + rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[0-9]/{found=1} END{exit !found}'; then + _has_rocm_gpu=true + elif command -v amd-smi >/dev/null 2>&1 && \ + amd-smi list 2>/dev/null | awk 'NR>1 && NF{found=1} END{exit !found}'; then + _has_rocm_gpu=true + fi + if [ "$_has_rocm_gpu" != true ]; then + echo "$_base/cpu"; return + fi + # AMD GPU confirmed -- detect ROCm version _rocm_tag="" _rocm_tag=$({ command -v amd-smi >/dev/null 2>&1 && \ amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 43f3362b81..3b46373eb0 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1431,17 +1431,24 @@ def detect_host() -> HostInfo: except Exception: pass - # Detect AMD ROCm (HIP) + # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed has_rocm = False if not is_macos: - rocm_hints = [ - shutil.which("hipcc"), - shutil.which("amd-smi"), - shutil.which("rocm-smi"), - ] - rocm_paths = [p for p in ("/opt/rocm", os.environ.get("ROCM_PATH")) if p] - if any(rocm_hints) or any(os.path.isdir(p) for p in rocm_paths): - has_rocm = True + for _cmd, _marker in ( + (["rocminfo"], "gfx"), + (["amd-smi", "list"], None), + ): + _exe = shutil.which(_cmd[0]) + if not _exe: + continue + try: + _result = run_capture([_exe, *_cmd[1:]], timeout = 10) + except Exception: + continue + if _result.returncode == 0 and _result.stdout.strip(): + if _marker is None or _marker in _result.stdout.lower(): + has_rocm = True + break return HostInfo( system = system, diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 16b9877da9..48d95c9b03 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -80,16 +80,44 @@ def _detect_rocm_version() -> tuple[int, int] | None: return None +def _has_rocm_gpu() -> bool: + """Return True only if an actual AMD GPU is visible (not just ROCm tools installed).""" + for cmd, marker in ( + (["rocminfo"], "gfx"), + (["amd-smi", "list"], None), + ): + exe = shutil.which(cmd[0]) + if not exe: + continue + try: + result = subprocess.run( + [exe, *cmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 10, + ) + except Exception: + continue + if result.returncode == 0 and result.stdout.strip(): + if marker is None or marker in result.stdout.lower(): + return True + return False + + def _ensure_rocm_torch() -> None: """Reinstall torch with ROCm wheels when the venv received CPU-only torch. - Runs only on Linux hosts where ROCm is installed. No-op when torch already - links against HIP (ROCm) or CUDA (NVIDIA). Skips on Windows/macOS. + Runs only on Linux hosts where ROCm is installed and an AMD GPU is + present. No-op when torch already links against HIP (ROCm) or CUDA + (NVIDIA). Skips on Windows/macOS. Uses pip_install() to respect uv, constraints, and --python targeting. """ rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" if not os.path.isdir(rocm_root) and not shutil.which("hipcc"): return # no ROCm toolchain + if not _has_rocm_gpu(): + return # ROCm tools present but no AMD GPU ver = _detect_rocm_version() if ver is None: diff --git a/tests/sh/test_get_torch_index_url.sh b/tests/sh/test_get_torch_index_url.sh index 81da79aa32..adbdbe4e14 100755 --- a/tests/sh/test_get_torch_index_url.sh +++ b/tests/sh/test_get_torch_index_url.sh @@ -46,13 +46,22 @@ MOCK } # Helper: create a mock amd-smi that prints a given ROCm version string +# Supports both "amd-smi version" and "amd-smi list" subcommands so that +# the GPU presence check (amd-smi list) also succeeds in tests. make_mock_amd_smi() { _dir=$(mktemp -d) cat > "$_dir/amd-smi" <= comparison.""" ver = (7, 2) tag = next( - (t for (maj, mn), t in _ROCM_TORCH_INDEX.items() if ver >= (maj, mn)), + ( + t + for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) + if ver >= (maj, mn) + ), None, ) assert tag == "rocm7.1" @@ -665,7 +686,11 @@ def test_newer_rocm_selects_best_match(self): def test_rocm_64_selects_64(self): ver = (6, 4) tag = next( - (t for (maj, mn), t in _ROCM_TORCH_INDEX.items() if ver >= (maj, mn)), + ( + t + for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) + if ver >= (maj, mn) + ), None, ) assert tag == "rocm6.4" From 4286525c53a8837e173a8d129013cca82d7117d8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:26:10 +0000 Subject: [PATCH 10/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/studio/install/test_rocm_support.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index da2152530e..5955d79aec 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -593,7 +593,9 @@ def test_old_rocm_skips(self, mock_ver, mock_gpu, mock_pip): @patch.object(stack_mod, "pip_install") @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = None) - def test_version_unreadable_prints_warning(self, mock_ver, mock_gpu, mock_pip, capsys): + def test_version_unreadable_prints_warning( + self, mock_ver, mock_gpu, mock_pip, capsys + ): """ROCm detected but version unreadable should print warning and skip.""" with patch("os.path.isdir", return_value = True): _ensure_rocm_torch() From 7d6ac653746a4ce88f992c14ff79b99b92acb978 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 09:37:07 +0000 Subject: [PATCH 11/55] Harden hipconfig version parsing and torch probe compatibility - Add parts[1].isdigit() check in hipconfig version parsing to handle versions like "6.3-HIP" where the minor component has non-numeric suffix (strip "-" prefix before int() conversion) - Use getattr() in torch probe subprocess to safely handle old or custom torch builds that may lack torch.version.hip/cuda attributes --- studio/install_python_stack.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 48d95c9b03..e3b282a667 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -72,8 +72,8 @@ def _detect_rocm_version() -> tuple[int, int] | None: if result.returncode == 0: raw = result.stdout.decode().strip().split("\n")[0] parts = raw.split(".") - if len(parts) >= 2 and parts[0].isdigit(): - return int(parts[0]), int(parts[1]) + if len(parts) >= 2 and parts[0].isdigit() and parts[1].split("-")[0].isdigit(): + return int(parts[0]), int(parts[1].split("-")[0]) except Exception: pass @@ -130,7 +130,7 @@ def _ensure_rocm_torch() -> None: [ sys.executable, "-c", - "import torch; print(torch.version.hip or torch.version.cuda or '')", + "import torch; v=torch.version; print(getattr(v,'hip','') or getattr(v,'cuda','') or '')", ], stdout = subprocess.PIPE, stderr = subprocess.DEVNULL, From fd432354e5ef54a7a7866ef668c4d20408fe8e31 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:37:38 +0000 Subject: [PATCH 12/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/install_python_stack.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index e3b282a667..8196b03ddb 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -72,7 +72,11 @@ def _detect_rocm_version() -> tuple[int, int] | None: if result.returncode == 0: raw = result.stdout.decode().strip().split("\n")[0] parts = raw.split(".") - if len(parts) >= 2 and parts[0].isdigit() and parts[1].split("-")[0].isdigit(): + if ( + len(parts) >= 2 + and parts[0].isdigit() + and parts[1].split("-")[0].isdigit() + ): return int(parts[0]), int(parts[1].split("-")[0]) except Exception: pass From 10ec0cdefabc484dbeed2178d63ed3f233397d4a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 09:51:58 +0000 Subject: [PATCH 13/55] Strengthen AMD GPU detection and add NVIDIA precedence guard - Change amd-smi list detection from any-non-empty-output to requiring "gpu" marker in output, matching the shell-side NR>1 check. Prevents false positives from header-only amd-smi list output. - Add nvidia-smi check at the top of _ensure_rocm_torch() so mixed AMD+NVIDIA hosts preserve NVIDIA precedence (matching install.sh and install_llama_prebuilt.py behavior). - Apply the same amd-smi marker fix to install_llama_prebuilt.py detect_host() for consistency. --- studio/install_llama_prebuilt.py | 4 ++-- studio/install_python_stack.py | 10 +++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 3b46373eb0..fc3c65bc4b 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1436,7 +1436,7 @@ def detect_host() -> HostInfo: if not is_macos: for _cmd, _marker in ( (["rocminfo"], "gfx"), - (["amd-smi", "list"], None), + (["amd-smi", "list"], "gpu"), ): _exe = shutil.which(_cmd[0]) if not _exe: @@ -1446,7 +1446,7 @@ def detect_host() -> HostInfo: except Exception: continue if _result.returncode == 0 and _result.stdout.strip(): - if _marker is None or _marker in _result.stdout.lower(): + if _marker in _result.stdout.lower(): has_rocm = True break diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 8196b03ddb..461f83d9b9 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -88,7 +88,7 @@ def _has_rocm_gpu() -> bool: """Return True only if an actual AMD GPU is visible (not just ROCm tools installed).""" for cmd, marker in ( (["rocminfo"], "gfx"), - (["amd-smi", "list"], None), + (["amd-smi", "list"], "gpu"), ): exe = shutil.which(cmd[0]) if not exe: @@ -104,7 +104,7 @@ def _has_rocm_gpu() -> bool: except Exception: continue if result.returncode == 0 and result.stdout.strip(): - if marker is None or marker in result.stdout.lower(): + if marker in result.stdout.lower(): return True return False @@ -114,9 +114,13 @@ def _ensure_rocm_torch() -> None: Runs only on Linux hosts where ROCm is installed and an AMD GPU is present. No-op when torch already links against HIP (ROCm) or CUDA - (NVIDIA). Skips on Windows/macOS. + (NVIDIA). Skips on Windows/macOS and on mixed AMD+NVIDIA hosts + (NVIDIA takes precedence). Uses pip_install() to respect uv, constraints, and --python targeting. """ + # NVIDIA takes precedence on mixed hosts + if shutil.which("nvidia-smi"): + return rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" if not os.path.isdir(rocm_root) and not shutil.which("hipcc"): return # no ROCm toolchain From 726fab1f378652e39518633f8dabec47e6f00888 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 10:27:24 +0000 Subject: [PATCH 14/55] Add Windows-specific ROCm/HIP detection in detect_host() The previous detect_host() ROCm check used rocminfo and amd-smi list which are Linux-only tools. On Windows, has_rocm would always be False, making the Windows HIP prebuilt path at line 1794 unreachable. Now detect_host() uses platform-specific detection: - Linux: rocminfo (check for gfx GPU names) or amd-smi list - Windows: hipinfo.exe, amd-smi, or amdhip64.dll on PATH This allows Windows AMD users to get the HIP prebuilt binary instead of silently falling through to the CPU prebuilt. --- studio/install_llama_prebuilt.py | 12 +++++++++++- tests/studio/install/test_rocm_support.py | 7 +++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index fc3c65bc4b..abe31ebfff 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1433,7 +1433,7 @@ def detect_host() -> HostInfo: # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed has_rocm = False - if not is_macos: + if is_linux: for _cmd, _marker in ( (["rocminfo"], "gfx"), (["amd-smi", "list"], "gpu"), @@ -1449,6 +1449,16 @@ def detect_host() -> HostInfo: if _marker in _result.stdout.lower(): has_rocm = True break + elif is_windows: + # Windows: check for HIP runtime DLL or hipinfo tool + if shutil.which("hipinfo") or shutil.which("amd-smi"): + has_rocm = True + elif any( + Path(d).joinpath("amdhip64.dll").exists() + for d in os.environ.get("PATH", "").split(os.pathsep) + if d + ): + has_rocm = True return HostInfo( system = system, diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index 5955d79aec..fe76d920a5 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -403,6 +403,13 @@ def test_detect_host_with_rocm_path_env(self): source = inspect.getsource(prebuilt_mod.detect_host) assert "ROCM_PATH" in source or "rocm" in source.lower() + def test_detect_host_windows_rocm_detection(self): + """detect_host() source should have Windows-specific HIP detection.""" + import inspect + + source = inspect.getsource(prebuilt_mod.detect_host) + assert "hipinfo" in source or "amdhip64" in source + # ============================================================================= # TEST: install_python_stack.py -- _detect_rocm_version From f17e007cafac4c39b88dfad6769a413798db5dac Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 11:08:20 +0000 Subject: [PATCH 15/55] Add AMD ROCm gaps: Mamba/SSM source builds, GPU monitoring, Windows messaging, RDNA expansion - worker.py: Add HIP detection to causal-conv1d/mamba-ssm probe, check for hipcc before ROCm source builds, improve status messages and error reporting, add timeout and uv support for the source build fallback - amd.py: New AMD GPU monitoring module via amd-smi metric --json, mirroring nvidia.py structure (utilization, temperature, power, VRAM) - hardware.py: Branch to amd.py when IS_ROCM is True for GPU utilization, visible GPU queries, and physical GPU count - install_python_stack.py: Detect AMD GPUs on Windows and warn that ROCm-enabled PyTorch must be installed manually - kernels/utils.py: Expand is_rdna() to cover RDNA2 (gfx1030-1032), RDNA3 (gfx1102-1103), RDNA3.5 (gfx1150-1152) alongside existing entries - tests: Add 32 new tests covering all changes (95/95 pass) --- studio/backend/core/training/worker.py | 100 ++++-- studio/backend/utils/hardware/amd.py | 224 +++++++++++++ studio/backend/utils/hardware/hardware.py | 98 ++++-- studio/install_python_stack.py | 13 + tests/studio/install/test_rocm_support.py | 367 ++++++++++++++++++++++ unsloth/kernels/utils.py | 14 +- 6 files changed, 766 insertions(+), 50 deletions(-) create mode 100644 studio/backend/utils/hardware/amd.py diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 699cfe74f7..187f43b3e0 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -86,6 +86,7 @@ def _probe_causal_conv1d_env() -> dict[str, str] | None: "'python_tag': f'cp{sys.version_info.major}{sys.version_info.minor}', " "'torch_mm': torch_mm, " "'cuda_major': str(int(str(torch.version.cuda).split('.', 1)[0])) if torch.version.cuda else '', " + "'hip_version': str(torch.version.hip) if getattr(torch.version, 'hip', None) else '', " "'cxx11abi': str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()" "}))" ), @@ -237,25 +238,88 @@ def _install_package_wheel_first( else: logger.info("No published %s wheel found: %s", display_name, wheel_url) - _send_status(event_queue, f"Installing {display_name} from PyPI...") - pypi_cmd = [ - sys.executable, - "-m", - "pip", - "install", - "--no-build-isolation", - "--no-deps", - "--no-cache-dir", - f"{pypi_name}=={pypi_version}", - ] - result = _sp.run( - pypi_cmd, - stdout = _sp.PIPE, - stderr = _sp.STDOUT, - text = True, - ) + is_hip = env and env.get("hip_version") + if is_hip and not shutil.which("hipcc"): + logger.error( + "%s requires hipcc for source compilation on ROCm. " + "Install the ROCm HIP SDK: https://rocm.docs.amd.com", + display_name, + ) + _send_status( + event_queue, + f"{display_name}: hipcc not found (ROCm HIP SDK required)", + ) + return + + if is_hip: + _send_status( + event_queue, + f"Compiling {display_name} from source for ROCm " + "(this may take several minutes)...", + ) + else: + _send_status(event_queue, f"Installing {display_name} from PyPI...") + + # Prefer uv for faster dependency resolution when available + if shutil.which("uv"): + pypi_cmd = [ + "uv", "pip", "install", + "--python", sys.executable, + "--no-build-isolation", + "--no-deps", + f"{pypi_name}=={pypi_version}", + ] + else: + pypi_cmd = [ + sys.executable, + "-m", + "pip", + "install", + "--no-build-isolation", + "--no-deps", + "--no-cache-dir", + f"{pypi_name}=={pypi_version}", + ] + + # Source compilation on ROCm can take 5-10 minutes; use a generous timeout + timeout = 600 if is_hip else 300 + + try: + result = _sp.run( + pypi_cmd, + stdout = _sp.PIPE, + stderr = _sp.STDOUT, + text = True, + timeout = timeout, + ) + except _sp.TimeoutExpired: + logger.error( + "%s installation timed out after %ds", display_name, timeout, + ) + _send_status( + event_queue, + f"{display_name} installation timed out after {timeout}s", + ) + return + if result.returncode != 0: - logger.error("Failed to install %s from PyPI:\n%s", display_name, result.stdout) + if is_hip: + # Surface a clear error for ROCm source build failures + error_lines = (result.stdout or "").strip().splitlines() + snippet = "\n".join(error_lines[-5:]) if error_lines else "(no output)" + logger.error( + "Failed to compile %s for ROCm:\n%s", display_name, result.stdout, + ) + _send_status( + event_queue, + f"Failed to compile {display_name} for ROCm. " + "Check that hipcc and ROCm development headers are installed.\n" + f"{snippet}", + ) + else: + logger.error( + "Failed to install %s from PyPI:\n%s", display_name, result.stdout, + ) return logger.info("Installed %s from PyPI", display_name) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py new file mode 100644 index 0000000000..df4f83d5ca --- /dev/null +++ b/studio/backend/utils/hardware/amd.py @@ -0,0 +1,224 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +"""AMD GPU monitoring via amd-smi. + +Mirrors the nvidia.py module structure so hardware.py can swap backends +based on IS_ROCM. All functions return the same dict shapes as their +nvidia.py counterparts. +""" + +import json +import subprocess +from typing import Any, Optional + +from loggers import get_logger + +logger = get_logger(__name__) + + +def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[dict]: + """Run amd-smi with the given arguments and return parsed JSON, or None.""" + try: + result = subprocess.run( + ["amd-smi", *args, "--json"], + capture_output=True, + text=True, + timeout=timeout, + ) + except (OSError, subprocess.TimeoutExpired) as e: + logger.warning("amd-smi query failed: %s", e) + return None + if result.returncode != 0 or not result.stdout.strip(): + logger.warning("amd-smi returned code %d", result.returncode) + return None + try: + return json.loads(result.stdout) + except json.JSONDecodeError: + logger.warning("Failed to parse amd-smi JSON output") + return None + + +def _parse_numeric(value: Any) -> Optional[float]: + """Extract a numeric value from amd-smi output (may be str, int, float, or dict).""" + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + # Strip units like "W", "C", "%", "MB" etc. + cleaned = value.strip().rstrip("WCMBGb% ").strip() + if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"): + return None + try: + return float(cleaned) + except (ValueError, TypeError): + return None + return None + + +def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: + """Extract standardized metrics from a single GPU's amd-smi data.""" + # amd-smi metric output structure varies by version; try common paths + usage = gpu_data.get("usage", gpu_data.get("gpu_activity", {})) + if isinstance(usage, dict): + gpu_util = _parse_numeric(usage.get("gfx_activity", usage.get("gpu_use_percent"))) + else: + gpu_util = _parse_numeric(usage) + + # Temperature + temp_data = gpu_data.get("temperature", {}) + if isinstance(temp_data, dict): + temp = _parse_numeric( + temp_data.get("edge", temp_data.get("temperature_edge", + temp_data.get("hotspot", temp_data.get("temperature_hotspot")))) + ) + else: + temp = _parse_numeric(temp_data) + + # Power + power_data = gpu_data.get("power", {}) + if isinstance(power_data, dict): + power_draw = _parse_numeric( + power_data.get("current_socket_power", + power_data.get("average_socket_power", + power_data.get("socket_power"))) + ) + power_limit = _parse_numeric( + power_data.get("power_cap", power_data.get("max_power_limit")) + ) + else: + power_draw = None + power_limit = None + + # VRAM + vram_data = gpu_data.get("vram", gpu_data.get("fb_memory_usage", {})) + if isinstance(vram_data, dict): + vram_used_bytes = _parse_numeric( + vram_data.get("vram_used", vram_data.get("used")) + ) + vram_total_bytes = _parse_numeric( + vram_data.get("vram_total", vram_data.get("total")) + ) + else: + vram_used_bytes = None + vram_total_bytes = None + + # Convert VRAM from bytes to MB if values are large (>10000 = likely bytes) + vram_used_mb = None + vram_total_mb = None + if vram_used_bytes is not None: + if vram_used_bytes > 100000: # Likely bytes + vram_used_mb = vram_used_bytes / (1024 * 1024) + else: # Likely already MB + vram_used_mb = vram_used_bytes + if vram_total_bytes is not None: + if vram_total_bytes > 100000: # Likely bytes + vram_total_mb = vram_total_bytes / (1024 * 1024) + else: # Likely already MB + vram_total_mb = vram_total_bytes + + # Build the standardized dict (same shape as nvidia._build_gpu_metrics) + vram_used_gb = round(vram_used_mb / 1024, 2) if vram_used_mb is not None else None + vram_total_gb = round(vram_total_mb / 1024, 2) if vram_total_mb is not None else None + vram_util = ( + round((vram_used_mb / vram_total_mb) * 100, 1) + if vram_used_mb is not None and vram_total_mb and vram_total_mb > 0 + else None + ) + power_util = ( + round((power_draw / power_limit) * 100, 1) + if power_draw is not None and power_limit and power_limit > 0 + else None + ) + + return { + "gpu_utilization_pct": gpu_util, + "temperature_c": temp, + "vram_used_gb": vram_used_gb, + "vram_total_gb": vram_total_gb, + "vram_utilization_pct": vram_util, + "power_draw_w": power_draw, + "power_limit_w": power_limit, + "power_utilization_pct": power_util, + } + + +def get_physical_gpu_count() -> Optional[int]: + """Return physical AMD GPU count via amd-smi, or None on failure.""" + data = _run_amd_smi("list") + if data is None: + return None + if isinstance(data, list): + return len(data) + # Some versions return a dict with a "gpu" key + gpus = data.get("gpu", data.get("gpus", [])) + if isinstance(gpus, list): + return len(gpus) + return None + + +def get_primary_gpu_utilization() -> dict[str, Any]: + """Return utilization metrics for the primary AMD GPU.""" + data = _run_amd_smi("metric", "-g", "0") + if data is None: + return {"available": False} + + # amd-smi may return a list with one entry or a dict + if isinstance(data, list): + if len(data) == 0: + return {"available": False} + gpu_data = data[0] + else: + gpu_data = data + + metrics = _extract_gpu_metrics(gpu_data) + metrics["available"] = True + return metrics + + +def get_visible_gpu_utilization( + parent_visible_ids: Optional[list[int]], + parent_cuda_visible_devices: Optional[str] = None, +) -> dict[str, Any]: + """Return utilization metrics for visible AMD GPUs.""" + if parent_visible_ids is None: + return { + "available": False, + "backend_cuda_visible_devices": parent_cuda_visible_devices, + "parent_visible_gpu_ids": [], + "devices": [], + "index_kind": "unresolved", + } + + data = _run_amd_smi("metric") + if data is None: + return { + "available": False, + "backend_cuda_visible_devices": parent_cuda_visible_devices, + "parent_visible_gpu_ids": parent_visible_ids or [], + "devices": [], + "index_kind": "physical", + } + + gpu_list = data if isinstance(data, list) else data.get("gpus", [data]) + visible_set = set(parent_visible_ids) + ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)} + + devices = [] + for idx, gpu_data in enumerate(gpu_list): + if idx not in visible_set: + continue + metrics = _extract_gpu_metrics(gpu_data) + metrics["index"] = idx + metrics["index_kind"] = "physical" + metrics["visible_ordinal"] = ordinal_map.get(idx, len(devices)) + devices.append(metrics) + + return { + "available": len(devices) > 0, + "backend_cuda_visible_devices": parent_cuda_visible_devices, + "parent_visible_gpu_ids": parent_visible_ids or [], + "devices": devices, + "index_kind": "physical", + } diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 7a3fad5994..049e250237 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -405,15 +405,26 @@ def get_gpu_utilization() -> Dict[str, Any]: device = get_device() if device == DeviceType.CUDA: - try: - from . import nvidia + if IS_ROCM: + try: + from . import amd - result = nvidia.get_primary_gpu_utilization() - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("nvidia-smi utilization query failed: %s", e) + result = amd.get_primary_gpu_utilization() + if result.get("available"): + result["backend"] = device.value + return result + except Exception as e: + logger.warning("amd-smi utilization query failed: %s", e) + else: + try: + from . import nvidia + + result = nvidia.get_primary_gpu_utilization() + if result.get("available"): + result["backend"] = device.value + return result + except Exception as e: + logger.warning("nvidia-smi utilization query failed: %s", e) mem = get_gpu_memory_info() if device != DeviceType.CPU and mem.get("available"): @@ -438,18 +449,32 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if device == DeviceType.CUDA: parent_visible_spec = _get_parent_visible_gpu_spec() - try: - from . import nvidia + if IS_ROCM: + try: + from . import amd - result = nvidia.get_visible_gpu_utilization( - parent_visible_spec["numeric_ids"], - parent_cuda_visible_devices = parent_visible_spec["raw"], - ) - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("nvidia-smi visible GPU utilization query failed: %s", e) + result = amd.get_visible_gpu_utilization( + parent_visible_spec["numeric_ids"], + parent_cuda_visible_devices = parent_visible_spec["raw"], + ) + if result.get("available"): + result["backend"] = device.value + return result + except Exception as e: + logger.warning("amd-smi visible GPU utilization query failed: %s", e) + else: + try: + from . import nvidia + + result = nvidia.get_visible_gpu_utilization( + parent_visible_spec["numeric_ids"], + parent_cuda_visible_devices = parent_visible_spec["raw"], + ) + if result.get("available"): + result["backend"] = device.value + return result + except Exception as e: + logger.warning("nvidia-smi visible GPU utilization query failed: %s", e) # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) if device in (DeviceType.CUDA, DeviceType.XPU): @@ -1121,16 +1146,27 @@ def get_physical_gpu_count() -> int: device = get_device() if device == DeviceType.CUDA: - try: - from . import nvidia - - count = nvidia.get_physical_gpu_count() - if count is not None: - _physical_gpu_count = count - return _physical_gpu_count - except Exception: - pass - # nvidia-smi unavailable or failed — fall back to torch + if IS_ROCM: + try: + from . import amd + + count = amd.get_physical_gpu_count() + if count is not None: + _physical_gpu_count = count + return _physical_gpu_count + except Exception: + pass + else: + try: + from . import nvidia + + count = nvidia.get_physical_gpu_count() + if count is not None: + _physical_gpu_count = count + return _physical_gpu_count + except Exception: + pass + # SMI tool unavailable or failed -- fall back to torch count = _torch_get_physical_gpu_count() _physical_gpu_count = count if count is not None else 1 return _physical_gpu_count @@ -1153,8 +1189,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): parent_visible_ids = get_parent_visible_gpu_ids() - # Try nvidia-smi first (NVIDIA only) - if device == DeviceType.CUDA: + # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm) + if device == DeviceType.CUDA and not IS_ROCM: try: from . import nvidia diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 461f83d9b9..dc6b3fe49e 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -705,6 +705,19 @@ def install_python_stack() -> int: _progress("ROCm torch check") _ensure_rocm_torch() + # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows. + # Detect and warn so users know manual steps are needed for GPU training. + if IS_WINDOWS and not NO_TORCH: + if shutil.which("hipinfo") or shutil.which("amd-smi"): + _safe_print( + _dim(" Note:"), + "AMD GPU detected on Windows. ROCm-enabled PyTorch must be", + ) + _safe_print( + " " * 8, + "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd", + ) + # 3. Extra dependencies _progress("unsloth extras") pip_install( diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index fe76d920a5..59c51da79a 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -6,6 +6,7 @@ """ import importlib.util +import json import os import subprocess import sys @@ -958,5 +959,371 @@ def test_get_torch_index_url_returns_cuda_on_nvidia(self): assert "cu1" in url or "cuda" in url.lower(), f"Expected CUDA URL, got: {url}" +# ============================================================================= +# TEST: worker.py -- ROCm Mamba/SSM source build path +# ============================================================================= + +# Load worker.py module +_WORKER_PATH = PACKAGE_ROOT / "studio" / "backend" / "core" / "training" / "worker.py" + + +class TestWorkerRocmMambaSsm: + """Verify worker.py Mamba/SSM install logic on ROCm.""" + + def test_probe_returns_hip_version_field(self): + """_probe_causal_conv1d_env probe script should include hip_version.""" + source = _WORKER_PATH.read_text() + assert "hip_version" in source + + def test_probe_script_has_getattr_hip(self): + """Probe script should use getattr for torch.version.hip (safe on CUDA).""" + source = _WORKER_PATH.read_text() + assert "getattr(torch.version, 'hip', None)" in source + + def test_direct_wheel_url_returns_none_without_cuda_major(self): + """_direct_wheel_url should return None when cuda_major is empty (ROCm).""" + # Load module for function access + _worker_spec = importlib.util.spec_from_file_location( + "test_worker", _WORKER_PATH + ) + assert _worker_spec is not None and _worker_spec.loader is not None + worker_mod = importlib.util.module_from_spec(_worker_spec) + + # Mock all the imports worker.py needs + sys.modules["structlog"] = MagicMock() + sys.modules["loggers"] = MagicMock() + sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + sys.modules["utils"] = MagicMock() + sys.modules["utils.hardware"] = MagicMock() + + try: + _worker_spec.loader.exec_module(worker_mod) + except Exception: + pytest.skip("Could not load worker module in test environment") + + env_rocm = { + "python_tag": "cp312", + "torch_mm": "2.6", + "cuda_major": "", + "hip_version": "7.1.12345", + "cxx11abi": "TRUE", + } + result = worker_mod._direct_wheel_url( + filename_prefix="causal_conv1d", + package_version="1.6.1", + release_tag="v1.6.1.post4", + release_base_url="https://github.com/Dao-AILab/causal-conv1d/releases/download", + env=env_rocm, + ) + assert result is None + + def test_hipcc_check_exists_in_source(self): + """worker.py should check for hipcc before ROCm source builds.""" + source = _WORKER_PATH.read_text() + assert "hipcc" in source + + def test_rocm_source_build_status_message(self): + """worker.py should send a specific status for ROCm source compilation.""" + source = _WORKER_PATH.read_text() + assert "Compiling" in source and "from source for ROCm" in source + + def test_rocm_build_failure_message(self): + """worker.py should send a clear error on ROCm build failure.""" + source = _WORKER_PATH.read_text() + assert "Failed to compile" in source and "for ROCm" in source + + def test_timeout_on_install(self): + """worker.py should have a timeout on pip install subprocess.""" + source = _WORKER_PATH.read_text() + assert "TimeoutExpired" in source + assert "timeout" in source + + +# ============================================================================= +# TEST: amd.py -- AMD GPU monitoring +# ============================================================================= + + +class TestAmdGpuMonitoring: + """Verify amd.py module structure and mock behavior.""" + + def test_amd_py_exists(self): + """amd.py should exist in the hardware directory.""" + amd_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" + ) + assert amd_path.exists() + + def test_amd_py_has_required_functions(self): + """amd.py should export the same function signatures as nvidia.py.""" + amd_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" + ) + source = amd_path.read_text() + assert "def get_physical_gpu_count" in source + assert "def get_primary_gpu_utilization" in source + assert "def get_visible_gpu_utilization" in source + + def test_amd_smi_json_parsing(self): + """Verify _extract_gpu_metrics parses amd-smi JSON correctly.""" + amd_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" + ) + _amd_spec = importlib.util.spec_from_file_location("test_amd", amd_path) + assert _amd_spec is not None and _amd_spec.loader is not None + amd_mod = importlib.util.module_from_spec(_amd_spec) + + sys.modules["loggers"] = MagicMock() + sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + + try: + _amd_spec.loader.exec_module(amd_mod) + except Exception: + pytest.skip("Could not load amd module in test environment") + + # Simulate amd-smi metric JSON output + gpu_data = { + "usage": {"gfx_activity": "85"}, + "temperature": {"edge": "72"}, + "power": { + "current_socket_power": "200.5", + "power_cap": "300", + }, + "vram": { + "vram_used": 8192, # MB + "vram_total": 16384, # MB + }, + } + metrics = amd_mod._extract_gpu_metrics(gpu_data) + assert metrics["gpu_utilization_pct"] == 85.0 + assert metrics["temperature_c"] == 72.0 + assert metrics["power_draw_w"] == 200.5 + assert metrics["power_limit_w"] == 300.0 + assert metrics["vram_used_gb"] == round(8192 / 1024, 2) + assert metrics["vram_total_gb"] == round(16384 / 1024, 2) + assert metrics["vram_utilization_pct"] is not None + assert metrics["power_utilization_pct"] is not None + + def test_amd_primary_gpu_with_mock(self): + """get_primary_gpu_utilization returns correct dict with mocked amd-smi.""" + amd_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" + ) + _amd_spec = importlib.util.spec_from_file_location("test_amd2", amd_path) + assert _amd_spec is not None and _amd_spec.loader is not None + amd_mod = importlib.util.module_from_spec(_amd_spec) + + sys.modules["loggers"] = MagicMock() + sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + + try: + _amd_spec.loader.exec_module(amd_mod) + except Exception: + pytest.skip("Could not load amd module") + + mock_json = json.dumps([{ + "usage": {"gfx_activity": "50"}, + "temperature": {"edge": "65"}, + "power": {"current_socket_power": "150", "power_cap": "250"}, + "vram": {"vram_used": 4096, "vram_total": 16384}, + }]) + mock_result = MagicMock() + mock_result.returncode = 0 + mock_result.stdout = mock_json + + with patch.object(subprocess, "run", return_value=mock_result): + result = amd_mod.get_primary_gpu_utilization() + assert result["available"] is True + assert result["gpu_utilization_pct"] == 50.0 + assert result["temperature_c"] == 65.0 + + def test_amd_smi_not_found_returns_unavailable(self): + """get_primary_gpu_utilization returns available=False when amd-smi is missing.""" + amd_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" + ) + _amd_spec = importlib.util.spec_from_file_location("test_amd3", amd_path) + assert _amd_spec is not None and _amd_spec.loader is not None + amd_mod = importlib.util.module_from_spec(_amd_spec) + + sys.modules["loggers"] = MagicMock() + sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + + try: + _amd_spec.loader.exec_module(amd_mod) + except Exception: + pytest.skip("Could not load amd module") + + with patch.object( + subprocess, "run", side_effect=OSError("amd-smi not found") + ): + result = amd_mod.get_primary_gpu_utilization() + assert result["available"] is False + + def test_amd_timeout_returns_unavailable(self): + """get_primary_gpu_utilization handles timeout gracefully.""" + amd_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" + ) + _amd_spec = importlib.util.spec_from_file_location("test_amd4", amd_path) + assert _amd_spec is not None and _amd_spec.loader is not None + amd_mod = importlib.util.module_from_spec(_amd_spec) + + sys.modules["loggers"] = MagicMock() + sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + + try: + _amd_spec.loader.exec_module(amd_mod) + except Exception: + pytest.skip("Could not load amd module") + + with patch.object( + subprocess, + "run", + side_effect=subprocess.TimeoutExpired("amd-smi", 5), + ): + result = amd_mod.get_primary_gpu_utilization() + assert result["available"] is False + + +# ============================================================================= +# TEST: hardware.py -- IS_ROCM branching to amd.py +# ============================================================================= + + +class TestHardwareAmdBranching: + """Verify hardware.py branches to amd.py when IS_ROCM is True.""" + + def test_hardware_imports_amd_module(self): + """hardware.py should import from amd module when IS_ROCM.""" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) + source = hw_path.read_text() + assert "from . import amd" in source + + def test_hardware_branches_on_is_rocm_for_utilization(self): + """get_gpu_utilization should check IS_ROCM before choosing backend.""" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) + source = hw_path.read_text() + # Find the get_gpu_utilization function + func_start = source.find("def get_gpu_utilization") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "IS_ROCM" in func_body + assert "amd.get_primary_gpu_utilization" in func_body + + def test_hardware_branches_on_is_rocm_for_visible(self): + """get_visible_gpu_utilization should check IS_ROCM.""" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) + source = hw_path.read_text() + func_start = source.find("def get_visible_gpu_utilization") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "IS_ROCM" in func_body + assert "amd.get_visible_gpu_utilization" in func_body + + def test_hardware_branches_on_is_rocm_for_physical_count(self): + """get_physical_gpu_count should try amd.py when IS_ROCM.""" + hw_path = ( + PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" + ) + source = hw_path.read_text() + func_start = source.find("def get_physical_gpu_count") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "IS_ROCM" in func_body + assert "amd.get_physical_gpu_count" in func_body + + +# ============================================================================= +# TEST: install_python_stack.py -- Windows AMD warning +# ============================================================================= + + +class TestWindowsRocmWarning: + """Verify Windows AMD GPU detection and warning message.""" + + def test_windows_amd_warning_in_source(self): + """install_python_stack.py should warn Windows AMD users.""" + source = _STACK_PATH.read_text() + assert "AMD GPU detected on Windows" in source + + def test_windows_amd_warning_checks_hipinfo_or_amdsmi(self): + """Warning should check for hipinfo or amd-smi.""" + source = _STACK_PATH.read_text() + assert "hipinfo" in source + assert "amd-smi" in source + + def test_windows_amd_warning_has_docs_link(self): + """Warning should include AMD docs link.""" + source = _STACK_PATH.read_text() + assert "docs.unsloth.ai/get-started/install-and-update/amd" in source + + +# ============================================================================= +# TEST: unsloth/kernels/utils.py -- is_rdna() expansion +# ============================================================================= + + +class TestIsRdnaExpansion: + """Verify is_rdna() covers RDNA2, RDNA3, RDNA3.5, RDNA4 architectures.""" + + def test_is_rdna_source_has_rdna2(self): + """is_rdna() should include RDNA2 architectures.""" + utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" + source = utils_path.read_text() + func_start = source.find("def is_rdna()") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "gfx1030" in func_body + assert "gfx1031" in func_body + assert "gfx1032" in func_body + + def test_is_rdna_source_has_rdna3(self): + """is_rdna() should include RDNA3 architectures.""" + utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" + source = utils_path.read_text() + func_start = source.find("def is_rdna()") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "gfx1100" in func_body + assert "gfx1101" in func_body + assert "gfx1102" in func_body + assert "gfx1103" in func_body + + def test_is_rdna_source_has_rdna35(self): + """is_rdna() should include RDNA3.5 architectures.""" + utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" + source = utils_path.read_text() + func_start = source.find("def is_rdna()") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "gfx1150" in func_body + assert "gfx1151" in func_body + assert "gfx1152" in func_body + + def test_is_rdna_source_has_rdna4(self): + """is_rdna() should include RDNA4 architectures.""" + utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" + source = utils_path.read_text() + func_start = source.find("def is_rdna()") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "gfx1200" in func_body + assert "gfx1201" in func_body + + def test_is_cdna_not_changed(self): + """is_cdna() should remain unchanged (no RDNA architectures added).""" + utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" + source = utils_path.read_text() + func_start = source.find("def is_cdna()") + func_body = source[func_start:source.find("\ndef ", func_start + 1)] + assert "gfx940" in func_body + assert "gfx941" in func_body + assert "gfx942" in func_body + assert "gfx950" in func_body + # RDNA architectures should NOT be in is_cdna + assert "gfx1030" not in func_body + assert "gfx1100" not in func_body + + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 90f2d5d238..049afac96d 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -88,10 +88,22 @@ def is_cdna(): @functools.lru_cache(1) def is_rdna(): - """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA3, RDNA4).""" + """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA2, RDNA3, RDNA3.5, RDNA4).""" return is_hip() and triton.runtime.driver.active.get_current_target().arch in ( + # RDNA2 (Navi 21-24) + "gfx1030", + "gfx1031", + "gfx1032", + # RDNA3 (Navi 31-33) "gfx1100", "gfx1101", + "gfx1102", + "gfx1103", + # RDNA3.5 (Strix Point / Strix Halo) + "gfx1150", + "gfx1151", + "gfx1152", + # RDNA4 (Navi 48-44) "gfx1200", "gfx1201", ) From 14823265d28c618f46ef000328faa1d7a323fce0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:08:36 +0000 Subject: [PATCH 16/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/training/worker.py | 19 +++-- studio/backend/utils/hardware/amd.py | 30 +++++--- tests/studio/install/test_rocm_support.py | 84 ++++++++++------------- 3 files changed, 71 insertions(+), 62 deletions(-) diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 187f43b3e0..8554173d01 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -263,8 +263,11 @@ def _install_package_wheel_first( # Prefer uv for faster dependency resolution when available if shutil.which("uv"): pypi_cmd = [ - "uv", "pip", "install", - "--python", sys.executable, + "uv", + "pip", + "install", + "--python", + sys.executable, "--no-build-isolation", "--no-deps", f"{pypi_name}=={pypi_version}", @@ -294,7 +297,9 @@ def _install_package_wheel_first( ) except _sp.TimeoutExpired: logger.error( - "%s installation timed out after %ds", display_name, timeout, + "%s installation timed out after %ds", + display_name, + timeout, ) _send_status( event_queue, @@ -308,7 +313,9 @@ def _install_package_wheel_first( error_lines = (result.stdout or "").strip().splitlines() snippet = "\n".join(error_lines[-5:]) if error_lines else "(no output)" logger.error( - "Failed to compile %s for ROCm:\n%s", display_name, result.stdout, + "Failed to compile %s for ROCm:\n%s", + display_name, + result.stdout, ) _send_status( event_queue, @@ -318,7 +325,9 @@ def _install_package_wheel_first( ) else: logger.error( - "Failed to install %s from PyPI:\n%s", display_name, result.stdout, + "Failed to install %s from PyPI:\n%s", + display_name, + result.stdout, ) return diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index df4f83d5ca..8473ce8e51 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -22,9 +22,9 @@ def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[dict]: try: result = subprocess.run( ["amd-smi", *args, "--json"], - capture_output=True, - text=True, - timeout=timeout, + capture_output = True, + text = True, + timeout = timeout, ) except (OSError, subprocess.TimeoutExpired) as e: logger.warning("amd-smi query failed: %s", e) @@ -62,7 +62,9 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: # amd-smi metric output structure varies by version; try common paths usage = gpu_data.get("usage", gpu_data.get("gpu_activity", {})) if isinstance(usage, dict): - gpu_util = _parse_numeric(usage.get("gfx_activity", usage.get("gpu_use_percent"))) + gpu_util = _parse_numeric( + usage.get("gfx_activity", usage.get("gpu_use_percent")) + ) else: gpu_util = _parse_numeric(usage) @@ -70,8 +72,13 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: temp_data = gpu_data.get("temperature", {}) if isinstance(temp_data, dict): temp = _parse_numeric( - temp_data.get("edge", temp_data.get("temperature_edge", - temp_data.get("hotspot", temp_data.get("temperature_hotspot")))) + temp_data.get( + "edge", + temp_data.get( + "temperature_edge", + temp_data.get("hotspot", temp_data.get("temperature_hotspot")), + ), + ) ) else: temp = _parse_numeric(temp_data) @@ -80,9 +87,10 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: power_data = gpu_data.get("power", {}) if isinstance(power_data, dict): power_draw = _parse_numeric( - power_data.get("current_socket_power", - power_data.get("average_socket_power", - power_data.get("socket_power"))) + power_data.get( + "current_socket_power", + power_data.get("average_socket_power", power_data.get("socket_power")), + ) ) power_limit = _parse_numeric( power_data.get("power_cap", power_data.get("max_power_limit")) @@ -120,7 +128,9 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: # Build the standardized dict (same shape as nvidia._build_gpu_metrics) vram_used_gb = round(vram_used_mb / 1024, 2) if vram_used_mb is not None else None - vram_total_gb = round(vram_total_mb / 1024, 2) if vram_total_mb is not None else None + vram_total_gb = ( + round(vram_total_mb / 1024, 2) if vram_total_mb is not None else None + ) vram_util = ( round((vram_used_mb / vram_total_mb) * 100, 1) if vram_used_mb is not None and vram_total_mb and vram_total_mb > 0 diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index 59c51da79a..ff71da6b25 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -992,7 +992,7 @@ def test_direct_wheel_url_returns_none_without_cuda_major(self): # Mock all the imports worker.py needs sys.modules["structlog"] = MagicMock() sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) sys.modules["utils"] = MagicMock() sys.modules["utils.hardware"] = MagicMock() @@ -1009,11 +1009,11 @@ def test_direct_wheel_url_returns_none_without_cuda_major(self): "cxx11abi": "TRUE", } result = worker_mod._direct_wheel_url( - filename_prefix="causal_conv1d", - package_version="1.6.1", - release_tag="v1.6.1.post4", - release_base_url="https://github.com/Dao-AILab/causal-conv1d/releases/download", - env=env_rocm, + filename_prefix = "causal_conv1d", + package_version = "1.6.1", + release_tag = "v1.6.1.post4", + release_base_url = "https://github.com/Dao-AILab/causal-conv1d/releases/download", + env = env_rocm, ) assert result is None @@ -1049,16 +1049,12 @@ class TestAmdGpuMonitoring: def test_amd_py_exists(self): """amd.py should exist in the hardware directory.""" - amd_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - ) + amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" assert amd_path.exists() def test_amd_py_has_required_functions(self): """amd.py should export the same function signatures as nvidia.py.""" - amd_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - ) + amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" source = amd_path.read_text() assert "def get_physical_gpu_count" in source assert "def get_primary_gpu_utilization" in source @@ -1066,15 +1062,13 @@ def test_amd_py_has_required_functions(self): def test_amd_smi_json_parsing(self): """Verify _extract_gpu_metrics parses amd-smi JSON correctly.""" - amd_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - ) + amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" _amd_spec = importlib.util.spec_from_file_location("test_amd", amd_path) assert _amd_spec is not None and _amd_spec.loader is not None amd_mod = importlib.util.module_from_spec(_amd_spec) sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) try: _amd_spec.loader.exec_module(amd_mod) @@ -1106,32 +1100,34 @@ def test_amd_smi_json_parsing(self): def test_amd_primary_gpu_with_mock(self): """get_primary_gpu_utilization returns correct dict with mocked amd-smi.""" - amd_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - ) + amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" _amd_spec = importlib.util.spec_from_file_location("test_amd2", amd_path) assert _amd_spec is not None and _amd_spec.loader is not None amd_mod = importlib.util.module_from_spec(_amd_spec) sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) try: _amd_spec.loader.exec_module(amd_mod) except Exception: pytest.skip("Could not load amd module") - mock_json = json.dumps([{ - "usage": {"gfx_activity": "50"}, - "temperature": {"edge": "65"}, - "power": {"current_socket_power": "150", "power_cap": "250"}, - "vram": {"vram_used": 4096, "vram_total": 16384}, - }]) + mock_json = json.dumps( + [ + { + "usage": {"gfx_activity": "50"}, + "temperature": {"edge": "65"}, + "power": {"current_socket_power": "150", "power_cap": "250"}, + "vram": {"vram_used": 4096, "vram_total": 16384}, + } + ] + ) mock_result = MagicMock() mock_result.returncode = 0 mock_result.stdout = mock_json - with patch.object(subprocess, "run", return_value=mock_result): + with patch.object(subprocess, "run", return_value = mock_result): result = amd_mod.get_primary_gpu_utilization() assert result["available"] is True assert result["gpu_utilization_pct"] == 50.0 @@ -1139,38 +1135,32 @@ def test_amd_primary_gpu_with_mock(self): def test_amd_smi_not_found_returns_unavailable(self): """get_primary_gpu_utilization returns available=False when amd-smi is missing.""" - amd_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - ) + amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" _amd_spec = importlib.util.spec_from_file_location("test_amd3", amd_path) assert _amd_spec is not None and _amd_spec.loader is not None amd_mod = importlib.util.module_from_spec(_amd_spec) sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) try: _amd_spec.loader.exec_module(amd_mod) except Exception: pytest.skip("Could not load amd module") - with patch.object( - subprocess, "run", side_effect=OSError("amd-smi not found") - ): + with patch.object(subprocess, "run", side_effect = OSError("amd-smi not found")): result = amd_mod.get_primary_gpu_utilization() assert result["available"] is False def test_amd_timeout_returns_unavailable(self): """get_primary_gpu_utilization handles timeout gracefully.""" - amd_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - ) + amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" _amd_spec = importlib.util.spec_from_file_location("test_amd4", amd_path) assert _amd_spec is not None and _amd_spec.loader is not None amd_mod = importlib.util.module_from_spec(_amd_spec) sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value=MagicMock()) + sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) try: _amd_spec.loader.exec_module(amd_mod) @@ -1180,7 +1170,7 @@ def test_amd_timeout_returns_unavailable(self): with patch.object( subprocess, "run", - side_effect=subprocess.TimeoutExpired("amd-smi", 5), + side_effect = subprocess.TimeoutExpired("amd-smi", 5), ): result = amd_mod.get_primary_gpu_utilization() assert result["available"] is False @@ -1210,7 +1200,7 @@ def test_hardware_branches_on_is_rocm_for_utilization(self): source = hw_path.read_text() # Find the get_gpu_utilization function func_start = source.find("def get_gpu_utilization") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "IS_ROCM" in func_body assert "amd.get_primary_gpu_utilization" in func_body @@ -1221,7 +1211,7 @@ def test_hardware_branches_on_is_rocm_for_visible(self): ) source = hw_path.read_text() func_start = source.find("def get_visible_gpu_utilization") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "IS_ROCM" in func_body assert "amd.get_visible_gpu_utilization" in func_body @@ -1232,7 +1222,7 @@ def test_hardware_branches_on_is_rocm_for_physical_count(self): ) source = hw_path.read_text() func_start = source.find("def get_physical_gpu_count") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "IS_ROCM" in func_body assert "amd.get_physical_gpu_count" in func_body @@ -1275,7 +1265,7 @@ def test_is_rdna_source_has_rdna2(self): utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" source = utils_path.read_text() func_start = source.find("def is_rdna()") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "gfx1030" in func_body assert "gfx1031" in func_body assert "gfx1032" in func_body @@ -1285,7 +1275,7 @@ def test_is_rdna_source_has_rdna3(self): utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" source = utils_path.read_text() func_start = source.find("def is_rdna()") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "gfx1100" in func_body assert "gfx1101" in func_body assert "gfx1102" in func_body @@ -1296,7 +1286,7 @@ def test_is_rdna_source_has_rdna35(self): utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" source = utils_path.read_text() func_start = source.find("def is_rdna()") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "gfx1150" in func_body assert "gfx1151" in func_body assert "gfx1152" in func_body @@ -1306,7 +1296,7 @@ def test_is_rdna_source_has_rdna4(self): utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" source = utils_path.read_text() func_start = source.find("def is_rdna()") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "gfx1200" in func_body assert "gfx1201" in func_body @@ -1315,7 +1305,7 @@ def test_is_cdna_not_changed(self): utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" source = utils_path.read_text() func_start = source.find("def is_cdna()") - func_body = source[func_start:source.find("\ndef ", func_start + 1)] + func_body = source[func_start : source.find("\ndef ", func_start + 1)] assert "gfx940" in func_body assert "gfx941" in func_body assert "gfx942" in func_body From 134638dd0ea321167c26ae5bd36b97e34b586f9b Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 11:32:00 +0000 Subject: [PATCH 17/55] Harden ROCm detection, fix VRAM heuristic, and expand RDNA2 coverage - Windows ROCm detection: validate actual GPU presence via hipinfo/amd-smi output markers instead of just checking tool existence on PATH - _ensure_rocm_torch: validate nvidia-smi actually reports a GPU before giving NVIDIA precedence (fixes AMD-only hosts with stale NVIDIA tools) - amd.py _parse_numeric: handle dict-shaped metric objects from newer amd-smi versions ({"value": 10, "unit": "W"}) and strip MiB/GiB units - amd.py VRAM heuristic: raise threshold from 100k to 10M to correctly handle MI300X (192 GB = 196608 MB) and other high-VRAM GPUs - amd.py visible GPU: use AMD-reported GPU IDs instead of enumerate index so non-dense sets like CUDA_VISIBLE_DEVICES=1,3 report correctly - install.sh: add ROCm <6.0 minimum version guard (no PyTorch wheels exist for older versions); fix rocm7.1* glob to not match rocm7.10+ - is_rdna: add gfx1033-1036 for RDNA2 mobile GPUs (RX 6600M etc.) - worker.py: increase ROCm source build timeout from 600s to 1800s; fix success log message for ROCm source builds - Tests: update mocks for _has_usable_nvidia_gpu, add RDNA2 target asserts --- install.sh | 6 ++- studio/backend/core/training/worker.py | 9 ++-- studio/backend/utils/hardware/amd.py | 32 ++++++++++---- studio/install_llama_prebuilt.py | 23 ++++++---- studio/install_python_stack.py | 24 +++++++++-- tests/studio/install/test_rocm_support.py | 51 +++++++++++++++-------- unsloth/kernels/utils.py | 4 ++ 7 files changed, 108 insertions(+), 41 deletions(-) diff --git a/install.sh b/install.sh index 4d121bf331..9df0bb7347 100755 --- a/install.sh +++ b/install.sh @@ -1019,12 +1019,16 @@ get_torch_index_url() { *) _rocm_tag="" ;; # reject malformed (empty, garbled, or major=0) esac if [ -n "$_rocm_tag" ]; then + # Minimum supported: ROCm 6.0 (no PyTorch wheels exist for older) + case "$_rocm_tag" in + rocm[1-5].*) echo "$_base/cpu"; return ;; + esac # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds (<2.11.0). # Fall back to rocm7.1 index which has torch 2.10.0. # TODO: uncomment the next line when torch upper bound is bumped to >=2.11.0 # echo "$_base/$_rocm_tag"; return case "$_rocm_tag" in - rocm6.*|rocm7.0*|rocm7.1*) + rocm6.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*) echo "$_base/$_rocm_tag" ;; *) # ROCm 7.2+ (including future 10.x+): cap to rocm7.1 diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 8554173d01..d4a4b6eade 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -284,8 +284,8 @@ def _install_package_wheel_first( f"{pypi_name}=={pypi_version}", ] - # Source compilation on ROCm can take 5-10 minutes; use a generous timeout - timeout = 600 if is_hip else 300 + # Source compilation on ROCm can take 10-30 minutes; use a generous timeout + timeout = 1800 if is_hip else 300 try: result = _sp.run( @@ -331,7 +331,10 @@ def _install_package_wheel_first( ) return - logger.info("Installed %s from PyPI", display_name) + if is_hip: + logger.info("Compiled and installed %s from source for ROCm", display_name) + else: + logger.info("Installed %s from PyPI", display_name) def _ensure_causal_conv1d_fast_path(event_queue: Any, model_name: str) -> None: diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 8473ce8e51..f8e156e773 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -17,7 +17,7 @@ logger = get_logger(__name__) -def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[dict]: +def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[Any]: """Run amd-smi with the given arguments and return parsed JSON, or None.""" try: result = subprocess.run( @@ -43,11 +43,17 @@ def _parse_numeric(value: Any) -> Optional[float]: """Extract a numeric value from amd-smi output (may be str, int, float, or dict).""" if value is None: return None + # Newer amd-smi versions emit {"value": 10, "unit": "W"} + if isinstance(value, dict): + return _parse_numeric(value.get("value")) if isinstance(value, (int, float)): - return float(value) + import math + f = float(value) + return f if math.isfinite(f) else None if isinstance(value, str): - # Strip units like "W", "C", "%", "MB" etc. - cleaned = value.strip().rstrip("WCMBGb% ").strip() + # Strip units like "W", "C", "%", "MB", "MiB", "GB", "GiB" etc. + import re + cleaned = re.sub(r'\s*[A-Za-z/%]+$', '', value.strip()) if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"): return None try: @@ -112,16 +118,18 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: vram_used_bytes = None vram_total_bytes = None - # Convert VRAM from bytes to MB if values are large (>10000 = likely bytes) + # Convert VRAM from bytes to MB if values are very large. + # amd-smi typically reports in MB, but some versions report bytes. + # Threshold: 10 million -- no GPU has <10 MB, and even 10 TB = 10M MB. vram_used_mb = None vram_total_mb = None if vram_used_bytes is not None: - if vram_used_bytes > 100000: # Likely bytes + if vram_used_bytes > 10_000_000: # Likely bytes (>10M) vram_used_mb = vram_used_bytes / (1024 * 1024) else: # Likely already MB vram_used_mb = vram_used_bytes if vram_total_bytes is not None: - if vram_total_bytes > 100000: # Likely bytes + if vram_total_bytes > 10_000_000: # Likely bytes (>10M) vram_total_mb = vram_total_bytes / (1024 * 1024) else: # Likely already MB vram_total_mb = vram_total_bytes @@ -211,12 +219,18 @@ def get_visible_gpu_utilization( "index_kind": "physical", } - gpu_list = data if isinstance(data, list) else data.get("gpus", [data]) + gpu_list = data if isinstance(data, list) else data.get("gpus", data.get("gpu", [data])) visible_set = set(parent_visible_ids) ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)} devices = [] - for idx, gpu_data in enumerate(gpu_list): + for fallback_idx, gpu_data in enumerate(gpu_list): + # Use AMD-reported GPU ID when available, fall back to enumeration index + raw_id = gpu_data.get("gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx))) if isinstance(gpu_data, dict) else fallback_idx + try: + idx = int(raw_id) + except (TypeError, ValueError): + idx = fallback_idx if idx not in visible_set: continue metrics = _extract_gpu_metrics(gpu_data) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index abe31ebfff..2ea84eede5 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1450,15 +1450,22 @@ def detect_host() -> HostInfo: has_rocm = True break elif is_windows: - # Windows: check for HIP runtime DLL or hipinfo tool - if shutil.which("hipinfo") or shutil.which("amd-smi"): - has_rocm = True - elif any( - Path(d).joinpath("amdhip64.dll").exists() - for d in os.environ.get("PATH", "").split(os.pathsep) - if d + # Windows: validate actual AMD GPU presence (not just tool/DLL existence) + for _cmd, _marker in ( + (["hipinfo"], "gcnarchname"), + (["amd-smi", "list"], "gpu"), ): - has_rocm = True + _exe = shutil.which(_cmd[0]) + if not _exe: + continue + try: + _result = run_capture([_exe, *_cmd[1:]], timeout = 10) + except Exception: + continue + if _result.returncode == 0 and _result.stdout.strip(): + if _marker in _result.stdout.lower(): + has_rocm = True + break return HostInfo( system = system, diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index dc6b3fe49e..cba37886a2 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -109,6 +109,24 @@ def _has_rocm_gpu() -> bool: return False +def _has_usable_nvidia_gpu() -> bool: + """Return True only when nvidia-smi exists AND reports at least one GPU.""" + exe = shutil.which("nvidia-smi") + if not exe: + return False + try: + result = subprocess.run( + [exe, "-L"], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + timeout=10, + ) + except Exception: + return False + return result.returncode == 0 and "GPU " in result.stdout + + def _ensure_rocm_torch() -> None: """Reinstall torch with ROCm wheels when the venv received CPU-only torch. @@ -118,8 +136,8 @@ def _ensure_rocm_torch() -> None: (NVIDIA takes precedence). Uses pip_install() to respect uv, constraints, and --python targeting. """ - # NVIDIA takes precedence on mixed hosts - if shutil.which("nvidia-smi"): + # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable + if _has_usable_nvidia_gpu(): return rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" if not os.path.isdir(rocm_root) and not shutil.which("hipcc"): @@ -707,7 +725,7 @@ def install_python_stack() -> int: # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows. # Detect and warn so users know manual steps are needed for GPU training. - if IS_WINDOWS and not NO_TORCH: + if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu(): if shutil.which("hipinfo") or shutil.which("amd-smi"): _safe_print( _dim(" Note:"), diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index ff71da6b25..eaa1ce2986 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -49,6 +49,7 @@ _detect_rocm_version = stack_mod._detect_rocm_version _ensure_rocm_torch = stack_mod._ensure_rocm_torch _has_rocm_gpu = stack_mod._has_rocm_gpu +_has_usable_nvidia_gpu = stack_mod._has_usable_nvidia_gpu _ROCM_TORCH_INDEX = stack_mod._ROCM_TORCH_INDEX @@ -396,20 +397,20 @@ def test_nvidia_host_no_rocm(self): assert host.has_rocm is False assert host.has_usable_nvidia is True - def test_detect_host_with_rocm_path_env(self): - """detect_host() checks ROCM_PATH env var for ROCm detection.""" - # Verify the detect_host function source references ROCM_PATH + def test_detect_host_has_rocm_detection_logic(self): + """detect_host() should have ROCm GPU detection logic.""" import inspect source = inspect.getsource(prebuilt_mod.detect_host) - assert "ROCM_PATH" in source or "rocm" in source.lower() + # Must probe for actual GPU, not just tool presence + assert "rocminfo" in source or "amd-smi" in source def test_detect_host_windows_rocm_detection(self): - """detect_host() source should have Windows-specific HIP detection.""" + """detect_host() source should have Windows-specific ROCm GPU detection.""" import inspect source = inspect.getsource(prebuilt_mod.detect_host) - assert "hipinfo" in source or "amdhip64" in source + assert "hipinfo" in source or "amd-smi" in source # ============================================================================= @@ -520,7 +521,8 @@ class TestEnsureRocmTorch: """Verify ROCm torch reinstall logic.""" @patch.object(stack_mod, "pip_install") - def test_no_rocm_skips(self, mock_pip): + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) + def test_no_rocm_skips(self, mock_nvidia, mock_pip): """No ROCm toolchain should skip entirely.""" with patch("os.path.isdir", return_value = False): with patch("shutil.which", return_value = None): @@ -528,9 +530,10 @@ def test_no_rocm_skips(self, mock_pip): mock_pip.assert_not_called() @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_torch_already_has_cuda_skips(self, mock_ver, mock_gpu, mock_pip): + def test_torch_already_has_cuda_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """If torch already has CUDA, should skip ROCm reinstall.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -541,9 +544,10 @@ def test_torch_already_has_cuda_skips(self, mock_ver, mock_gpu, mock_pip): mock_pip.assert_not_called() @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_torch_already_has_hip_skips(self, mock_ver, mock_gpu, mock_pip): + def test_torch_already_has_hip_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """If torch already has HIP, should skip ROCm reinstall.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -554,9 +558,10 @@ def test_torch_already_has_hip_skips(self, mock_ver, mock_gpu, mock_pip): mock_pip.assert_not_called() @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_cpu_torch_gets_rocm_reinstall(self, mock_ver, mock_gpu, mock_pip): + def test_cpu_torch_gets_rocm_reinstall(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """CPU-only torch on ROCm host should trigger reinstall.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -572,9 +577,10 @@ def test_cpu_torch_gets_rocm_reinstall(self, mock_ver, mock_gpu, mock_pip): assert "bitsandbytes" in str(bnb_call) @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (6, 3)) - def test_rocm_63_selects_correct_tag(self, mock_ver, mock_gpu, mock_pip): + def test_rocm_63_selects_correct_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """ROCm 6.3 should select rocm6.3 tag.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -586,9 +592,10 @@ def test_rocm_63_selects_correct_tag(self, mock_ver, mock_gpu, mock_pip): assert "rocm6.3" in str(torch_call) @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (5, 0)) - def test_old_rocm_skips(self, mock_ver, mock_gpu, mock_pip): + def test_old_rocm_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """ROCm version too old (below 6.0) should skip.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -599,10 +606,11 @@ def test_old_rocm_skips(self, mock_ver, mock_gpu, mock_pip): mock_pip.assert_not_called() @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = None) def test_version_unreadable_prints_warning( - self, mock_ver, mock_gpu, mock_pip, capsys + self, mock_ver, mock_gpu, mock_nvidia, mock_pip, capsys ): """ROCm detected but version unreadable should print warning and skip.""" with patch("os.path.isdir", return_value = True): @@ -612,9 +620,10 @@ def test_version_unreadable_prints_warning( assert "unreadable" in captured.out @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 2)) - def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_pip): + def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """ROCm 7.2 should select rocm7.1 tag (capped, not in mapping).""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -626,9 +635,10 @@ def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_pip): assert "rocm7.1" in str(torch_call) @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_probe_timeout_triggers_reinstall(self, mock_ver, mock_gpu, mock_pip): + def test_probe_timeout_triggers_reinstall(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): """Probe subprocess timeout should not crash; should proceed to reinstall.""" with patch("os.path.isdir", return_value = True): with patch( @@ -640,8 +650,9 @@ def test_probe_timeout_triggers_reinstall(self, mock_ver, mock_gpu, mock_pip): assert "rocm7.1" in str(mock_pip.call_args_list[0]) @patch.object(stack_mod, "pip_install") + @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = False) - def test_no_gpu_with_rocm_tools_skips(self, mock_gpu, mock_pip): + def test_no_gpu_with_rocm_tools_skips(self, mock_gpu, mock_nvidia, mock_pip): """ROCm tools present but no actual AMD GPU should skip entirely.""" with patch("os.path.isdir", return_value = True): _ensure_rocm_torch() @@ -865,7 +876,9 @@ def test_rocm72_capped_to_71(self): source = sh_path.read_text() assert 'echo "$_base/rocm7.1"' in source # fallback for unknown versions # Allowlisted versions should pass through directly - assert "rocm6.*|rocm7.0*|rocm7.1*)" in source + assert "rocm6.*" in source + assert "rocm7.0" in source + assert "rocm7.1" in source def test_rocm_tag_validation_guard_exists(self): """install.sh should validate _rocm_tag with a case guard.""" @@ -1269,6 +1282,10 @@ def test_is_rdna_source_has_rdna2(self): assert "gfx1030" in func_body assert "gfx1031" in func_body assert "gfx1032" in func_body + assert "gfx1033" in func_body + assert "gfx1034" in func_body + assert "gfx1035" in func_body + assert "gfx1036" in func_body def test_is_rdna_source_has_rdna3(self): """is_rdna() should include RDNA3 architectures.""" diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 049afac96d..09b03a597b 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -94,6 +94,10 @@ def is_rdna(): "gfx1030", "gfx1031", "gfx1032", + "gfx1033", + "gfx1034", + "gfx1035", + "gfx1036", # RDNA3 (Navi 31-33) "gfx1100", "gfx1101", From 4dc1dcaaec0a5d76f99b912224d348098fa34c10 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:32:14 +0000 Subject: [PATCH 18/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/utils/hardware/amd.py | 16 +++++++++++++--- studio/install_python_stack.py | 8 ++++---- tests/studio/install/test_rocm_support.py | 20 +++++++++++++++----- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index f8e156e773..6bd8600be3 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -48,12 +48,14 @@ def _parse_numeric(value: Any) -> Optional[float]: return _parse_numeric(value.get("value")) if isinstance(value, (int, float)): import math + f = float(value) return f if math.isfinite(f) else None if isinstance(value, str): # Strip units like "W", "C", "%", "MB", "MiB", "GB", "GiB" etc. import re - cleaned = re.sub(r'\s*[A-Za-z/%]+$', '', value.strip()) + + cleaned = re.sub(r"\s*[A-Za-z/%]+$", "", value.strip()) if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"): return None try: @@ -219,14 +221,22 @@ def get_visible_gpu_utilization( "index_kind": "physical", } - gpu_list = data if isinstance(data, list) else data.get("gpus", data.get("gpu", [data])) + gpu_list = ( + data if isinstance(data, list) else data.get("gpus", data.get("gpu", [data])) + ) visible_set = set(parent_visible_ids) ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)} devices = [] for fallback_idx, gpu_data in enumerate(gpu_list): # Use AMD-reported GPU ID when available, fall back to enumeration index - raw_id = gpu_data.get("gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx))) if isinstance(gpu_data, dict) else fallback_idx + raw_id = ( + gpu_data.get( + "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx)) + ) + if isinstance(gpu_data, dict) + else fallback_idx + ) try: idx = int(raw_id) except (TypeError, ValueError): diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index cba37886a2..ac0d70f726 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -117,10 +117,10 @@ def _has_usable_nvidia_gpu() -> bool: try: result = subprocess.run( [exe, "-L"], - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - text=True, - timeout=10, + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 10, ) except Exception: return False diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index eaa1ce2986..597dd8a0d9 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -533,7 +533,9 @@ def test_no_rocm_skips(self, mock_nvidia, mock_pip): @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_torch_already_has_cuda_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): + def test_torch_already_has_cuda_skips( + self, mock_ver, mock_gpu, mock_nvidia, mock_pip + ): """If torch already has CUDA, should skip ROCm reinstall.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -547,7 +549,9 @@ def test_torch_already_has_cuda_skips(self, mock_ver, mock_gpu, mock_nvidia, moc @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_torch_already_has_hip_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): + def test_torch_already_has_hip_skips( + self, mock_ver, mock_gpu, mock_nvidia, mock_pip + ): """If torch already has HIP, should skip ROCm reinstall.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -561,7 +565,9 @@ def test_torch_already_has_hip_skips(self, mock_ver, mock_gpu, mock_nvidia, mock @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_cpu_torch_gets_rocm_reinstall(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): + def test_cpu_torch_gets_rocm_reinstall( + self, mock_ver, mock_gpu, mock_nvidia, mock_pip + ): """CPU-only torch on ROCm host should trigger reinstall.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -580,7 +586,9 @@ def test_cpu_torch_gets_rocm_reinstall(self, mock_ver, mock_gpu, mock_nvidia, mo @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (6, 3)) - def test_rocm_63_selects_correct_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): + def test_rocm_63_selects_correct_tag( + self, mock_ver, mock_gpu, mock_nvidia, mock_pip + ): """ROCm 6.3 should select rocm6.3 tag.""" mock_probe = MagicMock() mock_probe.returncode = 0 @@ -638,7 +646,9 @@ def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip) @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_probe_timeout_triggers_reinstall(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): + def test_probe_timeout_triggers_reinstall( + self, mock_ver, mock_gpu, mock_nvidia, mock_pip + ): """Probe subprocess timeout should not crash; should proceed to reinstall.""" with patch("os.path.isdir", return_value = True): with patch( From 3881539868bc7ab467786c6f19b7a043ba20b205 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 11:51:17 +0000 Subject: [PATCH 19/55] Add HIP_VISIBLE_DEVICES support, unit-aware VRAM parsing, Windows GPU validation - hardware.py: check HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES on ROCm before falling back to CUDA_VISIBLE_DEVICES, so multi-GPU AMD setups with HIP-specific env vars report the correct visible device set - amd.py: add _parse_memory_mb() that reads "unit" from dict-shaped amd-smi JSON (e.g. {"value": 192, "unit": "GiB"}) and converts to MB correctly; fixes MI300X VRAM misreported as 0.19 GB instead of 192 GB - install_python_stack.py: Windows AMD warning now validates actual GPU presence via hipinfo/amd-smi output markers before printing - install_llama_prebuilt.py: restore amdhip64.dll fallback for Windows HIP detection after tool-based checks, so Windows HIP installs without CLI tools on PATH are still detected - hardware.py: fix IS_ROCM comment to accurately describe its role --- studio/backend/utils/hardware/amd.py | 61 +++++++++++++++-------- studio/backend/utils/hardware/hardware.py | 14 +++++- studio/install_llama_prebuilt.py | 9 +++- studio/install_python_stack.py | 24 ++++++++- 4 files changed, 83 insertions(+), 25 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 6bd8600be3..747c515d68 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -65,6 +65,40 @@ def _parse_numeric(value: Any) -> Optional[float]: return None +def _parse_memory_mb(value: Any) -> Optional[float]: + """Parse a memory value from amd-smi output and return MB. + + Handles bare numbers (assumed MB), dict-shaped values with units + ({"value": 192, "unit": "GiB"}), and byte-scale heuristic fallback. + """ + unit = "" + raw_value = value + + if isinstance(value, dict): + unit = str(value.get("unit", "")).strip().lower() + raw_value = value.get("value") + + num = _parse_numeric(raw_value if isinstance(value, dict) else value) + if num is None: + return None + + # Explicit unit conversion + if "gib" in unit or "gb" in unit: + return num * 1024 + if "mib" in unit or "mb" in unit: + return num + if "kib" in unit or "kb" in unit: + return num / 1024 + if unit and ("b" in unit and "g" not in unit and "m" not in unit and "k" not in unit): + # Plain bytes + return num / (1024 * 1024) + + # No explicit unit -- heuristic: values > 10M are likely bytes + if num > 10_000_000: + return num / (1024 * 1024) + return num # Assume MB + + def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: """Extract standardized metrics from a single GPU's amd-smi data.""" # amd-smi metric output structure varies by version; try common paths @@ -107,34 +141,19 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: power_draw = None power_limit = None - # VRAM + # VRAM -- unit-aware parsing to handle varying amd-smi output formats. + # Newer amd-smi versions may return {"value": 192, "unit": "GiB"}. vram_data = gpu_data.get("vram", gpu_data.get("fb_memory_usage", {})) if isinstance(vram_data, dict): - vram_used_bytes = _parse_numeric( + vram_used_mb = _parse_memory_mb( vram_data.get("vram_used", vram_data.get("used")) ) - vram_total_bytes = _parse_numeric( + vram_total_mb = _parse_memory_mb( vram_data.get("vram_total", vram_data.get("total")) ) else: - vram_used_bytes = None - vram_total_bytes = None - - # Convert VRAM from bytes to MB if values are very large. - # amd-smi typically reports in MB, but some versions report bytes. - # Threshold: 10 million -- no GPU has <10 MB, and even 10 TB = 10M MB. - vram_used_mb = None - vram_total_mb = None - if vram_used_bytes is not None: - if vram_used_bytes > 10_000_000: # Likely bytes (>10M) - vram_used_mb = vram_used_bytes / (1024 * 1024) - else: # Likely already MB - vram_used_mb = vram_used_bytes - if vram_total_bytes is not None: - if vram_total_bytes > 10_000_000: # Likely bytes (>10M) - vram_total_mb = vram_total_bytes / (1024 * 1024) - else: # Likely already MB - vram_total_mb = vram_total_bytes + vram_used_mb = None + vram_total_mb = None # Build the standardized dict (same shape as nvidia._build_gpu_metrics) vram_used_gb = round(vram_used_mb / 1024, 2) if vram_used_mb is not None else None diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 049e250237..4e9214deda 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -43,7 +43,7 @@ class DeviceType(str, Enum): DEVICE: Optional[DeviceType] = None CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) -IS_ROCM: bool = False # True when running on AMD ROCm (HIP) -- display/logging only +IS_ROCM: bool = False # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py # ========== Detection ========== @@ -567,7 +567,17 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: def _get_parent_visible_gpu_spec() -> Dict[str, Any]: - cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") + # ROCm uses HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES in addition to + # CUDA_VISIBLE_DEVICES (which HIP also respects). Check ROCm-specific + # env vars first so multi-GPU AMD setups are handled correctly. + cuda_visible = None + if IS_ROCM: + cuda_visible = ( + os.environ.get("HIP_VISIBLE_DEVICES") + or os.environ.get("ROCR_VISIBLE_DEVICES") + ) + if cuda_visible is None: + cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_visible is None: return { diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 2ea84eede5..da2f47a04a 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1450,7 +1450,7 @@ def detect_host() -> HostInfo: has_rocm = True break elif is_windows: - # Windows: validate actual AMD GPU presence (not just tool/DLL existence) + # Windows: prefer active probes that validate GPU presence for _cmd, _marker in ( (["hipinfo"], "gcnarchname"), (["amd-smi", "list"], "gpu"), @@ -1466,6 +1466,13 @@ def detect_host() -> HostInfo: if _marker in _result.stdout.lower(): has_rocm = True break + # Fallback: HIP runtime DLL indicates a working HIP installation + if not has_rocm and any( + Path(d).joinpath("amdhip64.dll").exists() + for d in os.environ.get("PATH", "").split(os.pathsep) + if d + ): + has_rocm = True return HostInfo( system = system, diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index ac0d70f726..e884134a73 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -726,7 +726,29 @@ def install_python_stack() -> int: # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows. # Detect and warn so users know manual steps are needed for GPU training. if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu(): - if shutil.which("hipinfo") or shutil.which("amd-smi"): + # Validate actual AMD GPU presence (not just tool existence) + _win_amd_gpu = False + for _wcmd, _wmarker in ( + (["hipinfo"], "gcnarchname"), + (["amd-smi", "list"], "gpu"), + ): + _wexe = shutil.which(_wcmd[0]) + if not _wexe: + continue + try: + _wr = subprocess.run( + [_wexe, *_wcmd[1:]], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + text=True, + timeout=10, + ) + except Exception: + continue + if _wr.returncode == 0 and _wmarker in _wr.stdout.lower(): + _win_amd_gpu = True + break + if _win_amd_gpu: _safe_print( _dim(" Note:"), "AMD GPU detected on Windows. ROCm-enabled PyTorch must be", From 326a971da18257f9a952293dd4733e87e43cff32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 Mar 2026 11:51:49 +0000 Subject: [PATCH 20/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/utils/hardware/amd.py | 4 +++- studio/backend/utils/hardware/hardware.py | 9 +++++---- studio/install_python_stack.py | 8 ++++---- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 747c515d68..d71f1cd494 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -89,7 +89,9 @@ def _parse_memory_mb(value: Any) -> Optional[float]: return num if "kib" in unit or "kb" in unit: return num / 1024 - if unit and ("b" in unit and "g" not in unit and "m" not in unit and "k" not in unit): + if unit and ( + "b" in unit and "g" not in unit and "m" not in unit and "k" not in unit + ): # Plain bytes return num / (1024 * 1024) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 4e9214deda..a31a389753 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -43,7 +43,9 @@ class DeviceType(str, Enum): DEVICE: Optional[DeviceType] = None CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) -IS_ROCM: bool = False # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py +IS_ROCM: bool = ( + False # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py +) # ========== Detection ========== @@ -572,9 +574,8 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]: # env vars first so multi-GPU AMD setups are handled correctly. cuda_visible = None if IS_ROCM: - cuda_visible = ( - os.environ.get("HIP_VISIBLE_DEVICES") - or os.environ.get("ROCR_VISIBLE_DEVICES") + cuda_visible = os.environ.get("HIP_VISIBLE_DEVICES") or os.environ.get( + "ROCR_VISIBLE_DEVICES" ) if cuda_visible is None: cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index e884134a73..9f3c3ddd04 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -738,10 +738,10 @@ def install_python_stack() -> int: try: _wr = subprocess.run( [_wexe, *_wcmd[1:]], - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL, - text=True, - timeout=10, + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 10, ) except Exception: continue From 2d55e770f064380b3e2c94a87c66e3552578d5c8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 11:58:03 +0000 Subject: [PATCH 21/55] Fix HIP_VISIBLE_DEVICES empty-string handling in GPU visibility spec Use explicit None checks instead of Python `or` operator when reading HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES, so that an empty string ("") is correctly honored as "no visible GPUs" rather than silently falling through to CUDA_VISIBLE_DEVICES on mixed ROCm+CUDA systems. --- studio/backend/utils/hardware/hardware.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index a31a389753..ff3ef92842 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -572,11 +572,16 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]: # ROCm uses HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES in addition to # CUDA_VISIBLE_DEVICES (which HIP also respects). Check ROCm-specific # env vars first so multi-GPU AMD setups are handled correctly. + # Use explicit None checks (not `or`) so empty string "" is honoured + # as "no visible GPUs" rather than falling through to CUDA_VISIBLE_DEVICES. cuda_visible = None if IS_ROCM: - cuda_visible = os.environ.get("HIP_VISIBLE_DEVICES") or os.environ.get( - "ROCR_VISIBLE_DEVICES" - ) + hip_vis = os.environ.get("HIP_VISIBLE_DEVICES") + rocr_vis = os.environ.get("ROCR_VISIBLE_DEVICES") + if hip_vis is not None: + cuda_visible = hip_vis + elif rocr_vis is not None: + cuda_visible = rocr_vis if cuda_visible is None: cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") From 478bc7f625bb1b88e0fc90ae866bf4efe04a0f08 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 11:58:49 +0000 Subject: [PATCH 22/55] Fix IS_ROCM test assertion for multi-line formatting --- tests/studio/install/test_rocm_support.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py index 597dd8a0d9..48831fd57b 100644 --- a/tests/studio/install/test_rocm_support.py +++ b/tests/studio/install/test_rocm_support.py @@ -741,7 +741,7 @@ def test_hardware_py_has_is_rocm(self): PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" ) source = hw_path.read_text() - assert "IS_ROCM: bool = False" in source + assert "IS_ROCM: bool" in source and "False" in source def test_hardware_py_sets_is_rocm_on_hip(self): """detect_hardware() should set IS_ROCM when torch.version.hip is set.""" From a0671107cf6402a5516053de23b4fa4a0cf84b12 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 31 Mar 2026 12:06:14 +0000 Subject: [PATCH 23/55] Cap torchvision/torchaudio versions, remove amdhip64.dll fallback, fix visible GPU count - Cap torchvision<0.26.0 and torchaudio<2.11.0 alongside torch<2.11.0 in both install.sh and install_python_stack.py to prevent resolver from selecting incompatible companion packages from ROCm wheel index - Remove amdhip64.dll fallback in Windows ROCm detection (DLL presence without hipinfo/amd-smi is not proof of GPU existence) - Fix get_visible_gpu_count() to use _get_parent_visible_gpu_spec() which respects HIP_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES on ROCm hosts --- install.sh | 2 +- studio/backend/utils/hardware/hardware.py | 17 ++++++++++------- studio/install_llama_prebuilt.py | 9 ++------- studio/install_python_stack.py | 4 ++-- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/install.sh b/install.sh index 9df0bb7347..56f6e8e9de 100755 --- a/install.sh +++ b/install.sh @@ -1110,7 +1110,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then substep "skipping PyTorch (--no-torch or Intel Mac x86_64)." "$C_WARN" else substep "installing PyTorch ($TORCH_INDEX_URL)..." - run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "torch>=2.4,<2.11.0" torchvision torchaudio \ + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "torch>=2.4,<2.11.0" "torchvision<0.26.0" "torchaudio<2.11.0" \ --index-url "$TORCH_INDEX_URL" # AMD ROCm: install bitsandbytes with AMD support case "$TORCH_INDEX_URL" in diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index ff3ef92842..40364f765e 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -1311,17 +1311,20 @@ def get_visible_gpu_count() -> int: if _visible_gpu_count is not None: return _visible_gpu_count - cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") - if cuda_visible is not None: - # "" means zero GPUs, "0" means 1, "0,1,2" means 3 - cuda_visible = cuda_visible.strip() - if cuda_visible == "" or cuda_visible == "-1": + # Use _get_parent_visible_gpu_spec() which already handles + # HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES on ROCm. + visible_spec = _get_parent_visible_gpu_spec() + if visible_spec["raw"] is not None: + raw = visible_spec["raw"].strip() + if raw == "" or raw == "-1": _visible_gpu_count = 0 + elif visible_spec["numeric_ids"] is not None: + _visible_gpu_count = len(visible_spec["numeric_ids"]) else: - _visible_gpu_count = len([x for x in cuda_visible.split(",") if x.strip()]) + _visible_gpu_count = len([x for x in raw.split(",") if x.strip()]) return _visible_gpu_count - # CUDA_VISIBLE_DEVICES not set -- try torch, fall back to physical count + # No visibility env var set -- try torch, fall back to physical count try: import torch diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index da2f47a04a..423d8f6fd3 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -1466,13 +1466,8 @@ def detect_host() -> HostInfo: if _marker in _result.stdout.lower(): has_rocm = True break - # Fallback: HIP runtime DLL indicates a working HIP installation - if not has_rocm and any( - Path(d).joinpath("amdhip64.dll").exists() - for d in os.environ.get("PATH", "").split(os.pathsep) - if d - ): - has_rocm = True + # Note: amdhip64.dll presence alone is NOT treated as GPU evidence + # since the HIP SDK can be installed without an AMD GPU. return HostInfo( system = system, diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 9f3c3ddd04..d7f61e5580 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -187,8 +187,8 @@ def _ensure_rocm_torch() -> None: "--force-reinstall", "--no-cache-dir", "torch>=2.4,<2.11.0", - "torchvision", - "torchaudio", + "torchvision<0.26.0", + "torchaudio<2.11.0", "--index-url", index_url, constrain = False, From d1e38589ebcf96de8bb40e7cf1696148377ff5a8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 1 Apr 2026 06:48:10 +0000 Subject: [PATCH 24/55] Attribute is_rdna() RDNA2/3/3.5/4 expansion to PR #4428 The is_rdna() expansion to cover RDNA2 (gfx1030-1036), RDNA3 (gfx1100-1103), RDNA3.5 (gfx1150-1152), and RDNA4 (gfx1200-1201) architectures is based on the original work from PR #4428. Co-authored-by: GoldenGrapeGentleman Co-authored-by: billishyahao From d1de729f980fff170c13f7f521a474eb3221c5f9 Mon Sep 17 00:00:00 2001 From: Iswarya Alex <47045679+iswaryaalex@users.noreply.github.com> Date: Fri, 3 Apr 2026 04:25:08 -0700 Subject: [PATCH 25/55] Support AMD Radeon for studio (#4770) Co-authored-by: Iswarya Alex --- install.sh | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 148 insertions(+), 3 deletions(-) diff --git a/install.sh b/install.sh index 56f6e8e9de..56dc24a5d8 100755 --- a/install.sh +++ b/install.sh @@ -1055,8 +1055,109 @@ get_torch_index_url() { elif [ "$_major" -ge 11 ]; then echo "$_base/cu118" else echo "$_base/cpu"; fi } + +get_radeon_wheel_url() { + # Only meaningful on Linux. Picks a repo.radeon.com base URL whose listing + # contains torch wheels. Tries paths like rocm-rel-7.2.1/, rocm-rel-7.2/, + # rocm-rel-7.1.1/, rocm-rel-7.1/ (AMD publishes both M.m and M.m.p dirs). + case "$(uname -s)" in Linux) ;; *) echo ""; return ;; esac + + # Detect full X.Y.Z version -- try amd-smi first, then /opt/rocm/.info/version, then hipconfig + _full_ver="" + _full_ver=$({ command -v amd-smi >/dev/null 2>&1 && \ + amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ + 'NF>1{if(match($2,/[0-9]+\.[0-9]+\.[0-9]+/)){print substr($2,RSTART,RLENGTH); ok=1; exit}} END{exit !ok}'; } || \ + { [ -r /opt/rocm/.info/version ] && \ + awk -F'[.-]' 'NF>=3{print $1"."$2"."$3; exit}' /opt/rocm/.info/version; } || \ + { command -v hipconfig >/dev/null 2>&1 && \ + hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]+\.[0-9]+\.[0-9]/{print $1}'; }) 2>/dev/null + + # Validate: must be X.Y.Z with X >= 1 + case "$_full_ver" in + [1-9]*.*[0-9].*[0-9]*) : ;; + *) echo ""; return ;; + esac + echo "https://repo.radeon.com/rocm/manylinux/rocm-rel-${_full_ver}/" +} + +# ── Radeon repo wheel selection helpers ────────────────────────────────────── +# Fetches the Radeon repo directory listing once into _RADEON_LISTING (global). +# _RADEON_PYTAG holds the CPython tag for the running interpreter (e.g. cp312). +# _RADEON_BASE_URL holds the base URL for relative-href resolution. +_RADEON_LISTING="" +_RADEON_PYTAG="" +_RADEON_BASE_URL="" + +_radeon_fetch_listing() { + # Usage: _radeon_fetch_listing BASE_URL + # Populates _RADEON_LISTING, _RADEON_PYTAG, _RADEON_BASE_URL. + _RADEON_BASE_URL="$1" + _RADEON_PYTAG=$("$_VENV_PY" -c " +import sys +print('cp{}{}'.format(sys.version_info.major, sys.version_info.minor)) +" 2>/dev/null) || return 1 + if command -v curl >/dev/null 2>&1; then + _RADEON_LISTING=$(curl -fsSL --max-time 20 "$_RADEON_BASE_URL" 2>/dev/null) + elif command -v wget >/dev/null 2>&1; then + _RADEON_LISTING=$(wget -qO- --timeout=20 "$_RADEON_BASE_URL" 2>/dev/null) + fi + [ -n "$_RADEON_LISTING" ] || return 1 +} + +_pick_radeon_wheel() { + # Usage: _pick_radeon_wheel PACKAGE_NAME + # Scans $_RADEON_LISTING for the newest wheel whose filename starts exactly + # with PACKAGE_NAME- and matches _RADEON_PYTAG + linux_x86_64. + # Prints the full URL (resolving relative hrefs against _RADEON_BASE_URL). + _pkg="$1" + [ -n "$_RADEON_LISTING" ] || return 1 + [ -n "$_RADEON_PYTAG" ] || return 1 + _tag="$_RADEON_PYTAG" + _href=$(printf '%s\n' "$_RADEON_LISTING" \ + | grep -o 'href="[^"]*"' \ + | sed 's/href="//;s/"//' \ + | awk -F/ -v pkg="$_pkg" -v tag="$_tag" ' + { + base = $NF + sub(/[?#].*/, "", base) # strip query / fragment + prefix = pkg "-" + suffix = "-" tag "-" tag "-linux_x86_64.whl" + if (substr(base, 1, length(prefix)) == prefix && + substr(base, length(base) - length(suffix) + 1) == suffix) + print $0 + }' \ + | sort -V \ + | tail -1) + [ -z "$_href" ] && return 1 + case "$_href" in + http*) printf '%s\n' "$_href" ;; + *) printf '%s\n' "${_RADEON_BASE_URL%/}/${_href#/}" ;; + esac +} + TORCH_INDEX_URL=$(get_torch_index_url) +# Auto-detect GPU for AMD ROCm based +# get_torch_index_url must have chosen */rocm* +# (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon". +case "$TORCH_INDEX_URL" in + */rocm*) + _amd_gpu_here=false + _amd_gpu_radeon=false + if command -v rocminfo >/dev/null 2>&1 && \ + rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[0-9]/{found=1} END{exit !found}'; then + _amd_gpu_here=true + elif command -v amd-smi >/dev/null 2>&1 && \ + amd-smi list 2>/dev/null | awk 'NR>1 && NF{found=1} END{exit !found}'; then + _amd_gpu_here=true + fi + if [ "$_amd_gpu_here" = true ] && command -v rocminfo >/dev/null 2>&1 && \ + rocminfo 2>/dev/null | grep -q 'Marketing Name:.*Radeon'; then + _amd_gpu_radeon=true + fi + ;; +esac + # ── Print CPU-only hint when no GPU detected ── case "$TORCH_INDEX_URL" in */cpu) @@ -1072,7 +1173,11 @@ case "$TORCH_INDEX_URL" in ;; */rocm*) echo "" - echo " AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)" + if [ "$_amd_gpu_radeon" = true ]; then + echo " AMD Radeon + ROCm detected -- installing PyTorch wheels from repo.radeon.com" + else + echo " AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)" + fi echo "" ;; esac @@ -1108,6 +1213,46 @@ elif [ -n "$TORCH_INDEX_URL" ]; then # Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac) if [ "$SKIP_TORCH" = true ]; then substep "skipping PyTorch (--no-torch or Intel Mac x86_64)." "$C_WARN" + elif [ "$_amd_gpu_radeon" = true ]; then + _radeon_url=$(get_radeon_wheel_url) + if [ -n "$_radeon_url" ]; then + _radeon_listing_ok=false + if _radeon_fetch_listing "$_radeon_url" 2>/dev/null; then + _radeon_listing_ok=true + else + # Try shorter X.Y path (AMD publishes both X.Y.Z and X.Y dirs) + _radeon_url_short=$(printf '%s\n' "$_radeon_url" \ + | sed 's|rocm-rel-\([0-9]*\)\.\([0-9]*\)\.[0-9]*/|rocm-rel-\1.\2/|') + if [ "$_radeon_url_short" != "$_radeon_url" ] && \ + _radeon_fetch_listing "$_radeon_url_short" 2>/dev/null; then + _radeon_listing_ok=true + fi + fi + + if [ "$_radeon_listing_ok" = true ]; then + substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." + _torch_arg="torch"; _tv_arg="torchvision"; _ta_arg="torchaudio"; _tri_arg="" + _torch_whl=$(_pick_radeon_wheel "torch" 2>/dev/null) && _torch_arg="$_torch_whl" + _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) && _tv_arg="$_tv_whl" + _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) && _ta_arg="$_ta_whl" + _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) && _tri_arg="$_tri_whl" + run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ + --find-links "$_RADEON_BASE_URL" \ + "$_tri_arg" "$_torch_arg" "$_tv_arg" "$_ta_arg" + substep "installing bitsandbytes for AMD Radeon..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" \ + "bitsandbytes>=0.49.1" + else + substep "[WARN] Radeon repo unavailable; falling back to CPU-only PyTorch" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + "torch>=2.4,<2.11.0" "torchvision<0.26.0" "torchaudio<2.11.0" \ + --index-url "${TORCH_INDEX_URL%/*}/cpu" + fi + else + substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to CPU-only PyTorch" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "torch>=2.4,<2.11.0" "torchvision<0.26.0" "torchaudio<2.11.0" \ + --index-url "${TORCH_INDEX_URL%/*}/cpu" + fi else substep "installing PyTorch ($TORCH_INDEX_URL)..." run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "torch>=2.4,<2.11.0" "torchvision<0.26.0" "torchaudio<2.11.0" \ @@ -1121,7 +1266,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then esac fi # Fresh: Step 2 - install unsloth, preserving pre-installed torch - substep "installing unsloth (this may take a few minutes)..." + substep "installing unsloth (this may take a few minutes)..." if [ "$SKIP_TORCH" = true ]; then # No-torch: install unsloth + unsloth-zoo with --no-deps, then # runtime deps (typer, safetensors, transformers, etc.) with --no-deps. @@ -1292,4 +1437,4 @@ else substep "source ${VENV_DIR}/bin/activate" substep "unsloth studio -H 0.0.0.0 -p 8888" echo "" -fi +fi \ No newline at end of file From f9a738ab9ca0228ebab3277f3bff40cbfae23376 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 11:36:15 +0000 Subject: [PATCH 26/55] Remove ROCm test files from main PR Move test_rocm_support.py and shell test additions to a separate PR to keep the main ROCm support PR focused on implementation changes. --- tests/sh/test_get_torch_index_url.sh | 128 +- tests/studio/install/test_rocm_support.py | 1346 --------------------- 2 files changed, 2 insertions(+), 1472 deletions(-) delete mode 100644 tests/studio/install/test_rocm_support.py diff --git a/tests/sh/test_get_torch_index_url.sh b/tests/sh/test_get_torch_index_url.sh index adbdbe4e14..6387922712 100755 --- a/tests/sh/test_get_torch_index_url.sh +++ b/tests/sh/test_get_torch_index_url.sh @@ -45,32 +45,10 @@ MOCK echo "$_dir" } -# Helper: create a mock amd-smi that prints a given ROCm version string -# Supports both "amd-smi version" and "amd-smi list" subcommands so that -# the GPU presence check (amd-smi list) also succeeds in tests. -make_mock_amd_smi() { - _dir=$(mktemp -d) - cat > "$_dir/amd-smi" </dev/null || true) [ -n "$_real" ] && ln -sf "$_real" "$_TOOLS_DIR/$_cmd" done @@ -141,108 +119,6 @@ _result=$(run_func "$_dir") assert_eq "unparseable -> cu126" "https://download.pytorch.org/whl/cu126" "$_result" rm -rf "$_dir" -# 9) ROCm 6.3 (no nvidia-smi) -> rocm6.3 -_dir=$(make_mock_amd_smi "6.3") -_result=$(run_func "$_dir") -assert_eq "ROCm 6.3 -> rocm6.3" "https://download.pytorch.org/whl/rocm6.3" "$_result" -rm -rf "$_dir" - -# 10) ROCm 7.1 (no nvidia-smi) -> rocm7.1 -_dir=$(make_mock_amd_smi "7.1") -_result=$(run_func "$_dir") -assert_eq "ROCm 7.1 -> rocm7.1" "https://download.pytorch.org/whl/rocm7.1" "$_result" -rm -rf "$_dir" - -# 11) ROCm 7.2 (no nvidia-smi) -> rocm7.1 (capped due to torch <2.11.0) -_dir=$(make_mock_amd_smi "7.2") -_result=$(run_func "$_dir") -assert_eq "ROCm 7.2 -> rocm7.1 (capped)" "https://download.pytorch.org/whl/rocm7.1" "$_result" -rm -rf "$_dir" - -# 12) Both nvidia-smi and amd-smi present -> CUDA takes precedence -_cuda_dir=$(make_mock_smi "12.6") -_amd_dir=$(make_mock_amd_smi "6.3") -_combined_dir=$(mktemp -d) -ln -sf "$_cuda_dir/nvidia-smi" "$_combined_dir/nvidia-smi" -ln -sf "$_amd_dir/amd-smi" "$_combined_dir/amd-smi" -_result=$(run_func "$_combined_dir") -assert_eq "CUDA+ROCm -> CUDA precedence" "https://download.pytorch.org/whl/cu126" "$_result" -rm -rf "$_cuda_dir" "$_amd_dir" "$_combined_dir" - -# 13) No nvidia-smi, no amd-smi -> cpu (duplicate of test 1, confirms ROCm didn't break it) -_result=$(run_func "none") -assert_eq "no GPU -> cpu" "https://download.pytorch.org/whl/cpu" "$_result" - -# 14) ROCm 6.1 (no nvidia-smi) -> rocm6.1 -_dir=$(make_mock_amd_smi "6.1") -_result=$(run_func "$_dir") -assert_eq "ROCm 6.1 -> rocm6.1" "https://download.pytorch.org/whl/rocm6.1" "$_result" -rm -rf "$_dir" - -# 15) ROCm 6.4 (no nvidia-smi) -> rocm6.4 -_dir=$(make_mock_amd_smi "6.4") -_result=$(run_func "$_dir") -assert_eq "ROCm 6.4 -> rocm6.4" "https://download.pytorch.org/whl/rocm6.4" "$_result" -rm -rf "$_dir" - -# 16) ROCm 7.0 (no nvidia-smi) -> rocm7.0 -_dir=$(make_mock_amd_smi "7.0") -_result=$(run_func "$_dir") -assert_eq "ROCm 7.0 -> rocm7.0" "https://download.pytorch.org/whl/rocm7.0" "$_result" -rm -rf "$_dir" - -# 17) ROCm 8.0 (future, no nvidia-smi) -> rocm7.1 (capped) -_dir=$(make_mock_amd_smi "8.0") -_result=$(run_func "$_dir") -assert_eq "ROCm 8.0 -> rocm7.1 (capped)" "https://download.pytorch.org/whl/rocm7.1" "$_result" -rm -rf "$_dir" - -# 18) Malformed amd-smi output (empty version field) -> cpu -_dir=$(mktemp -d) -cat > "$_dir/amd-smi" <<'MOCK' -#!/bin/sh -echo "AMDSMI Tool: 25.0.1 | AMDSMI Library version: 25.0.1.0 | ROCm version: " -MOCK -chmod +x "$_dir/amd-smi" -_result=$(run_func "$_dir") -assert_eq "empty amd-smi version -> cpu" "https://download.pytorch.org/whl/cpu" "$_result" -rm -rf "$_dir" - -# 19) amd-smi with "N/A" version -> cpu -_dir=$(mktemp -d) -cat > "$_dir/amd-smi" <<'MOCK' -#!/bin/sh -echo "AMDSMI Tool: 25.0.1 | AMDSMI Library version: 25.0.1.0 | ROCm version: N/A" -MOCK -chmod +x "$_dir/amd-smi" -_result=$(run_func "$_dir") -assert_eq "N/A amd-smi version -> cpu" "https://download.pytorch.org/whl/cpu" "$_result" -rm -rf "$_dir" - -# 20) ROCm version with trailing text (e.g. "6.3.1-beta") -> rocm6.3 -_dir=$(make_mock_amd_smi "6.3.1-beta") -_result=$(run_func "$_dir") -assert_eq "ROCm 6.3.1-beta -> rocm6.3" "https://download.pytorch.org/whl/rocm6.3" "$_result" -rm -rf "$_dir" - -# 22) CUDA 12.6 still works after ROCm changes (regression check) -_dir=$(make_mock_smi "12.6") -_result=$(run_func "$_dir") -assert_eq "CUDA 12.6 regression -> cu126" "https://download.pytorch.org/whl/cu126" "$_result" -rm -rf "$_dir" - -# 23) CUDA 13.0 still works after ROCm changes (regression check) -_dir=$(make_mock_smi "13.0") -_result=$(run_func "$_dir") -assert_eq "CUDA 13.0 regression -> cu130" "https://download.pytorch.org/whl/cu130" "$_result" -rm -rf "$_dir" - -# 24) CUDA 12.8 still works after ROCm changes (regression check) -_dir=$(make_mock_smi "12.8") -_result=$(run_func "$_dir") -assert_eq "CUDA 12.8 regression -> cu128" "https://download.pytorch.org/whl/cu128" "$_result" -rm -rf "$_dir" - rm -f "$_FUNC_FILE" rm -rf "$_FAKE_SMI_DIR" rm -rf "$_TOOLS_DIR" diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py deleted file mode 100644 index 48831fd57b..0000000000 --- a/tests/studio/install/test_rocm_support.py +++ /dev/null @@ -1,1346 +0,0 @@ -"""Tests for AMD ROCm support across install pathways. - -Verifies that ROCm detection and installation logic works correctly -WITHOUT breaking existing CUDA, CPU, macOS, and Windows pathways. -All tests use mocks -- no AMD hardware required. -""" - -import importlib.util -import json -import os -import subprocess -import sys -from pathlib import Path -from unittest.mock import MagicMock, patch, PropertyMock - -import pytest - - -# ── Load modules under test ────────────────────────────────────────────────── - -PACKAGE_ROOT = Path(__file__).resolve().parents[3] - -# install_llama_prebuilt.py -_PREBUILT_PATH = PACKAGE_ROOT / "studio" / "install_llama_prebuilt.py" -_PREBUILT_SPEC = importlib.util.spec_from_file_location( - "studio_install_llama_prebuilt", _PREBUILT_PATH -) -assert _PREBUILT_SPEC is not None and _PREBUILT_SPEC.loader is not None -prebuilt_mod = importlib.util.module_from_spec(_PREBUILT_SPEC) -sys.modules[_PREBUILT_SPEC.name] = prebuilt_mod -_PREBUILT_SPEC.loader.exec_module(prebuilt_mod) - -HostInfo = prebuilt_mod.HostInfo -AssetChoice = prebuilt_mod.AssetChoice -PrebuiltFallback = prebuilt_mod.PrebuiltFallback -resolve_upstream_asset_choice = prebuilt_mod.resolve_upstream_asset_choice -runtime_patterns_for_choice = prebuilt_mod.runtime_patterns_for_choice - -# install_python_stack.py -_STACK_PATH = PACKAGE_ROOT / "studio" / "install_python_stack.py" -_STACK_SPEC = importlib.util.spec_from_file_location( - "studio_install_python_stack", _STACK_PATH -) -assert _STACK_SPEC is not None and _STACK_SPEC.loader is not None -stack_mod = importlib.util.module_from_spec(_STACK_SPEC) -sys.modules[_STACK_SPEC.name] = stack_mod -_STACK_SPEC.loader.exec_module(stack_mod) - -_detect_rocm_version = stack_mod._detect_rocm_version -_ensure_rocm_torch = stack_mod._ensure_rocm_torch -_has_rocm_gpu = stack_mod._has_rocm_gpu -_has_usable_nvidia_gpu = stack_mod._has_usable_nvidia_gpu -_ROCM_TORCH_INDEX = stack_mod._ROCM_TORCH_INDEX - - -# ── Helper: build HostInfo for different scenarios ────────────────────────── - - -def nvidia_host(**overrides) -> HostInfo: - """NVIDIA Linux x86_64 host.""" - defaults = dict( - system = "Linux", - machine = "x86_64", - is_windows = False, - is_linux = True, - is_macos = False, - is_x86_64 = True, - is_arm64 = False, - nvidia_smi = "/usr/bin/nvidia-smi", - driver_cuda_version = (12, 6), - compute_caps = ["89"], - visible_cuda_devices = None, - has_physical_nvidia = True, - has_usable_nvidia = True, - has_rocm = False, - ) - defaults.update(overrides) - return HostInfo(**defaults) - - -def rocm_host(**overrides) -> HostInfo: - """AMD ROCm Linux x86_64 host (no NVIDIA).""" - defaults = dict( - system = "Linux", - machine = "x86_64", - is_windows = False, - is_linux = True, - is_macos = False, - is_x86_64 = True, - is_arm64 = False, - nvidia_smi = None, - driver_cuda_version = None, - compute_caps = [], - visible_cuda_devices = None, - has_physical_nvidia = False, - has_usable_nvidia = False, - has_rocm = True, - ) - defaults.update(overrides) - return HostInfo(**defaults) - - -def cpu_host(**overrides) -> HostInfo: - """CPU-only Linux x86_64 host.""" - defaults = dict( - system = "Linux", - machine = "x86_64", - is_windows = False, - is_linux = True, - is_macos = False, - is_x86_64 = True, - is_arm64 = False, - nvidia_smi = None, - driver_cuda_version = None, - compute_caps = [], - visible_cuda_devices = None, - has_physical_nvidia = False, - has_usable_nvidia = False, - has_rocm = False, - ) - defaults.update(overrides) - return HostInfo(**defaults) - - -def macos_host(**overrides) -> HostInfo: - """macOS arm64 host.""" - defaults = dict( - system = "Darwin", - machine = "arm64", - is_windows = False, - is_linux = False, - is_macos = True, - is_x86_64 = False, - is_arm64 = True, - nvidia_smi = None, - driver_cuda_version = None, - compute_caps = [], - visible_cuda_devices = None, - has_physical_nvidia = False, - has_usable_nvidia = False, - has_rocm = False, - ) - defaults.update(overrides) - return HostInfo(**defaults) - - -def windows_host(**overrides) -> HostInfo: - """Windows x86_64 host.""" - defaults = dict( - system = "Windows", - machine = "amd64", - is_windows = True, - is_linux = False, - is_macos = False, - is_x86_64 = True, - is_arm64 = False, - nvidia_smi = None, - driver_cuda_version = None, - compute_caps = [], - visible_cuda_devices = None, - has_physical_nvidia = False, - has_usable_nvidia = False, - has_rocm = False, - ) - defaults.update(overrides) - return HostInfo(**defaults) - - -def windows_rocm_host(**overrides) -> HostInfo: - """Windows x86_64 host with ROCm.""" - defaults = dict( - system = "Windows", - machine = "amd64", - is_windows = True, - is_linux = False, - is_macos = False, - is_x86_64 = True, - is_arm64 = False, - nvidia_smi = None, - driver_cuda_version = None, - compute_caps = [], - visible_cuda_devices = None, - has_physical_nvidia = False, - has_usable_nvidia = False, - has_rocm = True, - ) - defaults.update(overrides) - return HostInfo(**defaults) - - -# ── Upstream asset fixture ─────────────────────────────────────────────────── - -LLAMA_TAG = "b8508" - -UPSTREAM_ASSETS = { - f"llama-{LLAMA_TAG}-bin-ubuntu-x64.tar.gz": f"https://example.com/{LLAMA_TAG}-linux-cpu.tar.gz", - f"llama-{LLAMA_TAG}-bin-ubuntu-rocm-7.2-x64.tar.gz": f"https://example.com/{LLAMA_TAG}-linux-rocm.tar.gz", - f"llama-{LLAMA_TAG}-bin-win-cpu-x64.zip": f"https://example.com/{LLAMA_TAG}-win-cpu.zip", - f"llama-{LLAMA_TAG}-bin-win-cuda-12.4-x64.zip": f"https://example.com/{LLAMA_TAG}-win-cuda.zip", - f"llama-{LLAMA_TAG}-bin-win-hip-radeon-x64.zip": f"https://example.com/{LLAMA_TAG}-win-hip.zip", - f"llama-{LLAMA_TAG}-bin-macos-arm64.tar.gz": f"https://example.com/{LLAMA_TAG}-macos-arm64.tar.gz", - f"llama-{LLAMA_TAG}-bin-macos-x64.tar.gz": f"https://example.com/{LLAMA_TAG}-macos-x64.tar.gz", -} - - -# ============================================================================= -# TEST: install_llama_prebuilt.py -- resolve_upstream_asset_choice -# ============================================================================= - - -class TestResolveUpstreamAssetChoice: - """Verify that the asset selection logic picks the right binary for each platform.""" - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_nvidia_linux_gets_cpu_asset(self, mock_assets): - """NVIDIA host should NOT hit the ROCm path -- gets CPU asset (CUDA handled elsewhere).""" - host = nvidia_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "linux-cpu" - assert "ubuntu-x64" in choice.name - assert "rocm" not in choice.name - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_rocm_linux_gets_rocm_prebuilt(self, mock_assets): - """AMD ROCm Linux host should get the ROCm prebuilt.""" - host = rocm_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "linux-rocm" - assert "rocm" in choice.name - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_cpu_linux_gets_cpu_asset(self, mock_assets): - """CPU-only Linux host should get CPU asset.""" - host = cpu_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "linux-cpu" - assert "ubuntu-x64" in choice.name - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_macos_arm64_gets_macos_asset(self, mock_assets): - """macOS arm64 host should get macOS asset.""" - host = macos_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "macos-arm64" - assert "macos-arm64" in choice.name - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_windows_cpu_gets_cpu_asset(self, mock_assets): - """Windows CPU-only host should get Windows CPU asset.""" - host = windows_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "windows-cpu" - assert "win-cpu" in choice.name - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_windows_rocm_gets_hip_asset(self, mock_assets): - """Windows ROCm host should get Windows HIP asset.""" - host = windows_rocm_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "windows-hip" - assert "hip" in choice.name - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_mixed_nvidia_rocm_prefers_nvidia(self, mock_assets): - """Host with both NVIDIA and ROCm should use NVIDIA (CPU path here, CUDA elsewhere).""" - host = nvidia_host(has_rocm = True) - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - # NVIDIA hosts go through the normal path (CUDA handled by resolve_linux_cuda_choice) - assert choice.install_kind == "linux-cpu" - assert "rocm" not in choice.name - - @patch.object(prebuilt_mod, "github_release_assets") - def test_rocm_linux_no_prebuilt_falls_back(self, mock_assets): - """AMD ROCm host should fall back to source build when no ROCm prebuilt exists.""" - # Remove the ROCm asset from available assets - assets_without_rocm = { - k: v for k, v in UPSTREAM_ASSETS.items() if "rocm" not in k - } - mock_assets.return_value = assets_without_rocm - host = rocm_host() - with pytest.raises(PrebuiltFallback, match = "ROCm detected"): - resolve_upstream_asset_choice(host, LLAMA_TAG) - - @patch.object(prebuilt_mod, "github_release_assets") - def test_windows_rocm_no_hip_falls_to_cpu(self, mock_assets): - """Windows+ROCm with HIP prebuilt missing should fall through to CPU.""" - assets_no_hip = {k: v for k, v in UPSTREAM_ASSETS.items() if "hip" not in k} - mock_assets.return_value = assets_no_hip - host = windows_rocm_host() - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "windows-cpu" - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_macos_rocm_impossible_has_rocm_false(self, mock_assets): - """macOS host should never have has_rocm=True in practice; verify it gets macOS asset.""" - host = macos_host(has_rocm = True) - choice = resolve_upstream_asset_choice(host, LLAMA_TAG) - assert choice.install_kind == "macos-arm64" - - @patch.object(prebuilt_mod, "github_release_assets", return_value = UPSTREAM_ASSETS) - def test_linux_aarch64_rocm_gets_prebuilt_fallback(self, mock_assets): - """Linux aarch64 with ROCm -- no x86_64 match, should raise PrebuiltFallback.""" - host = rocm_host(machine = "aarch64", is_x86_64 = False, is_arm64 = True) - with pytest.raises(PrebuiltFallback): - resolve_upstream_asset_choice(host, LLAMA_TAG) - - -# ============================================================================= -# TEST: install_llama_prebuilt.py -- runtime_patterns_for_choice -# ============================================================================= - - -class TestRuntimePatterns: - """Verify runtime file patterns for all install kinds.""" - - def test_linux_cpu_patterns(self): - choice = AssetChoice( - repo = "", tag = "", name = "", url = "", source_label = "", install_kind = "linux-cpu" - ) - patterns = runtime_patterns_for_choice(choice) - assert "llama-server" in patterns - assert "llama-quantize" in patterns - - def test_linux_cuda_patterns(self): - choice = AssetChoice( - repo = "", tag = "", name = "", url = "", source_label = "", install_kind = "linux-cuda" - ) - patterns = runtime_patterns_for_choice(choice) - assert "libggml-cuda.so*" in patterns - - def test_linux_rocm_patterns(self): - choice = AssetChoice( - repo = "", tag = "", name = "", url = "", source_label = "", install_kind = "linux-rocm" - ) - patterns = runtime_patterns_for_choice(choice) - assert "libggml-hip.so*" in patterns - assert "llama-server" in patterns - - def test_windows_hip_patterns(self): - choice = AssetChoice( - repo = "", - tag = "", - name = "", - url = "", - source_label = "", - install_kind = "windows-hip", - ) - patterns = runtime_patterns_for_choice(choice) - assert "*.exe" in patterns - assert "*.dll" in patterns - - def test_macos_patterns(self): - choice = AssetChoice( - repo = "", - tag = "", - name = "", - url = "", - source_label = "", - install_kind = "macos-arm64", - ) - patterns = runtime_patterns_for_choice(choice) - assert "lib*.dylib" in patterns - - -# ============================================================================= -# TEST: install_llama_prebuilt.py -- HostInfo.has_rocm field -# ============================================================================= - - -class TestHostInfoRocm: - """Verify has_rocm field does not affect other HostInfo behavior.""" - - def test_has_rocm_default_false(self): - host = HostInfo( - system = "Linux", - machine = "x86_64", - is_windows = False, - is_linux = True, - is_macos = False, - is_x86_64 = True, - is_arm64 = False, - nvidia_smi = None, - driver_cuda_version = None, - compute_caps = [], - visible_cuda_devices = None, - has_physical_nvidia = False, - has_usable_nvidia = False, - ) - assert host.has_rocm is False - - def test_has_rocm_explicit_true(self): - host = rocm_host() - assert host.has_rocm is True - - def test_nvidia_host_no_rocm(self): - host = nvidia_host() - assert host.has_rocm is False - assert host.has_usable_nvidia is True - - def test_detect_host_has_rocm_detection_logic(self): - """detect_host() should have ROCm GPU detection logic.""" - import inspect - - source = inspect.getsource(prebuilt_mod.detect_host) - # Must probe for actual GPU, not just tool presence - assert "rocminfo" in source or "amd-smi" in source - - def test_detect_host_windows_rocm_detection(self): - """detect_host() source should have Windows-specific ROCm GPU detection.""" - import inspect - - source = inspect.getsource(prebuilt_mod.detect_host) - assert "hipinfo" in source or "amd-smi" in source - - -# ============================================================================= -# TEST: install_python_stack.py -- _detect_rocm_version -# ============================================================================= - - -class TestDetectRocmVersion: - """Verify ROCm version detection from various sources.""" - - def test_no_rocm_returns_none(self, tmp_path): - """No ROCm installed should return None.""" - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): - with patch("shutil.which", return_value = None): - result = _detect_rocm_version() - assert result is None - - def test_version_from_file(self, tmp_path): - """Reads version from /opt/rocm/.info/version.""" - info_dir = tmp_path / ".info" - info_dir.mkdir() - (info_dir / "version").write_text("7.1.0-12345\n") - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): - result = _detect_rocm_version() - assert result == (7, 1) - - def test_version_62(self, tmp_path): - """Reads ROCm 6.2 version.""" - info_dir = tmp_path / ".info" - info_dir.mkdir() - (info_dir / "version").write_text("6.2.0\n") - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): - result = _detect_rocm_version() - assert result == (6, 2) - - def test_hipconfig_fallback(self, tmp_path): - """Falls back to hipconfig --version when file not found.""" - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = b"6.3.21234.2\n" - with patch("shutil.which", return_value = "/usr/bin/hipconfig"): - with patch("subprocess.run", return_value = mock_result): - result = _detect_rocm_version() - assert result == (6, 3) - - def test_empty_version_file(self, tmp_path): - """Empty version file should return None.""" - info_dir = tmp_path / ".info" - info_dir.mkdir() - (info_dir / "version").write_text("") - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): - with patch("shutil.which", return_value = None): - result = _detect_rocm_version() - assert result is None - - def test_version_with_epoch_prefix(self, tmp_path): - """Debian epoch prefix (2:6.2.0) -- version file has no epoch, so should parse.""" - info_dir = tmp_path / ".info" - info_dir.mkdir() - # Version files don't typically have epoch prefix, but lib/rocm_version might - (info_dir / "version").write_text("6.2.0\n") - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): - result = _detect_rocm_version() - assert result == (6, 2) - - def test_multiple_version_sources_first_wins(self, tmp_path): - """When both .info/version and lib/rocm_version exist, first found wins.""" - info_dir = tmp_path / ".info" - info_dir.mkdir() - (info_dir / "version").write_text("7.1.0\n") - lib_dir = tmp_path / "lib" - lib_dir.mkdir() - (lib_dir / "rocm_version").write_text("6.3.0\n") - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path)}): - result = _detect_rocm_version() - assert result == (7, 1) # .info/version checked first - - def test_hipconfig_multiline_output(self, tmp_path): - """hipconfig with multi-line output -- should use first line.""" - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = b"6.3.21234.2\nSome extra info\n" - with patch("shutil.which", return_value = "/usr/bin/hipconfig"): - with patch("subprocess.run", return_value = mock_result): - result = _detect_rocm_version() - assert result == (6, 3) - - def test_hipconfig_timeout(self, tmp_path): - """hipconfig that times out should return None.""" - with patch.dict(os.environ, {"ROCM_PATH": str(tmp_path / "nonexistent")}): - with patch("shutil.which", return_value = "/usr/bin/hipconfig"): - with patch( - "subprocess.run", - side_effect = subprocess.TimeoutExpired("hipconfig", 5), - ): - result = _detect_rocm_version() - assert result is None - - -# ============================================================================= -# TEST: install_python_stack.py -- _ensure_rocm_torch -# ============================================================================= - - -class TestEnsureRocmTorch: - """Verify ROCm torch reinstall logic.""" - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - def test_no_rocm_skips(self, mock_nvidia, mock_pip): - """No ROCm toolchain should skip entirely.""" - with patch("os.path.isdir", return_value = False): - with patch("shutil.which", return_value = None): - _ensure_rocm_torch() - mock_pip.assert_not_called() - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_torch_already_has_cuda_skips( - self, mock_ver, mock_gpu, mock_nvidia, mock_pip - ): - """If torch already has CUDA, should skip ROCm reinstall.""" - mock_probe = MagicMock() - mock_probe.returncode = 0 - mock_probe.stdout = b"12.6\n" # CUDA version string - with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", return_value = mock_probe): - _ensure_rocm_torch() - mock_pip.assert_not_called() - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_torch_already_has_hip_skips( - self, mock_ver, mock_gpu, mock_nvidia, mock_pip - ): - """If torch already has HIP, should skip ROCm reinstall.""" - mock_probe = MagicMock() - mock_probe.returncode = 0 - mock_probe.stdout = b"7.1.12345\n" # HIP version string - with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", return_value = mock_probe): - _ensure_rocm_torch() - mock_pip.assert_not_called() - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_cpu_torch_gets_rocm_reinstall( - self, mock_ver, mock_gpu, mock_nvidia, mock_pip - ): - """CPU-only torch on ROCm host should trigger reinstall.""" - mock_probe = MagicMock() - mock_probe.returncode = 0 - mock_probe.stdout = b"\n" # empty = no GPU backend - with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", return_value = mock_probe): - _ensure_rocm_torch() - # Should call pip_install twice: once for torch, once for bitsandbytes - assert mock_pip.call_count == 2 - torch_call = mock_pip.call_args_list[0] - assert "rocm7.1" in str(torch_call) - bnb_call = mock_pip.call_args_list[1] - assert "bitsandbytes" in str(bnb_call) - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (6, 3)) - def test_rocm_63_selects_correct_tag( - self, mock_ver, mock_gpu, mock_nvidia, mock_pip - ): - """ROCm 6.3 should select rocm6.3 tag.""" - mock_probe = MagicMock() - mock_probe.returncode = 0 - mock_probe.stdout = b"\n" - with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", return_value = mock_probe): - _ensure_rocm_torch() - torch_call = mock_pip.call_args_list[0] - assert "rocm6.3" in str(torch_call) - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (5, 0)) - def test_old_rocm_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): - """ROCm version too old (below 6.0) should skip.""" - mock_probe = MagicMock() - mock_probe.returncode = 0 - mock_probe.stdout = b"\n" - with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", return_value = mock_probe): - _ensure_rocm_torch() - mock_pip.assert_not_called() - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = None) - def test_version_unreadable_prints_warning( - self, mock_ver, mock_gpu, mock_nvidia, mock_pip, capsys - ): - """ROCm detected but version unreadable should print warning and skip.""" - with patch("os.path.isdir", return_value = True): - _ensure_rocm_torch() - mock_pip.assert_not_called() - captured = capsys.readouterr() - assert "unreadable" in captured.out - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 2)) - def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip): - """ROCm 7.2 should select rocm7.1 tag (capped, not in mapping).""" - mock_probe = MagicMock() - mock_probe.returncode = 0 - mock_probe.stdout = b"\n" - with patch("os.path.isdir", return_value = True): - with patch("subprocess.run", return_value = mock_probe): - _ensure_rocm_torch() - torch_call = mock_pip.call_args_list[0] - assert "rocm7.1" in str(torch_call) - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = True) - @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 1)) - def test_probe_timeout_triggers_reinstall( - self, mock_ver, mock_gpu, mock_nvidia, mock_pip - ): - """Probe subprocess timeout should not crash; should proceed to reinstall.""" - with patch("os.path.isdir", return_value = True): - with patch( - "subprocess.run", side_effect = subprocess.TimeoutExpired("python", 30) - ): - _ensure_rocm_torch() - # If probe times out, the function should treat torch as unusable and reinstall - assert mock_pip.call_count == 2 - assert "rocm7.1" in str(mock_pip.call_args_list[0]) - - @patch.object(stack_mod, "pip_install") - @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False) - @patch.object(stack_mod, "_has_rocm_gpu", return_value = False) - def test_no_gpu_with_rocm_tools_skips(self, mock_gpu, mock_nvidia, mock_pip): - """ROCm tools present but no actual AMD GPU should skip entirely.""" - with patch("os.path.isdir", return_value = True): - _ensure_rocm_torch() - mock_pip.assert_not_called() - - -# ============================================================================= -# TEST: install_python_stack.py -- _ROCM_TORCH_INDEX mapping -# ============================================================================= - - -class TestRocmTorchIndex: - """Verify the ROCm version -> torch index tag mapping.""" - - def test_mapping_is_sorted_descending(self): - """Keys should be in descending order for the next() iteration to work.""" - keys = list(_ROCM_TORCH_INDEX.keys()) - assert keys == sorted(keys, reverse = True) - - def test_rocm_72_not_in_mapping(self): - """ROCm 7.2 should NOT be in the active mapping (torch 2.11.0 exceeds bound).""" - assert (7, 2) not in _ROCM_TORCH_INDEX - - def test_rocm_71_maps_correctly(self): - assert _ROCM_TORCH_INDEX[(7, 1)] == "rocm7.1" - - def test_rocm_63_maps_correctly(self): - assert _ROCM_TORCH_INDEX[(6, 3)] == "rocm6.3" - - def test_rocm_60_maps_correctly(self): - assert _ROCM_TORCH_INDEX[(6, 0)] == "rocm6.0" - - def test_all_tags_use_download_pytorch(self): - """All tags should be for download.pytorch.org, not repo.radeon.com.""" - for tag in _ROCM_TORCH_INDEX.values(): - assert tag.startswith("rocm") - assert "radeon" not in tag - - def test_newer_rocm_selects_best_match(self): - """ROCm 7.2 (not in map) should select rocm7.1 via >= comparison.""" - ver = (7, 2) - tag = next( - ( - t - for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) - if ver >= (maj, mn) - ), - None, - ) - assert tag == "rocm7.1" - - def test_rocm_64_selects_64(self): - ver = (6, 4) - tag = next( - ( - t - for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) - if ver >= (maj, mn) - ), - None, - ) - assert tag == "rocm6.4" - - -# ============================================================================= -# TEST: hardware.py -- IS_ROCM flag and detect_hardware -# ============================================================================= - - -class TestHardwareRocmFlag: - """Verify IS_ROCM flag behavior without importing the full hardware module.""" - - def test_hardware_py_has_is_rocm(self): - """hardware.py should define IS_ROCM.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - assert "IS_ROCM: bool" in source and "False" in source - - def test_hardware_py_sets_is_rocm_on_hip(self): - """detect_hardware() should set IS_ROCM when torch.version.hip is set.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - assert 'torch.version, "hip"' in source or "torch.version.hip" in source - - def test_hardware_py_still_returns_cuda_for_rocm(self): - """DeviceType should remain CUDA even on ROCm -- no DeviceType.ROCM.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - # Ensure ROCM is NOT a DeviceType member - enum_section = source.split("class DeviceType")[1].split("\n\n")[0] - assert "ROCM" not in enum_section - - def test_hardware_py_has_rocm_in_package_versions(self): - """get_package_versions() should include 'rocm' key.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - assert '"rocm"' in source - - def test_hardware_py_device_type_cuda_references_intact(self): - """All existing DeviceType.CUDA references should still be present.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - # Key functions that must still reference DeviceType.CUDA - assert "DeviceType.CUDA" in source - assert "DEVICE = DeviceType.CUDA" in source - - def test_is_rocm_exported_from_init(self): - """IS_ROCM should be exported from hardware __init__.py.""" - init_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" - ) - source = init_path.read_text() - assert "IS_ROCM" in source - - def test_is_rocm_in_all_list(self): - """IS_ROCM should be in __all__ list in __init__.py.""" - init_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py" - ) - source = init_path.read_text() - # Extract __all__ section - assert '"IS_ROCM"' in source - - def test_get_package_versions_returns_rocm_key(self): - """get_package_versions() source should return both 'cuda' and 'rocm' keys.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - # Find the get_package_versions function body - func_start = source.find("def get_package_versions") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert '"cuda"' in func_body - assert '"rocm"' in func_body - - -# ============================================================================= -# TEST: tokenizer_utils.py -- error message -# ============================================================================= - - -class TestTokenizerErrorMessage: - """Verify the AMD error message is updated.""" - - def test_no_old_amd_message(self): - """Old 'We do not support AMD' message should be gone.""" - tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py" - source = tu_path.read_text() - assert "We do not support AMD" not in source - - def test_new_message_has_docs_link(self): - """New message should point to Unsloth AMD docs.""" - tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py" - source = tu_path.read_text() - assert "docs.unsloth.ai" in source or "No GPU detected" in source - - -# ============================================================================= -# TEST: install.sh -- structural checks -# ============================================================================= - - -class TestInstallShStructure: - """Verify install.sh structural properties without running it.""" - - def test_no_here_strings(self): - """install.sh must not use <<< (not POSIX).""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - # <<< is bash-only; breaks dash - for i, line in enumerate(source.splitlines(), 1): - stripped = line.lstrip() - if stripped.startswith("#"): - continue - assert "<<<" not in line, f"install.sh:{i} uses non-POSIX <<< here-string" - - def test_rocm_detection_present(self): - """install.sh should have ROCm detection in get_torch_index_url.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - assert "amd-smi" in source - assert "rocm" in source.lower() - - def test_cuda_precedence(self): - """ROCm detection should only run when nvidia-smi is absent.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - # The ROCm block should be inside the "if [ -z "$_smi" ]" branch - smi_block_start = source.find('if [ -z "$_smi" ]') - rocm_block_start = source.find("amd-smi") - assert ( - smi_block_start < rocm_block_start - ), "ROCm detection should be inside the 'no nvidia-smi' branch" - - def test_bitsandbytes_amd_install(self): - """install.sh should install bitsandbytes for AMD when ROCm detected.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - assert "bitsandbytes" in source - assert "rocm*)" in source # case pattern for ROCm URLs - - def test_cpu_hint_mentions_amd(self): - """CPU-only hint should mention AMD ROCm.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - assert "ROCm" in source - - def test_rocm72_capped_to_71(self): - """ROCm 7.2+ should fall back to rocm7.1 index.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - assert 'echo "$_base/rocm7.1"' in source # fallback for unknown versions - # Allowlisted versions should pass through directly - assert "rocm6.*" in source - assert "rocm7.0" in source - assert "rocm7.1" in source - - def test_rocm_tag_validation_guard_exists(self): - """install.sh should validate _rocm_tag with a case guard.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - assert "rocm[1-9]*.[0-9]*)" in source - assert '_rocm_tag=""' in source # rejection path - - def test_dpkg_epoch_handling(self): - """install.sh should strip Debian epoch prefix from dpkg-query output.""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - assert "sed 's/^[0-9]*://' " in source or "sed 's/^[0-9]*://'" in source - - def test_no_double_bracket_in_rocm_block(self): - """ROCm detection block should not use [[ ]] (bash-only, not POSIX). - Note: [[:space:]], [[:digit:]] etc. are valid POSIX character classes, not bash [[ ]].""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - func_start = source.find("get_torch_index_url()") - func_end = source.find("\n}", func_start) - func_body = source[func_start:func_end] - import re - - for i, line in enumerate(func_body.splitlines(), 1): - stripped = line.lstrip() - if stripped.startswith("#"): - continue - # Remove POSIX character classes [[:foo:]] before checking for [[ ]] - cleaned = re.sub(r"\[\[:[a-z]+:\]\]", "", line) - assert ( - "[[" not in cleaned - ), f"get_torch_index_url line {i} uses non-POSIX [[" - - def test_no_arithmetic_expansion_in_rocm_block(self): - """ROCm detection block should not use (( )) (bash-only).""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - func_start = source.find("get_torch_index_url()") - func_end = source.find("\n}", func_start) - func_body = source[func_start:func_end] - for i, line in enumerate(func_body.splitlines(), 1): - stripped = line.lstrip() - if stripped.startswith("#"): - continue - assert ( - "((" not in line or "))" not in line or "$(()" in line - ), f"get_torch_index_url line {i} may use non-POSIX (( ))" - - def test_macos_returns_cpu_before_rocm_check(self): - """macOS should return CPU immediately (before any ROCm check).""" - sh_path = PACKAGE_ROOT / "install.sh" - source = sh_path.read_text() - func_start = source.find("get_torch_index_url()") - func_body = source[func_start:] - darwin_pos = func_body.find("Darwin") - rocm_pos = func_body.find("amd-smi") - assert darwin_pos < rocm_pos, "macOS check should come before ROCm detection" - - -# ============================================================================= -# TEST: Live regression on current host (NVIDIA B200 expected) -# ============================================================================= - - -class TestLiveRegression: - """Live checks that run on the actual host -- skip if no NVIDIA GPU.""" - - def test_get_torch_index_url_returns_cuda_on_nvidia(self): - """On an NVIDIA machine, get_torch_index_url should return a CUDA URL.""" - import shutil - - if not shutil.which("nvidia-smi"): - pytest.skip("No nvidia-smi available") - sh_path = PACKAGE_ROOT / "install.sh" - # Extract just the function (don't source the whole installer) - result = subprocess.run( - [ - "bash", - "-c", - f"eval \"$(sed -n '/^get_torch_index_url()/,/^}}/p' '{sh_path}')\"; " - "get_torch_index_url", - ], - capture_output = True, - text = True, - timeout = 30, - ) - if result.returncode != 0: - pytest.skip("Could not extract get_torch_index_url for live test") - url = result.stdout.strip() - assert "cu1" in url or "cuda" in url.lower(), f"Expected CUDA URL, got: {url}" - - -# ============================================================================= -# TEST: worker.py -- ROCm Mamba/SSM source build path -# ============================================================================= - -# Load worker.py module -_WORKER_PATH = PACKAGE_ROOT / "studio" / "backend" / "core" / "training" / "worker.py" - - -class TestWorkerRocmMambaSsm: - """Verify worker.py Mamba/SSM install logic on ROCm.""" - - def test_probe_returns_hip_version_field(self): - """_probe_causal_conv1d_env probe script should include hip_version.""" - source = _WORKER_PATH.read_text() - assert "hip_version" in source - - def test_probe_script_has_getattr_hip(self): - """Probe script should use getattr for torch.version.hip (safe on CUDA).""" - source = _WORKER_PATH.read_text() - assert "getattr(torch.version, 'hip', None)" in source - - def test_direct_wheel_url_returns_none_without_cuda_major(self): - """_direct_wheel_url should return None when cuda_major is empty (ROCm).""" - # Load module for function access - _worker_spec = importlib.util.spec_from_file_location( - "test_worker", _WORKER_PATH - ) - assert _worker_spec is not None and _worker_spec.loader is not None - worker_mod = importlib.util.module_from_spec(_worker_spec) - - # Mock all the imports worker.py needs - sys.modules["structlog"] = MagicMock() - sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) - sys.modules["utils"] = MagicMock() - sys.modules["utils.hardware"] = MagicMock() - - try: - _worker_spec.loader.exec_module(worker_mod) - except Exception: - pytest.skip("Could not load worker module in test environment") - - env_rocm = { - "python_tag": "cp312", - "torch_mm": "2.6", - "cuda_major": "", - "hip_version": "7.1.12345", - "cxx11abi": "TRUE", - } - result = worker_mod._direct_wheel_url( - filename_prefix = "causal_conv1d", - package_version = "1.6.1", - release_tag = "v1.6.1.post4", - release_base_url = "https://github.com/Dao-AILab/causal-conv1d/releases/download", - env = env_rocm, - ) - assert result is None - - def test_hipcc_check_exists_in_source(self): - """worker.py should check for hipcc before ROCm source builds.""" - source = _WORKER_PATH.read_text() - assert "hipcc" in source - - def test_rocm_source_build_status_message(self): - """worker.py should send a specific status for ROCm source compilation.""" - source = _WORKER_PATH.read_text() - assert "Compiling" in source and "from source for ROCm" in source - - def test_rocm_build_failure_message(self): - """worker.py should send a clear error on ROCm build failure.""" - source = _WORKER_PATH.read_text() - assert "Failed to compile" in source and "for ROCm" in source - - def test_timeout_on_install(self): - """worker.py should have a timeout on pip install subprocess.""" - source = _WORKER_PATH.read_text() - assert "TimeoutExpired" in source - assert "timeout" in source - - -# ============================================================================= -# TEST: amd.py -- AMD GPU monitoring -# ============================================================================= - - -class TestAmdGpuMonitoring: - """Verify amd.py module structure and mock behavior.""" - - def test_amd_py_exists(self): - """amd.py should exist in the hardware directory.""" - amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - assert amd_path.exists() - - def test_amd_py_has_required_functions(self): - """amd.py should export the same function signatures as nvidia.py.""" - amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - source = amd_path.read_text() - assert "def get_physical_gpu_count" in source - assert "def get_primary_gpu_utilization" in source - assert "def get_visible_gpu_utilization" in source - - def test_amd_smi_json_parsing(self): - """Verify _extract_gpu_metrics parses amd-smi JSON correctly.""" - amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - _amd_spec = importlib.util.spec_from_file_location("test_amd", amd_path) - assert _amd_spec is not None and _amd_spec.loader is not None - amd_mod = importlib.util.module_from_spec(_amd_spec) - - sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) - - try: - _amd_spec.loader.exec_module(amd_mod) - except Exception: - pytest.skip("Could not load amd module in test environment") - - # Simulate amd-smi metric JSON output - gpu_data = { - "usage": {"gfx_activity": "85"}, - "temperature": {"edge": "72"}, - "power": { - "current_socket_power": "200.5", - "power_cap": "300", - }, - "vram": { - "vram_used": 8192, # MB - "vram_total": 16384, # MB - }, - } - metrics = amd_mod._extract_gpu_metrics(gpu_data) - assert metrics["gpu_utilization_pct"] == 85.0 - assert metrics["temperature_c"] == 72.0 - assert metrics["power_draw_w"] == 200.5 - assert metrics["power_limit_w"] == 300.0 - assert metrics["vram_used_gb"] == round(8192 / 1024, 2) - assert metrics["vram_total_gb"] == round(16384 / 1024, 2) - assert metrics["vram_utilization_pct"] is not None - assert metrics["power_utilization_pct"] is not None - - def test_amd_primary_gpu_with_mock(self): - """get_primary_gpu_utilization returns correct dict with mocked amd-smi.""" - amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - _amd_spec = importlib.util.spec_from_file_location("test_amd2", amd_path) - assert _amd_spec is not None and _amd_spec.loader is not None - amd_mod = importlib.util.module_from_spec(_amd_spec) - - sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) - - try: - _amd_spec.loader.exec_module(amd_mod) - except Exception: - pytest.skip("Could not load amd module") - - mock_json = json.dumps( - [ - { - "usage": {"gfx_activity": "50"}, - "temperature": {"edge": "65"}, - "power": {"current_socket_power": "150", "power_cap": "250"}, - "vram": {"vram_used": 4096, "vram_total": 16384}, - } - ] - ) - mock_result = MagicMock() - mock_result.returncode = 0 - mock_result.stdout = mock_json - - with patch.object(subprocess, "run", return_value = mock_result): - result = amd_mod.get_primary_gpu_utilization() - assert result["available"] is True - assert result["gpu_utilization_pct"] == 50.0 - assert result["temperature_c"] == 65.0 - - def test_amd_smi_not_found_returns_unavailable(self): - """get_primary_gpu_utilization returns available=False when amd-smi is missing.""" - amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - _amd_spec = importlib.util.spec_from_file_location("test_amd3", amd_path) - assert _amd_spec is not None and _amd_spec.loader is not None - amd_mod = importlib.util.module_from_spec(_amd_spec) - - sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) - - try: - _amd_spec.loader.exec_module(amd_mod) - except Exception: - pytest.skip("Could not load amd module") - - with patch.object(subprocess, "run", side_effect = OSError("amd-smi not found")): - result = amd_mod.get_primary_gpu_utilization() - assert result["available"] is False - - def test_amd_timeout_returns_unavailable(self): - """get_primary_gpu_utilization handles timeout gracefully.""" - amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py" - _amd_spec = importlib.util.spec_from_file_location("test_amd4", amd_path) - assert _amd_spec is not None and _amd_spec.loader is not None - amd_mod = importlib.util.module_from_spec(_amd_spec) - - sys.modules["loggers"] = MagicMock() - sys.modules["loggers"].get_logger = MagicMock(return_value = MagicMock()) - - try: - _amd_spec.loader.exec_module(amd_mod) - except Exception: - pytest.skip("Could not load amd module") - - with patch.object( - subprocess, - "run", - side_effect = subprocess.TimeoutExpired("amd-smi", 5), - ): - result = amd_mod.get_primary_gpu_utilization() - assert result["available"] is False - - -# ============================================================================= -# TEST: hardware.py -- IS_ROCM branching to amd.py -# ============================================================================= - - -class TestHardwareAmdBranching: - """Verify hardware.py branches to amd.py when IS_ROCM is True.""" - - def test_hardware_imports_amd_module(self): - """hardware.py should import from amd module when IS_ROCM.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - assert "from . import amd" in source - - def test_hardware_branches_on_is_rocm_for_utilization(self): - """get_gpu_utilization should check IS_ROCM before choosing backend.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - # Find the get_gpu_utilization function - func_start = source.find("def get_gpu_utilization") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "IS_ROCM" in func_body - assert "amd.get_primary_gpu_utilization" in func_body - - def test_hardware_branches_on_is_rocm_for_visible(self): - """get_visible_gpu_utilization should check IS_ROCM.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - func_start = source.find("def get_visible_gpu_utilization") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "IS_ROCM" in func_body - assert "amd.get_visible_gpu_utilization" in func_body - - def test_hardware_branches_on_is_rocm_for_physical_count(self): - """get_physical_gpu_count should try amd.py when IS_ROCM.""" - hw_path = ( - PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py" - ) - source = hw_path.read_text() - func_start = source.find("def get_physical_gpu_count") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "IS_ROCM" in func_body - assert "amd.get_physical_gpu_count" in func_body - - -# ============================================================================= -# TEST: install_python_stack.py -- Windows AMD warning -# ============================================================================= - - -class TestWindowsRocmWarning: - """Verify Windows AMD GPU detection and warning message.""" - - def test_windows_amd_warning_in_source(self): - """install_python_stack.py should warn Windows AMD users.""" - source = _STACK_PATH.read_text() - assert "AMD GPU detected on Windows" in source - - def test_windows_amd_warning_checks_hipinfo_or_amdsmi(self): - """Warning should check for hipinfo or amd-smi.""" - source = _STACK_PATH.read_text() - assert "hipinfo" in source - assert "amd-smi" in source - - def test_windows_amd_warning_has_docs_link(self): - """Warning should include AMD docs link.""" - source = _STACK_PATH.read_text() - assert "docs.unsloth.ai/get-started/install-and-update/amd" in source - - -# ============================================================================= -# TEST: unsloth/kernels/utils.py -- is_rdna() expansion -# ============================================================================= - - -class TestIsRdnaExpansion: - """Verify is_rdna() covers RDNA2, RDNA3, RDNA3.5, RDNA4 architectures.""" - - def test_is_rdna_source_has_rdna2(self): - """is_rdna() should include RDNA2 architectures.""" - utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" - source = utils_path.read_text() - func_start = source.find("def is_rdna()") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "gfx1030" in func_body - assert "gfx1031" in func_body - assert "gfx1032" in func_body - assert "gfx1033" in func_body - assert "gfx1034" in func_body - assert "gfx1035" in func_body - assert "gfx1036" in func_body - - def test_is_rdna_source_has_rdna3(self): - """is_rdna() should include RDNA3 architectures.""" - utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" - source = utils_path.read_text() - func_start = source.find("def is_rdna()") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "gfx1100" in func_body - assert "gfx1101" in func_body - assert "gfx1102" in func_body - assert "gfx1103" in func_body - - def test_is_rdna_source_has_rdna35(self): - """is_rdna() should include RDNA3.5 architectures.""" - utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" - source = utils_path.read_text() - func_start = source.find("def is_rdna()") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "gfx1150" in func_body - assert "gfx1151" in func_body - assert "gfx1152" in func_body - - def test_is_rdna_source_has_rdna4(self): - """is_rdna() should include RDNA4 architectures.""" - utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" - source = utils_path.read_text() - func_start = source.find("def is_rdna()") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "gfx1200" in func_body - assert "gfx1201" in func_body - - def test_is_cdna_not_changed(self): - """is_cdna() should remain unchanged (no RDNA architectures added).""" - utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py" - source = utils_path.read_text() - func_start = source.find("def is_cdna()") - func_body = source[func_start : source.find("\ndef ", func_start + 1)] - assert "gfx940" in func_body - assert "gfx941" in func_body - assert "gfx942" in func_body - assert "gfx950" in func_body - # RDNA architectures should NOT be in is_cdna - assert "gfx1030" not in func_body - assert "gfx1100" not in func_body - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) From 4148591fa776d7c904a3b5ff78192ca2674ee271 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 11:58:49 +0000 Subject: [PATCH 27/55] Fix installer and hardware detection issues for PR #4720 - Fix empty _tri_arg passed to uv pip install in Radeon path (causes "Empty field is not allowed for PEP508" error) - Fix Radeon fallback: use ROCm index instead of CPU-only when repo.radeon.com is unreachable (TORCH_INDEX_URL already has ROCm) - Use $TORCH_CONSTRAINT in fallback paths instead of hardcoded strings - Fix _pick_radeon_wheel: relax suffix to match manylinux_2_28_x86_64 wheels (AMD Radeon repo does not use bare linux_x86_64 platform tag) - Fix IS_ROCM export: use __getattr__ so callers always see the live value after detect_hardware() runs - Fix apply_gpu_ids: set HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES on ROCm so _get_parent_visible_gpu_spec picks up narrowed GPU set - Fix _parse_memory_mb: distinguish GB (1000 MB) from GiB (1024 MiB) - Add amd-smi version as a fallback in _detect_rocm_version - Fix trailing whitespace and missing newline at EOF in install.sh --- install.sh | 28 ++++++++++++++--------- studio/backend/utils/hardware/__init__.py | 12 +++++++--- studio/backend/utils/hardware/amd.py | 14 ++++++++---- studio/backend/utils/hardware/hardware.py | 7 +++++- studio/install_python_stack.py | 19 +++++++++++++++ 5 files changed, 61 insertions(+), 19 deletions(-) diff --git a/install.sh b/install.sh index e295c70a2b..8a0326b49d 100755 --- a/install.sh +++ b/install.sh @@ -1132,9 +1132,11 @@ _pick_radeon_wheel() { base = $NF sub(/[?#].*/, "", base) # strip query / fragment prefix = pkg "-" - suffix = "-" tag "-" tag "-linux_x86_64.whl" + # Match cpXY-cpXY or cpXY-abi3 with any linux x86_64 platform tag + # (linux_x86_64, manylinux_2_28_x86_64, manylinux2014_x86_64, etc.) if (substr(base, 1, length(prefix)) == prefix && - substr(base, length(base) - length(suffix) + 1) == suffix) + index(base, "-" tag "-") > 0 && + match(base, /x86_64\.whl$/)) print $0 }' \ | sort -V \ @@ -1247,22 +1249,26 @@ elif [ -n "$TORCH_INDEX_URL" ]; then _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) && _tv_arg="$_tv_whl" _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) && _ta_arg="$_ta_whl" _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) && _tri_arg="$_tri_whl" + # Build install args; skip empty _tri_arg to avoid passing "" to uv + _radeon_pkgs="$_torch_arg $_tv_arg $_ta_arg" + [ -n "$_tri_arg" ] && _radeon_pkgs="$_tri_arg $_radeon_pkgs" run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ --find-links "$_RADEON_BASE_URL" \ - "$_tri_arg" "$_torch_arg" "$_tv_arg" "$_ta_arg" + $_radeon_pkgs substep "installing bitsandbytes for AMD Radeon..." run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" \ "bitsandbytes>=0.49.1" else - substep "[WARN] Radeon repo unavailable; falling back to CPU-only PyTorch" "$C_WARN" + substep "[WARN] Radeon repo unavailable; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ - "torch>=2.4,<2.11.0" "torchvision<0.26.0" "torchaudio<2.11.0" \ - --index-url "${TORCH_INDEX_URL%/*}/cpu" + "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" fi else - substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to CPU-only PyTorch" "$C_WARN" - run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "torch>=2.4,<2.11.0" "torchvision<0.26.0" "torchaudio<2.11.0" \ - --index-url "${TORCH_INDEX_URL%/*}/cpu" + substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to ROCm index" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" fi else substep "installing PyTorch ($TORCH_INDEX_URL)..." @@ -1277,7 +1283,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then esac fi # Fresh: Step 2 - install unsloth, preserving pre-installed torch - substep "installing unsloth (this may take a few minutes)..." + substep "installing unsloth (this may take a few minutes)..." if [ "$SKIP_TORCH" = true ]; then # No-torch: install unsloth + unsloth-zoo with --no-deps, then # runtime deps (typer, safetensors, transformers, etc.) with --no-deps. @@ -1448,4 +1454,4 @@ else substep "source ${VENV_DIR}/bin/activate" substep "unsloth studio -H 0.0.0.0 -p 8888" echo "" -fi \ No newline at end of file +fi diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index b9b61cdcfe..17d731d7b2 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -5,11 +5,9 @@ Hardware detection and GPU utilities """ +from . import hardware as _hardware from .hardware import ( DeviceType, - DEVICE, - CHAT_ONLY, - IS_ROCM, detect_hardware, get_device, is_apple_silicon, @@ -83,3 +81,11 @@ "extract_arch_config", "estimate_training_vram", ] + + +def __getattr__(name: str): + """Resolve mutable module-level flags (DEVICE, CHAT_ONLY, IS_ROCM) at access + time so callers always see the current value after detect_hardware() runs.""" + if name in {"DEVICE", "CHAT_ONLY", "IS_ROCM"}: + return getattr(_hardware, name) + raise AttributeError(name) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index d71f1cd494..c3a0536a56 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -82,13 +82,19 @@ def _parse_memory_mb(value: Any) -> Optional[float]: if num is None: return None - # Explicit unit conversion - if "gib" in unit or "gb" in unit: + # Explicit unit conversion -- distinguish binary (GiB) from SI (GB) + if "gib" in unit: return num * 1024 - if "mib" in unit or "mb" in unit: + if "gb" in unit: + return num * 1000 + if "mib" in unit: return num - if "kib" in unit or "kb" in unit: + if "mb" in unit: + return num + if "kib" in unit: return num / 1024 + if "kb" in unit: + return num / 1000 if unit and ( "b" in unit and "g" not in unit and "m" not in unit and "k" not in unit ): diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 40364f765e..edeada0190 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -1356,8 +1356,13 @@ def apply_gpu_ids(gpu_ids) -> None: value = str(gpu_ids) os.environ["CUDA_VISIBLE_DEVICES"] = value + # Keep ROCm visibility env vars in sync so _get_parent_visible_gpu_spec() + # picks up the narrowed set on AMD systems. + if IS_ROCM: + os.environ["HIP_VISIBLE_DEVICES"] = value + os.environ["ROCR_VISIBLE_DEVICES"] = value _visible_gpu_count = None - logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) + logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s' (rocm=%s)", value, IS_ROCM) def get_device_map( diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index d7f61e5580..ef3e511319 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -59,6 +59,25 @@ def _detect_rocm_version() -> tuple[int, int] | None: except Exception: pass + # Try amd-smi version (outputs "... | ROCm version: X.Y.Z") + amd_smi = shutil.which("amd-smi") + if amd_smi: + try: + result = subprocess.run( + [amd_smi, "version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + import re + m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) + if m: + return int(m.group(1)), int(m.group(2)) + except Exception: + pass + # Try hipconfig --version (outputs bare version like "6.3.21234.2") hipconfig = shutil.which("hipconfig") if hipconfig: From ae0f9afa24a485de840e754e558fc8c4a332d14e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Apr 2026 11:59:04 +0000 Subject: [PATCH 28/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/install_python_stack.py | 1 + 1 file changed, 1 insertion(+) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index ef3e511319..d5f89239e5 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -72,6 +72,7 @@ def _detect_rocm_version() -> tuple[int, int] | None: ) if result.returncode == 0: import re + m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) if m: return int(m.group(1)), int(m.group(2)) From ec12f9b1e6e0406b7f5427bac371376a4dca869c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 12:23:37 +0000 Subject: [PATCH 29/55] Fix GPU detection false positives and add missing health groups - Fix _has_rocm_gpu() false positive: require "GPU: " data rows from amd-smi list, not just header containing "gpu" - Apply same fix in detect_host() in install_llama_prebuilt.py - Add runtime_payload_health_groups for linux-rocm and windows-hip so partial/corrupt ROCm/HIP prebuilt installs are properly detected - Add bitsandbytes install to Radeon fallback paths (was only in the success path, skipped when repo.radeon.com was unreachable) - Keep DEVICE/CHAT_ONLY as direct imports in __init__.py (matching main) and only use __getattr__ for IS_ROCM --- install.sh | 4 +++ studio/backend/utils/hardware/__init__.py | 10 ++++--- studio/install_llama_prebuilt.py | 33 +++++++++++++++++------ studio/install_python_stack.py | 12 ++++++--- 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/install.sh b/install.sh index 8a0326b49d..f6c6c59ad0 100755 --- a/install.sh +++ b/install.sh @@ -1263,12 +1263,16 @@ elif [ -n "$TORCH_INDEX_URL" ]; then run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" fi else substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to ROCm index" "$C_WARN" run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" fi else substep "installing PyTorch ($TORCH_INDEX_URL)..." diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index 17d731d7b2..400b5dd066 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -8,6 +8,8 @@ from . import hardware as _hardware from .hardware import ( DeviceType, + DEVICE, + CHAT_ONLY, detect_hardware, get_device, is_apple_silicon, @@ -84,8 +86,8 @@ def __getattr__(name: str): - """Resolve mutable module-level flags (DEVICE, CHAT_ONLY, IS_ROCM) at access - time so callers always see the current value after detect_hardware() runs.""" - if name in {"DEVICE", "CHAT_ONLY", "IS_ROCM"}: - return getattr(_hardware, name) + """Resolve IS_ROCM at access time so callers always see the live value + after detect_hardware() runs (it flips the flag in hardware.py).""" + if name == "IS_ROCM": + return getattr(_hardware, "IS_ROCM") raise AttributeError(name) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 98e076962f..75c6ae8f54 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2549,11 +2549,17 @@ def detect_host() -> HostInfo: pass # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed + import re as _re + + def _amd_smi_has_gpu(stdout: str) -> bool: + """Check for 'GPU: ' data rows, not just a table header.""" + return bool(_re.search(r"(?im)^gpu\s*:\s*\d", stdout)) + has_rocm = False if is_linux: - for _cmd, _marker in ( - (["rocminfo"], "gfx"), - (["amd-smi", "list"], "gpu"), + for _cmd, _check in ( + (["rocminfo"], lambda out: "gfx" in out.lower()), + (["amd-smi", "list"], _amd_smi_has_gpu), ): _exe = shutil.which(_cmd[0]) if not _exe: @@ -2563,14 +2569,14 @@ def detect_host() -> HostInfo: except Exception: continue if _result.returncode == 0 and _result.stdout.strip(): - if _marker in _result.stdout.lower(): + if _check(_result.stdout): has_rocm = True break elif is_windows: # Windows: prefer active probes that validate GPU presence - for _cmd, _marker in ( - (["hipinfo"], "gcnarchname"), - (["amd-smi", "list"], "gpu"), + for _cmd, _check in ( + (["hipinfo"], lambda out: "gcnarchname" in out.lower()), + (["amd-smi", "list"], _amd_smi_has_gpu), ): _exe = shutil.which(_cmd[0]) if not _exe: @@ -2580,7 +2586,7 @@ def detect_host() -> HostInfo: except Exception: continue if _result.returncode == 0 and _result.stdout.strip(): - if _marker in _result.stdout.lower(): + if _check(_result.stdout): has_rocm = True break # Note: amdhip64.dll presence alone is NOT treated as GPU evidence @@ -4750,10 +4756,21 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: ["libggml*.dylib"], ["libmtmd*.dylib"], ] + if choice.install_kind == "linux-rocm": + return [ + ["libllama.so*"], + ["libggml.so*"], + ["libggml-base.so*"], + ["libggml-cpu-*.so*"], + ["libmtmd.so*"], + ["libggml-hip.so*"], + ] if choice.install_kind == "windows-cpu": return [["llama.dll"]] if choice.install_kind == "windows-cuda": return [["llama.dll"], ["ggml-cuda.dll"]] + if choice.install_kind == "windows-hip": + return [["llama.dll"], ["*hip*.dll"]] return [] diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index d5f89239e5..1b74c9737b 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -106,9 +106,13 @@ def _detect_rocm_version() -> tuple[int, int] | None: def _has_rocm_gpu() -> bool: """Return True only if an actual AMD GPU is visible (not just ROCm tools installed).""" - for cmd, marker in ( - (["rocminfo"], "gfx"), - (["amd-smi", "list"], "gpu"), + import re + + for cmd, check_fn in ( + # rocminfo: look for "Name: gfxNNNN" indicating an actual GPU agent + (["rocminfo"], lambda out: "gfx" in out.lower()), + # amd-smi list: require "GPU: " data rows, not just a header + (["amd-smi", "list"], lambda out: bool(re.search(r"(?im)^gpu\s*:\s*\d", out))), ): exe = shutil.which(cmd[0]) if not exe: @@ -124,7 +128,7 @@ def _has_rocm_gpu() -> bool: except Exception: continue if result.returncode == 0 and result.stdout.strip(): - if marker in result.stdout.lower(): + if check_fn(result.stdout): return True return False From 848c92a0380acd1c53aad438e27f747991fbd8f0 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 12:47:57 +0000 Subject: [PATCH 30/55] Fix _ensure_rocm_torch and Windows AMD warning false positives - _ensure_rocm_torch: only skip when HIP is already present, not for CUDA builds (which are unusable on AMD-only hosts). Fixes the case where a venv has a stale CUDA wheel and the repair step is skipped. - Windows AMD warning: use GPU data row check (same as Linux fix) to avoid false positives from amd-smi list header-only output. --- studio/install_python_stack.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 1b74c9737b..9657c65205 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -174,13 +174,15 @@ def _ensure_rocm_torch() -> None: print(" ROCm detected but version unreadable -- skipping torch reinstall") return - # Skip if torch is already GPU-enabled (HIP or CUDA) + # Skip if torch already links against HIP (ROCm is already working). + # Do NOT skip for CUDA-only builds since they are unusable on AMD-only hosts + # (the NVIDIA check above already handled mixed AMD+NVIDIA setups). try: probe = subprocess.run( [ sys.executable, "-c", - "import torch; v=torch.version; print(getattr(v,'hip','') or getattr(v,'cuda','') or '')", + "import torch; print(getattr(torch.version,'hip','') or '')", ], stdout = subprocess.PIPE, stderr = subprocess.DEVNULL, @@ -189,7 +191,7 @@ def _ensure_rocm_torch() -> None: except (OSError, subprocess.TimeoutExpired): probe = None if probe is not None and probe.returncode == 0 and probe.stdout.decode().strip(): - return # torch already GPU-enabled + return # torch already has HIP/ROCm backend # Select best matching wheel tag (newest ROCm version <= installed) tag = next( @@ -751,10 +753,15 @@ def install_python_stack() -> int: # Detect and warn so users know manual steps are needed for GPU training. if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu(): # Validate actual AMD GPU presence (not just tool existence) + import re as _re_win + + def _win_amd_smi_has_gpu(stdout: str) -> bool: + return bool(_re_win.search(r"(?im)^gpu\s*:\s*\d", stdout)) + _win_amd_gpu = False - for _wcmd, _wmarker in ( - (["hipinfo"], "gcnarchname"), - (["amd-smi", "list"], "gpu"), + for _wcmd, _check_fn in ( + (["hipinfo"], lambda out: "gcnarchname" in out.lower()), + (["amd-smi", "list"], _win_amd_smi_has_gpu), ): _wexe = shutil.which(_wcmd[0]) if not _wexe: @@ -769,7 +776,7 @@ def install_python_stack() -> int: ) except Exception: continue - if _wr.returncode == 0 and _wmarker in _wr.stdout.lower(): + if _wr.returncode == 0 and _check_fn(_wr.stdout): _win_amd_gpu = True break if _win_amd_gpu: From 86735ffad18461b5cc47e60afa76c8f04f84cbd1 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 12:51:22 +0000 Subject: [PATCH 31/55] Fix amd-smi GPU detection for GPU[N] output format Older amd-smi versions output "GPU[0] : Card series: ..." instead of "GPU: 0". The regex now matches both "GPU: " and "GPU[" formats to detect actual GPU data rows. --- studio/install_llama_prebuilt.py | 2 +- studio/install_python_stack.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 75c6ae8f54..9b76548c23 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2553,7 +2553,7 @@ def detect_host() -> HostInfo: def _amd_smi_has_gpu(stdout: str) -> bool: """Check for 'GPU: ' data rows, not just a table header.""" - return bool(_re.search(r"(?im)^gpu\s*:\s*\d", stdout)) + return bool(_re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) has_rocm = False if is_linux: diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 9657c65205..e4c4576fbf 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -112,7 +112,7 @@ def _has_rocm_gpu() -> bool: # rocminfo: look for "Name: gfxNNNN" indicating an actual GPU agent (["rocminfo"], lambda out: "gfx" in out.lower()), # amd-smi list: require "GPU: " data rows, not just a header - (["amd-smi", "list"], lambda out: bool(re.search(r"(?im)^gpu\s*:\s*\d", out))), + (["amd-smi", "list"], lambda out: bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", out))), ): exe = shutil.which(cmd[0]) if not exe: @@ -756,7 +756,7 @@ def install_python_stack() -> int: import re as _re_win def _win_amd_smi_has_gpu(stdout: str) -> bool: - return bool(_re_win.search(r"(?im)^gpu\s*:\s*\d", stdout)) + return bool(_re_win.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) _win_amd_gpu = False for _wcmd, _check_fn in ( From 96ba872a2203b319af456d07ba468a79230abe32 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Apr 2026 12:51:36 +0000 Subject: [PATCH 32/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/install_python_stack.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index e4c4576fbf..bbf71b4496 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -112,7 +112,10 @@ def _has_rocm_gpu() -> bool: # rocminfo: look for "Name: gfxNNNN" indicating an actual GPU agent (["rocminfo"], lambda out: "gfx" in out.lower()), # amd-smi list: require "GPU: " data rows, not just a header - (["amd-smi", "list"], lambda out: bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", out))), + ( + ["amd-smi", "list"], + lambda out: bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", out)), + ), ): exe = shutil.which(cmd[0]) if not exe: From 3caaf30a90420eb035769230a7b5079130b94f95 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 19:50:58 +0000 Subject: [PATCH 33/55] Harden AMD GPU detection against false positives - install.sh: replace weak amd-smi list check (awk 'NR>1 && NF') with strict pattern matching GPU data rows (/^GPU[[:space:]]*[:\[]/) - All files: reject rocminfo gfx000 (CPU HSA agent) by requiring gfx[1-9] instead of gfx[0-9] in the rocminfo GPU probe - Fixes false positives on hosts with ROCm tools but no AMD GPU --- install.sh | 8 ++++---- studio/install_llama_prebuilt.py | 3 ++- studio/install_python_stack.py | 3 ++- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/install.sh b/install.sh index f6c6c59ad0..bd3029b2e6 100755 --- a/install.sh +++ b/install.sh @@ -998,10 +998,10 @@ get_torch_index_url() { # First confirm an actual AMD GPU is present (not just ROCm tools installed) _has_rocm_gpu=false if command -v rocminfo >/dev/null 2>&1 && \ - rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[0-9]/{found=1} END{exit !found}'; then + rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then _has_rocm_gpu=true elif command -v amd-smi >/dev/null 2>&1 && \ - amd-smi list 2>/dev/null | awk 'NR>1 && NF{found=1} END{exit !found}'; then + amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[]/{ found=1 } END{ exit !found }'; then _has_rocm_gpu=true fi if [ "$_has_rocm_gpu" != true ]; then @@ -1158,10 +1158,10 @@ case "$TORCH_INDEX_URL" in _amd_gpu_here=false _amd_gpu_radeon=false if command -v rocminfo >/dev/null 2>&1 && \ - rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[0-9]/{found=1} END{exit !found}'; then + rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then _amd_gpu_here=true elif command -v amd-smi >/dev/null 2>&1 && \ - amd-smi list 2>/dev/null | awk 'NR>1 && NF{found=1} END{exit !found}'; then + amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[]/{ found=1 } END{ exit !found }'; then _amd_gpu_here=true fi if [ "$_amd_gpu_here" = true ] && command -v rocminfo >/dev/null 2>&1 && \ diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 9b76548c23..6b0bb150d6 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2558,7 +2558,8 @@ def _amd_smi_has_gpu(stdout: str) -> bool: has_rocm = False if is_linux: for _cmd, _check in ( - (["rocminfo"], lambda out: "gfx" in out.lower()), + # rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent) + (["rocminfo"], lambda out: bool(_re.search(r"gfx[1-9]", out.lower()))), (["amd-smi", "list"], _amd_smi_has_gpu), ): _exe = shutil.which(_cmd[0]) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index bbf71b4496..4fd14e5023 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -110,7 +110,8 @@ def _has_rocm_gpu() -> bool: for cmd, check_fn in ( # rocminfo: look for "Name: gfxNNNN" indicating an actual GPU agent - (["rocminfo"], lambda out: "gfx" in out.lower()), + # rocminfo: look for "Name: gfxNNNN" with nonzero first digit (gfx000 is the CPU agent) + (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), # amd-smi list: require "GPU: " data rows, not just a header ( ["amd-smi", "list"], From 263470edcac7ee1561ef30f51933fe3abe1d956e Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 3 Apr 2026 19:51:16 +0000 Subject: [PATCH 34/55] Remove duplicate comment from pre-commit merge --- studio/install_python_stack.py | 1 - 1 file changed, 1 deletion(-) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 4fd14e5023..387d8c1855 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -109,7 +109,6 @@ def _has_rocm_gpu() -> bool: import re for cmd, check_fn in ( - # rocminfo: look for "Name: gfxNNNN" indicating an actual GPU agent # rocminfo: look for "Name: gfxNNNN" with nonzero first digit (gfx000 is the CPU agent) (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), # amd-smi list: require "GPU: " data rows, not just a header From 1b98f6d705c48e7464aeb14626c5a52f29fb248d Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 5 Apr 2026 02:28:47 +0000 Subject: [PATCH 35/55] Refactor: deduplicate AMD detection, consolidate bitsandbytes, clean up imports - Extract _has_amd_rocm_gpu() shell function to avoid duplicating the rocminfo/amd-smi GPU detection logic in get_torch_index_url and the Radeon auto-detect block - Consolidate bitsandbytes install into a single case block after torch install (was duplicated 4 times across Radeon success/fallback paths) - Move math and re imports to top of amd.py (were inline in functions) - Add _smi_query() helper in hardware.py to centralize IS_ROCM backend selection for get_gpu_utilization and get_visible_gpu_utilization Addresses Gemini code review suggestions. --- install.sh | 57 +++++------ studio/backend/utils/hardware/amd.py | 6 +- studio/backend/utils/hardware/hardware.py | 118 ++++++++++------------ 3 files changed, 78 insertions(+), 103 deletions(-) diff --git a/install.sh b/install.sh index bd3029b2e6..7f393165e8 100755 --- a/install.sh +++ b/install.sh @@ -978,6 +978,21 @@ _find_no_torch_runtime() { fi } +# ── AMD ROCm GPU detection helper ── +# Returns 0 (true) if an actual AMD GPU is present, 1 (false) otherwise. +# Checks rocminfo for gfx[1-9]* (excludes gfx000 CPU agent) and +# amd-smi list for GPU data rows (excludes header-only output). +_has_amd_rocm_gpu() { + if command -v rocminfo >/dev/null 2>&1 && \ + rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then + return 0 + elif command -v amd-smi >/dev/null 2>&1 && \ + amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[]/{ found=1 } END{ exit !found }'; then + return 0 + fi + return 1 +} + # ── Detect GPU and choose PyTorch index URL ── # Mirrors Get-TorchIndexUrl in install.ps1. # On CPU-only machines this returns the cpu index, avoiding the solver @@ -995,16 +1010,7 @@ get_torch_index_url() { fi if [ -z "$_smi" ]; then # No NVIDIA GPU -- check for AMD ROCm GPU - # First confirm an actual AMD GPU is present (not just ROCm tools installed) - _has_rocm_gpu=false - if command -v rocminfo >/dev/null 2>&1 && \ - rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then - _has_rocm_gpu=true - elif command -v amd-smi >/dev/null 2>&1 && \ - amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[]/{ found=1 } END{ exit !found }'; then - _has_rocm_gpu=true - fi - if [ "$_has_rocm_gpu" != true ]; then + if ! _has_amd_rocm_gpu; then echo "$_base/cpu"; return fi # AMD GPU confirmed -- detect ROCm version @@ -1155,16 +1161,8 @@ TORCH_INDEX_URL=$(get_torch_index_url) # (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon". case "$TORCH_INDEX_URL" in */rocm*) - _amd_gpu_here=false _amd_gpu_radeon=false - if command -v rocminfo >/dev/null 2>&1 && \ - rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then - _amd_gpu_here=true - elif command -v amd-smi >/dev/null 2>&1 && \ - amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[]/{ found=1 } END{ exit !found }'; then - _amd_gpu_here=true - fi - if [ "$_amd_gpu_here" = true ] && command -v rocminfo >/dev/null 2>&1 && \ + if _has_amd_rocm_gpu && command -v rocminfo >/dev/null 2>&1 && \ rocminfo 2>/dev/null | grep -q 'Marketing Name:.*Radeon'; then _amd_gpu_radeon=true fi @@ -1255,37 +1253,30 @@ elif [ -n "$TORCH_INDEX_URL" ]; then run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ --find-links "$_RADEON_BASE_URL" \ $_radeon_pkgs - substep "installing bitsandbytes for AMD Radeon..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" \ - "bitsandbytes>=0.49.1" else substep "[WARN] Radeon repo unavailable; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" - substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" fi else substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to ROCm index" "$C_WARN" run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" - substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" fi else substep "installing PyTorch ($TORCH_INDEX_URL)..." run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" - # AMD ROCm: install bitsandbytes with AMD support - case "$TORCH_INDEX_URL" in - */rocm*) - substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" - ;; - esac fi + # AMD ROCm: install bitsandbytes (once, after torch, for all ROCm paths) + case "$TORCH_INDEX_URL" in + */rocm*) + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" + ;; + esac # Fresh: Step 2 - install unsloth, preserving pre-installed torch substep "installing unsloth (this may take a few minutes)..." if [ "$SKIP_TORCH" = true ]; then diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index c3a0536a56..5cfd6bde97 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -9,6 +9,8 @@ """ import json +import math +import re import subprocess from typing import Any, Optional @@ -47,14 +49,10 @@ def _parse_numeric(value: Any) -> Optional[float]: if isinstance(value, dict): return _parse_numeric(value.get("value")) if isinstance(value, (int, float)): - import math - f = float(value) return f if math.isfinite(f) else None if isinstance(value, str): # Strip units like "W", "C", "%", "MB", "MiB", "GB", "GiB" etc. - import re - cleaned = re.sub(r"\s*[A-Za-z/%]+$", "", value.strip()) if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"): return None diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index edeada0190..6712513aa8 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -402,31 +402,44 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] # ========== Live GPU Utilization ========== +def _smi_query(func_name: str, *args, **kwargs) -> Optional[Dict[str, Any]]: + """Run a query against the appropriate SMI backend (amd-smi or nvidia-smi). + + Returns the result dict if available, or None on failure/unavailability. + """ + if IS_ROCM: + backend_name = "amd-smi" + try: + from . import amd as _backend + except Exception as e: + logger.warning("%s import failed: %s", backend_name, e) + return None + else: + backend_name = "nvidia-smi" + try: + from . import nvidia as _backend + except Exception as e: + logger.warning("%s import failed: %s", backend_name, e) + return None + try: + func = getattr(_backend, func_name) + result = func(*args, **kwargs) + if result.get("available"): + return result + except Exception as e: + logger.warning("%s %s query failed: %s", backend_name, func_name, e) + return None + + def get_gpu_utilization() -> Dict[str, Any]: """Return a live snapshot of device utilization information.""" device = get_device() if device == DeviceType.CUDA: - if IS_ROCM: - try: - from . import amd - - result = amd.get_primary_gpu_utilization() - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("amd-smi utilization query failed: %s", e) - else: - try: - from . import nvidia - - result = nvidia.get_primary_gpu_utilization() - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("nvidia-smi utilization query failed: %s", e) + result = _smi_query("get_primary_gpu_utilization") + if result is not None: + result["backend"] = device.value + return result mem = get_gpu_memory_info() if device != DeviceType.CPU and mem.get("available"): @@ -451,32 +464,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if device == DeviceType.CUDA: parent_visible_spec = _get_parent_visible_gpu_spec() - if IS_ROCM: - try: - from . import amd - - result = amd.get_visible_gpu_utilization( - parent_visible_spec["numeric_ids"], - parent_cuda_visible_devices = parent_visible_spec["raw"], - ) - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("amd-smi visible GPU utilization query failed: %s", e) - else: - try: - from . import nvidia - - result = nvidia.get_visible_gpu_utilization( - parent_visible_spec["numeric_ids"], - parent_cuda_visible_devices = parent_visible_spec["raw"], - ) - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("nvidia-smi visible GPU utilization query failed: %s", e) + result = _smi_query( + "get_visible_gpu_utilization", + parent_visible_spec["numeric_ids"], + parent_cuda_visible_devices = parent_visible_spec["raw"], + ) + if result is not None: + result["backend"] = device.value + return result # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) if device in (DeviceType.CUDA, DeviceType.XPU): @@ -1162,26 +1157,17 @@ def get_physical_gpu_count() -> int: device = get_device() if device == DeviceType.CUDA: - if IS_ROCM: - try: - from . import amd - - count = amd.get_physical_gpu_count() - if count is not None: - _physical_gpu_count = count - return _physical_gpu_count - except Exception: - pass - else: - try: - from . import nvidia - - count = nvidia.get_physical_gpu_count() - if count is not None: - _physical_gpu_count = count - return _physical_gpu_count - except Exception: - pass + try: + if IS_ROCM: + from . import amd as _smi_mod + else: + from . import nvidia as _smi_mod + count = _smi_mod.get_physical_gpu_count() + if count is not None: + _physical_gpu_count = count + return _physical_gpu_count + except Exception: + pass # SMI tool unavailable or failed -- fall back to torch count = _torch_get_physical_gpu_count() _physical_gpu_count = count if count is not None else 1 From b37f7e6bd55dee48107c0ac4dbe93fb46374642f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 5 Apr 2026 02:50:14 +0000 Subject: [PATCH 36/55] Fix VRAM parsing for string values and GB/GiB consistency - Extract unit from string-valued VRAM fields (e.g. "192 GiB") so _parse_memory_mb correctly applies the unit multiplier instead of treating the value as bare MB - Treat GB and GiB identically (both as binary x1024) since GPU tools including amd-smi use binary units even when labeling them "GB" - Fixes incorrect VRAM reporting on MI300-class cards (was showing ~0.19 GB instead of 192 GB for string-valued outputs) --- studio/backend/utils/hardware/amd.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 5cfd6bde97..1167d9edaf 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -75,24 +75,24 @@ def _parse_memory_mb(value: Any) -> Optional[float]: if isinstance(value, dict): unit = str(value.get("unit", "")).strip().lower() raw_value = value.get("value") + elif isinstance(value, str): + # Extract unit suffix from strings like "192 GiB" or "8192 MB" + m = re.match(r"^\s*([\d.]+)\s*([A-Za-z]+)\s*$", value.strip()) + if m: + unit = m.group(2).lower() num = _parse_numeric(raw_value if isinstance(value, dict) else value) if num is None: return None - # Explicit unit conversion -- distinguish binary (GiB) from SI (GB) - if "gib" in unit: + # Unit conversion -- GPU tools (including amd-smi) use binary units even + # when labeling them "GB" or "MB", so treat GB/GiB and MB/MiB the same. + if "gib" in unit or "gb" in unit: return num * 1024 - if "gb" in unit: - return num * 1000 - if "mib" in unit: + if "mib" in unit or "mb" in unit: return num - if "mb" in unit: - return num - if "kib" in unit: + if "kib" in unit or "kb" in unit: return num / 1024 - if "kb" in unit: - return num / 1000 if unit and ( "b" in unit and "g" not in unit and "m" not in unit and "k" not in unit ): From 543e721121fca877951d94d1b31f8145c93ac812 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 5 Apr 2026 02:51:17 +0000 Subject: [PATCH 37/55] Add --no-cache to uv for ROCm HIP source builds Avoid stale cache artifacts from partial HIP source builds when uv is used for causal-conv1d/mamba-ssm compilation on ROCm. The pip path already uses --no-cache-dir; this adds the uv equivalent (--no-cache) only when is_hip is True. --- studio/backend/core/training/worker.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 740dd4849f..3d2f9a14b9 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -270,8 +270,11 @@ def _install_package_wheel_first( sys.executable, "--no-build-isolation", "--no-deps", - f"{pypi_name}=={pypi_version}", ] + # Avoid stale cache artifacts from partial HIP source builds + if is_hip: + pypi_cmd.append("--no-cache") + pypi_cmd.append(f"{pypi_name}=={pypi_version}") else: pypi_cmd = [ sys.executable, From 84a9c55e1b86c06a8fd8430fda0763c3202eeb5f Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 5 Apr 2026 03:28:16 +0000 Subject: [PATCH 38/55] Fix critical: initialize _amd_gpu_radeon before case block _amd_gpu_radeon was only set inside the */rocm*) case arm, so on NVIDIA/CPU/macOS paths where TORCH_INDEX_URL does not contain "rocm", the variable was unbound. With set -u (nounset) enabled, this crashes the installer for every non-AMD user. Move initialization to before the case block so it is always defined. --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 624eb108ac..7a123f5246 100755 --- a/install.sh +++ b/install.sh @@ -1159,9 +1159,9 @@ TORCH_INDEX_URL=$(get_torch_index_url) # Auto-detect GPU for AMD ROCm based # get_torch_index_url must have chosen */rocm* # (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon". +_amd_gpu_radeon=false case "$TORCH_INDEX_URL" in */rocm*) - _amd_gpu_radeon=false if _has_amd_rocm_gpu && command -v rocminfo >/dev/null 2>&1 && \ rocminfo 2>/dev/null | grep -q 'Marketing Name:.*Radeon'; then _amd_gpu_radeon=true From c6f5b3af32ecb08d54aaaaafaa5f32a131949235 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Sun, 5 Apr 2026 03:31:04 +0000 Subject: [PATCH 39/55] Fix Windows AMD: route has_rocm hosts to HIP prebuilt path resolve_release_asset_choice was selecting windows-cpu for all Windows x86_64 hosts including those with has_rocm=True. Windows AMD users should fall through to resolve_upstream_asset_choice which tries the HIP prebuilt first. Add "not host.has_rocm" guard to the published windows-cpu selection. --- studio/install_llama_prebuilt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 6b0bb150d6..b18f265bcc 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -3120,7 +3120,7 @@ def resolve_release_asset_choice( ) published_choice: AssetChoice | None = None - if host.is_windows and host.is_x86_64: + if host.is_windows and host.is_x86_64 and not host.has_rocm: published_choice = published_asset_choice_for_kind(release, "windows-cpu") elif host.is_macos and host.is_arm64: published_choice = published_asset_choice_for_kind(release, "macos-arm64") From 810b833b017278d8f56bb01a99bfdd3ef2e2d7e6 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 09:20:14 +0000 Subject: [PATCH 40/55] Harden ROCm detection, Radeon wheel fallback, and HIP visibility Addresses review findings from parallel reviewers on PR #4720: - install.sh: add _has_usable_nvidia_gpu() helper requiring nvidia-smi -L to actually list a GPU before treating the host as NVIDIA. Fixes the stale-nvidia-smi-on-PATH regression where AMD-only hosts fell into the CUDA branch. - install.sh: fix hipconfig awk blocks to propagate a non-zero exit code when the output is not a recognisable version string, so the ||-chain continues to dpkg-query / rpm instead of terminating early. - install.sh: fail-closed on Radeon wheel fallback. When torch, torchvision or torchaudio is missing from the Radeon repo for the active Python tag, fall back to the standard ROCm index instead of silently mixing Radeon wheels with PyPI defaults. Quote all wheel arguments individually so wheel filenames cannot be word-split or glob-expanded. - install_llama_prebuilt.py: detect_host() now requires nvidia-smi -L to list a GPU before setting has_physical_nvidia. Routes AMD ROCm hosts with a broken leftover nvidia-smi to the ROCm path instead of misclassifying them as NVIDIA. - install_llama_prebuilt.py: scan upstream assets for any rocm- prebuilt instead of hard-coding rocm-7.2, so ROCm 6.x / 7.0 / 7.1 / 7.3+ users pick up a matching upstream prebuilt when one exists. - install_llama_prebuilt.py: validate_server() adds --n-gpu-layers 1 for linux-rocm and windows-hip hosts, so new HIP prebuilts are preflighted on the GPU path instead of passing validation on CPU only. - install_llama_prebuilt.py: restore the published windows-cpu fallback for AMD Windows hosts without a HIP prebuilt so hash-approved bundles are still preferred over the raw upstream CPU asset. - install_python_stack.py: drop the /opt/rocm / hipcc gate in _ensure_rocm_torch() and rely on _has_rocm_gpu(). Runtime-only ROCm installs (package-managed minimal installs, Radeon software) that ship amd-smi / rocminfo without hipcc can now repair a CPU-only venv via "unsloth studio update". Adds an explicit IS_WINDOWS / IS_MACOS guard. - studio/backend/utils/hardware/amd.py: honour HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES in get_primary_gpu_utilization(). A process restricted to GPU 2 now reports metrics for GPU 2 instead of physical GPU 0. Tighten the plain bytes unit detection to an explicit allowlist. - studio/backend/utils/hardware/hardware.py: route get_backend_visible_gpu_info()'s backend_cuda_visible_devices field through a helper that reads HIP_VISIBLE_DEVICES on ROCm. Drop the unconditional "(rocm=False)" suffix in apply_gpu_ids() logs. --- install.sh | 74 +++++++++++++++++------ studio/backend/utils/hardware/amd.py | 34 +++++++++-- studio/backend/utils/hardware/hardware.py | 22 ++++++- studio/install_llama_prebuilt.py | 58 +++++++++++++++--- studio/install_python_stack.py | 24 +++++--- 5 files changed, 168 insertions(+), 44 deletions(-) diff --git a/install.sh b/install.sh index 7a123f5246..be262da36e 100755 --- a/install.sh +++ b/install.sh @@ -987,12 +987,28 @@ _has_amd_rocm_gpu() { rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then return 0 elif command -v amd-smi >/dev/null 2>&1 && \ - amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[]/{ found=1 } END{ exit !found }'; then + amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[][[:space:]]*[0-9]/{ found=1 } END{ exit !found }'; then return 0 fi return 1 } +# ── NVIDIA usable-GPU helper ── +# Returns 0 (true) only if nvidia-smi is present AND actually lists a GPU. +# Prevents AMD-only hosts with a stale nvidia-smi on PATH from being routed +# into the CUDA branch. +_has_usable_nvidia_gpu() { + _nvsmi="" + if command -v nvidia-smi >/dev/null 2>&1; then + _nvsmi="nvidia-smi" + elif [ -x "/usr/bin/nvidia-smi" ]; then + _nvsmi="/usr/bin/nvidia-smi" + else + return 1 + fi + "$_nvsmi" -L 2>/dev/null | awk '/^GPU[[:space:]]+[0-9]+:/{found=1} END{exit !found}' +} + # ── Detect GPU and choose PyTorch index URL ── # Mirrors Get-TorchIndexUrl in install.ps1. # On CPU-only machines this returns the cpu index, avoiding the solver @@ -1001,12 +1017,17 @@ get_torch_index_url() { _base="https://download.pytorch.org/whl" # macOS: always CPU (no CUDA support) case "$(uname -s)" in Darwin) echo "$_base/cpu"; return ;; esac - # Try nvidia-smi + # Try nvidia-smi -- require the binary to actually list a usable GPU. + # Presence of the binary alone (container leftovers, stale driver + # packages) is not sufficient: otherwise an AMD-only host would + # silently install CUDA wheels. _smi="" - if command -v nvidia-smi >/dev/null 2>&1; then - _smi="nvidia-smi" - elif [ -x "/usr/bin/nvidia-smi" ]; then - _smi="/usr/bin/nvidia-smi" + if _has_usable_nvidia_gpu; then + if command -v nvidia-smi >/dev/null 2>&1; then + _smi="nvidia-smi" + elif [ -x "/usr/bin/nvidia-smi" ]; then + _smi="/usr/bin/nvidia-smi" + fi fi if [ -z "$_smi" ]; then # No NVIDIA GPU -- check for AMD ROCm GPU @@ -1021,7 +1042,7 @@ get_torch_index_url() { { [ -r /opt/rocm/.info/version ] && \ awk -F. '{print "rocm"$1"."$2; exit}' /opt/rocm/.info/version; } || \ { command -v hipconfig >/dev/null 2>&1 && \ - hipconfig --version 2>/dev/null | awk 'NR==1{split($1,a,"."); if(a[1]+0>0) print "rocm"a[1]"."a[2]}'; } || \ + hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]/{split($1,a,"."); if(a[1]+0>0){print "rocm"a[1]"."a[2]; found=1}} END{exit !found}'; } || \ { command -v dpkg-query >/dev/null 2>&1 && \ ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \ [ -n "$ver" ] && \ @@ -1087,7 +1108,7 @@ get_radeon_wheel_url() { { [ -r /opt/rocm/.info/version ] && \ awk -F'[.-]' 'NF>=3{print $1"."$2"."$3; exit}' /opt/rocm/.info/version; } || \ { command -v hipconfig >/dev/null 2>&1 && \ - hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]+\.[0-9]+\.[0-9]/{print $1}'; }) 2>/dev/null + hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]+\.[0-9]+\.[0-9]/{print $1; found=1} END{exit !found}'; }) 2>/dev/null # Validate: must be X.Y.Z with X >= 1 case "$_full_ver" in @@ -1241,18 +1262,31 @@ elif [ -n "$TORCH_INDEX_URL" ]; then fi if [ "$_radeon_listing_ok" = true ]; then - substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." - _torch_arg="torch"; _tv_arg="torchvision"; _ta_arg="torchaudio"; _tri_arg="" - _torch_whl=$(_pick_radeon_wheel "torch" 2>/dev/null) && _torch_arg="$_torch_whl" - _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) && _tv_arg="$_tv_whl" - _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) && _ta_arg="$_ta_whl" - _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) && _tri_arg="$_tri_whl" - # Build install args; skip empty _tri_arg to avoid passing "" to uv - _radeon_pkgs="$_torch_arg $_tv_arg $_ta_arg" - [ -n "$_tri_arg" ] && _radeon_pkgs="$_tri_arg $_radeon_pkgs" - run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ - --find-links "$_RADEON_BASE_URL" \ - $_radeon_pkgs + # Require torch, torchvision, torchaudio wheels to all resolve + # from the Radeon listing. If any is missing for this Python + # tag, fall through to the standard ROCm index instead of + # silently mixing Radeon wheels with PyPI defaults. + _torch_whl=$(_pick_radeon_wheel "torch" 2>/dev/null) || _torch_whl="" + _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) || _tv_whl="" + _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) || _ta_whl="" + _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) || _tri_whl="" + if [ -z "$_torch_whl" ] || [ -z "$_tv_whl" ] || [ -z "$_ta_whl" ]; then + substep "[WARN] Radeon repo lacks a complete wheel set for this Python; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" + else + substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." + if [ -n "$_tri_whl" ]; then + run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ + --find-links "$_RADEON_BASE_URL" \ + "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl" + else + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + --find-links "$_RADEON_BASE_URL" \ + "$_torch_whl" "$_tv_whl" "$_ta_whl" + fi + fi else substep "[WARN] Radeon repo unavailable; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 1167d9edaf..7b107f3bb9 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -10,6 +10,7 @@ import json import math +import os import re import subprocess from typing import Any, Optional @@ -93,9 +94,7 @@ def _parse_memory_mb(value: Any) -> Optional[float]: return num if "kib" in unit or "kb" in unit: return num / 1024 - if unit and ( - "b" in unit and "g" not in unit and "m" not in unit and "k" not in unit - ): + if unit in ("b", "byte", "bytes"): # Plain bytes return num / (1024 * 1024) @@ -203,9 +202,34 @@ def get_physical_gpu_count() -> Optional[int]: return None +def _first_visible_amd_gpu_id() -> Optional[str]: + """Return the physical AMD GPU id that should be treated as 'primary'. + + Honours HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES + in that order (HIP respects all three). Returns ``"0"`` when none are + set, and ``None`` when the env var explicitly narrows to zero GPUs + ("" or "-1"), so callers can short-circuit to "available: False". + """ + for env_name in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"): + raw = os.environ.get(env_name) + if raw is None: + continue + raw = raw.strip() + if raw == "" or raw == "-1": + return None + first = raw.split(",", 1)[0].strip() + if first: + return first + break + return "0" + + def get_primary_gpu_utilization() -> dict[str, Any]: - """Return utilization metrics for the primary AMD GPU.""" - data = _run_amd_smi("metric", "-g", "0") + """Return utilization metrics for the primary visible AMD GPU.""" + gpu_idx = _first_visible_amd_gpu_id() + if gpu_idx is None: + return {"available": False} + data = _run_amd_smi("metric", "-g", gpu_idx) if data is None: return {"available": False} diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 6712513aa8..fa5270b833 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -1187,6 +1187,19 @@ def get_physical_gpu_count() -> int: return _physical_gpu_count +def _backend_visible_devices_env() -> Optional[str]: + """Return the raw visibility env string that applies to this backend. + + On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence + over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in + ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices`` + reports the value that is actually narrowing the visible device set. + """ + if IS_ROCM: + return _get_parent_visible_gpu_spec().get("raw") + return os.environ.get("CUDA_VISIBLE_DEVICES") + + def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): @@ -1232,7 +1245,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": True, "backend": device.value, - "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), + "backend_cuda_visible_devices": _backend_visible_devices_env(), "parent_visible_gpu_ids": parent_visible_ids, "devices": devices, "index_kind": index_kind, @@ -1241,7 +1254,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, "backend": device.value, - "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), + "backend_cuda_visible_devices": _backend_visible_devices_env(), "parent_visible_gpu_ids": parent_visible_ids, "devices": [], "index_kind": "physical", @@ -1348,7 +1361,10 @@ def apply_gpu_ids(gpu_ids) -> None: os.environ["HIP_VISIBLE_DEVICES"] = value os.environ["ROCR_VISIBLE_DEVICES"] = value _visible_gpu_count = None - logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s' (rocm=%s)", value, IS_ROCM) + if IS_ROCM: + logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s' (rocm)", value) + else: + logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) def get_device_map( diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index b18f265bcc..154aa6e44c 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2494,12 +2494,27 @@ def detect_host() -> HostInfo: has_physical_nvidia = False has_usable_nvidia = False if nvidia_smi: + # Require `nvidia-smi -L` to actually list a GPU before treating the + # host as NVIDIA. The banner text "NVIDIA-SMI ..." is printed even + # when the command fails to communicate with the driver (e.g. stale + # container leftovers), which would otherwise misclassify an AMD + # ROCm host as NVIDIA and short-circuit the ROCm path. try: - result = run_capture([nvidia_smi], timeout = 20) - merged = "\n".join(part for part in (result.stdout, result.stderr) if part) - if "NVIDIA-SMI" in merged: + listing = run_capture([nvidia_smi, "-L"], timeout = 20) + gpu_lines = [ + line + for line in listing.stdout.splitlines() + if line.startswith("GPU ") + ] + if gpu_lines: has_physical_nvidia = True has_usable_nvidia = visible_device_tokens != [] + except Exception: + pass + + try: + result = run_capture([nvidia_smi], timeout = 20) + merged = "\n".join(part for part in (result.stdout, result.stderr) if part) for line in merged.splitlines(): if "CUDA Version:" in line: raw = line.split("CUDA Version:", 1)[1].strip().split()[0] @@ -2981,11 +2996,28 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice # the exact GPU target via rocminfo, which is more reliable for consumer # GPUs (e.g. gfx1151) that may not be in the prebuilt. if host.has_rocm and not host.has_usable_nvidia: - rocm_name = f"llama-{llama_tag}-bin-ubuntu-rocm-7.2-x64.tar.gz" - if rocm_name in upstream_assets: + # Scan upstream assets for any rocm- prebuilt and prefer + # the newest one. Hardcoding a single rocm-7.2 filename means + # ROCm 6.x / 7.0 / 7.1 / 7.3+ users always fall through to a + # source build even when a matching prebuilt exists upstream. + import re as _re_rocm + + _rocm_pattern = _re_rocm.compile( + rf"llama-{_re_rocm.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz" + ) + rocm_candidates: list[tuple[tuple[int, ...], str]] = [] + for _name in upstream_assets: + _m = _rocm_pattern.match(_name) + if _m is None: + continue + _parts = tuple(int(p) for p in _m.group(1).split(".")) + rocm_candidates.append((_parts, _name)) + rocm_candidates.sort(reverse = True) + if rocm_candidates: + rocm_name = rocm_candidates[0][1] log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") log( - "Note: prebuilt is compiled for ROCm 7.2; if your ROCm version differs, " + "Note: if your ROCm runtime version differs significantly, " "this may fail preflight and fall back to a source build (safe)" ) return AssetChoice( @@ -3120,7 +3152,13 @@ def resolve_release_asset_choice( ) published_choice: AssetChoice | None = None - if host.is_windows and host.is_x86_64 and not host.has_rocm: + if host.is_windows and host.is_x86_64: + # Always try the published Windows CPU bundle, even on AMD ROCm + # hosts. If a windows-hip bundle is added to published releases + # in the future, the upstream resolver below would pick it first + # via resolve_asset_choice; falling back to the hash-approved + # windows-cpu bundle is still better than the upstream CPU + # asset for AMD Windows hosts without a HIP prebuilt. published_choice = published_asset_choice_for_kind(release, "windows-cpu") elif host.is_macos and host.is_arm64: published_choice = published_asset_choice_for_kind(release, "macos-arm64") @@ -4233,7 +4271,11 @@ def validate_server( "--batch-size", "32", ] - if host.has_usable_nvidia or (host.is_macos and host.is_arm64): + if ( + host.has_usable_nvidia + or host.has_rocm + or (host.is_macos and host.is_arm64) + ): command.extend(["--n-gpu-layers", "1"]) log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log") diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 387d8c1855..b53a3f2fb9 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -157,20 +157,28 @@ def _has_usable_nvidia_gpu() -> bool: def _ensure_rocm_torch() -> None: """Reinstall torch with ROCm wheels when the venv received CPU-only torch. - Runs only on Linux hosts where ROCm is installed and an AMD GPU is - present. No-op when torch already links against HIP (ROCm) or CUDA - (NVIDIA). Skips on Windows/macOS and on mixed AMD+NVIDIA hosts - (NVIDIA takes precedence). + Runs only on Linux hosts where an AMD GPU is present and the ROCm + runtime is detectable (rocminfo / amd-smi / hipconfig / rocm-core + package). No-op when torch already links against HIP (ROCm) or on + Windows/macOS or on mixed AMD+NVIDIA hosts (NVIDIA takes precedence). Uses pip_install() to respect uv, constraints, and --python targeting. """ + # Explicit OS guard so the helper is safe to call from any context -- + # ROCm wheels are only published for Linux x86_64. + if IS_WINDOWS or IS_MACOS: + return # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable if _has_usable_nvidia_gpu(): return - rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" - if not os.path.isdir(rocm_root) and not shutil.which("hipcc"): - return # no ROCm toolchain + # Rely on _has_rocm_gpu() (rocminfo / amd-smi GPU data rows) as the + # authoritative "is this actually an AMD ROCm host?" signal. The old + # gate required /opt/rocm or hipcc to exist, which breaks on + # runtime-only ROCm installs (package-managed minimal installs, + # Radeon software) that ship amd-smi/rocminfo without /opt/rocm or + # hipcc, and leaves `unsloth studio update` unable to repair a + # CPU-only venv on those systems. if not _has_rocm_gpu(): - return # ROCm tools present but no AMD GPU + return # no AMD GPU visible ver = _detect_rocm_version() if ver is None: From 8636fa63fc7c77f4bf34d78d5ae7ebf25f87a92c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 09:46:41 +0000 Subject: [PATCH 41/55] Fix round 2 regressions: ROCm validate_server and Windows HIP routing Follow-up to 810b833b addressing review findings on the first round of hardening commits: - install_llama_prebuilt.py validate_server: gate --n-gpu-layers on the resolved install_kind instead of host.has_rocm. AMD Windows hosts without a HIP prebuilt fall back to windows-cpu and must not be validated with GPU layers; thread install_kind through from the caller. - install_llama_prebuilt.py resolve_release_asset_choice: reinstate the "not has_rocm" guard on the published windows-cpu bundle so AMD Windows hosts reach resolve_upstream_asset_choice() where the new HIP prebuilt path lives. Prefer a published windows-hip bundle first when one exists, fall through to upstream HIP + upstream CPU otherwise. - install_llama_prebuilt.py detect_host: also set has_physical_nvidia when the secondary --query-gpu block confirms a working NVIDIA GPU, so older nvidia-smi versions without -L support do not silently skip the Linux diagnostics that key off has_physical_nvidia. - install_llama_prebuilt.py: drop redundant "import re as _re" / "import re as _re_rocm" local aliases in favour of the existing top-level "import re". - install_python_stack.py _ensure_rocm_torch: run the AMD bitsandbytes install unconditionally after the HIP-torch probe so "unsloth studio update" on venvs that already have ROCm torch still gains the AMD bitsandbytes build. - install.sh: add a non-x86_64 early-exit to get_torch_index_url() so aarch64 / arm64 Linux hosts do not hit the ROCm wheel index (PyTorch only publishes ROCm wheels for linux_x86_64). - install.sh: add bitsandbytes install to the migrated-environment branch so upgrades pick it up for ROCm hosts instead of only the fresh-install path. - install.sh: in the Radeon wheel path, pass version constraints + --no-index --find-links to uv instead of explicit wheel URLs so a version-compatible torch / torchvision / torchaudio triple is resolved, rather than picking the highest-version wheel for each package independently. - studio/backend/utils/hardware/amd.py _first_visible_amd_gpu_id: fall through to lower-priority visibility env vars when the first entry is malformed (leading comma, all-whitespace first token) instead of silently returning GPU 0. --- install.sh | 39 ++++++++++++++-- studio/backend/utils/hardware/amd.py | 5 +- studio/install_llama_prebuilt.py | 60 ++++++++++++++++-------- studio/install_python_stack.py | 69 +++++++++++++++------------- 4 files changed, 119 insertions(+), 54 deletions(-) diff --git a/install.sh b/install.sh index be262da36e..33076beb30 100755 --- a/install.sh +++ b/install.sh @@ -1030,7 +1030,14 @@ get_torch_index_url() { fi fi if [ -z "$_smi" ]; then - # No NVIDIA GPU -- check for AMD ROCm GPU + # No NVIDIA GPU -- check for AMD ROCm GPU. + # PyTorch only publishes ROCm wheels for linux-x86_64; skip the + # ROCm branch entirely on aarch64 / arm64 / other architectures + # so non-x86_64 Linux hosts fall back cleanly to CPU wheels. + case "$(uname -m)" in + x86_64|amd64) : ;; + *) echo "$_base/cpu"; return ;; + esac if ! _has_amd_rocm_gpu; then echo "$_base/cpu"; return fi @@ -1241,6 +1248,17 @@ if [ "$_MIGRATED" = true ]; then substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps fi + # AMD ROCm: install bitsandbytes even in migrated environments so + # existing ROCm installs gain the AMD bitsandbytes build without a + # fresh reinstall. + if [ "$SKIP_TORCH" = false ]; then + case "$TORCH_INDEX_URL" in + */rocm*) + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" + ;; + esac + fi elif [ -n "$TORCH_INDEX_URL" ]; then # Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac) if [ "$SKIP_TORCH" = true ]; then @@ -1277,14 +1295,29 @@ elif [ -n "$TORCH_INDEX_URL" ]; then --index-url "$TORCH_INDEX_URL" else substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." + # Use version constraints + --find-links + --no-index so + # uv resolves a compatible torch / torchvision / torchaudio + # set from the Radeon listing (instead of picking the + # highest-version wheel for each package independently, + # which can assemble a version-mismatched stack). + # The wheel presence check above guarantees the listing + # has at least one wheel per package; uv will pick the + # newest version-compatible triple. if [ -n "$_tri_whl" ]; then run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ + --no-index \ --find-links "$_RADEON_BASE_URL" \ - "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl" + "$TORCH_CONSTRAINT" \ + "torchvision<0.26.0" \ + "torchaudio<2.11.0" \ + "triton<3.7" else run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + --no-index \ --find-links "$_RADEON_BASE_URL" \ - "$_torch_whl" "$_tv_whl" "$_ta_whl" + "$TORCH_CONSTRAINT" \ + "torchvision<0.26.0" \ + "torchaudio<2.11.0" fi fi else diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 7b107f3bb9..941c9cef18 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -220,7 +220,10 @@ def _first_visible_amd_gpu_id() -> Optional[str]: first = raw.split(",", 1)[0].strip() if first: return first - break + # Leading comma or all-whitespace first token -- fall through to + # the next env var in priority order rather than silently + # returning GPU 0. + continue return "0" diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 154aa6e44c..3f58ab1512 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2554,6 +2554,12 @@ def detect_host() -> HostInfo: if visible_gpu_rows: has_usable_nvidia = True + # Older nvidia-smi versions (pre -L support) hit the + # except in the first try block but still succeed here, + # leaving has_physical_nvidia unset. Mirror the -L path + # so downstream diagnostics on line ~4390 still run. + if not has_physical_nvidia: + has_physical_nvidia = True elif visible_device_tokens == []: has_usable_nvidia = False elif supports_explicit_visible_device_matching(visible_device_tokens): @@ -2564,17 +2570,16 @@ def detect_host() -> HostInfo: pass # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed - import re as _re def _amd_smi_has_gpu(stdout: str) -> bool: """Check for 'GPU: ' data rows, not just a table header.""" - return bool(_re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) + return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) has_rocm = False if is_linux: for _cmd, _check in ( # rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent) - (["rocminfo"], lambda out: bool(_re.search(r"gfx[1-9]", out.lower()))), + (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), (["amd-smi", "list"], _amd_smi_has_gpu), ): _exe = shutil.which(_cmd[0]) @@ -3000,10 +3005,8 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice # the newest one. Hardcoding a single rocm-7.2 filename means # ROCm 6.x / 7.0 / 7.1 / 7.3+ users always fall through to a # source build even when a matching prebuilt exists upstream. - import re as _re_rocm - - _rocm_pattern = _re_rocm.compile( - rf"llama-{_re_rocm.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz" + _rocm_pattern = re.compile( + rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz" ) rocm_candidates: list[tuple[tuple[int, ...], str]] = [] for _name in upstream_assets: @@ -3153,13 +3156,20 @@ def resolve_release_asset_choice( published_choice: AssetChoice | None = None if host.is_windows and host.is_x86_64: - # Always try the published Windows CPU bundle, even on AMD ROCm - # hosts. If a windows-hip bundle is added to published releases - # in the future, the upstream resolver below would pick it first - # via resolve_asset_choice; falling back to the hash-approved - # windows-cpu bundle is still better than the upstream CPU - # asset for AMD Windows hosts without a HIP prebuilt. - published_choice = published_asset_choice_for_kind(release, "windows-cpu") + # AMD Windows hosts should prefer a hash-approved published + # Windows HIP bundle when one exists, but otherwise fall through + # to resolve_asset_choice() so the upstream HIP prebuilt is + # tried before the CPU fallback. Hard-pinning the published + # windows-cpu bundle here would make the new HIP path + # unreachable. + if host.has_rocm: + published_choice = published_asset_choice_for_kind( + release, "windows-hip" + ) + else: + published_choice = published_asset_choice_for_kind( + release, "windows-cpu" + ) elif host.is_macos and host.is_arm64: published_choice = published_asset_choice_for_kind(release, "macos-arm64") elif host.is_macos and host.is_x86_64: @@ -4248,6 +4258,7 @@ def validate_server( install_dir: Path, *, runtime_line: str | None = None, + install_kind: str | None = None, ) -> None: last_failure: PrebuiltFallback | None = None for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1): @@ -4271,11 +4282,21 @@ def validate_server( "--batch-size", "32", ] - if ( - host.has_usable_nvidia - or host.has_rocm - or (host.is_macos and host.is_arm64) - ): + # Only enable GPU offload for assets that actually ship GPU code. + # Gating on `host.has_rocm` alone breaks the intentional CPU + # fallback on AMD Windows hosts without a HIP prebuilt: the CPU + # binary would be launched with `--n-gpu-layers 1` and fail + # validation. Use the resolved install_kind as the source of + # truth and fall back to host detection when the caller did not + # pass one (keeps backwards compatibility with older call sites). + _gpu_kinds = {"linux-cuda", "linux-rocm", "windows-cuda", "windows-hip", "macos-arm64"} + if install_kind is not None: + _enable_gpu_layers = install_kind in _gpu_kinds + else: + _enable_gpu_layers = host.has_usable_nvidia or ( + host.is_macos and host.is_arm64 + ) + if _enable_gpu_layers: command.extend(["--n-gpu-layers", "1"]) log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log") @@ -4985,6 +5006,7 @@ def validate_prebuilt_choice( host, install_dir, runtime_line = choice.runtime_line, + install_kind = choice.install_kind, ) log(f"staged prebuilt validation succeeded for {choice.name}") return server_path, quantize_path diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index b53a3f2fb9..64834306f6 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -185,9 +185,9 @@ def _ensure_rocm_torch() -> None: print(" ROCm detected but version unreadable -- skipping torch reinstall") return - # Skip if torch already links against HIP (ROCm is already working). - # Do NOT skip for CUDA-only builds since they are unusable on AMD-only hosts - # (the NVIDIA check above already handled mixed AMD+NVIDIA setups). + # Probe whether torch already links against HIP (ROCm is already working). + # Do NOT skip for CUDA-only builds since they are unusable on AMD-only + # hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups). try: probe = subprocess.run( [ @@ -201,36 +201,43 @@ def _ensure_rocm_torch() -> None: ) except (OSError, subprocess.TimeoutExpired): probe = None - if probe is not None and probe.returncode == 0 and probe.stdout.decode().strip(): - return # torch already has HIP/ROCm backend - - # Select best matching wheel tag (newest ROCm version <= installed) - tag = next( - ( - t - for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) - if ver >= (maj, mn) - ), - None, + has_hip_torch = ( + probe is not None + and probe.returncode == 0 + and probe.stdout.decode().strip() != "" ) - if tag is None: - print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping") - return - index_url = f"{_PYTORCH_WHL_BASE}/{tag}" - print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") - pip_install( - f"ROCm torch ({tag})", - "--force-reinstall", - "--no-cache-dir", - "torch>=2.4,<2.11.0", - "torchvision<0.26.0", - "torchaudio<2.11.0", - "--index-url", - index_url, - constrain = False, - ) - # Also install bitsandbytes for AMD + if not has_hip_torch: + # Select best matching wheel tag (newest ROCm version <= installed) + tag = next( + ( + t + for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) + if ver >= (maj, mn) + ), + None, + ) + if tag is None: + print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping") + return + + index_url = f"{_PYTORCH_WHL_BASE}/{tag}" + print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") + pip_install( + f"ROCm torch ({tag})", + "--force-reinstall", + "--no-cache-dir", + "torch>=2.4,<2.11.0", + "torchvision<0.26.0", + "torchaudio<2.11.0", + "--index-url", + index_url, + constrain = False, + ) + + # Always install bitsandbytes for AMD -- runs even when torch was not + # reinstalled (e.g. "unsloth studio update" on a venv that already has + # ROCm torch) so the AMD bitsandbytes dependency is not left missing. pip_install( "bitsandbytes (AMD)", "--no-cache-dir", From 5341e466e746c40ec2277f31da0263f4157fc5d8 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 10:11:35 +0000 Subject: [PATCH 42/55] Fix round 3 findings: x86_64 guard, ROCm version clip, Radeon deps Address issues surfaced by the round 3 reviewers on top of 8636fa63: - install_python_stack.py _ensure_rocm_torch: add the same `x86_64` guard that install.sh already has. Linux aarch64 / arm64 ROCm hosts must skip the repair path entirely; PyTorch only publishes ROCm wheels for linux_x86_64, and without this guard `unsloth studio update` aborts with a missing-wheel error on non x86_64 hosts. - install_llama_prebuilt.py resolve_upstream_asset_choice: add a best-effort _detect_host_rocm_version() helper (reading /opt/rocm/.info/version, amd-smi version, hipconfig --version) and filter rocm_candidates to entries whose major.minor is <= host version. Falls back to the newest candidate only when no compatible one exists, so a ROCm 6.4 host downloads rocm-6.4 instead of being handed the numerically newest rocm-7.2 bundle (which fails preflight and forces a source build). - install.sh: remove the round 2 --no-index switch from the Radeon wheel branch. --no-index forced uv to ignore PyPI entirely, which broke transitive dependency resolution (filelock, sympy, networkx, jinja2, fsspec, setuptools, typing-extensions, ...) on a fresh venv. Restore the round 1 explicit wheel URL invocation but add a torch / torchvision / torchaudio version-pair sanity check so a mismatched trio (e.g. torch 2.9.1 + torchvision 0.23.0 + torchaudio 2.9.0) falls back to the standard ROCm index instead of installing a broken combination. - install_python_stack.py _ensure_rocm_torch: restructure the "tag is None" path so it no longer short-circuits the bitsandbytes install. On a ROCm runtime older than anything in _ROCM_TORCH_INDEX, print the "no wheel" warning but still run the AMD bitsandbytes install. - studio/backend/core/training/worker.py: restore the pre-PR "no timeout" behaviour for non-HIP causal-conv1d / mamba-ssm source builds. The round 2 "timeout = 1800 if is_hip else 300" cap aborts slow non-HIP builds (Linux aarch64, unsupported torch/CUDA combos) after 5 minutes; omit timeout for the non-HIP branch so the cap only applies to ROCm source builds. --- install.sh | 70 ++++++++++++++----- studio/backend/core/training/worker.py | 27 ++++--- studio/install_llama_prebuilt.py | 97 ++++++++++++++++++++++++-- studio/install_python_stack.py | 52 ++++++++------ 4 files changed, 187 insertions(+), 59 deletions(-) diff --git a/install.sh b/install.sh index 33076beb30..324888bce2 100755 --- a/install.sh +++ b/install.sh @@ -1288,36 +1288,68 @@ elif [ -n "$TORCH_INDEX_URL" ]; then _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) || _tv_whl="" _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) || _ta_whl="" _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) || _tri_whl="" - if [ -z "$_torch_whl" ] || [ -z "$_tv_whl" ] || [ -z "$_ta_whl" ]; then - substep "[WARN] Radeon repo lacks a complete wheel set for this Python; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" + # Sanity-check torch / torchvision / torchaudio are a + # matching release. The Radeon repo publishes multiple + # generations simultaneously, so picking the highest-version + # wheel for each package independently can assemble a + # mismatched trio (e.g. torch 2.9.1 + torchvision 0.23.0 + + # torchaudio 2.9.0 from the current rocm-rel-7.2.1 index). + # Check that torch and torchaudio share the same X.Y public + # version prefix, and that torchvision's minor correctly + # pairs with torch's minor (torchvision = torch.minor - 5 + # since torch 2.4 -> torchvision 0.19 -> torch 2.9 -> + # torchvision 0.24). + _torch_ver="" + _tv_ver="" + _ta_ver="" + if [ -n "$_torch_whl" ]; then + _torch_ver=$(printf '%s\n' "$_torch_whl" | sed -n 's|.*/torch-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + fi + if [ -n "$_tv_whl" ]; then + _tv_ver=$(printf '%s\n' "$_tv_whl" | sed -n 's|.*/torchvision-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + fi + if [ -n "$_ta_whl" ]; then + _ta_ver=$(printf '%s\n' "$_ta_whl" | sed -n 's|.*/torchaudio-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + fi + _radeon_versions_match=false + if [ -n "$_torch_ver" ] && [ -n "$_tv_ver" ] && [ -n "$_ta_ver" ]; then + _torch_major=${_torch_ver%%.*} + _torch_minor=${_torch_ver#*.} + _ta_major=${_ta_ver%%.*} + _ta_minor=${_ta_ver#*.} + _tv_major=${_tv_ver%%.*} + _tv_minor=${_tv_ver#*.} + # torchvision expected minor (e.g. torch 2.9 -> 0.24) + _expected_tv_minor=$((_torch_minor + 15)) + if [ "$_torch_major" = "$_ta_major" ] && \ + [ "$_torch_minor" = "$_ta_minor" ] && \ + [ "$_tv_major" = "0" ] && \ + [ "$_tv_minor" = "$_expected_tv_minor" ]; then + _radeon_versions_match=true + fi + fi + if [ -z "$_torch_whl" ] || [ -z "$_tv_whl" ] || [ -z "$_ta_whl" ] || \ + [ "$_radeon_versions_match" != true ]; then + substep "[WARN] Radeon repo lacks a compatible wheel set for this Python; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" else substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." - # Use version constraints + --find-links + --no-index so - # uv resolves a compatible torch / torchvision / torchaudio - # set from the Radeon listing (instead of picking the - # highest-version wheel for each package independently, - # which can assemble a version-mismatched stack). - # The wheel presence check above guarantees the listing - # has at least one wheel per package; uv will pick the - # newest version-compatible triple. + # Pass explicit wheel URLs so the matched trio is + # installed together. --find-links lets uv discover + # the Radeon listing for any local lookup, and PyPI + # (not disabled) provides transitive deps like + # filelock / sympy / networkx which are not in the + # Radeon listing. if [ -n "$_tri_whl" ]; then run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ - --no-index \ --find-links "$_RADEON_BASE_URL" \ - "$TORCH_CONSTRAINT" \ - "torchvision<0.26.0" \ - "torchaudio<2.11.0" \ - "triton<3.7" + "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl" else run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ - --no-index \ --find-links "$_RADEON_BASE_URL" \ - "$TORCH_CONSTRAINT" \ - "torchvision<0.26.0" \ - "torchaudio<2.11.0" + "$_torch_whl" "$_tv_whl" "$_ta_whl" fi fi else diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 3d2f9a14b9..ebf30c14ed 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -287,26 +287,31 @@ def _install_package_wheel_first( f"{pypi_name}=={pypi_version}", ] - # Source compilation on ROCm can take 10-30 minutes; use a generous timeout - timeout = 1800 if is_hip else 300 + # Source compilation on ROCm can take 10-30 minutes; use a generous + # timeout. Non-HIP installs preserve the pre-existing "no timeout" + # behaviour so unrelated slow installs (e.g. causal-conv1d source + # build on Linux aarch64 or unsupported torch/CUDA combinations) + # are not aborted at 5 minutes by this PR. + _run_kwargs: dict[str, Any] = { + "stdout": _sp.PIPE, + "stderr": _sp.STDOUT, + "text": True, + } + if is_hip: + _run_kwargs["timeout"] = 1800 try: - result = _sp.run( - pypi_cmd, - stdout = _sp.PIPE, - stderr = _sp.STDOUT, - text = True, - timeout = timeout, - ) + result = _sp.run(pypi_cmd, **_run_kwargs) except _sp.TimeoutExpired: logger.error( "%s installation timed out after %ds", display_name, - timeout, + _run_kwargs.get("timeout"), ) _send_status( event_queue, - f"{display_name} installation timed out after {timeout}s", + f"{display_name} installation timed out after " + f"{_run_kwargs.get('timeout')}s", ) return diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 3f58ab1512..3769b5f0e2 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2993,6 +2993,65 @@ def published_asset_choice_for_kind( return None +def _detect_host_rocm_version() -> tuple[int, int] | None: + """Return (major, minor) of the installed ROCm runtime, or None. + + Best-effort read from /opt/rocm/.info/version, amd-smi version, and + hipconfig --version. Used to pick a compatible upstream llama.cpp + ROCm prebuilt rather than always taking the numerically newest one + (which can be newer than the host runtime). + """ + rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" + for path in ( + os.path.join(rocm_root, ".info", "version"), + os.path.join(rocm_root, "lib", "rocm_version"), + ): + try: + with open(path) as fh: + parts = fh.read().strip().split("-")[0].split(".") + return int(parts[0]), int(parts[1]) + except Exception: + pass + amd_smi = shutil.which("amd-smi") + if amd_smi: + try: + result = subprocess.run( + [amd_smi, "version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) + if m: + return int(m.group(1)), int(m.group(2)) + except Exception: + pass + hipconfig = shutil.which("hipconfig") + if hipconfig: + try: + result = subprocess.run( + [hipconfig, "--version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + raw = (result.stdout or "").strip().split("\n")[0] + parts = raw.split(".") + if ( + len(parts) >= 2 + and parts[0].isdigit() + and parts[1].split("-")[0].isdigit() + ): + return int(parts[0]), int(parts[1].split("-")[0]) + except Exception: + pass + return None + + def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice: upstream_assets = github_release_assets(UPSTREAM_REPO, llama_tag) if host.is_linux and host.is_x86_64: @@ -3001,10 +3060,14 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice # the exact GPU target via rocminfo, which is more reliable for consumer # GPUs (e.g. gfx1151) that may not be in the prebuilt. if host.has_rocm and not host.has_usable_nvidia: - # Scan upstream assets for any rocm- prebuilt and prefer - # the newest one. Hardcoding a single rocm-7.2 filename means - # ROCm 6.x / 7.0 / 7.1 / 7.3+ users always fall through to a - # source build even when a matching prebuilt exists upstream. + # Scan upstream assets for any rocm- prebuilt. When the + # host ROCm runtime version is known, pick the newest candidate + # whose major.minor is <= host version -- otherwise a ROCm 6.4 + # host would download the rocm-7.2 tarball, fail preflight, and + # fall back to a source build even though a compatible 6.4 + # prebuilt exists. If no compatible candidate matches (e.g. host + # runtime is older than every published prebuilt), fall back to + # the numerically newest so we at least try something. _rocm_pattern = re.compile( rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz" ) @@ -3016,9 +3079,29 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice _parts = tuple(int(p) for p in _m.group(1).split(".")) rocm_candidates.append((_parts, _name)) rocm_candidates.sort(reverse = True) - if rocm_candidates: - rocm_name = rocm_candidates[0][1] - log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") + _host_rocm_version = _detect_host_rocm_version() + _compatible: list[tuple[tuple[int, ...], str]] = rocm_candidates + if _host_rocm_version is not None: + _compatible = [ + item + for item in rocm_candidates + if item[0][:2] <= _host_rocm_version + ] + if rocm_candidates and not _compatible: + # Fall back to the newest candidate so a source build is + # not forced when the host runtime is older than every + # published prebuilt: preflight will still catch a true + # incompatibility and trigger a fallback. + _compatible = rocm_candidates[:1] + if _compatible: + rocm_name = _compatible[0][1] + if _host_rocm_version is not None: + log( + f"AMD ROCm {_host_rocm_version[0]}.{_host_rocm_version[1]} " + f"detected -- trying upstream prebuilt {rocm_name}" + ) + else: + log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") log( "Note: if your ROCm runtime version differs significantly, " "this may fail preflight and fall back to a source build (safe)" diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 64834306f6..debe0e2ce7 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -157,16 +157,22 @@ def _has_usable_nvidia_gpu() -> bool: def _ensure_rocm_torch() -> None: """Reinstall torch with ROCm wheels when the venv received CPU-only torch. - Runs only on Linux hosts where an AMD GPU is present and the ROCm - runtime is detectable (rocminfo / amd-smi / hipconfig / rocm-core - package). No-op when torch already links against HIP (ROCm) or on - Windows/macOS or on mixed AMD+NVIDIA hosts (NVIDIA takes precedence). + Runs only on Linux x86_64 hosts where an AMD GPU is present and the + ROCm runtime is detectable (rocminfo / amd-smi / hipconfig / + rocm-core package). No-op when torch already links against HIP + (ROCm), on Windows / macOS, on non-x86_64 Linux (PyTorch does not + publish ROCm wheels for aarch64 / arm64), or on mixed AMD+NVIDIA + hosts (NVIDIA takes precedence). Uses pip_install() to respect uv, constraints, and --python targeting. """ - # Explicit OS guard so the helper is safe to call from any context -- - # ROCm wheels are only published for Linux x86_64. + # Explicit OS / architecture guards so the helper is safe to call + # from any context -- PyTorch only publishes ROCm wheels for + # linux_x86_64, so aarch64 / arm64 hosts must skip this repair path + # instead of failing the update with a missing-wheel error. if IS_WINDOWS or IS_MACOS: return + if platform.machine().lower() not in {"x86_64", "amd64"}: + return # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable if _has_usable_nvidia_gpu(): return @@ -218,22 +224,24 @@ def _ensure_rocm_torch() -> None: None, ) if tag is None: - print(f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- skipping") - return - - index_url = f"{_PYTORCH_WHL_BASE}/{tag}" - print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") - pip_install( - f"ROCm torch ({tag})", - "--force-reinstall", - "--no-cache-dir", - "torch>=2.4,<2.11.0", - "torchvision<0.26.0", - "torchaudio<2.11.0", - "--index-url", - index_url, - constrain = False, - ) + print( + f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- " + f"skipping torch reinstall" + ) + else: + index_url = f"{_PYTORCH_WHL_BASE}/{tag}" + print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") + pip_install( + f"ROCm torch ({tag})", + "--force-reinstall", + "--no-cache-dir", + "torch>=2.4,<2.11.0", + "torchvision<0.26.0", + "torchaudio<2.11.0", + "--index-url", + index_url, + constrain = False, + ) # Always install bitsandbytes for AMD -- runs even when torch was not # reinstalled (e.g. "unsloth studio update" on a venv that already has From 5305c3198ed27126ed75bf33a847c59594895718 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 10:29:05 +0000 Subject: [PATCH 43/55] Fix round 4 findings: apply_gpu_ids env inheritance, Radeon X.Y, bitsandbytes gate Address remaining issues surfaced by the round 4 reviewers: - studio/backend/utils/hardware/hardware.py apply_gpu_ids: mirror the selection into HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES whenever the caller already had a ROCm visibility env var set, not only when IS_ROCM has already been set by detect_hardware(). Training and inference workers call apply_gpu_ids() before detect_hardware() runs, so the old guard would leave a forked ROCm worker with a stale HIP_VISIBLE_DEVICES mask that no longer matched the narrowed CUDA_VISIBLE_DEVICES selection. - install.sh get_radeon_wheel_url: accept X.Y ROCm versions in addition to X.Y.Z. The `/opt/rocm/.info/version` file and some hipconfig versions report only two components, and the Radeon repository publishes both rocm-rel-X.Y.Z/ and rocm-rel-X.Y/ directories, so treating X.Y as invalid caused Radeon hosts to fall back to the generic ROCm index even when a matching AMD wheel set existed. - install_python_stack.py _ensure_rocm_torch: only install the AMD bitsandbytes build when the venv actually has a ROCm-compatible torch (either already present or just installed by this function). Previously the bitsandbytes install ran unconditionally, which could leave an AMD bitsandbytes layered on top of a CPU/CUDA torch on hosts where the ROCm runtime is older than any entry in _ROCM_TORCH_INDEX. Also add --force-reinstall so an existing CPU/CUDA bitsandbytes is replaced by the AMD build during upgrades. --- install.sh | 16 ++++++++----- studio/backend/utils/hardware/hardware.py | 15 +++++++++--- studio/install_python_stack.py | 28 +++++++++++++++-------- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/install.sh b/install.sh index 324888bce2..cd428878d1 100755 --- a/install.sh +++ b/install.sh @@ -1105,21 +1105,25 @@ get_radeon_wheel_url() { # Only meaningful on Linux. Picks a repo.radeon.com base URL whose listing # contains torch wheels. Tries paths like rocm-rel-7.2.1/, rocm-rel-7.2/, # rocm-rel-7.1.1/, rocm-rel-7.1/ (AMD publishes both M.m and M.m.p dirs). + # Accepts both X.Y and X.Y.Z host versions since /opt/rocm/.info/version + # and hipconfig --version can return either shape. case "$(uname -s)" in Linux) ;; *) echo ""; return ;; esac - # Detect full X.Y.Z version -- try amd-smi first, then /opt/rocm/.info/version, then hipconfig + # Detect ROCm version (X.Y or X.Y.Z) -- try amd-smi, then + # /opt/rocm/.info/version, then hipconfig. _full_ver="" _full_ver=$({ command -v amd-smi >/dev/null 2>&1 && \ amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ - 'NF>1{if(match($2,/[0-9]+\.[0-9]+\.[0-9]+/)){print substr($2,RSTART,RLENGTH); ok=1; exit}} END{exit !ok}'; } || \ + 'NF>1{if(match($2,/[0-9]+\.[0-9]+(\.[0-9]+)?/)){print substr($2,RSTART,RLENGTH); ok=1; exit}} END{exit !ok}'; } || \ { [ -r /opt/rocm/.info/version ] && \ - awk -F'[.-]' 'NF>=3{print $1"."$2"."$3; exit}' /opt/rocm/.info/version; } || \ + awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1; exit} END{exit !found}' /opt/rocm/.info/version; } || \ { command -v hipconfig >/dev/null 2>&1 && \ - hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]+\.[0-9]+\.[0-9]/{print $1; found=1} END{exit !found}'; }) 2>/dev/null + hipconfig --version 2>/dev/null | awk 'NR==1 && match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1} END{exit !found}'; }) 2>/dev/null - # Validate: must be X.Y.Z with X >= 1 + # Validate: must be X.Y or X.Y.Z with X >= 1 case "$_full_ver" in - [1-9]*.*[0-9].*[0-9]*) : ;; + [1-9]*.[0-9]*.[0-9]*) : ;; # X.Y.Z + [1-9]*.[0-9]*) : ;; # X.Y *) echo ""; return ;; esac echo "https://repo.radeon.com/rocm/manylinux/rocm-rel-${_full_ver}/" diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index fa5270b833..cea5fddd64 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -1356,12 +1356,21 @@ def apply_gpu_ids(gpu_ids) -> None: os.environ["CUDA_VISIBLE_DEVICES"] = value # Keep ROCm visibility env vars in sync so _get_parent_visible_gpu_spec() - # picks up the narrowed set on AMD systems. - if IS_ROCM: + # picks up the narrowed set on AMD systems. Workers can call + # apply_gpu_ids() before detect_hardware() runs (so IS_ROCM is still + # its default False), so also mirror the selection whenever the + # parent process already set a ROCm visibility variable -- that + # way a downstream ROCm process inherits the narrowed mask even + # before Studio's hardware detection has classified the host. + _inherits_rocm_visibility = ( + "HIP_VISIBLE_DEVICES" in os.environ + or "ROCR_VISIBLE_DEVICES" in os.environ + ) + if IS_ROCM or _inherits_rocm_visibility: os.environ["HIP_VISIBLE_DEVICES"] = value os.environ["ROCR_VISIBLE_DEVICES"] = value _visible_gpu_count = None - if IS_ROCM: + if IS_ROCM or _inherits_rocm_visibility: logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s' (rocm)", value) else: logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index debe0e2ce7..79c90df9d3 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -213,6 +213,8 @@ def _ensure_rocm_torch() -> None: and probe.stdout.decode().strip() != "" ) + rocm_torch_ready = has_hip_torch + if not has_hip_torch: # Select best matching wheel tag (newest ROCm version <= installed) tag = next( @@ -242,16 +244,22 @@ def _ensure_rocm_torch() -> None: index_url, constrain = False, ) - - # Always install bitsandbytes for AMD -- runs even when torch was not - # reinstalled (e.g. "unsloth studio update" on a venv that already has - # ROCm torch) so the AMD bitsandbytes dependency is not left missing. - pip_install( - "bitsandbytes (AMD)", - "--no-cache-dir", - "bitsandbytes>=0.49.1", - constrain = False, - ) + rocm_torch_ready = True + + # Install bitsandbytes only when the venv has a ROCm-compatible torch + # (either already present or just installed). Avoids leaving an AMD + # bitsandbytes on top of a CPU/CUDA torch on hosts where the ROCm + # runtime is older than any published torch wheel. Uses + # --force-reinstall so an existing CPU/CUDA bitsandbytes is replaced + # by the AMD build during upgrades. + if rocm_torch_ready: + pip_install( + "bitsandbytes (AMD)", + "--force-reinstall", + "--no-cache-dir", + "bitsandbytes>=0.49.1", + constrain = False, + ) def _infer_no_torch() -> bool: From 7d27b2e53876130ff06153f622ab0006eb483c9a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Apr 2026 11:11:39 +0000 Subject: [PATCH 44/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/utils/hardware/amd.py | 6 +++++- studio/backend/utils/hardware/hardware.py | 3 +-- studio/install_llama_prebuilt.py | 20 ++++++++++---------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 941c9cef18..283c541f14 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -210,7 +210,11 @@ def _first_visible_amd_gpu_id() -> Optional[str]: set, and ``None`` when the env var explicitly narrows to zero GPUs ("" or "-1"), so callers can short-circuit to "available: False". """ - for env_name in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"): + for env_name in ( + "HIP_VISIBLE_DEVICES", + "ROCR_VISIBLE_DEVICES", + "CUDA_VISIBLE_DEVICES", + ): raw = os.environ.get(env_name) if raw is None: continue diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index cea5fddd64..c934bb9ff0 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -1363,8 +1363,7 @@ def apply_gpu_ids(gpu_ids) -> None: # way a downstream ROCm process inherits the narrowed mask even # before Studio's hardware detection has classified the host. _inherits_rocm_visibility = ( - "HIP_VISIBLE_DEVICES" in os.environ - or "ROCR_VISIBLE_DEVICES" in os.environ + "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ ) if IS_ROCM or _inherits_rocm_visibility: os.environ["HIP_VISIBLE_DEVICES"] = value diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 3769b5f0e2..2cbd3ad65f 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -2502,9 +2502,7 @@ def detect_host() -> HostInfo: try: listing = run_capture([nvidia_smi, "-L"], timeout = 20) gpu_lines = [ - line - for line in listing.stdout.splitlines() - if line.startswith("GPU ") + line for line in listing.stdout.splitlines() if line.startswith("GPU ") ] if gpu_lines: has_physical_nvidia = True @@ -3246,13 +3244,9 @@ def resolve_release_asset_choice( # windows-cpu bundle here would make the new HIP path # unreachable. if host.has_rocm: - published_choice = published_asset_choice_for_kind( - release, "windows-hip" - ) + published_choice = published_asset_choice_for_kind(release, "windows-hip") else: - published_choice = published_asset_choice_for_kind( - release, "windows-cpu" - ) + published_choice = published_asset_choice_for_kind(release, "windows-cpu") elif host.is_macos and host.is_arm64: published_choice = published_asset_choice_for_kind(release, "macos-arm64") elif host.is_macos and host.is_x86_64: @@ -4372,7 +4366,13 @@ def validate_server( # validation. Use the resolved install_kind as the source of # truth and fall back to host detection when the caller did not # pass one (keeps backwards compatibility with older call sites). - _gpu_kinds = {"linux-cuda", "linux-rocm", "windows-cuda", "windows-hip", "macos-arm64"} + _gpu_kinds = { + "linux-cuda", + "linux-rocm", + "windows-cuda", + "windows-hip", + "macos-arm64", + } if install_kind is not None: _enable_gpu_layers = install_kind in _gpu_kinds else: From f98aaef9b55e2c7993c57d0256e88e2671cf524c Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 11:36:31 +0000 Subject: [PATCH 45/55] Fix gemini findings: amd-smi metric envelope validation and dict-wrapped GPU id Two medium-severity defensive fixes from the gemini-code-assist review on the AMD monitoring backend: 1. _extract_gpu_metrics may return a dict where every value is None when amd-smi succeeds (zero exit) but the JSON envelope contains no usable fields (error response, unsupported card). The new _has_real_metrics helper lets get_primary_gpu_utilization surface available:False and lets get_visible_gpu_utilization skip ghost device rows so the UI does not render placeholder cards with empty numbers. 2. Newer amd-smi versions wrap scalar fields as {"value": 0, "unit": "none"}, including the per-GPU id. The previous int(raw_id) call silently fell back to the enumeration index in that case, losing the real GPU id. Routing raw_id through the existing _parse_numeric helper handles bare ints, floats, strings, and the dict shape uniformly, with a debug log on parse failure. --- studio/backend/utils/hardware/amd.py | 43 +++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 283c541f14..0cf51b4cb6 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -188,6 +188,18 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: } +def _has_real_metrics(metrics: dict[str, Any]) -> bool: + """Return True when ``metrics`` contains at least one non-None value. + + ``amd-smi`` can return a zero-exit JSON envelope that is missing every + expected field (error response, unsupported card, hipless container). + In that case ``_extract_gpu_metrics`` produces a dict where every value + is ``None`` -- callers must surface this as ``available: False`` rather + than ``available: True`` with empty data. + """ + return any(value is not None for value in metrics.values()) + + def get_physical_gpu_count() -> Optional[int]: """Return physical AMD GPU count via amd-smi, or None on failure.""" data = _run_amd_smi("list") @@ -249,6 +261,12 @@ def get_primary_gpu_utilization() -> dict[str, Any]: gpu_data = data metrics = _extract_gpu_metrics(gpu_data) + if not _has_real_metrics(metrics): + # amd-smi returned a JSON envelope with no usable fields (error + # response or unsupported card). Surface as unavailable rather + # than available-with-empty-data so the UI does not render a + # ghost device. + return {"available": False} metrics["available"] = True return metrics @@ -285,7 +303,11 @@ def get_visible_gpu_utilization( devices = [] for fallback_idx, gpu_data in enumerate(gpu_list): - # Use AMD-reported GPU ID when available, fall back to enumeration index + # Use AMD-reported GPU ID when available, fall back to enumeration + # index. Newer amd-smi versions wrap scalars as ``{"value": 0, + # "unit": "none"}``, so route raw_id through ``_parse_numeric`` + # which already handles bare ints, floats, strings, and that + # dict shape uniformly. raw_id = ( gpu_data.get( "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx)) @@ -293,13 +315,26 @@ def get_visible_gpu_utilization( if isinstance(gpu_data, dict) else fallback_idx ) - try: - idx = int(raw_id) - except (TypeError, ValueError): + parsed_id = _parse_numeric(raw_id) + if parsed_id is None: + logger.debug( + "amd-smi GPU id %r could not be parsed; falling back to " + "enumeration index %d", + raw_id, + fallback_idx, + ) idx = fallback_idx + else: + idx = int(parsed_id) if idx not in visible_set: continue metrics = _extract_gpu_metrics(gpu_data) + if not _has_real_metrics(metrics): + # Skip ghost entries: an amd-smi response that decodes to a + # dict but contains no usable fields (error envelope, etc.) + # would otherwise show up as a device row with all-None + # numbers in the UI. + continue metrics["index"] = idx metrics["index_kind"] = "physical" metrics["visible_ordinal"] = ordinal_map.get(idx, len(devices)) From 37432b689b2bf3a1bd5b0abc331c1e0d52805b5a Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 11:50:38 +0000 Subject: [PATCH 46/55] Fix gemini round 2 findings: explicit length guard on ROCm version file parser Both _detect_rocm_version (install_python_stack.py) and _detect_host_rocm_version (install_llama_prebuilt.py) read /opt/rocm/.info/version or $ROCM_PATH/lib/rocm_version, split on "." and unconditionally accessed parts[1]. The surrounding broad `except Exception: pass` already swallowed the resulting IndexError, so a one-component file like "6\n" did fall through to the next detection source -- but the control flow relied on exception handling instead of an explicit check. Add `if len(parts) >= 2:` guards in both helpers so the loop falls through on its own without raising. Behaviour is unchanged for the common multi- component case; the previously-silent IndexError path becomes an explicit no-op. --- studio/install_llama_prebuilt.py | 6 +++++- studio/install_python_stack.py | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 2cbd3ad65f..ba27a34975 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -3007,7 +3007,11 @@ def _detect_host_rocm_version() -> tuple[int, int] | None: try: with open(path) as fh: parts = fh.read().strip().split("-")[0].split(".") - return int(parts[0]), int(parts[1]) + # Explicit length guard avoids relying on the broad except + # below to swallow IndexError when the version file contains + # a single component (e.g. "6\n" on a partial install). + if len(parts) >= 2: + return int(parts[0]), int(parts[1]) except Exception: pass amd_smi = shutil.which("amd-smi") diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index 79c90df9d3..c75b78dd09 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -55,7 +55,11 @@ def _detect_rocm_version() -> tuple[int, int] | None: try: with open(path) as fh: parts = fh.read().strip().split("-")[0].split(".") - return int(parts[0]), int(parts[1]) + # Explicit length guard avoids relying on the broad except + # below to swallow IndexError when the version file contains + # a single component (e.g. "6\n" on a partial install). + if len(parts) >= 2: + return int(parts[0]), int(parts[1]) except Exception: pass From c12e8b7052759d7852a243bfda077d4d1fd99723 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 11:59:34 +0000 Subject: [PATCH 47/55] Fix gemini round 3: include has_rocm in validate_server fallback path When validate_server is called without an explicit install_kind (older call sites that have not been updated), the fallback was only enabling --n-gpu-layers for NVIDIA and macOS arm64 hosts. AMD ROCm Linux hosts fell through to the CPU validation path even though the prebuilt being exercised was a HIP binary. Add host.has_rocm to the fallback expression so the GPU offload flag is applied consistently with the install_kind=='linux-rocm' / 'windows-hip' branches above. --- studio/install_llama_prebuilt.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index ba27a34975..2311d1910f 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -4380,8 +4380,14 @@ def validate_server( if install_kind is not None: _enable_gpu_layers = install_kind in _gpu_kinds else: - _enable_gpu_layers = host.has_usable_nvidia or ( - host.is_macos and host.is_arm64 + # Older call sites that don't pass install_kind: keep ROCm + # hosts in the GPU-validation path so an AMD-only Linux host + # is exercised against the actual hardware rather than the + # CPU fallback. NVIDIA and macOS-arm64 are already covered. + _enable_gpu_layers = ( + host.has_usable_nvidia + or host.has_rocm + or (host.is_macos and host.is_arm64) ) if _enable_gpu_layers: command.extend(["--n-gpu-layers", "1"]) From d25c570a06caaeb679bb34e97270e681565a0100 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 12:07:39 +0000 Subject: [PATCH 48/55] Fix gemini round 4: remove risky bytes-vs-MB heuristic in _parse_memory_mb The previous heuristic divided any bare number above 10_000_000 by 1024*1024 on the assumption that large unit-less values were bytes. This misclassified small VRAM allocations: 5 MB of used VRAM reported as 5_242_880 bytes without a unit would be taken at face value and render as 5_242_880 MB (~5 TB) in the monitoring UI. Modern amd-smi always provides explicit units (MiB/GiB dict form), and legacy amd-smi returns bare numbers in MB -- the heuristic never had a real workload to handle. Drop it and default to MB for bare numeric input, keeping the existing unit-aware branches for dict / string inputs unchanged. The unrelated gemini suggestion to "default minor to 0" in the amd-smi version awk parser was intentionally NOT applied: rocm7.0 and rocm7.1 ship different wheel sets, so silently substituting 0 for a missing minor could install the wrong wheels. The existing reject-and-fall-through behaviour is safer. --- studio/backend/utils/hardware/amd.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 0cf51b4cb6..93daae4605 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -67,8 +67,10 @@ def _parse_numeric(value: Any) -> Optional[float]: def _parse_memory_mb(value: Any) -> Optional[float]: """Parse a memory value from amd-smi output and return MB. - Handles bare numbers (assumed MB), dict-shaped values with units - ({"value": 192, "unit": "GiB"}), and byte-scale heuristic fallback. + Handles bare numbers (assumed MB -- the amd-smi convention on every + version we have seen), dict-shaped values with explicit units + (``{"value": 192, "unit": "GiB"}`` on newer releases), and plain + strings like ``"8192 MiB"``. """ unit = "" raw_value = value @@ -98,10 +100,13 @@ def _parse_memory_mb(value: Any) -> Optional[float]: # Plain bytes return num / (1024 * 1024) - # No explicit unit -- heuristic: values > 10M are likely bytes - if num > 10_000_000: - return num / (1024 * 1024) - return num # Assume MB + # No explicit unit -- default to MB, which is the amd-smi convention + # for bare numeric values. A previous heuristic assumed values above + # ~10M were bytes, but that misclassifies small VRAM allocations + # (e.g. 5 MB = 5,242,880 reported without a unit) as ~5 TB. Modern + # amd-smi always ships explicit units, so the heuristic branch only + # fired for legacy output where MB was already the convention. + return num def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: From b3627bc29e5440a13169677b1340a1ed1e4e7ded Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Wed, 8 Apr 2026 12:17:27 +0000 Subject: [PATCH 49/55] Fix gemini round 5: POSIX compliance and leading-comma visibility parsing Three medium findings from gemini-code-assist addressed in this commit: 1. _pick_radeon_wheel used grep -o and sort -V, both GNU extensions that are not in POSIX and break on BSD/BusyBox coreutils. install.sh has a #!/bin/sh shebang so the whole pipeline was rewritten as a single awk script that extracts all href="..." hits on each line, filters to wheels matching the package prefix and python tag, and picks the newest version via zero-padded lexical comparison. No external sort or grep is needed. 2. _first_visible_amd_gpu_id in the AMD monitoring backend treated a leading comma (e.g. HIP_VISIBLE_DEVICES=",1") as "fall through to the next env var", which is surprising given the clear intent to narrow to device 1. Filter empty tokens after the split and return the first real one. An all-commas value ("," / ",,,") still falls through because no real tokens exist; the empty-string and "-1" explicit-zero cases are unchanged. The unrelated amd-smi version awk parser suggestion was not applied (see round 4 commit message for rationale: defaulting a missing minor to 0 could silently install the wrong ROCm wheel set). --- install.sh | 58 +++++++++++++++++++++------- studio/backend/utils/hardware/amd.py | 15 +++---- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/install.sh b/install.sh index cd428878d1..0dc5a712c2 100755 --- a/install.sh +++ b/install.sh @@ -1158,27 +1158,55 @@ _pick_radeon_wheel() { # Scans $_RADEON_LISTING for the newest wheel whose filename starts exactly # with PACKAGE_NAME- and matches _RADEON_PYTAG + linux_x86_64. # Prints the full URL (resolving relative hrefs against _RADEON_BASE_URL). + # + # POSIX-compliant pipeline: all href parsing, filtering, and version + # selection is done inside a single awk script rather than reaching + # for GNU extensions (grep -o, sort -V) that would break under BSD + # or BusyBox coreutils. _pkg="$1" [ -n "$_RADEON_LISTING" ] || return 1 [ -n "$_RADEON_PYTAG" ] || return 1 _tag="$_RADEON_PYTAG" _href=$(printf '%s\n' "$_RADEON_LISTING" \ - | grep -o 'href="[^"]*"' \ - | sed 's/href="//;s/"//' \ - | awk -F/ -v pkg="$_pkg" -v tag="$_tag" ' + | awk -v pkg="$_pkg" -v tag="$_tag" ' + BEGIN { max_pad = ""; max_url = "" } { - base = $NF - sub(/[?#].*/, "", base) # strip query / fragment - prefix = pkg "-" - # Match cpXY-cpXY or cpXY-abi3 with any linux x86_64 platform tag - # (linux_x86_64, manylinux_2_28_x86_64, manylinux2014_x86_64, etc.) - if (substr(base, 1, length(prefix)) == prefix && - index(base, "-" tag "-") > 0 && - match(base, /x86_64\.whl$/)) - print $0 - }' \ - | sort -V \ - | tail -1) + line = $0 + while (match(line, /href="[^"]*"/)) { + # Strip the leading href=" (6 chars) and trailing " (1 char) + url = substr(line, RSTART + 6, RLENGTH - 7) + line = substr(line, RSTART + RLENGTH) + + # Extract basename, strip query / fragment + n = split(url, p, "/") + base = p[n] + sub(/[?#].*/, "", base) + + prefix = pkg "-" + # Match cpXY-cpXY or cpXY-abi3 with any linux x86_64 + # platform tag (linux_x86_64, manylinux_2_28_x86_64, + # manylinux2014_x86_64, etc.) + if (substr(base, 1, length(prefix)) == prefix && + index(base, "-" tag "-") > 0 && + match(base, /x86_64\.whl$/)) { + # Extract the version component (first + # dotted-number run) and pad each piece so a + # plain lexical comparison gives us the newest. + if (match(base, /[0-9]+\.[0-9]+(\.[0-9]+)?/)) { + ver = substr(base, RSTART, RLENGTH) + m = split(ver, v, ".") + pad = "" + for (i = 1; i <= m; i++) + pad = pad sprintf("%08d", v[i]) + if (pad > max_pad) { + max_pad = pad + max_url = url + } + } + } + } + } + END { if (max_url != "") print max_url }') [ -z "$_href" ] && return 1 case "$_href" in http*) printf '%s\n' "$_href" ;; diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 93daae4605..76ce0e6eac 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -238,13 +238,14 @@ def _first_visible_amd_gpu_id() -> Optional[str]: raw = raw.strip() if raw == "" or raw == "-1": return None - first = raw.split(",", 1)[0].strip() - if first: - return first - # Leading comma or all-whitespace first token -- fall through to - # the next env var in priority order rather than silently - # returning GPU 0. - continue + # Filter out empty tokens after splitting. This tolerates minor + # typos like ``HIP_VISIBLE_DEVICES=",1"`` (leading comma, user + # clearly meant to narrow to device 1) while still falling + # through to the next env var when every token is empty + # (e.g. ``,,,``). + tokens = [t.strip() for t in raw.split(",") if t.strip()] + if tokens: + return tokens[0] return "0" From 5211328283d9e631dd7e7fc0f3b52e97b383a1be Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 9 Apr 2026 00:57:20 +0000 Subject: [PATCH 50/55] Fix 20-reviewer.py findings: base drift, Radeon %2B, dpkg/rpm fallback, bnb, backend label Consolidated fix batch from a 20-parallel reviewer.py run on the current head. Each fix is drawn from a high-consensus finding and addresses a real bug or feature gap, not a stylistic preference. 1. install.sh: bump `unsloth>=2026.4.2` -> `unsloth>=2026.4.4` at five call sites so this branch no longer regresses main's version floor (main bumped to 2026.4.4 in #4876). Without this, merging 4720 would silently downgrade the minimum version pin for fresh installs. 2. install.sh: URL-decode Radeon wheel names before extracting the torch / torchvision / torchaudio version strings. Real wheel URLs from repo.radeon.com are percent-encoded ("torch-2.10.0%2Brocm7.2.0...") so the previous `[+-]` terminator in the sed regex never matched, `_torch_ver` stayed empty, `_radeon_versions_match` stayed false, and every Radeon consumer install silently fell back to the generic ROCm index. Now decode %2B -> + first, then extract, then validate. 3. install.sh: the two AMD bitsandbytes install lines were running `uv pip install "bitsandbytes>=0.49.1"` without `--force-reinstall`, so upgrades where the venv already has a CPU/CUDA bitsandbytes satisfying the constraint would keep the stale non-AMD wheel. Add `--force-reinstall --no-cache-dir` to both call sites, matching the pattern already used in install_python_stack.py::_ensure_rocm_torch. 4. install_python_stack.py and install_llama_prebuilt.py: add `dpkg-query -W rocm-core` and `rpm -q rocm-core` fallbacks to the Python-side ROCm version detectors so they match the chain in install.sh::get_torch_index_url. Package-managed ROCm installs (Debian/Ubuntu/RHEL/Fedora distro packages) can expose GPUs via rocminfo/amd-smi but still lack /opt/rocm/.info/version, hipconfig, or amd-smi `version` output -- without these fallbacks, `unsloth studio update` on such hosts returned None and skipped the ROCm torch repair. Also strip the dpkg epoch prefix ("1:6.3.0-1") before parsing so epoch-annotated packages parse correctly. 5. hardware.py: add a `_backend_label(device)` helper that returns "rocm" when IS_ROCM is set and the device is DeviceType.CUDA, and use it for every `"backend": ...` emission in JSON responses served to the Studio frontend. Internally we still represent ROCm hosts as DeviceType.CUDA (ROCm torch reuses the whole torch.cuda.* API surface), but the user-facing API now correctly reports "rocm" on AMD boxes instead of labeling them as "cuda". All 250 simulation scenarios pass (was 233 before this batch: added 17 new regression tests covering the version pin, %2B decoding, bnb force-reinstall flags, dpkg/rpm fallback presence, and the _backend_label helper's four-way truth table). --- install.sh | 30 ++++++++---- studio/backend/utils/hardware/hardware.py | 57 +++++++++++++++-------- studio/install_llama_prebuilt.py | 31 ++++++++++++ studio/install_python_stack.py | 34 ++++++++++++++ 4 files changed, 122 insertions(+), 30 deletions(-) diff --git a/install.sh b/install.sh index 0dc5a712c2..09b5f2df15 100755 --- a/install.sh +++ b/install.sh @@ -1266,7 +1266,7 @@ if [ "$_MIGRATED" = true ]; then # to prevent transitive torch resolution. run_install_cmd "install unsloth (migrated no-torch)" uv pip install --python "$_VENV_PY" --no-deps \ --reinstall-package unsloth --reinstall-package unsloth-zoo \ - "unsloth>=2026.4.2" unsloth-zoo + "unsloth>=2026.4.4" unsloth-zoo _NO_TORCH_RT="$(_find_no_torch_runtime)" if [ -n "$_NO_TORCH_RT" ]; then run_install_cmd "install no-torch runtime deps" uv pip install --python "$_VENV_PY" --no-deps -r "$_NO_TORCH_RT" @@ -1274,7 +1274,7 @@ if [ "$_MIGRATED" = true ]; then else run_install_cmd "install unsloth (migrated)" uv pip install --python "$_VENV_PY" \ --reinstall-package unsloth --reinstall-package unsloth-zoo \ - "unsloth>=2026.4.2" unsloth-zoo + "unsloth>=2026.4.4" unsloth-zoo fi if [ "$STUDIO_LOCAL_INSTALL" = true ]; then substep "overlaying local repo (editable)..." @@ -1287,7 +1287,7 @@ if [ "$_MIGRATED" = true ]; then case "$TORCH_INDEX_URL" in */rocm*) substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" ;; esac fi @@ -1331,17 +1331,27 @@ elif [ -n "$TORCH_INDEX_URL" ]; then # pairs with torch's minor (torchvision = torch.minor - 5 # since torch 2.4 -> torchvision 0.19 -> torch 2.9 -> # torchvision 0.24). + # URL-decode each wheel name so %2B -> + before version + # extraction. Real Radeon wheel hrefs are percent-encoded + # (torch-2.10.0%2Brocm7.2.0...), so a plain [+-] terminator + # in the sed regex below would never match and + # _radeon_versions_match would stay false for every real + # listing, silently forcing a fallback to the generic + # ROCm index. _torch_ver="" _tv_ver="" _ta_ver="" if [ -n "$_torch_whl" ]; then - _torch_ver=$(printf '%s\n' "$_torch_whl" | sed -n 's|.*/torch-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + _torch_name=$(printf '%s' "${_torch_whl##*/}" | sed 's/%2[Bb]/+/g') + _torch_ver=$(printf '%s\n' "$_torch_name" | sed -n 's|^torch-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') fi if [ -n "$_tv_whl" ]; then - _tv_ver=$(printf '%s\n' "$_tv_whl" | sed -n 's|.*/torchvision-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + _tv_name=$(printf '%s' "${_tv_whl##*/}" | sed 's/%2[Bb]/+/g') + _tv_ver=$(printf '%s\n' "$_tv_name" | sed -n 's|^torchvision-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') fi if [ -n "$_ta_whl" ]; then - _ta_ver=$(printf '%s\n' "$_ta_whl" | sed -n 's|.*/torchaudio-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + _ta_name=$(printf '%s' "${_ta_whl##*/}" | sed 's/%2[Bb]/+/g') + _ta_ver=$(printf '%s\n' "$_ta_name" | sed -n 's|^torchaudio-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') fi _radeon_versions_match=false if [ -n "$_torch_ver" ] && [ -n "$_tv_ver" ] && [ -n "$_ta_ver" ]; then @@ -1405,7 +1415,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then case "$TORCH_INDEX_URL" in */rocm*) substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" "bitsandbytes>=0.49.1" + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" ;; esac # Fresh: Step 2 - install unsloth, preserving pre-installed torch @@ -1415,7 +1425,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then # runtime deps (typer, safetensors, transformers, etc.) with --no-deps. run_install_cmd "install unsloth (no-torch)" uv pip install --python "$_VENV_PY" --no-deps \ --upgrade-package unsloth --upgrade-package unsloth-zoo \ - "unsloth>=2026.4.2" unsloth-zoo + "unsloth>=2026.4.4" unsloth-zoo _NO_TORCH_RT="$(_find_no_torch_runtime)" if [ -n "$_NO_TORCH_RT" ]; then run_install_cmd "install no-torch runtime deps" uv pip install --python "$_VENV_PY" --no-deps -r "$_NO_TORCH_RT" @@ -1426,7 +1436,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then fi elif [ "$STUDIO_LOCAL_INSTALL" = true ]; then run_install_cmd "install unsloth (local)" uv pip install --python "$_VENV_PY" \ - --upgrade-package unsloth "unsloth>=2026.4.2" unsloth-zoo + --upgrade-package unsloth "unsloth>=2026.4.4" unsloth-zoo substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps else @@ -1437,7 +1447,7 @@ else # Fallback: GPU detection failed to produce a URL -- let uv resolve torch substep "installing unsloth (this may take a few minutes)..." if [ "$STUDIO_LOCAL_INSTALL" = true ]; then - run_install_cmd "install unsloth (auto torch backend)" uv pip install --python "$_VENV_PY" unsloth-zoo "unsloth>=2026.4.2" --torch-backend=auto + run_install_cmd "install unsloth (auto torch backend)" uv pip install --python "$_VENV_PY" unsloth-zoo "unsloth>=2026.4.4" --torch-backend=auto substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps else diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index c934bb9ff0..0a81a8e9e5 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -48,6 +48,23 @@ class DeviceType(str, Enum): ) +def _backend_label(device: DeviceType) -> str: + """Return the user-facing backend name for API responses. + + Internally we still represent ROCm hosts as ``DeviceType.CUDA`` because + ROCm torch sets ``torch.cuda.is_available() = True`` and reuses the whole + ``torch.cuda.*`` API surface, so branching on ``DeviceType`` stays + consistent with the rest of the codebase. For the JSON responses served + to the Studio frontend and other clients, however, "cuda" is misleading + on an AMD machine. This helper swaps the label to ``"rocm"`` when the + module-level ``IS_ROCM`` flag is set so the UI can render the correct + backend name without every caller having to duplicate the check. + """ + if IS_ROCM and device == DeviceType.CUDA: + return "rocm" + return device.value + + # ========== Detection ========== @@ -199,7 +216,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "device": idx, "device_name": props.name, "total_gb": total / (1024**3), @@ -210,7 +227,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting CUDA GPU info: {e}") - return {"available": False, "backend": device.value, "error": str(e)} + return {"available": False, "backend": _backend_label(device), "error": str(e)} # ---- XPU path (Intel GPU) ---- if device == DeviceType.XPU: @@ -226,7 +243,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "device": idx, "device_name": props.name, "total_gb": total / (1024**3), @@ -237,7 +254,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error("Error getting XPU GPU info: %s", e) - return {"available": False, "backend": device.value, "error": str(e)} + return {"available": False, "backend": _backend_label(device), "error": str(e)} # ---- MLX path (Apple Silicon) ---- if device == DeviceType.MLX: @@ -252,7 +269,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "device": 0, "device_name": f"Apple Silicon ({platform.processor() or platform.machine()})", "total_gb": total / (1024**3), @@ -263,7 +280,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting MLX GPU info: {e}") - return {"available": False, "backend": device.value, "error": str(e)} + return {"available": False, "backend": _backend_label(device), "error": str(e)} # ---- CPU-only ---- return {"available": False, "backend": "cpu"} @@ -438,14 +455,14 @@ def get_gpu_utilization() -> Dict[str, Any]: if device == DeviceType.CUDA: result = _smi_query("get_primary_gpu_utilization") if result is not None: - result["backend"] = device.value + result["backend"] = _backend_label(device) return result mem = get_gpu_memory_info() if device != DeviceType.CPU and mem.get("available"): return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "gpu_utilization_pct": None, "temperature_c": None, "vram_used_gb": round(mem.get("allocated_gb", 0), 2), @@ -456,7 +473,7 @@ def get_gpu_utilization() -> Dict[str, Any]: "power_utilization_pct": None, } - return {"available": False, "backend": device.value} + return {"available": False, "backend": _backend_label(device)} def get_visible_gpu_utilization() -> Dict[str, Any]: @@ -470,7 +487,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: parent_cuda_visible_devices = parent_visible_spec["raw"], ) if result is not None: - result["backend"] = device.value + result["backend"] = _backend_label(device) return result # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) @@ -510,7 +527,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: ) return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": parent_ids, "devices": devices, "index_kind": index_kind, @@ -521,14 +538,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if not mem.get("available"): return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": [], "devices": [], "index_kind": "relative", } return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": [0], "devices": [ { @@ -550,7 +567,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": [], "devices": [], "index_kind": "relative", @@ -1215,7 +1232,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: parent_visible_spec["raw"], ) if result.get("available"): - result["backend"] = device.value + result["backend"] = _backend_label(device) return result except Exception as e: logger.warning("Backend GPU visibility query failed: %s", e) @@ -1244,7 +1261,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: ] return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": _backend_visible_devices_env(), "parent_visible_gpu_ids": parent_visible_ids, "devices": devices, @@ -1253,7 +1270,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": _backend_visible_devices_env(), "parent_visible_gpu_ids": parent_visible_ids, "devices": [], @@ -1265,7 +1282,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: if not mem.get("available"): return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [], "devices": [], @@ -1273,7 +1290,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: } return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [0], "devices": [ @@ -1290,7 +1307,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [], "devices": [], diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 2311d1910f..e43a78baea 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -3051,6 +3051,37 @@ def _detect_host_rocm_version() -> tuple[int, int] | None: return int(parts[0]), int(parts[1].split("-")[0]) except Exception: pass + + # Distro package-manager fallbacks. Mirrors install.sh::get_torch_index_url + # and _detect_rocm_version() in install_python_stack.py so package-managed + # ROCm hosts without /opt/rocm/.info/version still report a usable version + # and the <= host version filter in resolve_upstream_asset_choice picks + # the correct upstream prebuilt instead of the newest-regardless fallback. + for _cmd in ( + ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"], + ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"], + ): + _exe = shutil.which(_cmd[0]) + if not _exe: + continue + try: + _result = subprocess.run( + [_exe, *_cmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + except Exception: + continue + if _result.returncode != 0 or not _result.stdout.strip(): + continue + _raw = _result.stdout.strip() + # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing. + _raw = re.sub(r"^\d+:", "", _raw) + _m = re.match(r"(\d+)[.-](\d+)", _raw) + if _m: + return int(_m.group(1)), int(_m.group(2)) return None diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index c75b78dd09..e20969331e 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -105,6 +105,40 @@ def _detect_rocm_version() -> tuple[int, int] | None: except Exception: pass + # Distro package-manager fallbacks. Package-managed ROCm installs can + # expose GPUs via rocminfo / amd-smi but still lack /opt/rocm/.info/version + # and hipconfig, so probe dpkg (Debian/Ubuntu) and rpm (RHEL/Fedora/SUSE) + # for the rocm-core package version. Matches the chain in + # install.sh::get_torch_index_url so `unsloth studio update` behaves + # the same as a fresh `curl | sh` install. + import re as _re_pkg + + for cmd in ( + ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"], + ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"], + ): + exe = shutil.which(cmd[0]) + if not exe: + continue + try: + result = subprocess.run( + [exe, *cmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + except Exception: + continue + if result.returncode != 0 or not result.stdout.strip(): + continue + raw = result.stdout.strip() + # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing. + raw = _re_pkg.sub(r"^\d+:", "", raw) + m = _re_pkg.match(r"(\d+)[.-](\d+)", raw) + if m: + return int(m.group(1)), int(m.group(2)) + return None From 1d387d6746b4958468b3c9a0e2be56d8c6a83624 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 00:58:13 +0000 Subject: [PATCH 51/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/utils/hardware/hardware.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index 0a81a8e9e5..be31c00a78 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -227,7 +227,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting CUDA GPU info: {e}") - return {"available": False, "backend": _backend_label(device), "error": str(e)} + return { + "available": False, + "backend": _backend_label(device), + "error": str(e), + } # ---- XPU path (Intel GPU) ---- if device == DeviceType.XPU: @@ -254,7 +258,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error("Error getting XPU GPU info: %s", e) - return {"available": False, "backend": _backend_label(device), "error": str(e)} + return { + "available": False, + "backend": _backend_label(device), + "error": str(e), + } # ---- MLX path (Apple Silicon) ---- if device == DeviceType.MLX: @@ -280,7 +288,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting MLX GPU info: {e}") - return {"available": False, "backend": _backend_label(device), "error": str(e)} + return { + "available": False, + "backend": _backend_label(device), + "error": str(e), + } # ---- CPU-only ---- return {"available": False, "backend": "cpu"} From 7effb3aee854667e1cb50d0fdc2991ca742086e9 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 9 Apr 2026 01:50:30 +0000 Subject: [PATCH 52/55] Fix gemini round 6 + URL audit: amd.py defensive checks, rocm6.5+ clip to 6.4 Two rounds of fixes in one commit, plus a full URL audit of every PyPI / download.pytorch.org / repo.radeon.com reference the PR introduces. amd.py (4 medium gemini findings on commit b3627bc2): 1. _extract_gpu_metrics used `and vram_total_mb` as part of the vram_util gate. The follow-up `vram_total_mb > 0` already handles the division guard, but the truthiness check was redundant and slightly surprising for a 0.0 valid value. Replace with explicit `is not None and > 0` for both vram_util and power_util. 2. get_physical_gpu_count called `data.get("gpu", ...)` without guarding for non-dict envelopes. A scalar / string JSON response from amd-smi would raise AttributeError. Add an isinstance(data, dict) check and return None for unexpected shapes. 3. get_visible_gpu_utilization had the same .get() exposure on the outer envelope. Rewrite the gpu_list extraction as an explicit list/dict/else cascade so a malformed scalar envelope produces gpu_list=[data] and continues without raising. 4. The same function's per-entry loop also called gpu_data.get() on whatever was inside gpu_list. If a scalar ever leaks into the list (directly or via the previous fix's fallback), _extract_gpu_metrics would raise on the first .get() inside the helper. Skip non-dict entries in the loop before extracting metrics. install.sh (URL audit finding, previously flagged by 20-reviewer as #13): 5. get_torch_index_url used `rocm6.*` in the rocm tag case statement, which matched rocm6.5 and rocm6.6 and emitted download.pytorch.org/whl/rocm6.5 -- which returns HTTP 403 because PyTorch only publishes rocm 5.7, 6.0-6.4, 7.0-7.2. Enumerate the supported 6.x minors explicitly and add a rocm6.* fallback branch that clips to rocm6.4 (the last supported 6.x wheel set). URL audit results (all URLs PR 4720 references): - 14/14 download.pytorch.org/whl/{cpu,cu118,cu124,cu126,cu128,cu130, rocm6.0..6.4,rocm7.0..7.2} return HTTP 200. - 9/9 repo.radeon.com/rocm/manylinux/rocm-rel-{5.7,6.0,6.1,6.2,6.3, 6.4,7.0,7.1,7.2}/ return HTTP 200. - X.Y.Z patch directories exist for 7.0.2, 7.1.1, 7.2.1 but NOT for 6.3.0, 6.4.0, 6.2.1 -- install.sh already handles this via the X.Y.Z -> X.Y fallback sed in the Radeon wheel install block. - Docs links (rocm.docs.amd.com, docs.unsloth.ai AMD guide) and the llama.cpp GitHub releases API endpoint all return 200. Test suite: 255 -> 258. New regression coverage: - U17: get_physical_gpu_count tolerates scalar amd-smi envelope - U18: get_visible_gpu_utilization tolerates scalar envelope - U19a-c: vram_util / power_util return None on zero total, but vram_total_gb still echoes 0.0 (not None) - A_rocm{6.5,6.6,6.9}_clips_to_rocm64: install.sh clips unsupported 6.x minors to rocm6.4 instead of producing a 403 index URL --- install.sh | 20 +++++++++---- studio/backend/utils/hardware/amd.py | 42 ++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/install.sh b/install.sh index 09b5f2df15..afe24d5a19 100755 --- a/install.sh +++ b/install.sh @@ -1068,13 +1068,23 @@ get_torch_index_url() { case "$_rocm_tag" in rocm[1-5].*) echo "$_base/cpu"; return ;; esac - # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds (<2.11.0). - # Fall back to rocm7.1 index which has torch 2.10.0. - # TODO: uncomment the next line when torch upper bound is bumped to >=2.11.0 - # echo "$_base/$_rocm_tag"; return + # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds + # (<2.11.0). Fall back to rocm7.1 index which has torch 2.10.0. + # Enumerate explicit versions rather than matching rocm6.* so + # a host on ROCm 6.5 or 6.6 (no PyTorch wheels published) is + # clipped down to the last supported 6.x (rocm6.4) instead of + # constructing https://download.pytorch.org/whl/rocm6.5 which + # returns HTTP 403. PyTorch only ships: rocm5.7, 6.0, 6.1, 6.2, + # 6.3, 6.4, 7.0, 7.1, 7.2 (and 5.7 is below our minimum). + # TODO: uncomment rocm7.2 when the torch upper bound is bumped + # to >=2.11.0. case "$_rocm_tag" in - rocm6.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*) + rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*) echo "$_base/$_rocm_tag" ;; + rocm6.*) + # ROCm 6.5+ (no published PyTorch wheels): clip down + # to the last supported 6.x wheel set. + echo "$_base/rocm6.4" ;; *) # ROCm 7.2+ (including future 10.x+): cap to rocm7.1 echo "$_base/rocm7.1" ;; diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 76ce0e6eac..563c226721 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -172,12 +172,16 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: ) vram_util = ( round((vram_used_mb / vram_total_mb) * 100, 1) - if vram_used_mb is not None and vram_total_mb and vram_total_mb > 0 + if vram_used_mb is not None + and vram_total_mb is not None + and vram_total_mb > 0 else None ) power_util = ( round((power_draw / power_limit) * 100, 1) - if power_draw is not None and power_limit and power_limit > 0 + if power_draw is not None + and power_limit is not None + and power_limit > 0 else None ) @@ -212,7 +216,11 @@ def get_physical_gpu_count() -> Optional[int]: return None if isinstance(data, list): return len(data) - # Some versions return a dict with a "gpu" key + # Some versions return a dict with a "gpu" / "gpus" key. Guard the + # .get() access with an isinstance check so a malformed scalar / + # string response from amd-smi cannot raise AttributeError. + if not isinstance(data, dict): + return None gpus = data.get("gpu", data.get("gpus", [])) if isinstance(gpus, list): return len(gpus) @@ -301,25 +309,35 @@ def get_visible_gpu_utilization( "index_kind": "physical", } - gpu_list = ( - data if isinstance(data, list) else data.get("gpus", data.get("gpu", [data])) - ) + # Extract a device list from amd-smi's envelope. Newer versions return + # a JSON array directly, older versions return a dict with a "gpus" / + # "gpu" key wrapping the list. Guard non-dict / non-list envelopes + # (scalar / string fallbacks from malformed output) so the .get() + # access cannot raise AttributeError on an unexpected shape. + if isinstance(data, list): + gpu_list = data + elif isinstance(data, dict): + gpu_list = data.get("gpus", data.get("gpu", [data])) + else: + gpu_list = [data] visible_set = set(parent_visible_ids) ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)} devices = [] for fallback_idx, gpu_data in enumerate(gpu_list): + # Skip non-dict entries defensively: if amd-smi ever ships a + # scalar inside its "gpus" array (observed on some malformed + # output), _extract_gpu_metrics would raise AttributeError on + # the first .get() call. + if not isinstance(gpu_data, dict): + continue # Use AMD-reported GPU ID when available, fall back to enumeration # index. Newer amd-smi versions wrap scalars as ``{"value": 0, # "unit": "none"}``, so route raw_id through ``_parse_numeric`` # which already handles bare ints, floats, strings, and that # dict shape uniformly. - raw_id = ( - gpu_data.get( - "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx)) - ) - if isinstance(gpu_data, dict) - else fallback_idx + raw_id = gpu_data.get( + "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx)) ) parsed_id = _parse_numeric(raw_id) if parsed_id is None: From bae24218b027aaa5db6ae46225ef9fe9f9de588a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Apr 2026 01:51:11 +0000 Subject: [PATCH 53/55] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/utils/hardware/amd.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py index 563c226721..3ab90ae2c1 100644 --- a/studio/backend/utils/hardware/amd.py +++ b/studio/backend/utils/hardware/amd.py @@ -172,16 +172,12 @@ def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: ) vram_util = ( round((vram_used_mb / vram_total_mb) * 100, 1) - if vram_used_mb is not None - and vram_total_mb is not None - and vram_total_mb > 0 + if vram_used_mb is not None and vram_total_mb is not None and vram_total_mb > 0 else None ) power_util = ( round((power_draw / power_limit) * 100, 1) - if power_draw is not None - and power_limit is not None - and power_limit > 0 + if power_draw is not None and power_limit is not None and power_limit > 0 else None ) From a24b27e4bf21ce2e73005d9f28114b95243e5791 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 9 Apr 2026 02:46:21 +0000 Subject: [PATCH 54/55] Fix reviewer.py round 2: tokenizer AMD multi-GPU, --no-torch bnb, main.py backend label Three high-confidence findings from a second 20-parallel reviewer.py run on commit 7effb3ae. Triaged 15 total findings and applied the three that were confirmed as real bugs; the rest were either false positives (e.g. "migrated AMD venv not repaired" -- _ensure_rocm_torch runs downstream via setup.sh regardless), design decisions (e.g. visibility mask env vars not consulted in installer detection), or edge cases the existing fallback logic already handles. 1. unsloth/tokenizer_utils.py [6/20]: the multi-GPU guard's shell probe runs `nvidia-smi --query-gpu=memory.used`, catches the failure, then only raises if `torch.cuda.is_available()` is False. On ROCm torch, torch.cuda.is_available() returns True (ROCm reuses the torch.cuda.* API), so the guard becomes dead code on AMD hosts and multi-GPU AMD setups slip through even though unsloth does not support them yet. Add a torch.cuda.device_count() > 1 fallback inside the except so AMD multi-visible-device setups are flagged consistently with the original CUDA memory check. 2. install.sh [1/20]: the fresh-install bitsandbytes block for AMD ROCm ran unconditionally when TORCH_INDEX_URL matched `*/rocm*`, even when SKIP_TORCH=true (from --no-torch or Intel Mac auto-detect). A user running `install.sh --no-torch` on an AMD host would still pull in bitsandbytes despite explicitly asking for GGUF-only mode. Wrap the case block in an outer `[ "$SKIP_TORCH" = false ]` guard. 3. studio/backend/main.py [3/20]: the /api/system endpoint returned `"device_backend": get_device().value`, which is "cuda" on ROCm hosts (because ROCm torch piggybacks on torch.cuda). Other endpoints (hardware.py) already use the _backend_label helper which swaps "cuda" -> "rocm" when IS_ROCM. Route /api/system through the same helper so the Studio UI reports the backend consistently across all endpoints. 4. studio/backend/tests/test_utils.py: update test_backend_matches_device to call _backend_label(get_device()) instead of raw get_device().value so the test matches the new contract and still passes on CUDA hosts. Tests: 258 -> 261. New regression coverage: - X08 main.py /api/system uses _backend_label - X09 tokenizer multi-GPU guard has device_count() fallback - X10 fresh-install bnb case block gated on SKIP_TORCH=false --- install.sh | 19 ++++++++++++------- studio/backend/main.py | 6 +++++- studio/backend/tests/test_utils.py | 8 +++++++- unsloth/tokenizer_utils.py | 9 +++++++++ 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/install.sh b/install.sh index afe24d5a19..4c6f641f09 100755 --- a/install.sh +++ b/install.sh @@ -1421,13 +1421,18 @@ elif [ -n "$TORCH_INDEX_URL" ]; then run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" fi - # AMD ROCm: install bitsandbytes (once, after torch, for all ROCm paths) - case "$TORCH_INDEX_URL" in - */rocm*) - substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" - ;; - esac + # AMD ROCm: install bitsandbytes (once, after torch, for all ROCm paths). + # Gate on SKIP_TORCH=false so a user running with --no-torch on a ROCm + # host stays in GGUF-only mode rather than pulling in bitsandbytes, + # which is only useful once torch is present for training. + if [ "$SKIP_TORCH" = false ]; then + case "$TORCH_INDEX_URL" in + */rocm*) + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" + ;; + esac + fi # Fresh: Step 2 - install unsloth, preserving pre-installed torch substep "installing unsloth (this may take a few minutes)..." if [ "$SKIP_TORCH" = true ]; then diff --git a/studio/backend/main.py b/studio/backend/main.py index ad19ee9679..c2c0a0b6e4 100644 --- a/studio/backend/main.py +++ b/studio/backend/main.py @@ -237,6 +237,7 @@ async def get_system_info(): import platform import psutil from utils.hardware import get_device + from utils.hardware.hardware import _backend_label visibility_info = get_backend_visible_gpu_info() gpu_info = { @@ -250,7 +251,10 @@ async def get_system_info(): return { "platform": platform.platform(), "python_version": platform.python_version(), - "device_backend": get_device().value, + # Use the centralized _backend_label helper so the /api/system + # endpoint reports "rocm" on AMD hosts instead of "cuda", matching + # the /api/hardware and /api/gpu-visibility endpoints. + "device_backend": _backend_label(get_device()), "cpu_count": psutil.cpu_count(), "memory": { "total_gb": round(memory.total / 1e9, 2), diff --git a/studio/backend/tests/test_utils.py b/studio/backend/tests/test_utils.py index 50557c6718..64c9907119 100644 --- a/studio/backend/tests/test_utils.py +++ b/studio/backend/tests/test_utils.py @@ -191,8 +191,14 @@ def test_has_backend_key(self): assert "backend" in get_gpu_memory_info() def test_backend_matches_device(self): + # The backend field uses _backend_label, which swaps "cuda" for + # "rocm" when running on an AMD host (IS_ROCM=True) so the UI + # can render the correct label. On CUDA / XPU / MLX / CPU hosts + # it is equivalent to `get_device().value`. + from utils.hardware.hardware import _backend_label + result = get_gpu_memory_info() - assert result["backend"] == get_device().value + assert result["backend"] == _backend_label(get_device()) # --- When a GPU IS available --- diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 07949cd32e..0d9ef896e6 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1104,6 +1104,15 @@ def patch_sft_trainer_tokenizer(): "except:\n" " if not torch.cuda.is_available():\n" " raise RuntimeError('Unsloth: No GPU detected. AMD ROCm users: install ROCm-enabled PyTorch -- see https://docs.unsloth.ai/get-started/install-and-update/amd')\n" + " # nvidia-smi unavailable but torch.cuda IS available -- we are on\n" + " # a ROCm host (ROCm reuses the torch.cuda.* API surface, so\n" + " # device_count() is authoritative) or on a CUDA host without\n" + " # the CLI installed. Use the device count directly as a\n" + " # conservative multi-GPU signal: any configuration with more\n" + " # than one visible device is flagged as unsupported, matching\n" + " # the spirit of the per-device memory check used on CUDA.\n" + " if torch.cuda.device_count() > 1:\n" + " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" "if ((a - PRE_CHECK) >= 1).sum() > 1:\n" " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" "for _ in range(3):\n" From ed97b1ff1a1ea238b22624f60029ebab32d35ece Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Thu, 9 Apr 2026 10:23:45 +0000 Subject: [PATCH 55/55] Split: keep only 1 file(s) --- install.sh | 384 +--------------------- studio/backend/core/training/worker.py | 173 ++-------- studio/backend/main.py | 6 +- studio/backend/utils/hardware/__init__.py | 10 - studio/backend/utils/hardware/amd.py | 369 --------------------- studio/backend/utils/hardware/hardware.py | 234 ++++--------- studio/install_llama_prebuilt.py | 306 +---------------- studio/install_python_stack.py | 324 ------------------ unsloth/kernels/utils.py | 18 +- unsloth/tokenizer_utils.py | 11 +- 10 files changed, 110 insertions(+), 1725 deletions(-) delete mode 100644 studio/backend/utils/hardware/amd.py diff --git a/install.sh b/install.sh index 4c6f641f09..ea53ecc6d6 100755 --- a/install.sh +++ b/install.sh @@ -978,37 +978,6 @@ _find_no_torch_runtime() { fi } -# ── AMD ROCm GPU detection helper ── -# Returns 0 (true) if an actual AMD GPU is present, 1 (false) otherwise. -# Checks rocminfo for gfx[1-9]* (excludes gfx000 CPU agent) and -# amd-smi list for GPU data rows (excludes header-only output). -_has_amd_rocm_gpu() { - if command -v rocminfo >/dev/null 2>&1 && \ - rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then - return 0 - elif command -v amd-smi >/dev/null 2>&1 && \ - amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[][[:space:]]*[0-9]/{ found=1 } END{ exit !found }'; then - return 0 - fi - return 1 -} - -# ── NVIDIA usable-GPU helper ── -# Returns 0 (true) only if nvidia-smi is present AND actually lists a GPU. -# Prevents AMD-only hosts with a stale nvidia-smi on PATH from being routed -# into the CUDA branch. -_has_usable_nvidia_gpu() { - _nvsmi="" - if command -v nvidia-smi >/dev/null 2>&1; then - _nvsmi="nvidia-smi" - elif [ -x "/usr/bin/nvidia-smi" ]; then - _nvsmi="/usr/bin/nvidia-smi" - else - return 1 - fi - "$_nvsmi" -L 2>/dev/null | awk '/^GPU[[:space:]]+[0-9]+:/{found=1} END{exit !found}' -} - # ── Detect GPU and choose PyTorch index URL ── # Mirrors Get-TorchIndexUrl in install.ps1. # On CPU-only machines this returns the cpu index, avoiding the solver @@ -1017,82 +986,14 @@ get_torch_index_url() { _base="https://download.pytorch.org/whl" # macOS: always CPU (no CUDA support) case "$(uname -s)" in Darwin) echo "$_base/cpu"; return ;; esac - # Try nvidia-smi -- require the binary to actually list a usable GPU. - # Presence of the binary alone (container leftovers, stale driver - # packages) is not sufficient: otherwise an AMD-only host would - # silently install CUDA wheels. + # Try nvidia-smi _smi="" - if _has_usable_nvidia_gpu; then - if command -v nvidia-smi >/dev/null 2>&1; then - _smi="nvidia-smi" - elif [ -x "/usr/bin/nvidia-smi" ]; then - _smi="/usr/bin/nvidia-smi" - fi - fi - if [ -z "$_smi" ]; then - # No NVIDIA GPU -- check for AMD ROCm GPU. - # PyTorch only publishes ROCm wheels for linux-x86_64; skip the - # ROCm branch entirely on aarch64 / arm64 / other architectures - # so non-x86_64 Linux hosts fall back cleanly to CPU wheels. - case "$(uname -m)" in - x86_64|amd64) : ;; - *) echo "$_base/cpu"; return ;; - esac - if ! _has_amd_rocm_gpu; then - echo "$_base/cpu"; return - fi - # AMD GPU confirmed -- detect ROCm version - _rocm_tag="" - _rocm_tag=$({ command -v amd-smi >/dev/null 2>&1 && \ - amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ - 'NF>1{gsub(/[^0-9.]/, "", $2); split($2,a,"."); print "rocm"a[1]"."a[2]; ok=1; exit} END{exit !ok}'; } || \ - { [ -r /opt/rocm/.info/version ] && \ - awk -F. '{print "rocm"$1"."$2; exit}' /opt/rocm/.info/version; } || \ - { command -v hipconfig >/dev/null 2>&1 && \ - hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]/{split($1,a,"."); if(a[1]+0>0){print "rocm"a[1]"."a[2]; found=1}} END{exit !found}'; } || \ - { command -v dpkg-query >/dev/null 2>&1 && \ - ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \ - [ -n "$ver" ] && \ - printf '%s\n' "$ver" | sed 's/^[0-9]*://' | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; } || \ - { command -v rpm >/dev/null 2>&1 && \ - ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \ - [ -n "$ver" ] && \ - printf '%s\n' "$ver" | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; }) 2>/dev/null - # Validate _rocm_tag: must match "rocmX.Y" with major >= 1 - case "$_rocm_tag" in - rocm[1-9]*.[0-9]*) : ;; # valid (major >= 1) - *) _rocm_tag="" ;; # reject malformed (empty, garbled, or major=0) - esac - if [ -n "$_rocm_tag" ]; then - # Minimum supported: ROCm 6.0 (no PyTorch wheels exist for older) - case "$_rocm_tag" in - rocm[1-5].*) echo "$_base/cpu"; return ;; - esac - # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds - # (<2.11.0). Fall back to rocm7.1 index which has torch 2.10.0. - # Enumerate explicit versions rather than matching rocm6.* so - # a host on ROCm 6.5 or 6.6 (no PyTorch wheels published) is - # clipped down to the last supported 6.x (rocm6.4) instead of - # constructing https://download.pytorch.org/whl/rocm6.5 which - # returns HTTP 403. PyTorch only ships: rocm5.7, 6.0, 6.1, 6.2, - # 6.3, 6.4, 7.0, 7.1, 7.2 (and 5.7 is below our minimum). - # TODO: uncomment rocm7.2 when the torch upper bound is bumped - # to >=2.11.0. - case "$_rocm_tag" in - rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*) - echo "$_base/$_rocm_tag" ;; - rocm6.*) - # ROCm 6.5+ (no published PyTorch wheels): clip down - # to the last supported 6.x wheel set. - echo "$_base/rocm6.4" ;; - *) - # ROCm 7.2+ (including future 10.x+): cap to rocm7.1 - echo "$_base/rocm7.1" ;; - esac - return - fi - echo "$_base/cpu"; return + if command -v nvidia-smi >/dev/null 2>&1; then + _smi="nvidia-smi" + elif [ -x "/usr/bin/nvidia-smi" ]; then + _smi="/usr/bin/nvidia-smi" fi + if [ -z "$_smi" ]; then echo "$_base/cpu"; return; fi # Parse CUDA version from nvidia-smi output (POSIX-safe, no grep -P) _cuda_ver=$(LC_ALL=C $_smi 2>/dev/null \ | sed -n 's/.*CUDA Version:[[:space:]]*\([0-9][0-9]*\.[0-9][0-9]*\).*/\1/p' \ @@ -1110,157 +1011,20 @@ get_torch_index_url() { elif [ "$_major" -ge 11 ]; then echo "$_base/cu118" else echo "$_base/cpu"; fi } - -get_radeon_wheel_url() { - # Only meaningful on Linux. Picks a repo.radeon.com base URL whose listing - # contains torch wheels. Tries paths like rocm-rel-7.2.1/, rocm-rel-7.2/, - # rocm-rel-7.1.1/, rocm-rel-7.1/ (AMD publishes both M.m and M.m.p dirs). - # Accepts both X.Y and X.Y.Z host versions since /opt/rocm/.info/version - # and hipconfig --version can return either shape. - case "$(uname -s)" in Linux) ;; *) echo ""; return ;; esac - - # Detect ROCm version (X.Y or X.Y.Z) -- try amd-smi, then - # /opt/rocm/.info/version, then hipconfig. - _full_ver="" - _full_ver=$({ command -v amd-smi >/dev/null 2>&1 && \ - amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ - 'NF>1{if(match($2,/[0-9]+\.[0-9]+(\.[0-9]+)?/)){print substr($2,RSTART,RLENGTH); ok=1; exit}} END{exit !ok}'; } || \ - { [ -r /opt/rocm/.info/version ] && \ - awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1; exit} END{exit !found}' /opt/rocm/.info/version; } || \ - { command -v hipconfig >/dev/null 2>&1 && \ - hipconfig --version 2>/dev/null | awk 'NR==1 && match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1} END{exit !found}'; }) 2>/dev/null - - # Validate: must be X.Y or X.Y.Z with X >= 1 - case "$_full_ver" in - [1-9]*.[0-9]*.[0-9]*) : ;; # X.Y.Z - [1-9]*.[0-9]*) : ;; # X.Y - *) echo ""; return ;; - esac - echo "https://repo.radeon.com/rocm/manylinux/rocm-rel-${_full_ver}/" -} - -# ── Radeon repo wheel selection helpers ────────────────────────────────────── -# Fetches the Radeon repo directory listing once into _RADEON_LISTING (global). -# _RADEON_PYTAG holds the CPython tag for the running interpreter (e.g. cp312). -# _RADEON_BASE_URL holds the base URL for relative-href resolution. -_RADEON_LISTING="" -_RADEON_PYTAG="" -_RADEON_BASE_URL="" - -_radeon_fetch_listing() { - # Usage: _radeon_fetch_listing BASE_URL - # Populates _RADEON_LISTING, _RADEON_PYTAG, _RADEON_BASE_URL. - _RADEON_BASE_URL="$1" - _RADEON_PYTAG=$("$_VENV_PY" -c " -import sys -print('cp{}{}'.format(sys.version_info.major, sys.version_info.minor)) -" 2>/dev/null) || return 1 - if command -v curl >/dev/null 2>&1; then - _RADEON_LISTING=$(curl -fsSL --max-time 20 "$_RADEON_BASE_URL" 2>/dev/null) - elif command -v wget >/dev/null 2>&1; then - _RADEON_LISTING=$(wget -qO- --timeout=20 "$_RADEON_BASE_URL" 2>/dev/null) - fi - [ -n "$_RADEON_LISTING" ] || return 1 -} - -_pick_radeon_wheel() { - # Usage: _pick_radeon_wheel PACKAGE_NAME - # Scans $_RADEON_LISTING for the newest wheel whose filename starts exactly - # with PACKAGE_NAME- and matches _RADEON_PYTAG + linux_x86_64. - # Prints the full URL (resolving relative hrefs against _RADEON_BASE_URL). - # - # POSIX-compliant pipeline: all href parsing, filtering, and version - # selection is done inside a single awk script rather than reaching - # for GNU extensions (grep -o, sort -V) that would break under BSD - # or BusyBox coreutils. - _pkg="$1" - [ -n "$_RADEON_LISTING" ] || return 1 - [ -n "$_RADEON_PYTAG" ] || return 1 - _tag="$_RADEON_PYTAG" - _href=$(printf '%s\n' "$_RADEON_LISTING" \ - | awk -v pkg="$_pkg" -v tag="$_tag" ' - BEGIN { max_pad = ""; max_url = "" } - { - line = $0 - while (match(line, /href="[^"]*"/)) { - # Strip the leading href=" (6 chars) and trailing " (1 char) - url = substr(line, RSTART + 6, RLENGTH - 7) - line = substr(line, RSTART + RLENGTH) - - # Extract basename, strip query / fragment - n = split(url, p, "/") - base = p[n] - sub(/[?#].*/, "", base) - - prefix = pkg "-" - # Match cpXY-cpXY or cpXY-abi3 with any linux x86_64 - # platform tag (linux_x86_64, manylinux_2_28_x86_64, - # manylinux2014_x86_64, etc.) - if (substr(base, 1, length(prefix)) == prefix && - index(base, "-" tag "-") > 0 && - match(base, /x86_64\.whl$/)) { - # Extract the version component (first - # dotted-number run) and pad each piece so a - # plain lexical comparison gives us the newest. - if (match(base, /[0-9]+\.[0-9]+(\.[0-9]+)?/)) { - ver = substr(base, RSTART, RLENGTH) - m = split(ver, v, ".") - pad = "" - for (i = 1; i <= m; i++) - pad = pad sprintf("%08d", v[i]) - if (pad > max_pad) { - max_pad = pad - max_url = url - } - } - } - } - } - END { if (max_url != "") print max_url }') - [ -z "$_href" ] && return 1 - case "$_href" in - http*) printf '%s\n' "$_href" ;; - *) printf '%s\n' "${_RADEON_BASE_URL%/}/${_href#/}" ;; - esac -} - TORCH_INDEX_URL=$(get_torch_index_url) -# Auto-detect GPU for AMD ROCm based -# get_torch_index_url must have chosen */rocm* -# (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon". -_amd_gpu_radeon=false -case "$TORCH_INDEX_URL" in - */rocm*) - if _has_amd_rocm_gpu && command -v rocminfo >/dev/null 2>&1 && \ - rocminfo 2>/dev/null | grep -q 'Marketing Name:.*Radeon'; then - _amd_gpu_radeon=true - fi - ;; -esac - # ── Print CPU-only hint when no GPU detected ── case "$TORCH_INDEX_URL" in */cpu) if [ "$SKIP_TORCH" = false ] && [ "$OS" != "macos" ]; then echo "" - echo " NOTE: No GPU detected (nvidia-smi and ROCm not found)." + echo " NOTE: No NVIDIA GPU detected (nvidia-smi not found)." echo " Installing CPU-only PyTorch. If you only need GGUF chat/inference," echo " re-run with --no-torch for a faster, lighter install:" echo " curl -fsSL https://unsloth.ai/install.sh | sh -s -- --no-torch" - echo " AMD ROCm users: see https://docs.unsloth.ai/get-started/install-and-update/amd" echo "" fi ;; - */rocm*) - echo "" - if [ "$_amd_gpu_radeon" = true ]; then - echo " AMD Radeon + ROCm detected -- installing PyTorch wheels from repo.radeon.com" - else - echo " AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)" - fi - echo "" - ;; esac # ── Install unsloth directly into the venv (no activation needed) ── @@ -1290,149 +1054,15 @@ if [ "$_MIGRATED" = true ]; then substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps fi - # AMD ROCm: install bitsandbytes even in migrated environments so - # existing ROCm installs gain the AMD bitsandbytes build without a - # fresh reinstall. - if [ "$SKIP_TORCH" = false ]; then - case "$TORCH_INDEX_URL" in - */rocm*) - substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" - ;; - esac - fi elif [ -n "$TORCH_INDEX_URL" ]; then # Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac) if [ "$SKIP_TORCH" = true ]; then substep "skipping PyTorch (--no-torch or Intel Mac x86_64)." "$C_WARN" - elif [ "$_amd_gpu_radeon" = true ]; then - _radeon_url=$(get_radeon_wheel_url) - if [ -n "$_radeon_url" ]; then - _radeon_listing_ok=false - if _radeon_fetch_listing "$_radeon_url" 2>/dev/null; then - _radeon_listing_ok=true - else - # Try shorter X.Y path (AMD publishes both X.Y.Z and X.Y dirs) - _radeon_url_short=$(printf '%s\n' "$_radeon_url" \ - | sed 's|rocm-rel-\([0-9]*\)\.\([0-9]*\)\.[0-9]*/|rocm-rel-\1.\2/|') - if [ "$_radeon_url_short" != "$_radeon_url" ] && \ - _radeon_fetch_listing "$_radeon_url_short" 2>/dev/null; then - _radeon_listing_ok=true - fi - fi - - if [ "$_radeon_listing_ok" = true ]; then - # Require torch, torchvision, torchaudio wheels to all resolve - # from the Radeon listing. If any is missing for this Python - # tag, fall through to the standard ROCm index instead of - # silently mixing Radeon wheels with PyPI defaults. - _torch_whl=$(_pick_radeon_wheel "torch" 2>/dev/null) || _torch_whl="" - _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) || _tv_whl="" - _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) || _ta_whl="" - _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) || _tri_whl="" - # Sanity-check torch / torchvision / torchaudio are a - # matching release. The Radeon repo publishes multiple - # generations simultaneously, so picking the highest-version - # wheel for each package independently can assemble a - # mismatched trio (e.g. torch 2.9.1 + torchvision 0.23.0 + - # torchaudio 2.9.0 from the current rocm-rel-7.2.1 index). - # Check that torch and torchaudio share the same X.Y public - # version prefix, and that torchvision's minor correctly - # pairs with torch's minor (torchvision = torch.minor - 5 - # since torch 2.4 -> torchvision 0.19 -> torch 2.9 -> - # torchvision 0.24). - # URL-decode each wheel name so %2B -> + before version - # extraction. Real Radeon wheel hrefs are percent-encoded - # (torch-2.10.0%2Brocm7.2.0...), so a plain [+-] terminator - # in the sed regex below would never match and - # _radeon_versions_match would stay false for every real - # listing, silently forcing a fallback to the generic - # ROCm index. - _torch_ver="" - _tv_ver="" - _ta_ver="" - if [ -n "$_torch_whl" ]; then - _torch_name=$(printf '%s' "${_torch_whl##*/}" | sed 's/%2[Bb]/+/g') - _torch_ver=$(printf '%s\n' "$_torch_name" | sed -n 's|^torch-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') - fi - if [ -n "$_tv_whl" ]; then - _tv_name=$(printf '%s' "${_tv_whl##*/}" | sed 's/%2[Bb]/+/g') - _tv_ver=$(printf '%s\n' "$_tv_name" | sed -n 's|^torchvision-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') - fi - if [ -n "$_ta_whl" ]; then - _ta_name=$(printf '%s' "${_ta_whl##*/}" | sed 's/%2[Bb]/+/g') - _ta_ver=$(printf '%s\n' "$_ta_name" | sed -n 's|^torchaudio-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') - fi - _radeon_versions_match=false - if [ -n "$_torch_ver" ] && [ -n "$_tv_ver" ] && [ -n "$_ta_ver" ]; then - _torch_major=${_torch_ver%%.*} - _torch_minor=${_torch_ver#*.} - _ta_major=${_ta_ver%%.*} - _ta_minor=${_ta_ver#*.} - _tv_major=${_tv_ver%%.*} - _tv_minor=${_tv_ver#*.} - # torchvision expected minor (e.g. torch 2.9 -> 0.24) - _expected_tv_minor=$((_torch_minor + 15)) - if [ "$_torch_major" = "$_ta_major" ] && \ - [ "$_torch_minor" = "$_ta_minor" ] && \ - [ "$_tv_major" = "0" ] && \ - [ "$_tv_minor" = "$_expected_tv_minor" ]; then - _radeon_versions_match=true - fi - fi - if [ -z "$_torch_whl" ] || [ -z "$_tv_whl" ] || [ -z "$_ta_whl" ] || \ - [ "$_radeon_versions_match" != true ]; then - substep "[WARN] Radeon repo lacks a compatible wheel set for this Python; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" - run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ - "$TORCH_CONSTRAINT" torchvision torchaudio \ - --index-url "$TORCH_INDEX_URL" - else - substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." - # Pass explicit wheel URLs so the matched trio is - # installed together. --find-links lets uv discover - # the Radeon listing for any local lookup, and PyPI - # (not disabled) provides transitive deps like - # filelock / sympy / networkx which are not in the - # Radeon listing. - if [ -n "$_tri_whl" ]; then - run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ - --find-links "$_RADEON_BASE_URL" \ - "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl" - else - run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ - --find-links "$_RADEON_BASE_URL" \ - "$_torch_whl" "$_tv_whl" "$_ta_whl" - fi - fi - else - substep "[WARN] Radeon repo unavailable; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" - run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ - "$TORCH_CONSTRAINT" torchvision torchaudio \ - --index-url "$TORCH_INDEX_URL" - fi - else - substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to ROCm index" "$C_WARN" - run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ - "$TORCH_CONSTRAINT" torchvision torchaudio \ - --index-url "$TORCH_INDEX_URL" - fi else substep "installing PyTorch ($TORCH_INDEX_URL)..." run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" fi - # AMD ROCm: install bitsandbytes (once, after torch, for all ROCm paths). - # Gate on SKIP_TORCH=false so a user running with --no-torch on a ROCm - # host stays in GGUF-only mode rather than pulling in bitsandbytes, - # which is only useful once torch is present for training. - if [ "$SKIP_TORCH" = false ]; then - case "$TORCH_INDEX_URL" in - */rocm*) - substep "installing bitsandbytes for AMD ROCm..." - run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" - ;; - esac - fi # Fresh: Step 2 - install unsloth, preserving pre-installed torch substep "installing unsloth (this may take a few minutes)..." if [ "$SKIP_TORCH" = true ]; then diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index ebf30c14ed..a461972eca 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -86,7 +86,6 @@ def _probe_causal_conv1d_env() -> dict[str, str] | None: "'python_tag': f'cp{sys.version_info.major}{sys.version_info.minor}', " "'torch_mm': torch_mm, " "'cuda_major': str(int(str(torch.version.cuda).split('.', 1)[0])) if torch.version.cuda else '', " - "'hip_version': str(torch.version.hip) if getattr(torch.version, 'hip', None) else '', " "'cxx11abi': str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()" "}))" ), @@ -238,111 +237,28 @@ def _install_package_wheel_first( else: logger.info("No published %s wheel found: %s", display_name, wheel_url) - is_hip = env and env.get("hip_version") - if is_hip and not shutil.which("hipcc"): - logger.error( - "%s requires hipcc for source compilation on ROCm. " - "Install the ROCm HIP SDK: https://rocm.docs.amd.com", - display_name, - ) - _send_status( - event_queue, - f"{display_name}: hipcc not found (ROCm HIP SDK required)", - ) - return - - if is_hip: - _send_status( - event_queue, - f"Compiling {display_name} from source for ROCm " - "(this may take several minutes)...", - ) - else: - _send_status(event_queue, f"Installing {display_name} from PyPI...") - - # Prefer uv for faster dependency resolution when available - if shutil.which("uv"): - pypi_cmd = [ - "uv", - "pip", - "install", - "--python", - sys.executable, - "--no-build-isolation", - "--no-deps", - ] - # Avoid stale cache artifacts from partial HIP source builds - if is_hip: - pypi_cmd.append("--no-cache") - pypi_cmd.append(f"{pypi_name}=={pypi_version}") - else: - pypi_cmd = [ - sys.executable, - "-m", - "pip", - "install", - "--no-build-isolation", - "--no-deps", - "--no-cache-dir", - f"{pypi_name}=={pypi_version}", - ] - - # Source compilation on ROCm can take 10-30 minutes; use a generous - # timeout. Non-HIP installs preserve the pre-existing "no timeout" - # behaviour so unrelated slow installs (e.g. causal-conv1d source - # build on Linux aarch64 or unsupported torch/CUDA combinations) - # are not aborted at 5 minutes by this PR. - _run_kwargs: dict[str, Any] = { - "stdout": _sp.PIPE, - "stderr": _sp.STDOUT, - "text": True, - } - if is_hip: - _run_kwargs["timeout"] = 1800 - - try: - result = _sp.run(pypi_cmd, **_run_kwargs) - except _sp.TimeoutExpired: - logger.error( - "%s installation timed out after %ds", - display_name, - _run_kwargs.get("timeout"), - ) - _send_status( - event_queue, - f"{display_name} installation timed out after " - f"{_run_kwargs.get('timeout')}s", - ) - return - + _send_status(event_queue, f"Installing {display_name} from PyPI...") + pypi_cmd = [ + sys.executable, + "-m", + "pip", + "install", + "--no-build-isolation", + "--no-deps", + "--no-cache-dir", + f"{pypi_name}=={pypi_version}", + ] + result = _sp.run( + pypi_cmd, + stdout = _sp.PIPE, + stderr = _sp.STDOUT, + text = True, + ) if result.returncode != 0: - if is_hip: - # Surface a clear error for ROCm source build failures - error_lines = (result.stdout or "").strip().splitlines() - snippet = "\n".join(error_lines[-5:]) if error_lines else "(no output)" - logger.error( - "Failed to compile %s for ROCm:\n%s", - display_name, - result.stdout, - ) - _send_status( - event_queue, - f"Failed to compile {display_name} for ROCm. " - "Check that hipcc and ROCm development headers are installed.\n" - f"{snippet}", - ) - else: - logger.error( - "Failed to install %s from PyPI:\n%s", - display_name, - result.stdout, - ) + logger.error("Failed to install %s from PyPI:\n%s", display_name, result.stdout) return - if is_hip: - logger.info("Compiled and installed %s from source for ROCm", display_name) - else: - logger.info("Installed %s from PyPI", display_name) + logger.info("Installed %s from PyPI", display_name) def _ensure_causal_conv1d_fast_path(event_queue: Any, model_name: str) -> None: @@ -390,37 +306,15 @@ def _ensure_mamba_ssm(event_queue: Any, model_name: str) -> None: def _activate_transformers_version(model_name: str) -> None: - """Activate the correct transformers version BEFORE any ML imports. - - If the model needs transformers 5.x, prepend the pre-installed .venv_t5/ - directory to sys.path. Otherwise do nothing (default 4.57.x in .venv/). - """ + """Activate the correct transformers version BEFORE any ML imports.""" # Ensure backend is on path for utils imports backend_path = str(Path(__file__).resolve().parent.parent.parent) if backend_path not in sys.path: sys.path.insert(0, backend_path) - from utils.transformers_version import ( - needs_transformers_5, - _resolve_base_model, - _ensure_venv_t5_exists, - _VENV_T5_DIR, - ) + from utils.transformers_version import activate_transformers_for_subprocess - resolved = _resolve_base_model(model_name) - if needs_transformers_5(resolved): - if not _ensure_venv_t5_exists(): - raise RuntimeError( - f"Cannot activate transformers 5.x: .venv_t5 missing at {_VENV_T5_DIR}" - ) - if _VENV_T5_DIR not in sys.path: - sys.path.insert(0, _VENV_T5_DIR) - logger.info("Activated transformers 5.x from %s", _VENV_T5_DIR) - # Propagate to child subprocesses (e.g. GGUF converter) - _pp = os.environ.get("PYTHONPATH", "") - os.environ["PYTHONPATH"] = _VENV_T5_DIR + (os.pathsep + _pp if _pp else "") - else: - logger.info("Using default transformers (4.57.x) for %s", model_name) + activate_transformers_for_subprocess(model_name) def run_training_process( @@ -470,25 +364,22 @@ def run_training_process( ) return - # ── 1a. Auto-enable trust_remote_code for unsloth/* transformers 5.x models ── - # Some newer architectures (e.g. NemotronH) have config parsing bugs in - # transformers that require trust_remote_code=True as a workaround. - # Only auto-enable for unsloth/* prefixed models (trusted source). - # Exclude Gemma 4 since it is a native transformers 5.5 model and - # trust_remote_code=True would bypass the compiler (disabling fused CE). - from utils.transformers_version import needs_transformers_5 - + # ── 1a. Auto-enable trust_remote_code for NemotronH/Nano models ── + # NemotronH has config parsing bugs in transformers that require + # trust_remote_code=True as a workaround. Other transformers 5.x models + # (Qwen3.5, Gemma 4, etc.) are native and do NOT need it — enabling it + # bypasses the compiler (disabling fused CE). + # NOTE: Must NOT match Llama-Nemotron (standard Llama architecture). + _NEMOTRON_TRUST_SUBSTRINGS = ("nemotron_h", "nemotron-h", "nemotron-3-nano") _lowered = model_name.lower() - _is_native_t5 = any(x in _lowered for x in ("gemma-4", "gemma4")) if ( - needs_transformers_5(model_name) - and _lowered.startswith("unsloth/") - and not _is_native_t5 + any(sub in _lowered for sub in _NEMOTRON_TRUST_SUBSTRINGS) + and (_lowered.startswith("unsloth/") or _lowered.startswith("nvidia/")) and not config.get("trust_remote_code", False) ): config["trust_remote_code"] = True logger.info( - "Auto-enabled trust_remote_code for unsloth/* transformers 5.x model: %s", + "Auto-enabled trust_remote_code for Nemotron model: %s", model_name, ) diff --git a/studio/backend/main.py b/studio/backend/main.py index c2c0a0b6e4..ad19ee9679 100644 --- a/studio/backend/main.py +++ b/studio/backend/main.py @@ -237,7 +237,6 @@ async def get_system_info(): import platform import psutil from utils.hardware import get_device - from utils.hardware.hardware import _backend_label visibility_info = get_backend_visible_gpu_info() gpu_info = { @@ -251,10 +250,7 @@ async def get_system_info(): return { "platform": platform.platform(), "python_version": platform.python_version(), - # Use the centralized _backend_label helper so the /api/system - # endpoint reports "rocm" on AMD hosts instead of "cuda", matching - # the /api/hardware and /api/gpu-visibility endpoints. - "device_backend": _backend_label(get_device()), + "device_backend": get_device().value, "cpu_count": psutil.cpu_count(), "memory": { "total_gb": round(memory.total / 1e9, 2), diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index 400b5dd066..aaa0452406 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -5,7 +5,6 @@ Hardware detection and GPU utilities """ -from . import hardware as _hardware from .hardware import ( DeviceType, DEVICE, @@ -50,7 +49,6 @@ "DeviceType", "DEVICE", "CHAT_ONLY", - "IS_ROCM", "detect_hardware", "get_device", "is_apple_silicon", @@ -83,11 +81,3 @@ "extract_arch_config", "estimate_training_vram", ] - - -def __getattr__(name: str): - """Resolve IS_ROCM at access time so callers always see the live value - after detect_hardware() runs (it flips the flag in hardware.py).""" - if name == "IS_ROCM": - return getattr(_hardware, "IS_ROCM") - raise AttributeError(name) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py deleted file mode 100644 index 3ab90ae2c1..0000000000 --- a/studio/backend/utils/hardware/amd.py +++ /dev/null @@ -1,369 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-only -# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 - -"""AMD GPU monitoring via amd-smi. - -Mirrors the nvidia.py module structure so hardware.py can swap backends -based on IS_ROCM. All functions return the same dict shapes as their -nvidia.py counterparts. -""" - -import json -import math -import os -import re -import subprocess -from typing import Any, Optional - -from loggers import get_logger - -logger = get_logger(__name__) - - -def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[Any]: - """Run amd-smi with the given arguments and return parsed JSON, or None.""" - try: - result = subprocess.run( - ["amd-smi", *args, "--json"], - capture_output = True, - text = True, - timeout = timeout, - ) - except (OSError, subprocess.TimeoutExpired) as e: - logger.warning("amd-smi query failed: %s", e) - return None - if result.returncode != 0 or not result.stdout.strip(): - logger.warning("amd-smi returned code %d", result.returncode) - return None - try: - return json.loads(result.stdout) - except json.JSONDecodeError: - logger.warning("Failed to parse amd-smi JSON output") - return None - - -def _parse_numeric(value: Any) -> Optional[float]: - """Extract a numeric value from amd-smi output (may be str, int, float, or dict).""" - if value is None: - return None - # Newer amd-smi versions emit {"value": 10, "unit": "W"} - if isinstance(value, dict): - return _parse_numeric(value.get("value")) - if isinstance(value, (int, float)): - f = float(value) - return f if math.isfinite(f) else None - if isinstance(value, str): - # Strip units like "W", "C", "%", "MB", "MiB", "GB", "GiB" etc. - cleaned = re.sub(r"\s*[A-Za-z/%]+$", "", value.strip()) - if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"): - return None - try: - return float(cleaned) - except (ValueError, TypeError): - return None - return None - - -def _parse_memory_mb(value: Any) -> Optional[float]: - """Parse a memory value from amd-smi output and return MB. - - Handles bare numbers (assumed MB -- the amd-smi convention on every - version we have seen), dict-shaped values with explicit units - (``{"value": 192, "unit": "GiB"}`` on newer releases), and plain - strings like ``"8192 MiB"``. - """ - unit = "" - raw_value = value - - if isinstance(value, dict): - unit = str(value.get("unit", "")).strip().lower() - raw_value = value.get("value") - elif isinstance(value, str): - # Extract unit suffix from strings like "192 GiB" or "8192 MB" - m = re.match(r"^\s*([\d.]+)\s*([A-Za-z]+)\s*$", value.strip()) - if m: - unit = m.group(2).lower() - - num = _parse_numeric(raw_value if isinstance(value, dict) else value) - if num is None: - return None - - # Unit conversion -- GPU tools (including amd-smi) use binary units even - # when labeling them "GB" or "MB", so treat GB/GiB and MB/MiB the same. - if "gib" in unit or "gb" in unit: - return num * 1024 - if "mib" in unit or "mb" in unit: - return num - if "kib" in unit or "kb" in unit: - return num / 1024 - if unit in ("b", "byte", "bytes"): - # Plain bytes - return num / (1024 * 1024) - - # No explicit unit -- default to MB, which is the amd-smi convention - # for bare numeric values. A previous heuristic assumed values above - # ~10M were bytes, but that misclassifies small VRAM allocations - # (e.g. 5 MB = 5,242,880 reported without a unit) as ~5 TB. Modern - # amd-smi always ships explicit units, so the heuristic branch only - # fired for legacy output where MB was already the convention. - return num - - -def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: - """Extract standardized metrics from a single GPU's amd-smi data.""" - # amd-smi metric output structure varies by version; try common paths - usage = gpu_data.get("usage", gpu_data.get("gpu_activity", {})) - if isinstance(usage, dict): - gpu_util = _parse_numeric( - usage.get("gfx_activity", usage.get("gpu_use_percent")) - ) - else: - gpu_util = _parse_numeric(usage) - - # Temperature - temp_data = gpu_data.get("temperature", {}) - if isinstance(temp_data, dict): - temp = _parse_numeric( - temp_data.get( - "edge", - temp_data.get( - "temperature_edge", - temp_data.get("hotspot", temp_data.get("temperature_hotspot")), - ), - ) - ) - else: - temp = _parse_numeric(temp_data) - - # Power - power_data = gpu_data.get("power", {}) - if isinstance(power_data, dict): - power_draw = _parse_numeric( - power_data.get( - "current_socket_power", - power_data.get("average_socket_power", power_data.get("socket_power")), - ) - ) - power_limit = _parse_numeric( - power_data.get("power_cap", power_data.get("max_power_limit")) - ) - else: - power_draw = None - power_limit = None - - # VRAM -- unit-aware parsing to handle varying amd-smi output formats. - # Newer amd-smi versions may return {"value": 192, "unit": "GiB"}. - vram_data = gpu_data.get("vram", gpu_data.get("fb_memory_usage", {})) - if isinstance(vram_data, dict): - vram_used_mb = _parse_memory_mb( - vram_data.get("vram_used", vram_data.get("used")) - ) - vram_total_mb = _parse_memory_mb( - vram_data.get("vram_total", vram_data.get("total")) - ) - else: - vram_used_mb = None - vram_total_mb = None - - # Build the standardized dict (same shape as nvidia._build_gpu_metrics) - vram_used_gb = round(vram_used_mb / 1024, 2) if vram_used_mb is not None else None - vram_total_gb = ( - round(vram_total_mb / 1024, 2) if vram_total_mb is not None else None - ) - vram_util = ( - round((vram_used_mb / vram_total_mb) * 100, 1) - if vram_used_mb is not None and vram_total_mb is not None and vram_total_mb > 0 - else None - ) - power_util = ( - round((power_draw / power_limit) * 100, 1) - if power_draw is not None and power_limit is not None and power_limit > 0 - else None - ) - - return { - "gpu_utilization_pct": gpu_util, - "temperature_c": temp, - "vram_used_gb": vram_used_gb, - "vram_total_gb": vram_total_gb, - "vram_utilization_pct": vram_util, - "power_draw_w": power_draw, - "power_limit_w": power_limit, - "power_utilization_pct": power_util, - } - - -def _has_real_metrics(metrics: dict[str, Any]) -> bool: - """Return True when ``metrics`` contains at least one non-None value. - - ``amd-smi`` can return a zero-exit JSON envelope that is missing every - expected field (error response, unsupported card, hipless container). - In that case ``_extract_gpu_metrics`` produces a dict where every value - is ``None`` -- callers must surface this as ``available: False`` rather - than ``available: True`` with empty data. - """ - return any(value is not None for value in metrics.values()) - - -def get_physical_gpu_count() -> Optional[int]: - """Return physical AMD GPU count via amd-smi, or None on failure.""" - data = _run_amd_smi("list") - if data is None: - return None - if isinstance(data, list): - return len(data) - # Some versions return a dict with a "gpu" / "gpus" key. Guard the - # .get() access with an isinstance check so a malformed scalar / - # string response from amd-smi cannot raise AttributeError. - if not isinstance(data, dict): - return None - gpus = data.get("gpu", data.get("gpus", [])) - if isinstance(gpus, list): - return len(gpus) - return None - - -def _first_visible_amd_gpu_id() -> Optional[str]: - """Return the physical AMD GPU id that should be treated as 'primary'. - - Honours HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES - in that order (HIP respects all three). Returns ``"0"`` when none are - set, and ``None`` when the env var explicitly narrows to zero GPUs - ("" or "-1"), so callers can short-circuit to "available: False". - """ - for env_name in ( - "HIP_VISIBLE_DEVICES", - "ROCR_VISIBLE_DEVICES", - "CUDA_VISIBLE_DEVICES", - ): - raw = os.environ.get(env_name) - if raw is None: - continue - raw = raw.strip() - if raw == "" or raw == "-1": - return None - # Filter out empty tokens after splitting. This tolerates minor - # typos like ``HIP_VISIBLE_DEVICES=",1"`` (leading comma, user - # clearly meant to narrow to device 1) while still falling - # through to the next env var when every token is empty - # (e.g. ``,,,``). - tokens = [t.strip() for t in raw.split(",") if t.strip()] - if tokens: - return tokens[0] - return "0" - - -def get_primary_gpu_utilization() -> dict[str, Any]: - """Return utilization metrics for the primary visible AMD GPU.""" - gpu_idx = _first_visible_amd_gpu_id() - if gpu_idx is None: - return {"available": False} - data = _run_amd_smi("metric", "-g", gpu_idx) - if data is None: - return {"available": False} - - # amd-smi may return a list with one entry or a dict - if isinstance(data, list): - if len(data) == 0: - return {"available": False} - gpu_data = data[0] - else: - gpu_data = data - - metrics = _extract_gpu_metrics(gpu_data) - if not _has_real_metrics(metrics): - # amd-smi returned a JSON envelope with no usable fields (error - # response or unsupported card). Surface as unavailable rather - # than available-with-empty-data so the UI does not render a - # ghost device. - return {"available": False} - metrics["available"] = True - return metrics - - -def get_visible_gpu_utilization( - parent_visible_ids: Optional[list[int]], - parent_cuda_visible_devices: Optional[str] = None, -) -> dict[str, Any]: - """Return utilization metrics for visible AMD GPUs.""" - if parent_visible_ids is None: - return { - "available": False, - "backend_cuda_visible_devices": parent_cuda_visible_devices, - "parent_visible_gpu_ids": [], - "devices": [], - "index_kind": "unresolved", - } - - data = _run_amd_smi("metric") - if data is None: - return { - "available": False, - "backend_cuda_visible_devices": parent_cuda_visible_devices, - "parent_visible_gpu_ids": parent_visible_ids or [], - "devices": [], - "index_kind": "physical", - } - - # Extract a device list from amd-smi's envelope. Newer versions return - # a JSON array directly, older versions return a dict with a "gpus" / - # "gpu" key wrapping the list. Guard non-dict / non-list envelopes - # (scalar / string fallbacks from malformed output) so the .get() - # access cannot raise AttributeError on an unexpected shape. - if isinstance(data, list): - gpu_list = data - elif isinstance(data, dict): - gpu_list = data.get("gpus", data.get("gpu", [data])) - else: - gpu_list = [data] - visible_set = set(parent_visible_ids) - ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)} - - devices = [] - for fallback_idx, gpu_data in enumerate(gpu_list): - # Skip non-dict entries defensively: if amd-smi ever ships a - # scalar inside its "gpus" array (observed on some malformed - # output), _extract_gpu_metrics would raise AttributeError on - # the first .get() call. - if not isinstance(gpu_data, dict): - continue - # Use AMD-reported GPU ID when available, fall back to enumeration - # index. Newer amd-smi versions wrap scalars as ``{"value": 0, - # "unit": "none"}``, so route raw_id through ``_parse_numeric`` - # which already handles bare ints, floats, strings, and that - # dict shape uniformly. - raw_id = gpu_data.get( - "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx)) - ) - parsed_id = _parse_numeric(raw_id) - if parsed_id is None: - logger.debug( - "amd-smi GPU id %r could not be parsed; falling back to " - "enumeration index %d", - raw_id, - fallback_idx, - ) - idx = fallback_idx - else: - idx = int(parsed_id) - if idx not in visible_set: - continue - metrics = _extract_gpu_metrics(gpu_data) - if not _has_real_metrics(metrics): - # Skip ghost entries: an amd-smi response that decodes to a - # dict but contains no usable fields (error envelope, etc.) - # would otherwise show up as a device row with all-None - # numbers in the UI. - continue - metrics["index"] = idx - metrics["index_kind"] = "physical" - metrics["visible_ordinal"] = ordinal_map.get(idx, len(devices)) - devices.append(metrics) - - return { - "available": len(devices) > 0, - "backend_cuda_visible_devices": parent_cuda_visible_devices, - "parent_visible_gpu_ids": parent_visible_ids or [], - "devices": devices, - "index_kind": "physical", - } diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index be31c00a78..b6d3faf6d7 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -43,26 +43,6 @@ class DeviceType(str, Enum): DEVICE: Optional[DeviceType] = None CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) -IS_ROCM: bool = ( - False # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py -) - - -def _backend_label(device: DeviceType) -> str: - """Return the user-facing backend name for API responses. - - Internally we still represent ROCm hosts as ``DeviceType.CUDA`` because - ROCm torch sets ``torch.cuda.is_available() = True`` and reuses the whole - ``torch.cuda.*`` API surface, so branching on ``DeviceType`` stays - consistent with the rest of the codebase. For the JSON responses served - to the Studio frontend and other clients, however, "cuda" is misleading - on an AMD machine. This helper swaps the label to ``"rocm"`` when the - module-level ``IS_ROCM`` flag is set so the UI can render the correct - backend name without every caller having to duplicate the check. - """ - if IS_ROCM and device == DeviceType.CUDA: - return "rocm" - return device.value # ========== Detection ========== @@ -105,11 +85,10 @@ def detect_hardware() -> DeviceType: 2. MLX (Apple Silicon via MLX framework) 3. CPU (fallback) """ - global DEVICE, CHAT_ONLY, IS_ROCM - CHAT_ONLY = True # reset -- only CUDA/ROCm sets it to False - IS_ROCM = False + global DEVICE, CHAT_ONLY + CHAT_ONLY = True # reset -- only CUDA sets it to False - # --- CUDA / ROCm: try PyTorch --- + # --- CUDA: try PyTorch --- if _has_torch(): import torch @@ -117,16 +96,7 @@ def detect_hardware() -> DeviceType: DEVICE = DeviceType.CUDA CHAT_ONLY = False device_name = torch.cuda.get_device_properties(0).name - - # Distinguish AMD ROCm (HIP) from NVIDIA CUDA for display purposes. - # DeviceType stays CUDA since torch.cuda.* works on ROCm via HIP. - if getattr(torch.version, "hip", None) is not None: - IS_ROCM = True - print( - f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}" - ) - else: - print(f"Hardware detected: CUDA -- {device_name}") + print(f"Hardware detected: CUDA — {device_name}") return DEVICE # --- XPU: Intel GPU --- @@ -216,7 +186,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "device": idx, "device_name": props.name, "total_gb": total / (1024**3), @@ -227,11 +197,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting CUDA GPU info: {e}") - return { - "available": False, - "backend": _backend_label(device), - "error": str(e), - } + return {"available": False, "backend": device.value, "error": str(e)} # ---- XPU path (Intel GPU) ---- if device == DeviceType.XPU: @@ -247,7 +213,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "device": idx, "device_name": props.name, "total_gb": total / (1024**3), @@ -258,11 +224,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error("Error getting XPU GPU info: %s", e) - return { - "available": False, - "backend": _backend_label(device), - "error": str(e), - } + return {"available": False, "backend": device.value, "error": str(e)} # ---- MLX path (Apple Silicon) ---- if device == DeviceType.MLX: @@ -277,7 +239,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "device": 0, "device_name": f"Apple Silicon ({platform.processor() or platform.machine()})", "total_gb": total / (1024**3), @@ -288,11 +250,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting MLX GPU info: {e}") - return { - "available": False, - "backend": _backend_label(device), - "error": str(e), - } + return {"available": False, "backend": device.value, "error": str(e)} # ---- CPU-only ---- return {"available": False, "backend": "cpu"} @@ -357,15 +315,13 @@ def get_package_versions() -> Dict[str, Optional[str]]: except PackageNotFoundError: versions[name] = None - # GPU runtime version bundled with torch + # CUDA toolkit version bundled with torch try: import torch versions["cuda"] = getattr(torch.version, "cuda", None) - versions["rocm"] = getattr(torch.version, "hip", None) except Exception: versions["cuda"] = None - versions["rocm"] = None return versions @@ -431,50 +387,26 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] # ========== Live GPU Utilization ========== -def _smi_query(func_name: str, *args, **kwargs) -> Optional[Dict[str, Any]]: - """Run a query against the appropriate SMI backend (amd-smi or nvidia-smi). - - Returns the result dict if available, or None on failure/unavailability. - """ - if IS_ROCM: - backend_name = "amd-smi" - try: - from . import amd as _backend - except Exception as e: - logger.warning("%s import failed: %s", backend_name, e) - return None - else: - backend_name = "nvidia-smi" - try: - from . import nvidia as _backend - except Exception as e: - logger.warning("%s import failed: %s", backend_name, e) - return None - try: - func = getattr(_backend, func_name) - result = func(*args, **kwargs) - if result.get("available"): - return result - except Exception as e: - logger.warning("%s %s query failed: %s", backend_name, func_name, e) - return None - - def get_gpu_utilization() -> Dict[str, Any]: """Return a live snapshot of device utilization information.""" device = get_device() if device == DeviceType.CUDA: - result = _smi_query("get_primary_gpu_utilization") - if result is not None: - result["backend"] = _backend_label(device) - return result + try: + from . import nvidia + + result = nvidia.get_primary_gpu_utilization() + if result.get("available"): + result["backend"] = device.value + return result + except Exception as e: + logger.warning("nvidia-smi utilization query failed: %s", e) mem = get_gpu_memory_info() if device != DeviceType.CPU and mem.get("available"): return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "gpu_utilization_pct": None, "temperature_c": None, "vram_used_gb": round(mem.get("allocated_gb", 0), 2), @@ -485,7 +417,7 @@ def get_gpu_utilization() -> Dict[str, Any]: "power_utilization_pct": None, } - return {"available": False, "backend": _backend_label(device)} + return {"available": False, "backend": device.value} def get_visible_gpu_utilization() -> Dict[str, Any]: @@ -493,14 +425,18 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if device == DeviceType.CUDA: parent_visible_spec = _get_parent_visible_gpu_spec() - result = _smi_query( - "get_visible_gpu_utilization", - parent_visible_spec["numeric_ids"], - parent_cuda_visible_devices = parent_visible_spec["raw"], - ) - if result is not None: - result["backend"] = _backend_label(device) - return result + try: + from . import nvidia + + result = nvidia.get_visible_gpu_utilization( + parent_visible_spec["numeric_ids"], + parent_cuda_visible_devices = parent_visible_spec["raw"], + ) + if result.get("available"): + result["backend"] = device.value + return result + except Exception as e: + logger.warning("nvidia-smi visible GPU utilization query failed: %s", e) # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) if device in (DeviceType.CUDA, DeviceType.XPU): @@ -539,7 +475,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: ) return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "parent_visible_gpu_ids": parent_ids, "devices": devices, "index_kind": index_kind, @@ -550,14 +486,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if not mem.get("available"): return { "available": False, - "backend": _backend_label(device), + "backend": device.value, "parent_visible_gpu_ids": [], "devices": [], "index_kind": "relative", } return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "parent_visible_gpu_ids": [0], "devices": [ { @@ -579,7 +515,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: return { "available": False, - "backend": _backend_label(device), + "backend": device.value, "parent_visible_gpu_ids": [], "devices": [], "index_kind": "relative", @@ -593,21 +529,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: def _get_parent_visible_gpu_spec() -> Dict[str, Any]: - # ROCm uses HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES in addition to - # CUDA_VISIBLE_DEVICES (which HIP also respects). Check ROCm-specific - # env vars first so multi-GPU AMD setups are handled correctly. - # Use explicit None checks (not `or`) so empty string "" is honoured - # as "no visible GPUs" rather than falling through to CUDA_VISIBLE_DEVICES. - cuda_visible = None - if IS_ROCM: - hip_vis = os.environ.get("HIP_VISIBLE_DEVICES") - rocr_vis = os.environ.get("ROCR_VISIBLE_DEVICES") - if hip_vis is not None: - cuda_visible = hip_vis - elif rocr_vis is not None: - cuda_visible = rocr_vis - if cuda_visible is None: - cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") + cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_visible is None: return { @@ -1187,17 +1109,15 @@ def get_physical_gpu_count() -> int: if device == DeviceType.CUDA: try: - if IS_ROCM: - from . import amd as _smi_mod - else: - from . import nvidia as _smi_mod - count = _smi_mod.get_physical_gpu_count() + from . import nvidia + + count = nvidia.get_physical_gpu_count() if count is not None: _physical_gpu_count = count return _physical_gpu_count except Exception: pass - # SMI tool unavailable or failed -- fall back to torch + # nvidia-smi unavailable or failed — fall back to torch count = _torch_get_physical_gpu_count() _physical_gpu_count = count if count is not None else 1 return _physical_gpu_count @@ -1216,25 +1136,12 @@ def get_physical_gpu_count() -> int: return _physical_gpu_count -def _backend_visible_devices_env() -> Optional[str]: - """Return the raw visibility env string that applies to this backend. - - On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence - over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in - ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices`` - reports the value that is actually narrowing the visible device set. - """ - if IS_ROCM: - return _get_parent_visible_gpu_spec().get("raw") - return os.environ.get("CUDA_VISIBLE_DEVICES") - - def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): parent_visible_ids = get_parent_visible_gpu_ids() - # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm) - if device == DeviceType.CUDA and not IS_ROCM: + # Try nvidia-smi first (NVIDIA only) + if device == DeviceType.CUDA: try: from . import nvidia @@ -1244,7 +1151,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: parent_visible_spec["raw"], ) if result.get("available"): - result["backend"] = _backend_label(device) + result["backend"] = device.value return result except Exception as e: logger.warning("Backend GPU visibility query failed: %s", e) @@ -1273,8 +1180,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: ] return { "available": True, - "backend": _backend_label(device), - "backend_cuda_visible_devices": _backend_visible_devices_env(), + "backend": device.value, + "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": parent_visible_ids, "devices": devices, "index_kind": index_kind, @@ -1282,8 +1189,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, - "backend": _backend_label(device), - "backend_cuda_visible_devices": _backend_visible_devices_env(), + "backend": device.value, + "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": parent_visible_ids, "devices": [], "index_kind": "physical", @@ -1294,7 +1201,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: if not mem.get("available"): return { "available": False, - "backend": _backend_label(device), + "backend": device.value, "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [], "devices": [], @@ -1302,7 +1209,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: } return { "available": True, - "backend": _backend_label(device), + "backend": device.value, "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [0], "devices": [ @@ -1319,7 +1226,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, - "backend": _backend_label(device), + "backend": device.value, "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [], "devices": [], @@ -1339,20 +1246,17 @@ def get_visible_gpu_count() -> int: if _visible_gpu_count is not None: return _visible_gpu_count - # Use _get_parent_visible_gpu_spec() which already handles - # HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES on ROCm. - visible_spec = _get_parent_visible_gpu_spec() - if visible_spec["raw"] is not None: - raw = visible_spec["raw"].strip() - if raw == "" or raw == "-1": + cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") + if cuda_visible is not None: + # "" means zero GPUs, "0" means 1, "0,1,2" means 3 + cuda_visible = cuda_visible.strip() + if cuda_visible == "" or cuda_visible == "-1": _visible_gpu_count = 0 - elif visible_spec["numeric_ids"] is not None: - _visible_gpu_count = len(visible_spec["numeric_ids"]) else: - _visible_gpu_count = len([x for x in raw.split(",") if x.strip()]) + _visible_gpu_count = len([x for x in cuda_visible.split(",") if x.strip()]) return _visible_gpu_count - # No visibility env var set -- try torch, fall back to physical count + # CUDA_VISIBLE_DEVICES not set -- try torch, fall back to physical count try: import torch @@ -1384,24 +1288,8 @@ def apply_gpu_ids(gpu_ids) -> None: value = str(gpu_ids) os.environ["CUDA_VISIBLE_DEVICES"] = value - # Keep ROCm visibility env vars in sync so _get_parent_visible_gpu_spec() - # picks up the narrowed set on AMD systems. Workers can call - # apply_gpu_ids() before detect_hardware() runs (so IS_ROCM is still - # its default False), so also mirror the selection whenever the - # parent process already set a ROCm visibility variable -- that - # way a downstream ROCm process inherits the narrowed mask even - # before Studio's hardware detection has classified the host. - _inherits_rocm_visibility = ( - "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ - ) - if IS_ROCM or _inherits_rocm_visibility: - os.environ["HIP_VISIBLE_DEVICES"] = value - os.environ["ROCR_VISIBLE_DEVICES"] = value _visible_gpu_count = None - if IS_ROCM or _inherits_rocm_visibility: - logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s' (rocm)", value) - else: - logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) + logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) def get_device_map( diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index e43a78baea..8d06c7d0e1 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -173,7 +173,6 @@ class HostInfo: visible_cuda_devices: str | None has_physical_nvidia: bool has_usable_nvidia: bool - has_rocm: bool = False @dataclass @@ -2494,25 +2493,12 @@ def detect_host() -> HostInfo: has_physical_nvidia = False has_usable_nvidia = False if nvidia_smi: - # Require `nvidia-smi -L` to actually list a GPU before treating the - # host as NVIDIA. The banner text "NVIDIA-SMI ..." is printed even - # when the command fails to communicate with the driver (e.g. stale - # container leftovers), which would otherwise misclassify an AMD - # ROCm host as NVIDIA and short-circuit the ROCm path. - try: - listing = run_capture([nvidia_smi, "-L"], timeout = 20) - gpu_lines = [ - line for line in listing.stdout.splitlines() if line.startswith("GPU ") - ] - if gpu_lines: - has_physical_nvidia = True - has_usable_nvidia = visible_device_tokens != [] - except Exception: - pass - try: result = run_capture([nvidia_smi], timeout = 20) merged = "\n".join(part for part in (result.stdout, result.stderr) if part) + if "NVIDIA-SMI" in merged: + has_physical_nvidia = True + has_usable_nvidia = visible_device_tokens != [] for line in merged.splitlines(): if "CUDA Version:" in line: raw = line.split("CUDA Version:", 1)[1].strip().split()[0] @@ -2552,12 +2538,6 @@ def detect_host() -> HostInfo: if visible_gpu_rows: has_usable_nvidia = True - # Older nvidia-smi versions (pre -L support) hit the - # except in the first try block but still succeed here, - # leaving has_physical_nvidia unset. Mirror the -L path - # so downstream diagnostics on line ~4390 still run. - if not has_physical_nvidia: - has_physical_nvidia = True elif visible_device_tokens == []: has_usable_nvidia = False elif supports_explicit_visible_device_matching(visible_device_tokens): @@ -2567,50 +2547,6 @@ def detect_host() -> HostInfo: except Exception: pass - # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed - - def _amd_smi_has_gpu(stdout: str) -> bool: - """Check for 'GPU: ' data rows, not just a table header.""" - return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) - - has_rocm = False - if is_linux: - for _cmd, _check in ( - # rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent) - (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), - (["amd-smi", "list"], _amd_smi_has_gpu), - ): - _exe = shutil.which(_cmd[0]) - if not _exe: - continue - try: - _result = run_capture([_exe, *_cmd[1:]], timeout = 10) - except Exception: - continue - if _result.returncode == 0 and _result.stdout.strip(): - if _check(_result.stdout): - has_rocm = True - break - elif is_windows: - # Windows: prefer active probes that validate GPU presence - for _cmd, _check in ( - (["hipinfo"], lambda out: "gcnarchname" in out.lower()), - (["amd-smi", "list"], _amd_smi_has_gpu), - ): - _exe = shutil.which(_cmd[0]) - if not _exe: - continue - try: - _result = run_capture([_exe, *_cmd[1:]], timeout = 10) - except Exception: - continue - if _result.returncode == 0 and _result.stdout.strip(): - if _check(_result.stdout): - has_rocm = True - break - # Note: amdhip64.dll presence alone is NOT treated as GPU evidence - # since the HIP SDK can be installed without an AMD GPU. - return HostInfo( system = system, machine = machine, @@ -2625,7 +2561,6 @@ def _amd_smi_has_gpu(stdout: str) -> bool: visible_cuda_devices = visible_cuda_devices, has_physical_nvidia = has_physical_nvidia, has_usable_nvidia = has_usable_nvidia, - has_rocm = has_rocm, ) @@ -2991,168 +2926,9 @@ def published_asset_choice_for_kind( return None -def _detect_host_rocm_version() -> tuple[int, int] | None: - """Return (major, minor) of the installed ROCm runtime, or None. - - Best-effort read from /opt/rocm/.info/version, amd-smi version, and - hipconfig --version. Used to pick a compatible upstream llama.cpp - ROCm prebuilt rather than always taking the numerically newest one - (which can be newer than the host runtime). - """ - rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" - for path in ( - os.path.join(rocm_root, ".info", "version"), - os.path.join(rocm_root, "lib", "rocm_version"), - ): - try: - with open(path) as fh: - parts = fh.read().strip().split("-")[0].split(".") - # Explicit length guard avoids relying on the broad except - # below to swallow IndexError when the version file contains - # a single component (e.g. "6\n" on a partial install). - if len(parts) >= 2: - return int(parts[0]), int(parts[1]) - except Exception: - pass - amd_smi = shutil.which("amd-smi") - if amd_smi: - try: - result = subprocess.run( - [amd_smi, "version"], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 5, - ) - if result.returncode == 0: - m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) - if m: - return int(m.group(1)), int(m.group(2)) - except Exception: - pass - hipconfig = shutil.which("hipconfig") - if hipconfig: - try: - result = subprocess.run( - [hipconfig, "--version"], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 5, - ) - if result.returncode == 0: - raw = (result.stdout or "").strip().split("\n")[0] - parts = raw.split(".") - if ( - len(parts) >= 2 - and parts[0].isdigit() - and parts[1].split("-")[0].isdigit() - ): - return int(parts[0]), int(parts[1].split("-")[0]) - except Exception: - pass - - # Distro package-manager fallbacks. Mirrors install.sh::get_torch_index_url - # and _detect_rocm_version() in install_python_stack.py so package-managed - # ROCm hosts without /opt/rocm/.info/version still report a usable version - # and the <= host version filter in resolve_upstream_asset_choice picks - # the correct upstream prebuilt instead of the newest-regardless fallback. - for _cmd in ( - ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"], - ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"], - ): - _exe = shutil.which(_cmd[0]) - if not _exe: - continue - try: - _result = subprocess.run( - [_exe, *_cmd[1:]], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 5, - ) - except Exception: - continue - if _result.returncode != 0 or not _result.stdout.strip(): - continue - _raw = _result.stdout.strip() - # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing. - _raw = re.sub(r"^\d+:", "", _raw) - _m = re.match(r"(\d+)[.-](\d+)", _raw) - if _m: - return int(_m.group(1)), int(_m.group(2)) - return None - - def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice: upstream_assets = github_release_assets(UPSTREAM_REPO, llama_tag) if host.is_linux and host.is_x86_64: - # AMD ROCm: try upstream ROCm prebuilt first, then fall back to source build. - # Source build (via setup.sh) compiles with -DGGML_HIP=ON and auto-detects - # the exact GPU target via rocminfo, which is more reliable for consumer - # GPUs (e.g. gfx1151) that may not be in the prebuilt. - if host.has_rocm and not host.has_usable_nvidia: - # Scan upstream assets for any rocm- prebuilt. When the - # host ROCm runtime version is known, pick the newest candidate - # whose major.minor is <= host version -- otherwise a ROCm 6.4 - # host would download the rocm-7.2 tarball, fail preflight, and - # fall back to a source build even though a compatible 6.4 - # prebuilt exists. If no compatible candidate matches (e.g. host - # runtime is older than every published prebuilt), fall back to - # the numerically newest so we at least try something. - _rocm_pattern = re.compile( - rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz" - ) - rocm_candidates: list[tuple[tuple[int, ...], str]] = [] - for _name in upstream_assets: - _m = _rocm_pattern.match(_name) - if _m is None: - continue - _parts = tuple(int(p) for p in _m.group(1).split(".")) - rocm_candidates.append((_parts, _name)) - rocm_candidates.sort(reverse = True) - _host_rocm_version = _detect_host_rocm_version() - _compatible: list[tuple[tuple[int, ...], str]] = rocm_candidates - if _host_rocm_version is not None: - _compatible = [ - item - for item in rocm_candidates - if item[0][:2] <= _host_rocm_version - ] - if rocm_candidates and not _compatible: - # Fall back to the newest candidate so a source build is - # not forced when the host runtime is older than every - # published prebuilt: preflight will still catch a true - # incompatibility and trigger a fallback. - _compatible = rocm_candidates[:1] - if _compatible: - rocm_name = _compatible[0][1] - if _host_rocm_version is not None: - log( - f"AMD ROCm {_host_rocm_version[0]}.{_host_rocm_version[1]} " - f"detected -- trying upstream prebuilt {rocm_name}" - ) - else: - log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") - log( - "Note: if your ROCm runtime version differs significantly, " - "this may fail preflight and fall back to a source build (safe)" - ) - return AssetChoice( - repo = UPSTREAM_REPO, - tag = llama_tag, - name = rocm_name, - url = upstream_assets[rocm_name], - source_label = "upstream", - install_kind = "linux-rocm", - ) - # No ROCm prebuilt available -- fall back to source build - raise PrebuiltFallback( - "AMD ROCm detected but no upstream ROCm prebuilt found; " - "falling back to source build with HIP support" - ) - upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Linux CPU asset was not found") @@ -3172,25 +2948,6 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice return attempts[0] raise PrebuiltFallback("no compatible Windows CUDA asset was found") - # AMD ROCm on Windows: try HIP prebuilt - if host.has_rocm: - hip_name = f"llama-{llama_tag}-bin-win-hip-radeon-x64.zip" - if hip_name in upstream_assets: - log( - f"AMD ROCm detected on Windows -- trying upstream HIP prebuilt {hip_name}" - ) - return AssetChoice( - repo = UPSTREAM_REPO, - tag = llama_tag, - name = hip_name, - url = upstream_assets[hip_name], - source_label = "upstream", - install_kind = "windows-hip", - ) - log( - "AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU" - ) - upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Windows CPU asset was not found") @@ -3272,16 +3029,7 @@ def resolve_release_asset_choice( published_choice: AssetChoice | None = None if host.is_windows and host.is_x86_64: - # AMD Windows hosts should prefer a hash-approved published - # Windows HIP bundle when one exists, but otherwise fall through - # to resolve_asset_choice() so the upstream HIP prebuilt is - # tried before the CPU fallback. Hard-pinning the published - # windows-cpu bundle here would make the new HIP path - # unreachable. - if host.has_rocm: - published_choice = published_asset_choice_for_kind(release, "windows-hip") - else: - published_choice = published_asset_choice_for_kind(release, "windows-cpu") + published_choice = published_asset_choice_for_kind(release, "windows-cpu") elif host.is_macos and host.is_arm64: published_choice = published_asset_choice_for_kind(release, "macos-arm64") elif host.is_macos and host.is_x86_64: @@ -3630,7 +3378,7 @@ def overlay_directory_for_choice( def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: - if choice.install_kind in {"linux-cpu", "linux-cuda", "linux-rocm"}: + if choice.install_kind in {"linux-cpu", "linux-cuda"}: return [ "llama-server", "llama-quantize", @@ -3640,12 +3388,11 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: "libmtmd.so*", "libggml-cpu-*.so*", "libggml-cuda.so*", - "libggml-hip.so*", "libggml-rpc.so*", ] if choice.install_kind in {"macos-arm64", "macos-x64"}: return ["llama-server", "llama-quantize", "lib*.dylib"] - if choice.install_kind in {"windows-cpu", "windows-cuda", "windows-hip"}: + if choice.install_kind in {"windows-cpu", "windows-cuda"}: return ["*.exe", "*.dll"] raise PrebuiltFallback( f"unsupported install kind for runtime overlay: {choice.install_kind}" @@ -4370,7 +4117,6 @@ def validate_server( install_dir: Path, *, runtime_line: str | None = None, - install_kind: str | None = None, ) -> None: last_failure: PrebuiltFallback | None = None for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1): @@ -4394,33 +4140,7 @@ def validate_server( "--batch-size", "32", ] - # Only enable GPU offload for assets that actually ship GPU code. - # Gating on `host.has_rocm` alone breaks the intentional CPU - # fallback on AMD Windows hosts without a HIP prebuilt: the CPU - # binary would be launched with `--n-gpu-layers 1` and fail - # validation. Use the resolved install_kind as the source of - # truth and fall back to host detection when the caller did not - # pass one (keeps backwards compatibility with older call sites). - _gpu_kinds = { - "linux-cuda", - "linux-rocm", - "windows-cuda", - "windows-hip", - "macos-arm64", - } - if install_kind is not None: - _enable_gpu_layers = install_kind in _gpu_kinds - else: - # Older call sites that don't pass install_kind: keep ROCm - # hosts in the GPU-validation path so an AMD-only Linux host - # is exercised against the actual hardware rather than the - # CPU fallback. NVIDIA and macOS-arm64 are already covered. - _enable_gpu_layers = ( - host.has_usable_nvidia - or host.has_rocm - or (host.is_macos and host.is_arm64) - ) - if _enable_gpu_layers: + if host.has_usable_nvidia or (host.is_macos and host.is_arm64): command.extend(["--n-gpu-layers", "1"]) log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log") @@ -4944,21 +4664,10 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: ["libggml*.dylib"], ["libmtmd*.dylib"], ] - if choice.install_kind == "linux-rocm": - return [ - ["libllama.so*"], - ["libggml.so*"], - ["libggml-base.so*"], - ["libggml-cpu-*.so*"], - ["libmtmd.so*"], - ["libggml-hip.so*"], - ] if choice.install_kind == "windows-cpu": return [["llama.dll"]] if choice.install_kind == "windows-cuda": return [["llama.dll"], ["ggml-cuda.dll"]] - if choice.install_kind == "windows-hip": - return [["llama.dll"], ["*hip*.dll"]] return [] @@ -5130,7 +4839,6 @@ def validate_prebuilt_choice( host, install_dir, runtime_line = choice.runtime_line, - install_kind = choice.install_kind, ) log(f"staged prebuilt validation succeeded for {choice.name}") return server_path, quantize_path diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index e20969331e..f2981ea665 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -25,280 +25,6 @@ IS_MACOS = sys.platform == "darwin" IS_MAC_INTEL = IS_MACOS and platform.machine() == "x86_64" -# ── ROCm / AMD GPU support ───────────────────────────────────────────────────── -# Mapping from detected ROCm (major, minor) to the best PyTorch wheel tag on -# download.pytorch.org. Entries are checked newest-first (>=). -# ROCm 7.2 only has torch 2.11.0 on download.pytorch.org, which exceeds the -# current torch upper bound (<2.11.0). Fall back to rocm7.1 (torch 2.10.0). -# TODO: uncomment rocm7.2 when torch upper bound is bumped to >=2.11.0 -_ROCM_TORCH_INDEX: dict[tuple[int, int], str] = { - # (7, 2): "rocm7.2", # torch 2.11.0 -- requires torch>=2.11 - (7, 1): "rocm7.1", - (7, 0): "rocm7.0", - (6, 4): "rocm6.4", - (6, 3): "rocm6.3", - (6, 2): "rocm6.2", - (6, 1): "rocm6.1", - (6, 0): "rocm6.0", -} -_PYTORCH_WHL_BASE = "https://download.pytorch.org/whl" - - -def _detect_rocm_version() -> tuple[int, int] | None: - """Return (major, minor) of the installed ROCm stack, or None.""" - # Check /opt/rocm/.info/version or ROCM_PATH equivalent - rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" - for path in ( - os.path.join(rocm_root, ".info", "version"), - os.path.join(rocm_root, "lib", "rocm_version"), - ): - try: - with open(path) as fh: - parts = fh.read().strip().split("-")[0].split(".") - # Explicit length guard avoids relying on the broad except - # below to swallow IndexError when the version file contains - # a single component (e.g. "6\n" on a partial install). - if len(parts) >= 2: - return int(parts[0]), int(parts[1]) - except Exception: - pass - - # Try amd-smi version (outputs "... | ROCm version: X.Y.Z") - amd_smi = shutil.which("amd-smi") - if amd_smi: - try: - result = subprocess.run( - [amd_smi, "version"], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 5, - ) - if result.returncode == 0: - import re - - m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) - if m: - return int(m.group(1)), int(m.group(2)) - except Exception: - pass - - # Try hipconfig --version (outputs bare version like "6.3.21234.2") - hipconfig = shutil.which("hipconfig") - if hipconfig: - try: - result = subprocess.run( - [hipconfig, "--version"], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - timeout = 5, - ) - if result.returncode == 0: - raw = result.stdout.decode().strip().split("\n")[0] - parts = raw.split(".") - if ( - len(parts) >= 2 - and parts[0].isdigit() - and parts[1].split("-")[0].isdigit() - ): - return int(parts[0]), int(parts[1].split("-")[0]) - except Exception: - pass - - # Distro package-manager fallbacks. Package-managed ROCm installs can - # expose GPUs via rocminfo / amd-smi but still lack /opt/rocm/.info/version - # and hipconfig, so probe dpkg (Debian/Ubuntu) and rpm (RHEL/Fedora/SUSE) - # for the rocm-core package version. Matches the chain in - # install.sh::get_torch_index_url so `unsloth studio update` behaves - # the same as a fresh `curl | sh` install. - import re as _re_pkg - - for cmd in ( - ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"], - ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"], - ): - exe = shutil.which(cmd[0]) - if not exe: - continue - try: - result = subprocess.run( - [exe, *cmd[1:]], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 5, - ) - except Exception: - continue - if result.returncode != 0 or not result.stdout.strip(): - continue - raw = result.stdout.strip() - # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing. - raw = _re_pkg.sub(r"^\d+:", "", raw) - m = _re_pkg.match(r"(\d+)[.-](\d+)", raw) - if m: - return int(m.group(1)), int(m.group(2)) - - return None - - -def _has_rocm_gpu() -> bool: - """Return True only if an actual AMD GPU is visible (not just ROCm tools installed).""" - import re - - for cmd, check_fn in ( - # rocminfo: look for "Name: gfxNNNN" with nonzero first digit (gfx000 is the CPU agent) - (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), - # amd-smi list: require "GPU: " data rows, not just a header - ( - ["amd-smi", "list"], - lambda out: bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", out)), - ), - ): - exe = shutil.which(cmd[0]) - if not exe: - continue - try: - result = subprocess.run( - [exe, *cmd[1:]], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 10, - ) - except Exception: - continue - if result.returncode == 0 and result.stdout.strip(): - if check_fn(result.stdout): - return True - return False - - -def _has_usable_nvidia_gpu() -> bool: - """Return True only when nvidia-smi exists AND reports at least one GPU.""" - exe = shutil.which("nvidia-smi") - if not exe: - return False - try: - result = subprocess.run( - [exe, "-L"], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 10, - ) - except Exception: - return False - return result.returncode == 0 and "GPU " in result.stdout - - -def _ensure_rocm_torch() -> None: - """Reinstall torch with ROCm wheels when the venv received CPU-only torch. - - Runs only on Linux x86_64 hosts where an AMD GPU is present and the - ROCm runtime is detectable (rocminfo / amd-smi / hipconfig / - rocm-core package). No-op when torch already links against HIP - (ROCm), on Windows / macOS, on non-x86_64 Linux (PyTorch does not - publish ROCm wheels for aarch64 / arm64), or on mixed AMD+NVIDIA - hosts (NVIDIA takes precedence). - Uses pip_install() to respect uv, constraints, and --python targeting. - """ - # Explicit OS / architecture guards so the helper is safe to call - # from any context -- PyTorch only publishes ROCm wheels for - # linux_x86_64, so aarch64 / arm64 hosts must skip this repair path - # instead of failing the update with a missing-wheel error. - if IS_WINDOWS or IS_MACOS: - return - if platform.machine().lower() not in {"x86_64", "amd64"}: - return - # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable - if _has_usable_nvidia_gpu(): - return - # Rely on _has_rocm_gpu() (rocminfo / amd-smi GPU data rows) as the - # authoritative "is this actually an AMD ROCm host?" signal. The old - # gate required /opt/rocm or hipcc to exist, which breaks on - # runtime-only ROCm installs (package-managed minimal installs, - # Radeon software) that ship amd-smi/rocminfo without /opt/rocm or - # hipcc, and leaves `unsloth studio update` unable to repair a - # CPU-only venv on those systems. - if not _has_rocm_gpu(): - return # no AMD GPU visible - - ver = _detect_rocm_version() - if ver is None: - print(" ROCm detected but version unreadable -- skipping torch reinstall") - return - - # Probe whether torch already links against HIP (ROCm is already working). - # Do NOT skip for CUDA-only builds since they are unusable on AMD-only - # hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups). - try: - probe = subprocess.run( - [ - sys.executable, - "-c", - "import torch; print(getattr(torch.version,'hip','') or '')", - ], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - timeout = 30, - ) - except (OSError, subprocess.TimeoutExpired): - probe = None - has_hip_torch = ( - probe is not None - and probe.returncode == 0 - and probe.stdout.decode().strip() != "" - ) - - rocm_torch_ready = has_hip_torch - - if not has_hip_torch: - # Select best matching wheel tag (newest ROCm version <= installed) - tag = next( - ( - t - for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) - if ver >= (maj, mn) - ), - None, - ) - if tag is None: - print( - f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- " - f"skipping torch reinstall" - ) - else: - index_url = f"{_PYTORCH_WHL_BASE}/{tag}" - print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") - pip_install( - f"ROCm torch ({tag})", - "--force-reinstall", - "--no-cache-dir", - "torch>=2.4,<2.11.0", - "torchvision<0.26.0", - "torchaudio<2.11.0", - "--index-url", - index_url, - constrain = False, - ) - rocm_torch_ready = True - - # Install bitsandbytes only when the venv has a ROCm-compatible torch - # (either already present or just installed). Avoids leaving an AMD - # bitsandbytes on top of a CPU/CUDA torch on hosts where the ROCm - # runtime is older than any published torch wheel. Uses - # --force-reinstall so an existing CPU/CUDA bitsandbytes is replaced - # by the AMD build during upgrades. - if rocm_torch_ready: - pip_install( - "bitsandbytes (AMD)", - "--force-reinstall", - "--no-cache-dir", - "bitsandbytes>=0.49.1", - constrain = False, - ) - def _infer_no_torch() -> bool: """Determine whether to run in no-torch (GGUF-only) mode. @@ -688,9 +414,6 @@ def install_python_stack() -> int: base_total = 10 if IS_WINDOWS else 11 if IS_MACOS: base_total -= 1 # triton step is skipped on macOS - # ROCm torch check step (Linux only, non-macOS, non-no-torch) - if not IS_WINDOWS and not IS_MACOS and not NO_TORCH: - base_total += 1 _TOTAL = (base_total - 1) if skip_base else base_total # 1. Try to use uv for faster installs (must happen before pip upgrade @@ -814,53 +537,6 @@ def install_python_stack() -> int: req = REQ_ROOT / "base.txt", ) - # 2b. AMD ROCm: reinstall torch with HIP wheels if the host has ROCm but the - # venv received CPU-only torch (common when pip resolves torch from PyPI). - # Must come immediately after base packages so torch is present for inspection. - if not IS_WINDOWS and not IS_MACOS and not NO_TORCH: - _progress("ROCm torch check") - _ensure_rocm_torch() - - # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows. - # Detect and warn so users know manual steps are needed for GPU training. - if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu(): - # Validate actual AMD GPU presence (not just tool existence) - import re as _re_win - - def _win_amd_smi_has_gpu(stdout: str) -> bool: - return bool(_re_win.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) - - _win_amd_gpu = False - for _wcmd, _check_fn in ( - (["hipinfo"], lambda out: "gcnarchname" in out.lower()), - (["amd-smi", "list"], _win_amd_smi_has_gpu), - ): - _wexe = shutil.which(_wcmd[0]) - if not _wexe: - continue - try: - _wr = subprocess.run( - [_wexe, *_wcmd[1:]], - stdout = subprocess.PIPE, - stderr = subprocess.DEVNULL, - text = True, - timeout = 10, - ) - except Exception: - continue - if _wr.returncode == 0 and _check_fn(_wr.stdout): - _win_amd_gpu = True - break - if _win_amd_gpu: - _safe_print( - _dim(" Note:"), - "AMD GPU detected on Windows. ROCm-enabled PyTorch must be", - ) - _safe_print( - " " * 8, - "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd", - ) - # 3. Extra dependencies _progress("unsloth extras") pip_install( diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 09b03a597b..90f2d5d238 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -88,26 +88,10 @@ def is_cdna(): @functools.lru_cache(1) def is_rdna(): - """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA2, RDNA3, RDNA3.5, RDNA4).""" + """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA3, RDNA4).""" return is_hip() and triton.runtime.driver.active.get_current_target().arch in ( - # RDNA2 (Navi 21-24) - "gfx1030", - "gfx1031", - "gfx1032", - "gfx1033", - "gfx1034", - "gfx1035", - "gfx1036", - # RDNA3 (Navi 31-33) "gfx1100", "gfx1101", - "gfx1102", - "gfx1103", - # RDNA3.5 (Strix Point / Strix Halo) - "gfx1150", - "gfx1151", - "gfx1152", - # RDNA4 (Navi 48-44) "gfx1200", "gfx1201", ) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 0d9ef896e6..8be6bb5a5a 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1103,16 +1103,7 @@ def patch_sft_trainer_tokenizer(): " a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n" "except:\n" " if not torch.cuda.is_available():\n" - " raise RuntimeError('Unsloth: No GPU detected. AMD ROCm users: install ROCm-enabled PyTorch -- see https://docs.unsloth.ai/get-started/install-and-update/amd')\n" - " # nvidia-smi unavailable but torch.cuda IS available -- we are on\n" - " # a ROCm host (ROCm reuses the torch.cuda.* API surface, so\n" - " # device_count() is authoritative) or on a CUDA host without\n" - " # the CLI installed. Use the device count directly as a\n" - " # conservative multi-GPU signal: any configuration with more\n" - " # than one visible device is flagged as unsupported, matching\n" - " # the spirit of the per-device memory check used on CUDA.\n" - " if torch.cuda.device_count() > 1:\n" - " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" + " raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n" "if ((a - PRE_CHECK) >= 1).sum() > 1:\n" " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" "for _ in range(3):\n"