diff --git a/install.sh b/install.sh index 053f334d2b..5035bdd213 100755 --- a/install.sh +++ b/install.sh @@ -978,6 +978,64 @@ _find_no_torch_runtime() { fi } +# ── ROCm visibility-mask check ── +# Returns 0 (true) when no env var explicitly hides all AMD GPUs, +# 1 (false) when ANY of HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES / +# CUDA_VISIBLE_DEVICES is set to "" or "-1". +# On ROCm, ROCR narrows the physical set, then CUDA/HIP further +# restricts within that. If any is empty, all GPUs are hidden. +_rocm_devices_enabled() { + if [ "${HIP_VISIBLE_DEVICES+x}" = x ]; then + case "$HIP_VISIBLE_DEVICES" in ""|-1) return 1 ;; esac + fi + if [ "${ROCR_VISIBLE_DEVICES+x}" = x ]; then + case "$ROCR_VISIBLE_DEVICES" in ""|-1) return 1 ;; esac + fi + if [ "${CUDA_VISIBLE_DEVICES+x}" = x ]; then + case "$CUDA_VISIBLE_DEVICES" in ""|-1) return 1 ;; esac + fi + return 0 +} + +# ── AMD ROCm GPU detection helper ── +# Returns 0 (true) if an actual AMD GPU is present, 1 (false) otherwise. +# Checks rocminfo for gfx[1-9]* (excludes gfx000 CPU agent) and +# amd-smi list for GPU data rows (excludes header-only output). +# Respects HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES / +# CUDA_VISIBLE_DEVICES so hidden GPUs are not detected. +_has_amd_rocm_gpu() { + _rocm_devices_enabled || return 1 + if command -v rocminfo >/dev/null 2>&1 && \ + rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then + return 0 + elif command -v amd-smi >/dev/null 2>&1 && \ + amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[][[:space:]]*[0-9]/{ found=1 } END{ exit !found }'; then + return 0 + fi + return 1 +} + +# ── NVIDIA usable-GPU helper ── +# Returns 0 (true) only if nvidia-smi is present AND actually lists a GPU +# AND visibility masks do not hide all GPUs. +# Prevents AMD-only hosts with a stale nvidia-smi on PATH from being routed +# into the CUDA branch. +_has_usable_nvidia_gpu() { + # Respect explicit "hide all NVIDIA GPUs" masks. + if [ "${CUDA_VISIBLE_DEVICES+x}" = x ]; then + case "${CUDA_VISIBLE_DEVICES}" in ""|-1) return 1 ;; esac + fi + _nvsmi="" + if command -v nvidia-smi >/dev/null 2>&1; then + _nvsmi="nvidia-smi" + elif [ -x "/usr/bin/nvidia-smi" ]; then + _nvsmi="/usr/bin/nvidia-smi" + else + return 1 + fi + "$_nvsmi" -L 2>/dev/null | awk '/^GPU[[:space:]]+[0-9]+:/{found=1} END{exit !found}' +} + # ── Detect GPU and choose PyTorch index URL ── # Mirrors Get-TorchIndexUrl in install.ps1. # On CPU-only machines this returns the cpu index, avoiding the solver @@ -986,14 +1044,83 @@ get_torch_index_url() { _base="https://download.pytorch.org/whl" # macOS: always CPU (no CUDA support) case "$(uname -s)" in Darwin) echo "$_base/cpu"; return ;; esac - # Try nvidia-smi + # Try nvidia-smi -- require the binary to actually list a usable GPU. + # Presence of the binary alone (container leftovers, stale driver + # packages) is not sufficient: otherwise an AMD-only host would + # silently install CUDA wheels. _smi="" - if command -v nvidia-smi >/dev/null 2>&1; then - _smi="nvidia-smi" - elif [ -x "/usr/bin/nvidia-smi" ]; then - _smi="/usr/bin/nvidia-smi" + if _has_usable_nvidia_gpu; then + if command -v nvidia-smi >/dev/null 2>&1; then + _smi="nvidia-smi" + elif [ -x "/usr/bin/nvidia-smi" ]; then + _smi="/usr/bin/nvidia-smi" + fi + fi + if [ -z "$_smi" ]; then + # No NVIDIA GPU -- check for AMD ROCm GPU. + # PyTorch only publishes ROCm wheels for linux-x86_64; skip the + # ROCm branch entirely on aarch64 / arm64 / other architectures + # so non-x86_64 Linux hosts fall back cleanly to CPU wheels. + case "$(uname -m)" in + x86_64|amd64) : ;; + *) echo "$_base/cpu"; return ;; + esac + if ! _has_amd_rocm_gpu; then + echo "$_base/cpu"; return + fi + # AMD GPU confirmed -- detect ROCm version + _rocm_tag="" + _rocm_tag=$({ command -v amd-smi >/dev/null 2>&1 && \ + amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ + 'NF>1{gsub(/[^0-9.]/, "", $2); split($2,a,"."); print "rocm"a[1]"."a[2]; ok=1; exit} END{exit !ok}'; } || \ + { _rocm_info_file="${ROCM_PATH:-/opt/rocm}/.info/version"; \ + [ -r "$_rocm_info_file" ] && \ + awk -F. '{print "rocm"$1"."$2; exit}' "$_rocm_info_file"; } || \ + { command -v hipconfig >/dev/null 2>&1 && \ + hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]/{split($1,a,"."); if(a[1]+0>0){print "rocm"a[1]"."a[2]; found=1}} END{exit !found}'; } || \ + { command -v dpkg-query >/dev/null 2>&1 && \ + ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \ + [ -n "$ver" ] && \ + printf '%s\n' "$ver" | sed 's/^[0-9]*://' | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; } || \ + { command -v rpm >/dev/null 2>&1 && \ + ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \ + [ -n "$ver" ] && \ + printf '%s\n' "$ver" | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; }) 2>/dev/null + # Validate _rocm_tag: must match "rocmX.Y" with major >= 1 + case "$_rocm_tag" in + rocm[1-9]*.[0-9]*) : ;; # valid (major >= 1) + *) _rocm_tag="" ;; # reject malformed (empty, garbled, or major=0) + esac + if [ -n "$_rocm_tag" ]; then + # Minimum supported: ROCm 6.0 (no PyTorch wheels exist for older) + case "$_rocm_tag" in + rocm[1-5].*) echo "$_base/cpu"; return ;; + esac + # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds + # (<2.11.0). Fall back to rocm7.1 index which has torch 2.10.0. + # Enumerate explicit versions rather than matching rocm6.* so + # a host on ROCm 6.5 or 6.6 (no PyTorch wheels published) is + # clipped down to the last supported 6.x (rocm6.4) instead of + # constructing https://download.pytorch.org/whl/rocm6.5 which + # returns HTTP 403. PyTorch only ships: rocm5.7, 6.0, 6.1, 6.2, + # 6.3, 6.4, 7.0, 7.1, 7.2 (and 5.7 is below our minimum). + # TODO: uncomment rocm7.2 when the torch upper bound is bumped + # to >=2.11.0. + case "$_rocm_tag" in + rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*) + echo "$_base/$_rocm_tag" ;; + rocm6.*) + # ROCm 6.5+ (no published PyTorch wheels): clip down + # to the last supported 6.x wheel set. + echo "$_base/rocm6.4" ;; + *) + # ROCm 7.2+ (including future 10.x+): cap to rocm7.1 + echo "$_base/rocm7.1" ;; + esac + return + fi + echo "$_base/cpu"; return fi - if [ -z "$_smi" ]; then echo "$_base/cpu"; return; fi # Parse CUDA version from nvidia-smi output (POSIX-safe, no grep -P) _cuda_ver=$(LC_ALL=C $_smi 2>/dev/null \ | sed -n 's/.*CUDA Version:[[:space:]]*\([0-9][0-9]*\.[0-9][0-9]*\).*/\1/p' \ @@ -1011,20 +1138,166 @@ get_torch_index_url() { elif [ "$_major" -ge 11 ]; then echo "$_base/cu118" else echo "$_base/cpu"; fi } + +get_radeon_wheel_url() { + # Only meaningful on Linux. Picks a repo.radeon.com base URL whose listing + # contains torch wheels. Tries paths like rocm-rel-7.2.1/, rocm-rel-7.2/, + # rocm-rel-7.1.1/, rocm-rel-7.1/ (AMD publishes both M.m and M.m.p dirs). + # Accepts both X.Y and X.Y.Z host versions since /opt/rocm/.info/version + # and hipconfig --version can return either shape. + case "$(uname -s)" in Linux) ;; *) echo ""; return ;; esac + + # Detect ROCm version (X.Y or X.Y.Z) -- try amd-smi, then + # /opt/rocm/.info/version, then hipconfig. + _full_ver="" + _full_ver=$({ command -v amd-smi >/dev/null 2>&1 && \ + amd-smi version 2>/dev/null | awk -F'ROCm version: ' \ + 'NF>1{if(match($2,/[0-9]+\.[0-9]+(\.[0-9]+)?/)){print substr($2,RSTART,RLENGTH); ok=1; exit}} END{exit !ok}'; } || \ + { _rocm_info_file="${ROCM_PATH:-/opt/rocm}/.info/version"; \ + [ -r "$_rocm_info_file" ] && \ + awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1; exit} END{exit !found}' "$_rocm_info_file"; } || \ + { command -v hipconfig >/dev/null 2>&1 && \ + hipconfig --version 2>/dev/null | awk 'NR==1 && match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1} END{exit !found}'; } || \ + { command -v dpkg-query >/dev/null 2>&1 && \ + ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \ + [ -n "$ver" ] && \ + printf '%s\n' "$ver" | sed 's/^[0-9]*://' | awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); exit}'; } || \ + { command -v rpm >/dev/null 2>&1 && \ + ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \ + [ -n "$ver" ] && \ + printf '%s\n' "$ver" | awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); exit}'; }) 2>/dev/null + + # Validate: must be X.Y or X.Y.Z with X >= 1 + case "$_full_ver" in + [1-9]*.[0-9]*.[0-9]*) : ;; # X.Y.Z + [1-9]*.[0-9]*) : ;; # X.Y + *) echo ""; return ;; + esac + echo "https://repo.radeon.com/rocm/manylinux/rocm-rel-${_full_ver}/" +} + +# ── Radeon repo wheel selection helpers ────────────────────────────────────── +# Fetches the Radeon repo directory listing once into _RADEON_LISTING (global). +# _RADEON_PYTAG holds the CPython tag for the running interpreter (e.g. cp312). +# _RADEON_BASE_URL holds the base URL for relative-href resolution. +_RADEON_LISTING="" +_RADEON_PYTAG="" +_RADEON_BASE_URL="" + +_radeon_fetch_listing() { + # Usage: _radeon_fetch_listing BASE_URL + # Populates _RADEON_LISTING, _RADEON_PYTAG, _RADEON_BASE_URL. + _RADEON_BASE_URL="$1" + _RADEON_PYTAG=$("$_VENV_PY" -c " +import sys +print('cp{}{}'.format(sys.version_info.major, sys.version_info.minor)) +" 2>/dev/null) || return 1 + if command -v curl >/dev/null 2>&1; then + _RADEON_LISTING=$(curl -fsSL --max-time 20 "$_RADEON_BASE_URL" 2>/dev/null) + elif command -v wget >/dev/null 2>&1; then + _RADEON_LISTING=$(wget -qO- --timeout=20 "$_RADEON_BASE_URL" 2>/dev/null) + fi + [ -n "$_RADEON_LISTING" ] || return 1 +} + +_pick_radeon_wheel() { + # Usage: _pick_radeon_wheel PACKAGE_NAME + # Scans $_RADEON_LISTING for the newest wheel whose filename starts exactly + # with PACKAGE_NAME- and matches _RADEON_PYTAG + linux_x86_64. + # Prints the full URL (resolving relative hrefs against _RADEON_BASE_URL). + # + # POSIX-compliant pipeline: all href parsing, filtering, and version + # selection is done inside a single awk script rather than reaching + # for GNU extensions (grep -o, sort -V) that would break under BSD + # or BusyBox coreutils. + _pkg="$1" + [ -n "$_RADEON_LISTING" ] || return 1 + [ -n "$_RADEON_PYTAG" ] || return 1 + _tag="$_RADEON_PYTAG" + _href=$(printf '%s\n' "$_RADEON_LISTING" \ + | awk -v pkg="$_pkg" -v tag="$_tag" ' + BEGIN { max_pad = ""; max_url = "" } + { + line = $0 + while (match(line, /href="[^"]*"/)) { + # Strip the leading href=" (6 chars) and trailing " (1 char) + url = substr(line, RSTART + 6, RLENGTH - 7) + line = substr(line, RSTART + RLENGTH) + + # Extract basename, strip query / fragment + n = split(url, p, "/") + base = p[n] + sub(/[?#].*/, "", base) + + prefix = pkg "-" + # Match cpXY-cpXY with any linux x86_64 + # platform tag (linux_x86_64, manylinux_2_28_x86_64, + # manylinux2014_x86_64, etc.) + if (substr(base, 1, length(prefix)) == prefix && + index(base, "-" tag "-") > 0 && + match(base, /x86_64\.whl$/)) { + # Extract the version component (first + # dotted-number run) and pad each piece so a + # plain lexical comparison gives us the newest. + if (match(base, /[0-9]+\.[0-9]+(\.[0-9]+)?/)) { + ver = substr(base, RSTART, RLENGTH) + m = split(ver, v, ".") + pad = "" + for (i = 1; i <= m; i++) + pad = pad sprintf("%08d", v[i]) + if (pad > max_pad) { + max_pad = pad + max_url = url + } + } + } + } + } + END { if (max_url != "") print max_url }') + [ -z "$_href" ] && return 1 + case "$_href" in + http*) printf '%s\n' "$_href" ;; + *) printf '%s\n' "${_RADEON_BASE_URL%/}/${_href#/}" ;; + esac +} + TORCH_INDEX_URL=$(get_torch_index_url) +# Auto-detect GPU for AMD ROCm based +# get_torch_index_url must have chosen */rocm* +# (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon". +_amd_gpu_radeon=false +case "$TORCH_INDEX_URL" in + */rocm*) + if _has_amd_rocm_gpu && command -v rocminfo >/dev/null 2>&1 && \ + rocminfo 2>/dev/null | grep -q 'Marketing Name:.*Radeon'; then + _amd_gpu_radeon=true + fi + ;; +esac + # ── Print CPU-only hint when no GPU detected ── case "$TORCH_INDEX_URL" in */cpu) if [ "$SKIP_TORCH" = false ] && [ "$OS" != "macos" ]; then echo "" - echo " NOTE: No NVIDIA GPU detected (nvidia-smi not found)." + echo " NOTE: No GPU detected (nvidia-smi and ROCm not found)." echo " Installing CPU-only PyTorch. If you only need GGUF chat/inference," echo " re-run with --no-torch for a faster, lighter install:" echo " curl -fsSL https://unsloth.ai/install.sh | sh -s -- --no-torch" + echo " AMD ROCm users: see https://docs.unsloth.ai/get-started/install-and-update/amd" echo "" fi ;; + */rocm*) + echo "" + if [ "$_amd_gpu_radeon" = true ]; then + echo " AMD Radeon + ROCm detected -- installing PyTorch wheels from repo.radeon.com" + else + echo " AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)" + fi + echo "" + ;; esac # ── Install unsloth directly into the venv (no activation needed) ── @@ -1040,7 +1313,7 @@ if [ "$_MIGRATED" = true ]; then # to prevent transitive torch resolution. run_install_cmd "install unsloth (migrated no-torch)" uv pip install --python "$_VENV_PY" --no-deps \ --reinstall-package unsloth --reinstall-package unsloth-zoo \ - "unsloth>=2026.4.2" unsloth-zoo + "unsloth>=2026.4.4" unsloth-zoo _NO_TORCH_RT="$(_find_no_torch_runtime)" if [ -n "$_NO_TORCH_RT" ]; then run_install_cmd "install no-torch runtime deps" uv pip install --python "$_VENV_PY" --no-deps -r "$_NO_TORCH_RT" @@ -1048,21 +1321,169 @@ if [ "$_MIGRATED" = true ]; then else run_install_cmd "install unsloth (migrated)" uv pip install --python "$_VENV_PY" \ --reinstall-package unsloth --reinstall-package unsloth-zoo \ - "unsloth>=2026.4.2" unsloth-zoo + "unsloth>=2026.4.4" unsloth-zoo fi if [ "$STUDIO_LOCAL_INSTALL" = true ]; then substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps fi + # AMD ROCm: ensure torch has HIP support and install bitsandbytes + # in migrated environments. Existing venvs created before ROCm support + # may have CPU-only torch that needs replacing. + if [ "$SKIP_TORCH" = false ]; then + case "$TORCH_INDEX_URL" in + */rocm*) + if ! "$_VENV_PY" - <<'PY' >/dev/null 2>&1 +import sys, torch +sys.exit(0 if getattr(torch.version, "hip", None) else 1) +PY + then + substep "reinstalling ROCm PyTorch ($TORCH_INDEX_URL)..." + run_install_cmd "install PyTorch (ROCm)" uv pip install --python "$_VENV_PY" \ + --force-reinstall "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" + fi + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" + ;; + esac + fi elif [ -n "$TORCH_INDEX_URL" ]; then # Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac) if [ "$SKIP_TORCH" = true ]; then substep "skipping PyTorch (--no-torch or Intel Mac x86_64)." "$C_WARN" + elif [ "$_amd_gpu_radeon" = true ]; then + _radeon_url=$(get_radeon_wheel_url) + if [ -n "$_radeon_url" ]; then + _radeon_listing_ok=false + if _radeon_fetch_listing "$_radeon_url" 2>/dev/null; then + _radeon_listing_ok=true + else + # Try shorter X.Y path (AMD publishes both X.Y.Z and X.Y dirs) + _radeon_url_short=$(printf '%s\n' "$_radeon_url" \ + | sed 's|rocm-rel-\([0-9]*\)\.\([0-9]*\)\.[0-9]*/|rocm-rel-\1.\2/|') + if [ "$_radeon_url_short" != "$_radeon_url" ] && \ + _radeon_fetch_listing "$_radeon_url_short" 2>/dev/null; then + _radeon_listing_ok=true + fi + fi + + if [ "$_radeon_listing_ok" = true ]; then + # Require torch, torchvision, torchaudio wheels to all resolve + # from the Radeon listing. If any is missing for this Python + # tag, fall through to the standard ROCm index instead of + # silently mixing Radeon wheels with PyPI defaults. + _torch_whl=$(_pick_radeon_wheel "torch" 2>/dev/null) || _torch_whl="" + _tv_whl=$(_pick_radeon_wheel "torchvision" 2>/dev/null) || _tv_whl="" + _ta_whl=$(_pick_radeon_wheel "torchaudio" 2>/dev/null) || _ta_whl="" + _tri_whl=$(_pick_radeon_wheel "triton" 2>/dev/null) || _tri_whl="" + # Some ROCm versions publish triton as pytorch_triton_rocm + if [ -z "$_tri_whl" ]; then + _tri_whl=$(_pick_radeon_wheel "pytorch_triton_rocm" 2>/dev/null) || _tri_whl="" + fi + # Sanity-check torch / torchvision / torchaudio are a + # matching release. The Radeon repo publishes multiple + # generations simultaneously, so picking the highest-version + # wheel for each package independently can assemble a + # mismatched trio (e.g. torch 2.9.1 + torchvision 0.23.0 + + # torchaudio 2.9.0 from the current rocm-rel-7.2.1 index). + # Check that torch and torchaudio share the same X.Y public + # version prefix, and that torchvision's minor correctly + # pairs with torch's minor (torchvision = torch.minor + 15 + # since torch 2.4 -> torchvision 0.19 -> torch 2.9 -> + # torchvision 0.24). + # URL-decode each wheel name so %2B -> + before version + # extraction. Real Radeon wheel hrefs are percent-encoded + # (torch-2.10.0%2Brocm7.2.0...), so a plain [+-] terminator + # in the sed regex below would never match and + # _radeon_versions_match would stay false for every real + # listing, silently forcing a fallback to the generic + # ROCm index. + _torch_ver="" + _tv_ver="" + _ta_ver="" + if [ -n "$_torch_whl" ]; then + _torch_name=$(printf '%s' "${_torch_whl##*/}" | sed 's/%2[Bb]/+/g') + _torch_ver=$(printf '%s\n' "$_torch_name" | sed -n 's|^torch-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + fi + if [ -n "$_tv_whl" ]; then + _tv_name=$(printf '%s' "${_tv_whl##*/}" | sed 's/%2[Bb]/+/g') + _tv_ver=$(printf '%s\n' "$_tv_name" | sed -n 's|^torchvision-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + fi + if [ -n "$_ta_whl" ]; then + _ta_name=$(printf '%s' "${_ta_whl##*/}" | sed 's/%2[Bb]/+/g') + _ta_ver=$(printf '%s\n' "$_ta_name" | sed -n 's|^torchaudio-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p') + fi + _radeon_versions_match=false + if [ -n "$_torch_ver" ] && [ -n "$_tv_ver" ] && [ -n "$_ta_ver" ]; then + _torch_major=${_torch_ver%%.*} + _torch_minor=${_torch_ver#*.} + _ta_major=${_ta_ver%%.*} + _ta_minor=${_ta_ver#*.} + _tv_major=${_tv_ver%%.*} + _tv_minor=${_tv_ver#*.} + # torchvision expected minor (e.g. torch 2.9 -> 0.24) + _expected_tv_minor=$((_torch_minor + 15)) + if [ "$_torch_major" = "$_ta_major" ] && \ + [ "$_torch_minor" = "$_ta_minor" ] && \ + [ "$_tv_major" = "0" ] && \ + [ "$_tv_minor" = "$_expected_tv_minor" ]; then + _radeon_versions_match=true + fi + fi + if [ -z "$_torch_whl" ] || [ -z "$_tv_whl" ] || [ -z "$_ta_whl" ] || \ + [ "$_radeon_versions_match" != true ]; then + substep "[WARN] Radeon repo lacks a compatible wheel set for this Python; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" + else + substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..." + # Pass explicit wheel URLs so the matched trio is + # installed together. --find-links lets uv discover + # the Radeon listing for any local lookup, and PyPI + # (not disabled) provides transitive deps like + # filelock / sympy / networkx which are not in the + # Radeon listing. + if [ -n "$_tri_whl" ]; then + run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \ + --find-links "$_RADEON_BASE_URL" \ + "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl" + else + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + --find-links "$_RADEON_BASE_URL" \ + "$_torch_whl" "$_tv_whl" "$_ta_whl" + fi + fi + else + substep "[WARN] Radeon repo unavailable; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" + fi + else + substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to ROCm index" "$C_WARN" + run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \ + "$TORCH_CONSTRAINT" torchvision torchaudio \ + --index-url "$TORCH_INDEX_URL" + fi else substep "installing PyTorch ($TORCH_INDEX_URL)..." run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "$TORCH_CONSTRAINT" torchvision torchaudio \ --index-url "$TORCH_INDEX_URL" fi + # AMD ROCm: install bitsandbytes (once, after torch, for all ROCm paths). + # Gate on SKIP_TORCH=false so a user running with --no-torch on a ROCm + # host stays in GGUF-only mode rather than pulling in bitsandbytes, + # which is only useful once torch is present for training. + if [ "$SKIP_TORCH" = false ]; then + case "$TORCH_INDEX_URL" in + */rocm*) + substep "installing bitsandbytes for AMD ROCm..." + run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1" + ;; + esac + fi # Fresh: Step 2 - install unsloth, preserving pre-installed torch substep "installing unsloth (this may take a few minutes)..." if [ "$SKIP_TORCH" = true ]; then @@ -1070,7 +1491,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then # runtime deps (typer, safetensors, transformers, etc.) with --no-deps. run_install_cmd "install unsloth (no-torch)" uv pip install --python "$_VENV_PY" --no-deps \ --upgrade-package unsloth --upgrade-package unsloth-zoo \ - "unsloth>=2026.4.2" unsloth-zoo + "unsloth>=2026.4.4" unsloth-zoo _NO_TORCH_RT="$(_find_no_torch_runtime)" if [ -n "$_NO_TORCH_RT" ]; then run_install_cmd "install no-torch runtime deps" uv pip install --python "$_VENV_PY" --no-deps -r "$_NO_TORCH_RT" @@ -1081,7 +1502,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then fi elif [ "$STUDIO_LOCAL_INSTALL" = true ]; then run_install_cmd "install unsloth (local)" uv pip install --python "$_VENV_PY" \ - --upgrade-package unsloth "unsloth>=2026.4.2" unsloth-zoo + --upgrade-package unsloth "unsloth>=2026.4.4" unsloth-zoo substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps else @@ -1092,7 +1513,7 @@ else # Fallback: GPU detection failed to produce a URL -- let uv resolve torch substep "installing unsloth (this may take a few minutes)..." if [ "$STUDIO_LOCAL_INSTALL" = true ]; then - run_install_cmd "install unsloth (auto torch backend)" uv pip install --python "$_VENV_PY" unsloth-zoo "unsloth>=2026.4.2" --torch-backend=auto + run_install_cmd "install unsloth (auto torch backend)" uv pip install --python "$_VENV_PY" unsloth-zoo "unsloth>=2026.4.4" --torch-backend=auto substep "overlaying local repo (editable)..." run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps else diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py index 0454eada89..ebf30c14ed 100644 --- a/studio/backend/core/training/worker.py +++ b/studio/backend/core/training/worker.py @@ -86,6 +86,7 @@ def _probe_causal_conv1d_env() -> dict[str, str] | None: "'python_tag': f'cp{sys.version_info.major}{sys.version_info.minor}', " "'torch_mm': torch_mm, " "'cuda_major': str(int(str(torch.version.cuda).split('.', 1)[0])) if torch.version.cuda else '', " + "'hip_version': str(torch.version.hip) if getattr(torch.version, 'hip', None) else '', " "'cxx11abi': str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()" "}))" ), @@ -237,28 +238,111 @@ def _install_package_wheel_first( else: logger.info("No published %s wheel found: %s", display_name, wheel_url) - _send_status(event_queue, f"Installing {display_name} from PyPI...") - pypi_cmd = [ - sys.executable, - "-m", - "pip", - "install", - "--no-build-isolation", - "--no-deps", - "--no-cache-dir", - f"{pypi_name}=={pypi_version}", - ] - result = _sp.run( - pypi_cmd, - stdout = _sp.PIPE, - stderr = _sp.STDOUT, - text = True, - ) + is_hip = env and env.get("hip_version") + if is_hip and not shutil.which("hipcc"): + logger.error( + "%s requires hipcc for source compilation on ROCm. " + "Install the ROCm HIP SDK: https://rocm.docs.amd.com", + display_name, + ) + _send_status( + event_queue, + f"{display_name}: hipcc not found (ROCm HIP SDK required)", + ) + return + + if is_hip: + _send_status( + event_queue, + f"Compiling {display_name} from source for ROCm " + "(this may take several minutes)...", + ) + else: + _send_status(event_queue, f"Installing {display_name} from PyPI...") + + # Prefer uv for faster dependency resolution when available + if shutil.which("uv"): + pypi_cmd = [ + "uv", + "pip", + "install", + "--python", + sys.executable, + "--no-build-isolation", + "--no-deps", + ] + # Avoid stale cache artifacts from partial HIP source builds + if is_hip: + pypi_cmd.append("--no-cache") + pypi_cmd.append(f"{pypi_name}=={pypi_version}") + else: + pypi_cmd = [ + sys.executable, + "-m", + "pip", + "install", + "--no-build-isolation", + "--no-deps", + "--no-cache-dir", + f"{pypi_name}=={pypi_version}", + ] + + # Source compilation on ROCm can take 10-30 minutes; use a generous + # timeout. Non-HIP installs preserve the pre-existing "no timeout" + # behaviour so unrelated slow installs (e.g. causal-conv1d source + # build on Linux aarch64 or unsupported torch/CUDA combinations) + # are not aborted at 5 minutes by this PR. + _run_kwargs: dict[str, Any] = { + "stdout": _sp.PIPE, + "stderr": _sp.STDOUT, + "text": True, + } + if is_hip: + _run_kwargs["timeout"] = 1800 + + try: + result = _sp.run(pypi_cmd, **_run_kwargs) + except _sp.TimeoutExpired: + logger.error( + "%s installation timed out after %ds", + display_name, + _run_kwargs.get("timeout"), + ) + _send_status( + event_queue, + f"{display_name} installation timed out after " + f"{_run_kwargs.get('timeout')}s", + ) + return + if result.returncode != 0: - logger.error("Failed to install %s from PyPI:\n%s", display_name, result.stdout) + if is_hip: + # Surface a clear error for ROCm source build failures + error_lines = (result.stdout or "").strip().splitlines() + snippet = "\n".join(error_lines[-5:]) if error_lines else "(no output)" + logger.error( + "Failed to compile %s for ROCm:\n%s", + display_name, + result.stdout, + ) + _send_status( + event_queue, + f"Failed to compile {display_name} for ROCm. " + "Check that hipcc and ROCm development headers are installed.\n" + f"{snippet}", + ) + else: + logger.error( + "Failed to install %s from PyPI:\n%s", + display_name, + result.stdout, + ) return - logger.info("Installed %s from PyPI", display_name) + if is_hip: + logger.info("Compiled and installed %s from source for ROCm", display_name) + else: + logger.info("Installed %s from PyPI", display_name) def _ensure_causal_conv1d_fast_path(event_queue: Any, model_name: str) -> None: diff --git a/studio/backend/main.py b/studio/backend/main.py index ad19ee9679..c2c0a0b6e4 100644 --- a/studio/backend/main.py +++ b/studio/backend/main.py @@ -237,6 +237,7 @@ async def get_system_info(): import platform import psutil from utils.hardware import get_device + from utils.hardware.hardware import _backend_label visibility_info = get_backend_visible_gpu_info() gpu_info = { @@ -250,7 +251,10 @@ async def get_system_info(): return { "platform": platform.platform(), "python_version": platform.python_version(), - "device_backend": get_device().value, + # Use the centralized _backend_label helper so the /api/system + # endpoint reports "rocm" on AMD hosts instead of "cuda", matching + # the /api/hardware and /api/gpu-visibility endpoints. + "device_backend": _backend_label(get_device()), "cpu_count": psutil.cpu_count(), "memory": { "total_gb": round(memory.total / 1e9, 2), diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py index aaa0452406..400b5dd066 100644 --- a/studio/backend/utils/hardware/__init__.py +++ b/studio/backend/utils/hardware/__init__.py @@ -5,6 +5,7 @@ Hardware detection and GPU utilities """ +from . import hardware as _hardware from .hardware import ( DeviceType, DEVICE, @@ -49,6 +50,7 @@ "DeviceType", "DEVICE", "CHAT_ONLY", + "IS_ROCM", "detect_hardware", "get_device", "is_apple_silicon", @@ -81,3 +83,11 @@ "extract_arch_config", "estimate_training_vram", ] + + +def __getattr__(name: str): + """Resolve IS_ROCM at access time so callers always see the live value + after detect_hardware() runs (it flips the flag in hardware.py).""" + if name == "IS_ROCM": + return getattr(_hardware, "IS_ROCM") + raise AttributeError(name) diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py new file mode 100644 index 0000000000..a2522c5ca1 --- /dev/null +++ b/studio/backend/utils/hardware/amd.py @@ -0,0 +1,373 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +"""AMD GPU monitoring via amd-smi. + +Mirrors the nvidia.py module structure so hardware.py can swap backends +based on IS_ROCM. All functions return the same dict shapes as their +nvidia.py counterparts. +""" + +import json +import math +import os +import re +import subprocess +from typing import Any, Optional + +from loggers import get_logger + +logger = get_logger(__name__) + + +def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[Any]: + """Run amd-smi with the given arguments and return parsed JSON, or None.""" + try: + result = subprocess.run( + ["amd-smi", *args, "--json"], + capture_output = True, + text = True, + timeout = timeout, + ) + except (OSError, subprocess.TimeoutExpired) as e: + logger.warning("amd-smi query failed: %s", e) + return None + if result.returncode != 0 or not result.stdout.strip(): + logger.warning("amd-smi returned code %d", result.returncode) + return None + try: + return json.loads(result.stdout) + except json.JSONDecodeError: + logger.warning("Failed to parse amd-smi JSON output") + return None + + +def _parse_numeric(value: Any) -> Optional[float]: + """Extract a numeric value from amd-smi output (may be str, int, float, or dict).""" + if value is None: + return None + # Newer amd-smi versions emit {"value": 10, "unit": "W"} + if isinstance(value, dict): + return _parse_numeric(value.get("value")) + if isinstance(value, (int, float)): + f = float(value) + return f if math.isfinite(f) else None + if isinstance(value, str): + # Strip units like "W", "C", "%", "MB", "MiB", "GB", "GiB" etc. + cleaned = re.sub(r"\s*[A-Za-z/%]+$", "", value.strip()) + if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"): + return None + try: + return float(cleaned) + except (ValueError, TypeError): + return None + return None + + +def _parse_memory_mb(value: Any) -> Optional[float]: + """Parse a memory value from amd-smi output and return MB. + + Handles bare numbers (assumed MB -- the amd-smi convention on every + version we have seen), dict-shaped values with explicit units + (``{"value": 192, "unit": "GiB"}`` on newer releases), and plain + strings like ``"8192 MiB"``. + """ + unit = "" + raw_value = value + + if isinstance(value, dict): + unit = str(value.get("unit", "")).strip().lower() + raw_value = value.get("value") + elif isinstance(value, str): + # Extract unit suffix from strings like "192 GiB" or "8192 MB" + m = re.match(r"^\s*([\d.]+)\s*([A-Za-z]+)\s*$", value.strip()) + if m: + unit = m.group(2).lower() + + num = _parse_numeric(raw_value if isinstance(value, dict) else value) + if num is None: + return None + + # Unit conversion -- GPU tools (including amd-smi) use binary units even + # when labeling them "GB" or "MB", so treat GB/GiB and MB/MiB the same. + if "gib" in unit or "gb" in unit: + return num * 1024 + if "mib" in unit or "mb" in unit: + return num + if "kib" in unit or "kb" in unit: + return num / 1024 + if unit in ("b", "byte", "bytes"): + # Plain bytes + return num / (1024 * 1024) + + # No explicit unit -- default to MB, which is the amd-smi convention + # for bare numeric values. A previous heuristic assumed values above + # ~10M were bytes, but that misclassifies small VRAM allocations + # (e.g. 5 MB = 5,242,880 reported without a unit) as ~5 TB. Modern + # amd-smi always ships explicit units, so the heuristic branch only + # fired for legacy output where MB was already the convention. + return num + + +def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]: + """Extract standardized metrics from a single GPU's amd-smi data.""" + # amd-smi metric output structure varies by version; try common paths + usage = gpu_data.get("usage", gpu_data.get("gpu_activity", {})) + if isinstance(usage, dict): + gpu_util = _parse_numeric( + usage.get("gfx_activity", usage.get("gpu_use_percent")) + ) + else: + gpu_util = _parse_numeric(usage) + + # Temperature + temp_data = gpu_data.get("temperature", {}) + if isinstance(temp_data, dict): + temp = _parse_numeric( + temp_data.get( + "edge", + temp_data.get( + "temperature_edge", + temp_data.get("hotspot", temp_data.get("temperature_hotspot")), + ), + ) + ) + else: + temp = _parse_numeric(temp_data) + + # Power + power_data = gpu_data.get("power", {}) + if isinstance(power_data, dict): + power_draw = _parse_numeric( + power_data.get( + "current_socket_power", + power_data.get("average_socket_power", power_data.get("socket_power")), + ) + ) + power_limit = _parse_numeric( + power_data.get("power_cap", power_data.get("max_power_limit")) + ) + else: + power_draw = None + power_limit = None + + # VRAM -- unit-aware parsing to handle varying amd-smi output formats. + # Newer amd-smi versions may return {"value": 192, "unit": "GiB"}. + vram_data = gpu_data.get("vram", gpu_data.get("fb_memory_usage", {})) + if isinstance(vram_data, dict): + vram_used_mb = _parse_memory_mb( + vram_data.get("vram_used", vram_data.get("used")) + ) + vram_total_mb = _parse_memory_mb( + vram_data.get("vram_total", vram_data.get("total")) + ) + else: + vram_used_mb = None + vram_total_mb = None + + # Build the standardized dict (same shape as nvidia._build_gpu_metrics) + vram_used_gb = round(vram_used_mb / 1024, 2) if vram_used_mb is not None else None + vram_total_gb = ( + round(vram_total_mb / 1024, 2) if vram_total_mb is not None else None + ) + vram_util = ( + round((vram_used_mb / vram_total_mb) * 100, 1) + if vram_used_mb is not None and vram_total_mb is not None and vram_total_mb > 0 + else None + ) + power_util = ( + round((power_draw / power_limit) * 100, 1) + if power_draw is not None and power_limit is not None and power_limit > 0 + else None + ) + + return { + "gpu_utilization_pct": gpu_util, + "temperature_c": temp, + "vram_used_gb": vram_used_gb, + "vram_total_gb": vram_total_gb, + "vram_utilization_pct": vram_util, + "power_draw_w": power_draw, + "power_limit_w": power_limit, + "power_utilization_pct": power_util, + } + + +def _has_real_metrics(metrics: dict[str, Any]) -> bool: + """Return True when ``metrics`` contains at least one non-None value. + + ``amd-smi`` can return a zero-exit JSON envelope that is missing every + expected field (error response, unsupported card, hipless container). + In that case ``_extract_gpu_metrics`` produces a dict where every value + is ``None`` -- callers must surface this as ``available: False`` rather + than ``available: True`` with empty data. + """ + return any(value is not None for value in metrics.values()) + + +def get_physical_gpu_count() -> Optional[int]: + """Return physical AMD GPU count via amd-smi, or None on failure.""" + data = _run_amd_smi("list") + if data is None: + return None + if isinstance(data, list): + return len(data) + # Some versions return a dict with a "gpu" / "gpus" key. Guard the + # .get() access with an isinstance check so a malformed scalar / + # string response from amd-smi cannot raise AttributeError. + if not isinstance(data, dict): + return None + gpus = data.get("gpu", data.get("gpus", [])) + if isinstance(gpus, list): + return len(gpus) + return None + + +def _first_visible_amd_gpu_id() -> Optional[str]: + """Return the *physical* AMD GPU id that should be treated as 'primary'. + + Delegates to ``hardware._get_parent_visible_gpu_spec()`` which correctly + composes the layered ROCm visibility masks (ROCR narrows physical set, + HIP/CUDA select ordinals within that set). Returns ``None`` when all + GPUs are hidden so callers can short-circuit to ``available: False``. + """ + try: + from .hardware import _get_parent_visible_gpu_spec + except ImportError: + return "0" + + spec = _get_parent_visible_gpu_spec() + numeric_ids = spec.get("numeric_ids") + if numeric_ids is None: + # Non-numeric IDs (UUIDs etc.) -- fall back to device 0 and let + # amd-smi resolve it. + return "0" + if len(numeric_ids) == 0: + return None + return str(numeric_ids[0]) + + +def get_primary_gpu_utilization() -> dict[str, Any]: + """Return utilization metrics for the primary visible AMD GPU.""" + gpu_idx = _first_visible_amd_gpu_id() + if gpu_idx is None: + return {"available": False} + data = _run_amd_smi("metric", "-g", gpu_idx) + if data is None: + return {"available": False} + + # amd-smi may return a list, a dict wrapping a list ({"gpus": [...]}), + # or a bare dict for a single GPU. Normalize to a single gpu_data dict. + if isinstance(data, list): + gpu_list = data + elif isinstance(data, dict): + gpu_list = data.get("gpus", data.get("gpu", [data])) + if isinstance(gpu_list, dict): + gpu_list = [gpu_list] + else: + return {"available": False} + + if not gpu_list: + return {"available": False} + + gpu_data = gpu_list[0] + if not isinstance(gpu_data, dict): + return {"available": False} + + metrics = _extract_gpu_metrics(gpu_data) + if not _has_real_metrics(metrics): + # amd-smi returned a JSON envelope with no usable fields (error + # response or unsupported card). Surface as unavailable rather + # than available-with-empty-data so the UI does not render a + # ghost device. + return {"available": False} + metrics["available"] = True + return metrics + + +def get_visible_gpu_utilization( + parent_visible_ids: Optional[list[int]], + parent_cuda_visible_devices: Optional[str] = None, +) -> dict[str, Any]: + """Return utilization metrics for visible AMD GPUs.""" + if parent_visible_ids is None: + return { + "available": False, + "backend_cuda_visible_devices": parent_cuda_visible_devices, + "parent_visible_gpu_ids": [], + "devices": [], + "index_kind": "unresolved", + } + + data = _run_amd_smi("metric") + if data is None: + return { + "available": False, + "backend_cuda_visible_devices": parent_cuda_visible_devices, + "parent_visible_gpu_ids": parent_visible_ids or [], + "devices": [], + "index_kind": "physical", + } + + # Extract a device list from amd-smi's envelope. Newer versions return + # a JSON array directly, older versions return a dict with a "gpus" / + # "gpu" key wrapping the list. Guard non-dict / non-list envelopes + # (scalar / string fallbacks from malformed output) so the .get() + # access cannot raise AttributeError on an unexpected shape. + if isinstance(data, list): + gpu_list = data + elif isinstance(data, dict): + gpu_list = data.get("gpus", data.get("gpu", [data])) + else: + gpu_list = [data] + visible_set = set(parent_visible_ids) + ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)} + + devices = [] + for fallback_idx, gpu_data in enumerate(gpu_list): + # Skip non-dict entries defensively: if amd-smi ever ships a + # scalar inside its "gpus" array (observed on some malformed + # output), _extract_gpu_metrics would raise AttributeError on + # the first .get() call. + if not isinstance(gpu_data, dict): + continue + # Use AMD-reported GPU ID when available, fall back to enumeration + # index. Newer amd-smi versions wrap scalars as ``{"value": 0, + # "unit": "none"}``, so route raw_id through ``_parse_numeric`` + # which already handles bare ints, floats, strings, and that + # dict shape uniformly. + raw_id = gpu_data.get( + "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx)) + ) + parsed_id = _parse_numeric(raw_id) + if parsed_id is None: + logger.debug( + "amd-smi GPU id %r could not be parsed; falling back to " + "enumeration index %d", + raw_id, + fallback_idx, + ) + idx = fallback_idx + else: + idx = int(parsed_id) + if idx not in visible_set: + continue + metrics = _extract_gpu_metrics(gpu_data) + if not _has_real_metrics(metrics): + # Skip ghost entries: an amd-smi response that decodes to a + # dict but contains no usable fields (error envelope, etc.) + # would otherwise show up as a device row with all-None + # numbers in the UI. + continue + metrics["index"] = idx + metrics["index_kind"] = "physical" + metrics["visible_ordinal"] = ordinal_map.get(idx, len(devices)) + devices.append(metrics) + + return { + "available": len(devices) > 0, + "backend_cuda_visible_devices": parent_cuda_visible_devices, + "parent_visible_gpu_ids": parent_visible_ids or [], + "devices": devices, + "index_kind": "physical", + } diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py index b6d3faf6d7..3e340a4b53 100644 --- a/studio/backend/utils/hardware/hardware.py +++ b/studio/backend/utils/hardware/hardware.py @@ -43,6 +43,26 @@ class DeviceType(str, Enum): DEVICE: Optional[DeviceType] = None CHAT_ONLY: bool = True # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.) +IS_ROCM: bool = ( + False # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py +) + + +def _backend_label(device: DeviceType) -> str: + """Return the user-facing backend name for API responses. + + Internally we still represent ROCm hosts as ``DeviceType.CUDA`` because + ROCm torch sets ``torch.cuda.is_available() = True`` and reuses the whole + ``torch.cuda.*`` API surface, so branching on ``DeviceType`` stays + consistent with the rest of the codebase. For the JSON responses served + to the Studio frontend and other clients, however, "cuda" is misleading + on an AMD machine. This helper swaps the label to ``"rocm"`` when the + module-level ``IS_ROCM`` flag is set so the UI can render the correct + backend name without every caller having to duplicate the check. + """ + if IS_ROCM and device == DeviceType.CUDA: + return "rocm" + return device.value # ========== Detection ========== @@ -85,10 +105,11 @@ def detect_hardware() -> DeviceType: 2. MLX (Apple Silicon via MLX framework) 3. CPU (fallback) """ - global DEVICE, CHAT_ONLY - CHAT_ONLY = True # reset -- only CUDA sets it to False + global DEVICE, CHAT_ONLY, IS_ROCM + CHAT_ONLY = True # reset -- only CUDA/ROCm sets it to False + IS_ROCM = False - # --- CUDA: try PyTorch --- + # --- CUDA / ROCm: try PyTorch --- if _has_torch(): import torch @@ -96,7 +117,16 @@ def detect_hardware() -> DeviceType: DEVICE = DeviceType.CUDA CHAT_ONLY = False device_name = torch.cuda.get_device_properties(0).name - print(f"Hardware detected: CUDA — {device_name}") + + # Distinguish AMD ROCm (HIP) from NVIDIA CUDA for display purposes. + # DeviceType stays CUDA since torch.cuda.* works on ROCm via HIP. + if getattr(torch.version, "hip", None) is not None: + IS_ROCM = True + print( + f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}" + ) + else: + print(f"Hardware detected: CUDA -- {device_name}") return DEVICE # --- XPU: Intel GPU --- @@ -186,7 +216,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "device": idx, "device_name": props.name, "total_gb": total / (1024**3), @@ -197,7 +227,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting CUDA GPU info: {e}") - return {"available": False, "backend": device.value, "error": str(e)} + return { + "available": False, + "backend": _backend_label(device), + "error": str(e), + } # ---- XPU path (Intel GPU) ---- if device == DeviceType.XPU: @@ -213,7 +247,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "device": idx, "device_name": props.name, "total_gb": total / (1024**3), @@ -224,7 +258,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error("Error getting XPU GPU info: %s", e) - return {"available": False, "backend": device.value, "error": str(e)} + return { + "available": False, + "backend": _backend_label(device), + "error": str(e), + } # ---- MLX path (Apple Silicon) ---- if device == DeviceType.MLX: @@ -239,7 +277,7 @@ def get_gpu_memory_info() -> Dict[str, Any]: return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "device": 0, "device_name": f"Apple Silicon ({platform.processor() or platform.machine()})", "total_gb": total / (1024**3), @@ -250,7 +288,11 @@ def get_gpu_memory_info() -> Dict[str, Any]: } except Exception as e: logger.error(f"Error getting MLX GPU info: {e}") - return {"available": False, "backend": device.value, "error": str(e)} + return { + "available": False, + "backend": _backend_label(device), + "error": str(e), + } # ---- CPU-only ---- return {"available": False, "backend": "cpu"} @@ -315,13 +357,15 @@ def get_package_versions() -> Dict[str, Optional[str]]: except PackageNotFoundError: versions[name] = None - # CUDA toolkit version bundled with torch + # GPU runtime version bundled with torch try: import torch versions["cuda"] = getattr(torch.version, "cuda", None) + versions["rocm"] = getattr(torch.version, "hip", None) except Exception: versions["cuda"] = None + versions["rocm"] = None return versions @@ -387,26 +431,50 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any] # ========== Live GPU Utilization ========== +def _smi_query(func_name: str, *args, **kwargs) -> Optional[Dict[str, Any]]: + """Run a query against the appropriate SMI backend (amd-smi or nvidia-smi). + + Returns the result dict if available, or None on failure/unavailability. + """ + if IS_ROCM: + backend_name = "amd-smi" + try: + from . import amd as _backend + except Exception as e: + logger.warning("%s import failed: %s", backend_name, e) + return None + else: + backend_name = "nvidia-smi" + try: + from . import nvidia as _backend + except Exception as e: + logger.warning("%s import failed: %s", backend_name, e) + return None + try: + func = getattr(_backend, func_name) + result = func(*args, **kwargs) + if result.get("available"): + return result + except Exception as e: + logger.warning("%s %s query failed: %s", backend_name, func_name, e) + return None + + def get_gpu_utilization() -> Dict[str, Any]: """Return a live snapshot of device utilization information.""" device = get_device() if device == DeviceType.CUDA: - try: - from . import nvidia - - result = nvidia.get_primary_gpu_utilization() - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("nvidia-smi utilization query failed: %s", e) + result = _smi_query("get_primary_gpu_utilization") + if result is not None: + result["backend"] = _backend_label(device) + return result mem = get_gpu_memory_info() if device != DeviceType.CPU and mem.get("available"): return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "gpu_utilization_pct": None, "temperature_c": None, "vram_used_gb": round(mem.get("allocated_gb", 0), 2), @@ -417,7 +485,7 @@ def get_gpu_utilization() -> Dict[str, Any]: "power_utilization_pct": None, } - return {"available": False, "backend": device.value} + return {"available": False, "backend": _backend_label(device)} def get_visible_gpu_utilization() -> Dict[str, Any]: @@ -425,18 +493,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if device == DeviceType.CUDA: parent_visible_spec = _get_parent_visible_gpu_spec() - try: - from . import nvidia - - result = nvidia.get_visible_gpu_utilization( - parent_visible_spec["numeric_ids"], - parent_cuda_visible_devices = parent_visible_spec["raw"], - ) - if result.get("available"): - result["backend"] = device.value - return result - except Exception as e: - logger.warning("nvidia-smi visible GPU utilization query failed: %s", e) + result = _smi_query( + "get_visible_gpu_utilization", + parent_visible_spec["numeric_ids"], + parent_cuda_visible_devices = parent_visible_spec["raw"], + ) + if result is not None: + result["backend"] = _backend_label(device) + return result # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel) if device in (DeviceType.CUDA, DeviceType.XPU): @@ -475,7 +539,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: ) return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": parent_ids, "devices": devices, "index_kind": index_kind, @@ -486,14 +550,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: if not mem.get("available"): return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": [], "devices": [], "index_kind": "relative", } return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": [0], "devices": [ { @@ -515,7 +579,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "parent_visible_gpu_ids": [], "devices": [], "index_kind": "relative", @@ -528,7 +592,84 @@ def get_visible_gpu_utilization() -> Dict[str, Any]: _visible_gpu_count: Optional[int] = None +def _parse_visible_ids(raw: str) -> tuple: + """Parse a CUDA/HIP/ROCR visibility string into (numeric_ids, ok). + + Returns ([int, ...], True) on success, (None, False) when tokens are + non-numeric (UUIDs, BDF addresses). + """ + raw = raw.strip() + if raw in ("", "-1"): + return [], True + tokens = [v.strip() for v in raw.split(",") if v.strip()] + try: + return [int(v) for v in tokens], True + except ValueError: + return None, False + + def _get_parent_visible_gpu_spec() -> Dict[str, Any]: + # ── ROCm layered visibility ── + # On ROCm, ROCR_VISIBLE_DEVICES narrows the *physical* GPU set first. + # HIP_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES then select *ordinals* + # within that narrowed set (not physical IDs). Example: + # ROCR_VISIBLE_DEVICES=2,3 HIP_VISIBLE_DEVICES=1 + # -> physical set is [2,3], HIP ordinal 1 = physical GPU 3 + # We must compose through both layers to report the correct physical IDs. + if IS_ROCM: + rocr_raw = os.environ.get("ROCR_VISIBLE_DEVICES") + hip_raw = os.environ.get("HIP_VISIBLE_DEVICES") + cuda_raw = os.environ.get("CUDA_VISIBLE_DEVICES") + + # If any mask is explicitly empty / -1, all GPUs are hidden. + for val in (rocr_raw, hip_raw, cuda_raw): + if val is not None and val.strip() in ("", "-1"): + return { + "raw": val.strip(), + "numeric_ids": [], + "supports_explicit_gpu_ids": True, + } + + # Layer 1: ROCR narrows the physical set. + if rocr_raw is not None: + physical_ids, ok = _parse_visible_ids(rocr_raw) + if not ok: + return { + "raw": rocr_raw, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + else: + physical_ids = list(range(get_physical_gpu_count())) + + # Layer 2: HIP or CUDA selects ordinals within the ROCR set. + child_raw = hip_raw if hip_raw is not None else cuda_raw + if child_raw is not None: + ordinals, ok = _parse_visible_ids(child_raw) + if not ok: + return { + "raw": child_raw, + "numeric_ids": None, + "supports_explicit_gpu_ids": False, + } + # Map ordinals back to physical IDs. + physical_ids = [ + physical_ids[i] for i in ordinals + if 0 <= i < len(physical_ids) + ] + return { + "raw": child_raw, + "numeric_ids": physical_ids, + "supports_explicit_gpu_ids": True, + } + + return { + "raw": rocr_raw, + "numeric_ids": physical_ids, + "supports_explicit_gpu_ids": True, + } + + # ── NVIDIA / non-ROCm path (unchanged) ── cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") if cuda_visible is None: @@ -1109,15 +1250,17 @@ def get_physical_gpu_count() -> int: if device == DeviceType.CUDA: try: - from . import nvidia - - count = nvidia.get_physical_gpu_count() + if IS_ROCM: + from . import amd as _smi_mod + else: + from . import nvidia as _smi_mod + count = _smi_mod.get_physical_gpu_count() if count is not None: _physical_gpu_count = count return _physical_gpu_count except Exception: pass - # nvidia-smi unavailable or failed — fall back to torch + # SMI tool unavailable or failed -- fall back to torch count = _torch_get_physical_gpu_count() _physical_gpu_count = count if count is not None else 1 return _physical_gpu_count @@ -1136,12 +1279,25 @@ def get_physical_gpu_count() -> int: return _physical_gpu_count +def _backend_visible_devices_env() -> Optional[str]: + """Return the raw visibility env string that applies to this backend. + + On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence + over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in + ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices`` + reports the value that is actually narrowing the visible device set. + """ + if IS_ROCM: + return _get_parent_visible_gpu_spec().get("raw") + return os.environ.get("CUDA_VISIBLE_DEVICES") + + def get_backend_visible_gpu_info() -> Dict[str, Any]: device = get_device() if device in (DeviceType.CUDA, DeviceType.XPU): parent_visible_ids = get_parent_visible_gpu_ids() - # Try nvidia-smi first (NVIDIA only) - if device == DeviceType.CUDA: + # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm) + if device == DeviceType.CUDA and not IS_ROCM: try: from . import nvidia @@ -1151,7 +1307,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: parent_visible_spec["raw"], ) if result.get("available"): - result["backend"] = device.value + result["backend"] = _backend_label(device) return result except Exception as e: logger.warning("Backend GPU visibility query failed: %s", e) @@ -1180,8 +1336,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: ] return { "available": True, - "backend": device.value, - "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), + "backend": _backend_label(device), + "backend_cuda_visible_devices": _backend_visible_devices_env(), "parent_visible_gpu_ids": parent_visible_ids, "devices": devices, "index_kind": index_kind, @@ -1189,8 +1345,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, - "backend": device.value, - "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), + "backend": _backend_label(device), + "backend_cuda_visible_devices": _backend_visible_devices_env(), "parent_visible_gpu_ids": parent_visible_ids, "devices": [], "index_kind": "physical", @@ -1201,7 +1357,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: if not mem.get("available"): return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [], "devices": [], @@ -1209,7 +1365,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: } return { "available": True, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [0], "devices": [ @@ -1226,7 +1382,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]: return { "available": False, - "backend": device.value, + "backend": _backend_label(device), "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"), "parent_visible_gpu_ids": [], "devices": [], @@ -1246,17 +1402,20 @@ def get_visible_gpu_count() -> int: if _visible_gpu_count is not None: return _visible_gpu_count - cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES") - if cuda_visible is not None: - # "" means zero GPUs, "0" means 1, "0,1,2" means 3 - cuda_visible = cuda_visible.strip() - if cuda_visible == "" or cuda_visible == "-1": + # Use _get_parent_visible_gpu_spec() which already handles + # HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES on ROCm. + visible_spec = _get_parent_visible_gpu_spec() + if visible_spec["raw"] is not None: + raw = visible_spec["raw"].strip() + if raw == "" or raw == "-1": _visible_gpu_count = 0 + elif visible_spec["numeric_ids"] is not None: + _visible_gpu_count = len(visible_spec["numeric_ids"]) else: - _visible_gpu_count = len([x for x in cuda_visible.split(",") if x.strip()]) + _visible_gpu_count = len([x for x in raw.split(",") if x.strip()]) return _visible_gpu_count - # CUDA_VISIBLE_DEVICES not set -- try torch, fall back to physical count + # No visibility env var set -- try torch, fall back to physical count try: import torch @@ -1288,8 +1447,39 @@ def apply_gpu_ids(gpu_ids) -> None: value = str(gpu_ids) os.environ["CUDA_VISIBLE_DEVICES"] = value + # Keep ROCm visibility env vars in sync so _get_parent_visible_gpu_spec() + # picks up the narrowed set on AMD systems. Workers can call + # apply_gpu_ids() before detect_hardware() runs (so IS_ROCM is still + # its default False), so also mirror the selection whenever the + # parent process already set a ROCm visibility variable -- that + # way a downstream ROCm process inherits the narrowed mask even + # before Studio's hardware detection has classified the host. + # + # ROCm layered visibility: ROCR_VISIBLE_DEVICES holds *physical* IDs, + # while HIP_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES hold *ordinals* + # within the ROCR set. When narrowing to specific physical GPUs we + # set ROCR to the physical IDs and reset HIP/CUDA to a zero-based + # sequence so ordinals map 1:1 to the new ROCR set. + _inherits_rocm_visibility = ( + "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ + ) + if IS_ROCM or _inherits_rocm_visibility: + os.environ["ROCR_VISIBLE_DEVICES"] = value + # HIP/CUDA ordinals are relative to the ROCR set above. + n_gpus = len(value.split(",")) if value.strip() else 0 + relative = ",".join(str(i) for i in range(n_gpus)) + os.environ["HIP_VISIBLE_DEVICES"] = relative + os.environ["CUDA_VISIBLE_DEVICES"] = relative _visible_gpu_count = None - logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) + if IS_ROCM or _inherits_rocm_visibility: + logger.info( + "Applied gpu_ids: ROCR_VISIBLE_DEVICES='%s', " + "HIP_VISIBLE_DEVICES='%s' (rocm)", + value, + os.environ.get("HIP_VISIBLE_DEVICES", ""), + ) + else: + logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value) def get_device_map( diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 8d06c7d0e1..2ab5eadc45 100755 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -173,6 +173,7 @@ class HostInfo: visible_cuda_devices: str | None has_physical_nvidia: bool has_usable_nvidia: bool + has_rocm: bool = False @dataclass @@ -2493,12 +2494,25 @@ def detect_host() -> HostInfo: has_physical_nvidia = False has_usable_nvidia = False if nvidia_smi: + # Require `nvidia-smi -L` to actually list a GPU before treating the + # host as NVIDIA. The banner text "NVIDIA-SMI ..." is printed even + # when the command fails to communicate with the driver (e.g. stale + # container leftovers), which would otherwise misclassify an AMD + # ROCm host as NVIDIA and short-circuit the ROCm path. try: - result = run_capture([nvidia_smi], timeout = 20) - merged = "\n".join(part for part in (result.stdout, result.stderr) if part) - if "NVIDIA-SMI" in merged: + listing = run_capture([nvidia_smi, "-L"], timeout = 20) + gpu_lines = [ + line for line in listing.stdout.splitlines() if line.startswith("GPU ") + ] + if gpu_lines: has_physical_nvidia = True has_usable_nvidia = visible_device_tokens != [] + except Exception: + pass + + try: + result = run_capture([nvidia_smi], timeout = 20) + merged = "\n".join(part for part in (result.stdout, result.stderr) if part) for line in merged.splitlines(): if "CUDA Version:" in line: raw = line.split("CUDA Version:", 1)[1].strip().split()[0] @@ -2538,6 +2552,12 @@ def detect_host() -> HostInfo: if visible_gpu_rows: has_usable_nvidia = True + # Older nvidia-smi versions (pre -L support) hit the + # except in the first try block but still succeed here, + # leaving has_physical_nvidia unset. Mirror the -L path + # so downstream diagnostics on line ~4390 still run. + if not has_physical_nvidia: + has_physical_nvidia = True elif visible_device_tokens == []: has_usable_nvidia = False elif supports_explicit_visible_device_matching(visible_device_tokens): @@ -2547,6 +2567,61 @@ def detect_host() -> HostInfo: except Exception: pass + # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed + + def _amd_smi_has_gpu(stdout: str) -> bool: + """Check for 'GPU: ' data rows, not just a table header.""" + return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) + + # Honour GPU visibility masks so hidden GPUs are not detected. + # On ROCm, ROCR_VISIBLE_DEVICES narrows the physical set, then + # CUDA/HIP_VISIBLE_DEVICES further restricts within that. If ANY + # is empty or "-1", all GPUs are hidden. + _rocm_vis_enabled = True + for _env_name in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"): + _env_raw = os.environ.get(_env_name) + if _env_raw is not None and _env_raw.strip() in {"", "-1"}: + _rocm_vis_enabled = False + break + + has_rocm = False + if _rocm_vis_enabled and is_linux: + for _cmd, _check in ( + # rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent) + (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), + (["amd-smi", "list"], _amd_smi_has_gpu), + ): + _exe = shutil.which(_cmd[0]) + if not _exe: + continue + try: + _result = run_capture([_exe, *_cmd[1:]], timeout = 10) + except Exception: + continue + if _result.returncode == 0 and _result.stdout.strip(): + if _check(_result.stdout): + has_rocm = True + break + elif _rocm_vis_enabled and is_windows: + # Windows: prefer active probes that validate GPU presence + for _cmd, _check in ( + (["hipinfo"], lambda out: "gcnarchname" in out.lower()), + (["amd-smi", "list"], _amd_smi_has_gpu), + ): + _exe = shutil.which(_cmd[0]) + if not _exe: + continue + try: + _result = run_capture([_exe, *_cmd[1:]], timeout = 10) + except Exception: + continue + if _result.returncode == 0 and _result.stdout.strip(): + if _check(_result.stdout): + has_rocm = True + break + # Note: amdhip64.dll presence alone is NOT treated as GPU evidence + # since the HIP SDK can be installed without an AMD GPU. + return HostInfo( system = system, machine = machine, @@ -2561,6 +2636,7 @@ def detect_host() -> HostInfo: visible_cuda_devices = visible_cuda_devices, has_physical_nvidia = has_physical_nvidia, has_usable_nvidia = has_usable_nvidia, + has_rocm = has_rocm, ) @@ -2926,9 +3002,168 @@ def published_asset_choice_for_kind( return None +def _detect_host_rocm_version() -> tuple[int, int] | None: + """Return (major, minor) of the installed ROCm runtime, or None. + + Best-effort read from /opt/rocm/.info/version, amd-smi version, and + hipconfig --version. Used to pick a compatible upstream llama.cpp + ROCm prebuilt rather than always taking the numerically newest one + (which can be newer than the host runtime). + """ + rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" + for path in ( + os.path.join(rocm_root, ".info", "version"), + os.path.join(rocm_root, "lib", "rocm_version"), + ): + try: + with open(path) as fh: + parts = fh.read().strip().split("-")[0].split(".") + # Explicit length guard avoids relying on the broad except + # below to swallow IndexError when the version file contains + # a single component (e.g. "6\n" on a partial install). + if len(parts) >= 2: + return int(parts[0]), int(parts[1]) + except Exception: + pass + amd_smi = shutil.which("amd-smi") + if amd_smi: + try: + result = subprocess.run( + [amd_smi, "version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) + if m: + return int(m.group(1)), int(m.group(2)) + except Exception: + pass + hipconfig = shutil.which("hipconfig") + if hipconfig: + try: + result = subprocess.run( + [hipconfig, "--version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + raw = (result.stdout or "").strip().split("\n")[0] + parts = raw.split(".") + if ( + len(parts) >= 2 + and parts[0].isdigit() + and parts[1].split("-")[0].isdigit() + ): + return int(parts[0]), int(parts[1].split("-")[0]) + except Exception: + pass + + # Distro package-manager fallbacks. Mirrors install.sh::get_torch_index_url + # and _detect_rocm_version() in install_python_stack.py so package-managed + # ROCm hosts without /opt/rocm/.info/version still report a usable version + # and the <= host version filter in resolve_upstream_asset_choice picks + # the correct upstream prebuilt instead of the newest-regardless fallback. + for _cmd in ( + ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"], + ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"], + ): + _exe = shutil.which(_cmd[0]) + if not _exe: + continue + try: + _result = subprocess.run( + [_exe, *_cmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + except Exception: + continue + if _result.returncode != 0 or not _result.stdout.strip(): + continue + _raw = _result.stdout.strip() + # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing. + _raw = re.sub(r"^\d+:", "", _raw) + _m = re.match(r"(\d+)[.-](\d+)", _raw) + if _m: + return int(_m.group(1)), int(_m.group(2)) + return None + + def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice: upstream_assets = github_release_assets(UPSTREAM_REPO, llama_tag) if host.is_linux and host.is_x86_64: + # AMD ROCm: try upstream ROCm prebuilt first, then fall back to source build. + # Source build (via setup.sh) compiles with -DGGML_HIP=ON and auto-detects + # the exact GPU target via rocminfo, which is more reliable for consumer + # GPUs (e.g. gfx1151) that may not be in the prebuilt. + if host.has_rocm and not host.has_usable_nvidia: + # Scan upstream assets for any rocm- prebuilt. When the + # host ROCm runtime version is known, pick the newest candidate + # whose major.minor is <= host version -- otherwise a ROCm 6.4 + # host would download the rocm-7.2 tarball, fail preflight, and + # fall back to a source build even though a compatible 6.4 + # prebuilt exists. If no compatible candidate matches (e.g. host + # runtime is older than every published prebuilt), fall back to + # the numerically newest so we at least try something. + _rocm_pattern = re.compile( + rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz" + ) + rocm_candidates: list[tuple[tuple[int, ...], str]] = [] + for _name in upstream_assets: + _m = _rocm_pattern.match(_name) + if _m is None: + continue + _parts = tuple(int(p) for p in _m.group(1).split(".")) + rocm_candidates.append((_parts, _name)) + rocm_candidates.sort(reverse = True) + _host_rocm_version = _detect_host_rocm_version() + _compatible: list[tuple[tuple[int, ...], str]] = rocm_candidates + if _host_rocm_version is not None: + _compatible = [ + item + for item in rocm_candidates + if item[0][:2] <= _host_rocm_version + ] + if rocm_candidates and not _compatible: + # Fall back to the newest candidate so a source build is + # not forced when the host runtime is older than every + # published prebuilt: preflight will still catch a true + # incompatibility and trigger a fallback. + _compatible = rocm_candidates[:1] + if _compatible: + rocm_name = _compatible[0][1] + if _host_rocm_version is not None: + log( + f"AMD ROCm {_host_rocm_version[0]}.{_host_rocm_version[1]} " + f"detected -- trying upstream prebuilt {rocm_name}" + ) + else: + log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}") + log( + "Note: if your ROCm runtime version differs significantly, " + "this may fail preflight and fall back to a source build (safe)" + ) + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = rocm_name, + url = upstream_assets[rocm_name], + source_label = "upstream", + install_kind = "linux-rocm", + ) + # No ROCm prebuilt available -- fall back to source build + raise PrebuiltFallback( + "AMD ROCm detected but no upstream ROCm prebuilt found; " + "falling back to source build with HIP support" + ) + upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Linux CPU asset was not found") @@ -2948,6 +3183,25 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice return attempts[0] raise PrebuiltFallback("no compatible Windows CUDA asset was found") + # AMD ROCm on Windows: try HIP prebuilt + if host.has_rocm: + hip_name = f"llama-{llama_tag}-bin-win-hip-radeon-x64.zip" + if hip_name in upstream_assets: + log( + f"AMD ROCm detected on Windows -- trying upstream HIP prebuilt {hip_name}" + ) + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = hip_name, + url = upstream_assets[hip_name], + source_label = "upstream", + install_kind = "windows-hip", + ) + log( + "AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU" + ) + upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Windows CPU asset was not found") @@ -3028,8 +3282,19 @@ def resolve_release_asset_choice( ) published_choice: AssetChoice | None = None - if host.is_windows and host.is_x86_64: - published_choice = published_asset_choice_for_kind(release, "windows-cpu") + if host.is_linux and host.is_x86_64 and host.has_rocm and not host.has_usable_nvidia: + published_choice = published_asset_choice_for_kind(release, "linux-rocm") + elif host.is_windows and host.is_x86_64: + # AMD Windows hosts should prefer a hash-approved published + # Windows HIP bundle when one exists, but otherwise fall through + # to resolve_asset_choice() so the upstream HIP prebuilt is + # tried before the CPU fallback. Hard-pinning the published + # windows-cpu bundle here would make the new HIP path + # unreachable. + if host.has_rocm: + published_choice = published_asset_choice_for_kind(release, "windows-hip") + else: + published_choice = published_asset_choice_for_kind(release, "windows-cpu") elif host.is_macos and host.is_arm64: published_choice = published_asset_choice_for_kind(release, "macos-arm64") elif host.is_macos and host.is_x86_64: @@ -3378,7 +3643,7 @@ def overlay_directory_for_choice( def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: - if choice.install_kind in {"linux-cpu", "linux-cuda"}: + if choice.install_kind in {"linux-cpu", "linux-cuda", "linux-rocm"}: return [ "llama-server", "llama-quantize", @@ -3388,11 +3653,12 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: "libmtmd.so*", "libggml-cpu-*.so*", "libggml-cuda.so*", + "libggml-hip.so*", "libggml-rpc.so*", ] if choice.install_kind in {"macos-arm64", "macos-x64"}: return ["llama-server", "llama-quantize", "lib*.dylib"] - if choice.install_kind in {"windows-cpu", "windows-cuda"}: + if choice.install_kind in {"windows-cpu", "windows-cuda", "windows-hip"}: return ["*.exe", "*.dll"] raise PrebuiltFallback( f"unsupported install kind for runtime overlay: {choice.install_kind}" @@ -4117,6 +4383,7 @@ def validate_server( install_dir: Path, *, runtime_line: str | None = None, + install_kind: str | None = None, ) -> None: last_failure: PrebuiltFallback | None = None for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1): @@ -4140,7 +4407,33 @@ def validate_server( "--batch-size", "32", ] - if host.has_usable_nvidia or (host.is_macos and host.is_arm64): + # Only enable GPU offload for assets that actually ship GPU code. + # Gating on `host.has_rocm` alone breaks the intentional CPU + # fallback on AMD Windows hosts without a HIP prebuilt: the CPU + # binary would be launched with `--n-gpu-layers 1` and fail + # validation. Use the resolved install_kind as the source of + # truth and fall back to host detection when the caller did not + # pass one (keeps backwards compatibility with older call sites). + _gpu_kinds = { + "linux-cuda", + "linux-rocm", + "windows-cuda", + "windows-hip", + "macos-arm64", + } + if install_kind is not None: + _enable_gpu_layers = install_kind in _gpu_kinds + else: + # Older call sites that don't pass install_kind: keep ROCm + # hosts in the GPU-validation path so an AMD-only Linux host + # is exercised against the actual hardware rather than the + # CPU fallback. NVIDIA and macOS-arm64 are already covered. + _enable_gpu_layers = ( + host.has_usable_nvidia + or host.has_rocm + or (host.is_macos and host.is_arm64) + ) + if _enable_gpu_layers: command.extend(["--n-gpu-layers", "1"]) log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log") @@ -4664,10 +4957,21 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: ["libggml*.dylib"], ["libmtmd*.dylib"], ] + if choice.install_kind == "linux-rocm": + return [ + ["libllama.so*"], + ["libggml.so*"], + ["libggml-base.so*"], + ["libggml-cpu-*.so*"], + ["libmtmd.so*"], + ["libggml-hip.so*"], + ] if choice.install_kind == "windows-cpu": return [["llama.dll"]] if choice.install_kind == "windows-cuda": return [["llama.dll"], ["ggml-cuda.dll"]] + if choice.install_kind == "windows-hip": + return [["llama.dll"], ["*hip*.dll"]] return [] @@ -4839,6 +5143,7 @@ def validate_prebuilt_choice( host, install_dir, runtime_line = choice.runtime_line, + install_kind = choice.install_kind, ) log(f"staged prebuilt validation succeeded for {choice.name}") return server_path, quantize_path diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py index f2981ea665..cbbb81f913 100644 --- a/studio/install_python_stack.py +++ b/studio/install_python_stack.py @@ -25,6 +25,319 @@ IS_MACOS = sys.platform == "darwin" IS_MAC_INTEL = IS_MACOS and platform.machine() == "x86_64" +# ── ROCm / AMD GPU support ───────────────────────────────────────────────────── +# Mapping from detected ROCm (major, minor) to the best PyTorch wheel tag on +# download.pytorch.org. Entries are checked newest-first (>=). +# ROCm 7.2 only has torch 2.11.0 on download.pytorch.org, which exceeds the +# current torch upper bound (<2.11.0). Fall back to rocm7.1 (torch 2.10.0). +# TODO: uncomment rocm7.2 when torch upper bound is bumped to >=2.11.0 +_ROCM_TORCH_INDEX: dict[tuple[int, int], str] = { + # (7, 2): "rocm7.2", # torch 2.11.0 -- requires torch>=2.11 + (7, 1): "rocm7.1", + (7, 0): "rocm7.0", + (6, 4): "rocm6.4", + (6, 3): "rocm6.3", + (6, 2): "rocm6.2", + (6, 1): "rocm6.1", + (6, 0): "rocm6.0", +} +_PYTORCH_WHL_BASE = "https://download.pytorch.org/whl" + + +def _detect_rocm_version() -> tuple[int, int] | None: + """Return (major, minor) of the installed ROCm stack, or None.""" + # Check /opt/rocm/.info/version or ROCM_PATH equivalent + rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm" + for path in ( + os.path.join(rocm_root, ".info", "version"), + os.path.join(rocm_root, "lib", "rocm_version"), + ): + try: + with open(path) as fh: + parts = fh.read().strip().split("-")[0].split(".") + # Explicit length guard avoids relying on the broad except + # below to swallow IndexError when the version file contains + # a single component (e.g. "6\n" on a partial install). + if len(parts) >= 2: + return int(parts[0]), int(parts[1]) + except Exception: + pass + + # Try amd-smi version (outputs "... | ROCm version: X.Y.Z") + amd_smi = shutil.which("amd-smi") + if amd_smi: + try: + result = subprocess.run( + [amd_smi, "version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + import re + + m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout) + if m: + return int(m.group(1)), int(m.group(2)) + except Exception: + pass + + # Try hipconfig --version (outputs bare version like "6.3.21234.2") + hipconfig = shutil.which("hipconfig") + if hipconfig: + try: + result = subprocess.run( + [hipconfig, "--version"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + if result.returncode == 0: + raw = result.stdout.strip().split("\n")[0] + parts = raw.split(".") + if ( + len(parts) >= 2 + and parts[0].isdigit() + and parts[1].split("-")[0].isdigit() + ): + return int(parts[0]), int(parts[1].split("-")[0]) + except Exception: + pass + + # Distro package-manager fallbacks. Package-managed ROCm installs can + # expose GPUs via rocminfo / amd-smi but still lack /opt/rocm/.info/version + # and hipconfig, so probe dpkg (Debian/Ubuntu) and rpm (RHEL/Fedora/SUSE) + # for the rocm-core package version. Matches the chain in + # install.sh::get_torch_index_url so `unsloth studio update` behaves + # the same as a fresh `curl | sh` install. + import re as _re_pkg + + for cmd in ( + ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"], + ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"], + ): + exe = shutil.which(cmd[0]) + if not exe: + continue + try: + result = subprocess.run( + [exe, *cmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 5, + ) + except Exception: + continue + if result.returncode != 0 or not result.stdout.strip(): + continue + raw = result.stdout.strip() + # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing. + raw = _re_pkg.sub(r"^\d+:", "", raw) + m = _re_pkg.match(r"(\d+)[.-](\d+)", raw) + if m: + return int(m.group(1)), int(m.group(2)) + + return None + + +def _rocm_devices_enabled() -> bool: + """Return True when no env var explicitly hides all AMD GPUs. + + On ROCm, ROCR_VISIBLE_DEVICES narrows the physical set, then + CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES further restricts within + that set. If ANY of the defined vars is "" or "-1", all GPUs are hidden. + """ + for name in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"): + raw = os.environ.get(name) + if raw is not None and raw.strip() in {"", "-1"}: + return False + return True + + +def _has_rocm_gpu() -> bool: + """Return True only if an actual AMD GPU is visible (not just ROCm tools installed).""" + if not _rocm_devices_enabled(): + return False + import re + + for cmd, check_fn in ( + # rocminfo: look for "Name: gfxNNNN" with nonzero first digit (gfx000 is the CPU agent) + (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))), + # amd-smi list: require "GPU: " data rows, not just a header + ( + ["amd-smi", "list"], + lambda out: bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", out)), + ), + ): + exe = shutil.which(cmd[0]) + if not exe: + continue + try: + result = subprocess.run( + [exe, *cmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 10, + ) + except Exception: + continue + if result.returncode == 0 and result.stdout.strip(): + if check_fn(result.stdout): + return True + return False + + +def _has_usable_nvidia_gpu() -> bool: + """Return True only when nvidia-smi exists AND reports at least one GPU. + + Respects CUDA_VISIBLE_DEVICES="" or "-1" so mixed NVIDIA+AMD hosts + where NVIDIA is intentionally hidden are correctly routed to ROCm. + """ + raw = os.environ.get("CUDA_VISIBLE_DEVICES") + if raw is not None and raw.strip() in {"", "-1"}: + return False + exe = shutil.which("nvidia-smi") + if not exe: + return False + try: + result = subprocess.run( + [exe, "-L"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 10, + ) + except Exception: + return False + return result.returncode == 0 and "GPU " in result.stdout + + +def _ensure_rocm_torch() -> None: + """Reinstall torch with ROCm wheels when the venv received CPU-only torch. + + Runs only on Linux x86_64 hosts where an AMD GPU is present and the + ROCm runtime is detectable (rocminfo / amd-smi / hipconfig / + rocm-core package). No-op when torch already links against HIP + (ROCm), on Windows / macOS, on non-x86_64 Linux (PyTorch does not + publish ROCm wheels for aarch64 / arm64), or on mixed AMD+NVIDIA + hosts (NVIDIA takes precedence). + Uses pip_install() to respect uv, constraints, and --python targeting. + """ + # Explicit OS / architecture guards so the helper is safe to call + # from any context -- PyTorch only publishes ROCm wheels for + # linux_x86_64, so aarch64 / arm64 hosts must skip this repair path + # instead of failing the update with a missing-wheel error. + if IS_WINDOWS or IS_MACOS: + return + if platform.machine().lower() not in {"x86_64", "amd64"}: + return + # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable + if _has_usable_nvidia_gpu(): + return + # Rely on _has_rocm_gpu() (rocminfo / amd-smi GPU data rows) as the + # authoritative "is this actually an AMD ROCm host?" signal. The old + # gate required /opt/rocm or hipcc to exist, which breaks on + # runtime-only ROCm installs (package-managed minimal installs, + # Radeon software) that ship amd-smi/rocminfo without /opt/rocm or + # hipcc, and leaves `unsloth studio update` unable to repair a + # CPU-only venv on those systems. + if not _has_rocm_gpu(): + return # no AMD GPU visible + + ver = _detect_rocm_version() + if ver is None: + print(" ROCm detected but version unreadable -- skipping torch reinstall") + return + + # Probe whether torch already links against HIP (ROCm is already working). + # Do NOT skip for CUDA-only builds since they are unusable on AMD-only + # hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups). + try: + probe = subprocess.run( + [ + sys.executable, + "-c", + "import torch; print(getattr(torch.version,'hip','') or '')", + ], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + timeout = 30, + ) + except (OSError, subprocess.TimeoutExpired): + probe = None + has_hip_torch = ( + probe is not None + and probe.returncode == 0 + and probe.stdout.decode().strip() != "" + ) + + rocm_torch_ready = has_hip_torch + + if not has_hip_torch: + # Select best matching wheel tag (newest ROCm version <= installed) + tag = next( + ( + t + for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True) + if ver >= (maj, mn) + ), + None, + ) + if tag is None: + print( + f" No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- " + f"skipping torch reinstall" + ) + else: + index_url = f"{_PYTORCH_WHL_BASE}/{tag}" + print(f" ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}") + pip_install( + f"ROCm torch ({tag})", + "--force-reinstall", + "--no-cache-dir", + "torch>=2.4,<2.11.0", + "torchvision<0.26.0", # TODO: bump to <0.27.0 when rocm7.2 is uncommented + "torchaudio<2.11.0", + "--index-url", + index_url, + constrain = False, + ) + # Re-probe: only mark ready if HIP torch is now actually present. + # pip_install() may have failed silently. + try: + probe2 = subprocess.run( + [sys.executable, "-c", + "import torch; print(getattr(torch.version,'hip','') or '')"], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + timeout = 30, + ) + rocm_torch_ready = ( + probe2.returncode == 0 + and probe2.stdout.decode().strip() != "" + ) + except Exception: + rocm_torch_ready = False + + # Install bitsandbytes only when the venv has a ROCm-compatible torch + # (either already present or just installed). Avoids leaving an AMD + # bitsandbytes on top of a CPU/CUDA torch on hosts where the ROCm + # runtime is older than any published torch wheel. Uses + # --force-reinstall so an existing CPU/CUDA bitsandbytes is replaced + # by the AMD build during upgrades. + if rocm_torch_ready: + pip_install( + "bitsandbytes (AMD)", + "--force-reinstall", + "--no-cache-dir", + "bitsandbytes>=0.49.1", + constrain = False, + ) + def _infer_no_torch() -> bool: """Determine whether to run in no-torch (GGUF-only) mode. @@ -414,6 +727,9 @@ def install_python_stack() -> int: base_total = 10 if IS_WINDOWS else 11 if IS_MACOS: base_total -= 1 # triton step is skipped on macOS + # ROCm torch check step (Linux only, non-macOS, non-no-torch) + if not IS_WINDOWS and not IS_MACOS and not NO_TORCH: + base_total += 1 _TOTAL = (base_total - 1) if skip_base else base_total # 1. Try to use uv for faster installs (must happen before pip upgrade @@ -537,6 +853,53 @@ def install_python_stack() -> int: req = REQ_ROOT / "base.txt", ) + # 2b. AMD ROCm: reinstall torch with HIP wheels if the host has ROCm but the + # venv received CPU-only torch (common when pip resolves torch from PyPI). + # Must come immediately after base packages so torch is present for inspection. + if not IS_WINDOWS and not IS_MACOS and not NO_TORCH: + _progress("ROCm torch check") + _ensure_rocm_torch() + + # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows. + # Detect and warn so users know manual steps are needed for GPU training. + if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu(): + # Validate actual AMD GPU presence (not just tool existence) + import re as _re_win + + def _win_amd_smi_has_gpu(stdout: str) -> bool: + return bool(_re_win.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout)) + + _win_amd_gpu = False + for _wcmd, _check_fn in ( + (["hipinfo"], lambda out: "gcnarchname" in out.lower()), + (["amd-smi", "list"], _win_amd_smi_has_gpu), + ): + _wexe = shutil.which(_wcmd[0]) + if not _wexe: + continue + try: + _wr = subprocess.run( + [_wexe, *_wcmd[1:]], + stdout = subprocess.PIPE, + stderr = subprocess.DEVNULL, + text = True, + timeout = 10, + ) + except Exception: + continue + if _wr.returncode == 0 and _check_fn(_wr.stdout): + _win_amd_gpu = True + break + if _win_amd_gpu: + _safe_print( + _dim(" Note:"), + "AMD GPU detected on Windows. ROCm-enabled PyTorch must be", + ) + _safe_print( + " " * 8, + "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd", + ) + # 3. Extra dependencies _progress("unsloth extras") pip_install( diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py index 90f2d5d238..09e755e001 100644 --- a/unsloth/kernels/utils.py +++ b/unsloth/kernels/utils.py @@ -88,10 +88,27 @@ def is_cdna(): @functools.lru_cache(1) def is_rdna(): - """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA3, RDNA4).""" + """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA2, RDNA3, RDNA3.5, RDNA4).""" return is_hip() and triton.runtime.driver.active.get_current_target().arch in ( + # RDNA2 (Navi 21-24) + "gfx1030", + "gfx1031", + "gfx1032", + "gfx1033", + "gfx1034", + "gfx1035", + "gfx1036", + # RDNA3 (Navi 31-33) "gfx1100", "gfx1101", + "gfx1102", + "gfx1103", + # RDNA3.5 (Strix Point / Strix Halo / Krackan Point) + "gfx1150", + "gfx1151", + "gfx1152", + "gfx1153", + # RDNA4 (Navi 48-44) "gfx1200", "gfx1201", ) diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py index 8be6bb5a5a..0d9ef896e6 100644 --- a/unsloth/tokenizer_utils.py +++ b/unsloth/tokenizer_utils.py @@ -1103,7 +1103,16 @@ def patch_sft_trainer_tokenizer(): " a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n" "except:\n" " if not torch.cuda.is_available():\n" - " raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n" + " raise RuntimeError('Unsloth: No GPU detected. AMD ROCm users: install ROCm-enabled PyTorch -- see https://docs.unsloth.ai/get-started/install-and-update/amd')\n" + " # nvidia-smi unavailable but torch.cuda IS available -- we are on\n" + " # a ROCm host (ROCm reuses the torch.cuda.* API surface, so\n" + " # device_count() is authoritative) or on a CUDA host without\n" + " # the CLI installed. Use the device count directly as a\n" + " # conservative multi-GPU signal: any configuration with more\n" + " # than one visible device is flagged as unsupported, matching\n" + " # the spirit of the per-device memory check used on CUDA.\n" + " if torch.cuda.device_count() > 1:\n" + " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" "if ((a - PRE_CHECK) >= 1).sum() > 1:\n" " raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n" "for _ in range(3):\n"