diff --git a/install.sh b/install.sh
index 053f334d2b..5035bdd213 100755
--- a/install.sh
+++ b/install.sh
@@ -978,6 +978,64 @@ _find_no_torch_runtime() {
     fi
 }
 
+# ── ROCm visibility-mask check ──
+# Returns 0 (true) when no env var explicitly hides all AMD GPUs,
+# 1 (false) when ANY of HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES /
+# CUDA_VISIBLE_DEVICES is set to "" or "-1".
+# On ROCm, ROCR narrows the physical set, then CUDA/HIP further
+# restricts within that. If any is empty, all GPUs are hidden.
+_rocm_devices_enabled() {
+    if [ "${HIP_VISIBLE_DEVICES+x}" = x ]; then
+        case "$HIP_VISIBLE_DEVICES" in ""|-1) return 1 ;; esac
+    fi
+    if [ "${ROCR_VISIBLE_DEVICES+x}" = x ]; then
+        case "$ROCR_VISIBLE_DEVICES" in ""|-1) return 1 ;; esac
+    fi
+    if [ "${CUDA_VISIBLE_DEVICES+x}" = x ]; then
+        case "$CUDA_VISIBLE_DEVICES" in ""|-1) return 1 ;; esac
+    fi
+    return 0
+}
+
+# ── AMD ROCm GPU detection helper ──
+# Returns 0 (true) if an actual AMD GPU is present, 1 (false) otherwise.
+# Checks rocminfo for gfx[1-9]* (excludes gfx000 CPU agent) and
+# amd-smi list for GPU data rows (excludes header-only output).
+# Respects HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES /
+# CUDA_VISIBLE_DEVICES so hidden GPUs are not detected.
+_has_amd_rocm_gpu() {
+    _rocm_devices_enabled || return 1
+    if command -v rocminfo >/dev/null 2>&1 && \
+       rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9]/{found=1} END{exit !found}'; then
+        return 0
+    elif command -v amd-smi >/dev/null 2>&1 && \
+         amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[][[:space:]]*[0-9]/{ found=1 } END{ exit !found }'; then
+        return 0
+    fi
+    return 1
+}
+
+# ── NVIDIA usable-GPU helper ──
+# Returns 0 (true) only if nvidia-smi is present AND actually lists a GPU
+# AND visibility masks do not hide all GPUs.
+# Prevents AMD-only hosts with a stale nvidia-smi on PATH from being routed
+# into the CUDA branch.
+_has_usable_nvidia_gpu() {
+    # Respect explicit "hide all NVIDIA GPUs" masks.
+    if [ "${CUDA_VISIBLE_DEVICES+x}" = x ]; then
+        case "${CUDA_VISIBLE_DEVICES}" in ""|-1) return 1 ;; esac
+    fi
+    _nvsmi=""
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        _nvsmi="nvidia-smi"
+    elif [ -x "/usr/bin/nvidia-smi" ]; then
+        _nvsmi="/usr/bin/nvidia-smi"
+    else
+        return 1
+    fi
+    "$_nvsmi" -L 2>/dev/null | awk '/^GPU[[:space:]]+[0-9]+:/{found=1} END{exit !found}'
+}
+
 # ── Detect GPU and choose PyTorch index URL ──
 # Mirrors Get-TorchIndexUrl in install.ps1.
 # On CPU-only machines this returns the cpu index, avoiding the solver
@@ -986,14 +1044,83 @@ get_torch_index_url() {
     _base="https://download.pytorch.org/whl"
     # macOS: always CPU (no CUDA support)
     case "$(uname -s)" in Darwin) echo "$_base/cpu"; return ;; esac
-    # Try nvidia-smi
+    # Try nvidia-smi -- require the binary to actually list a usable GPU.
+    # Presence of the binary alone (container leftovers, stale driver
+    # packages) is not sufficient: otherwise an AMD-only host would
+    # silently install CUDA wheels.
     _smi=""
-    if command -v nvidia-smi >/dev/null 2>&1; then
-        _smi="nvidia-smi"
-    elif [ -x "/usr/bin/nvidia-smi" ]; then
-        _smi="/usr/bin/nvidia-smi"
+    if _has_usable_nvidia_gpu; then
+        if command -v nvidia-smi >/dev/null 2>&1; then
+            _smi="nvidia-smi"
+        elif [ -x "/usr/bin/nvidia-smi" ]; then
+            _smi="/usr/bin/nvidia-smi"
+        fi
+    fi
+    if [ -z "$_smi" ]; then
+        # No NVIDIA GPU -- check for AMD ROCm GPU.
+        # PyTorch only publishes ROCm wheels for linux-x86_64; skip the
+        # ROCm branch entirely on aarch64 / arm64 / other architectures
+        # so non-x86_64 Linux hosts fall back cleanly to CPU wheels.
+        case "$(uname -m)" in
+            x86_64|amd64) : ;;
+            *) echo "$_base/cpu"; return ;;
+        esac
+        if ! _has_amd_rocm_gpu; then
+            echo "$_base/cpu"; return
+        fi
+        # AMD GPU confirmed -- detect ROCm version
+        _rocm_tag=""
+        _rocm_tag=$({ command -v amd-smi >/dev/null 2>&1 && \
+            amd-smi version 2>/dev/null | awk -F'ROCm version: ' \
+                'NF>1{gsub(/[^0-9.]/, "", $2); split($2,a,"."); print "rocm"a[1]"."a[2]; ok=1; exit} END{exit !ok}'; } || \
+            { _rocm_info_file="${ROCM_PATH:-/opt/rocm}/.info/version"; \
+              [ -r "$_rocm_info_file" ] && \
+                awk -F. '{print "rocm"$1"."$2; exit}' "$_rocm_info_file"; } || \
+            { command -v hipconfig >/dev/null 2>&1 && \
+                hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]/{split($1,a,"."); if(a[1]+0>0){print "rocm"a[1]"."a[2]; found=1}} END{exit !found}'; } || \
+            { command -v dpkg-query >/dev/null 2>&1 && \
+                ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \
+                [ -n "$ver" ] && \
+                printf '%s\n' "$ver" | sed 's/^[0-9]*://' | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; } || \
+            { command -v rpm >/dev/null 2>&1 && \
+                ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \
+                [ -n "$ver" ] && \
+                printf '%s\n' "$ver" | awk -F'[.-]' '{print "rocm"$1"."$2; exit}'; }) 2>/dev/null
+        # Validate _rocm_tag: must match "rocmX.Y" with major >= 1
+        case "$_rocm_tag" in
+            rocm[1-9]*.[0-9]*) : ;;  # valid (major >= 1)
+            *) _rocm_tag="" ;;        # reject malformed (empty, garbled, or major=0)
+        esac
+        if [ -n "$_rocm_tag" ]; then
+            # Minimum supported: ROCm 6.0 (no PyTorch wheels exist for older)
+            case "$_rocm_tag" in
+                rocm[1-5].*) echo "$_base/cpu"; return ;;
+            esac
+            # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds
+            # (<2.11.0).  Fall back to rocm7.1 index which has torch 2.10.0.
+            # Enumerate explicit versions rather than matching rocm6.* so
+            # a host on ROCm 6.5 or 6.6 (no PyTorch wheels published) is
+            # clipped down to the last supported 6.x (rocm6.4) instead of
+            # constructing https://download.pytorch.org/whl/rocm6.5 which
+            # returns HTTP 403. PyTorch only ships: rocm5.7, 6.0, 6.1, 6.2,
+            # 6.3, 6.4, 7.0, 7.1, 7.2 (and 5.7 is below our minimum).
+            # TODO: uncomment rocm7.2 when the torch upper bound is bumped
+            # to >=2.11.0.
+            case "$_rocm_tag" in
+                rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*)
+                    echo "$_base/$_rocm_tag" ;;
+                rocm6.*)
+                    # ROCm 6.5+ (no published PyTorch wheels): clip down
+                    # to the last supported 6.x wheel set.
+                    echo "$_base/rocm6.4" ;;
+                *)
+                    # ROCm 7.2+ (including future 10.x+): cap to rocm7.1
+                    echo "$_base/rocm7.1" ;;
+            esac
+            return
+        fi
+        echo "$_base/cpu"; return
     fi
-    if [ -z "$_smi" ]; then echo "$_base/cpu"; return; fi
     # Parse CUDA version from nvidia-smi output (POSIX-safe, no grep -P)
     _cuda_ver=$(LC_ALL=C $_smi 2>/dev/null \
         | sed -n 's/.*CUDA Version:[[:space:]]*\([0-9][0-9]*\.[0-9][0-9]*\).*/\1/p' \
@@ -1011,20 +1138,166 @@ get_torch_index_url() {
     elif [ "$_major" -ge 11 ]; then echo "$_base/cu118"
     else echo "$_base/cpu"; fi
 }
+
+get_radeon_wheel_url() {
+    # Only meaningful on Linux. Picks a repo.radeon.com base URL whose listing
+    # contains torch wheels. Tries paths like rocm-rel-7.2.1/, rocm-rel-7.2/,
+    # rocm-rel-7.1.1/, rocm-rel-7.1/ (AMD publishes both M.m and M.m.p dirs).
+    # Accepts both X.Y and X.Y.Z host versions since /opt/rocm/.info/version
+    # and hipconfig --version can return either shape.
+    case "$(uname -s)" in Linux) ;; *) echo ""; return ;; esac
+
+    # Detect ROCm version (X.Y or X.Y.Z) -- try amd-smi, then
+    # /opt/rocm/.info/version, then hipconfig.
+    _full_ver=""
+    _full_ver=$({ command -v amd-smi >/dev/null 2>&1 && \
+        amd-smi version 2>/dev/null | awk -F'ROCm version: ' \
+            'NF>1{if(match($2,/[0-9]+\.[0-9]+(\.[0-9]+)?/)){print substr($2,RSTART,RLENGTH); ok=1; exit}} END{exit !ok}'; } || \
+        { _rocm_info_file="${ROCM_PATH:-/opt/rocm}/.info/version"; \
+          [ -r "$_rocm_info_file" ] && \
+            awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1; exit} END{exit !found}' "$_rocm_info_file"; } || \
+        { command -v hipconfig >/dev/null 2>&1 && \
+            hipconfig --version 2>/dev/null | awk 'NR==1 && match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); found=1} END{exit !found}'; } || \
+        { command -v dpkg-query >/dev/null 2>&1 && \
+            ver="$(dpkg-query -W -f='${Version}\n' rocm-core 2>/dev/null)" && \
+            [ -n "$ver" ] && \
+            printf '%s\n' "$ver" | sed 's/^[0-9]*://' | awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); exit}'; } || \
+        { command -v rpm >/dev/null 2>&1 && \
+            ver="$(rpm -q --qf '%{VERSION}\n' rocm-core 2>/dev/null)" && \
+            [ -n "$ver" ] && \
+            printf '%s\n' "$ver" | awk 'match($0,/[0-9]+\.[0-9]+(\.[0-9]+)?/){print substr($0,RSTART,RLENGTH); exit}'; }) 2>/dev/null
+
+    # Validate: must be X.Y or X.Y.Z with X >= 1
+    case "$_full_ver" in
+        [1-9]*.[0-9]*.[0-9]*) : ;;  # X.Y.Z
+        [1-9]*.[0-9]*) : ;;          # X.Y
+        *) echo ""; return ;;
+    esac
+    echo "https://repo.radeon.com/rocm/manylinux/rocm-rel-${_full_ver}/"
+}
+
+# ── Radeon repo wheel selection helpers ──────────────────────────────────────
+# Fetches the Radeon repo directory listing once into _RADEON_LISTING (global).
+# _RADEON_PYTAG holds the CPython tag for the running interpreter (e.g. cp312).
+# _RADEON_BASE_URL holds the base URL for relative-href resolution.
+_RADEON_LISTING=""
+_RADEON_PYTAG=""
+_RADEON_BASE_URL=""
+
+_radeon_fetch_listing() {
+    # Usage: _radeon_fetch_listing BASE_URL
+    # Populates _RADEON_LISTING, _RADEON_PYTAG, _RADEON_BASE_URL.
+    _RADEON_BASE_URL="$1"
+    _RADEON_PYTAG=$("$_VENV_PY" -c "
+import sys
+print('cp{}{}'.format(sys.version_info.major, sys.version_info.minor))
+" 2>/dev/null) || return 1
+    if command -v curl >/dev/null 2>&1; then
+        _RADEON_LISTING=$(curl -fsSL --max-time 20 "$_RADEON_BASE_URL" 2>/dev/null)
+    elif command -v wget >/dev/null 2>&1; then
+        _RADEON_LISTING=$(wget -qO- --timeout=20 "$_RADEON_BASE_URL" 2>/dev/null)
+    fi
+    [ -n "$_RADEON_LISTING" ] || return 1
+}
+
+_pick_radeon_wheel() {
+    # Usage: _pick_radeon_wheel PACKAGE_NAME
+    # Scans $_RADEON_LISTING for the newest wheel whose filename starts exactly
+    # with PACKAGE_NAME- and matches _RADEON_PYTAG + linux_x86_64.
+    # Prints the full URL (resolving relative hrefs against _RADEON_BASE_URL).
+    #
+    # POSIX-compliant pipeline: all href parsing, filtering, and version
+    # selection is done inside a single awk script rather than reaching
+    # for GNU extensions (grep -o, sort -V) that would break under BSD
+    # or BusyBox coreutils.
+    _pkg="$1"
+    [ -n "$_RADEON_LISTING" ] || return 1
+    [ -n "$_RADEON_PYTAG"   ] || return 1
+    _tag="$_RADEON_PYTAG"
+    _href=$(printf '%s\n' "$_RADEON_LISTING" \
+        | awk -v pkg="$_pkg" -v tag="$_tag" '
+            BEGIN { max_pad = ""; max_url = "" }
+            {
+                line = $0
+                while (match(line, /href="[^"]*"/)) {
+                    # Strip the leading href=" (6 chars) and trailing " (1 char)
+                    url = substr(line, RSTART + 6, RLENGTH - 7)
+                    line = substr(line, RSTART + RLENGTH)
+
+                    # Extract basename, strip query / fragment
+                    n = split(url, p, "/")
+                    base = p[n]
+                    sub(/[?#].*/, "", base)
+
+                    prefix = pkg "-"
+                    # Match cpXY-cpXY with any linux x86_64
+                    # platform tag (linux_x86_64, manylinux_2_28_x86_64,
+                    # manylinux2014_x86_64, etc.)
+                    if (substr(base, 1, length(prefix)) == prefix &&
+                            index(base, "-" tag "-") > 0 &&
+                            match(base, /x86_64\.whl$/)) {
+                        # Extract the version component (first
+                        # dotted-number run) and pad each piece so a
+                        # plain lexical comparison gives us the newest.
+                        if (match(base, /[0-9]+\.[0-9]+(\.[0-9]+)?/)) {
+                            ver = substr(base, RSTART, RLENGTH)
+                            m = split(ver, v, ".")
+                            pad = ""
+                            for (i = 1; i <= m; i++)
+                                pad = pad sprintf("%08d", v[i])
+                            if (pad > max_pad) {
+                                max_pad = pad
+                                max_url = url
+                            }
+                        }
+                    }
+                }
+            }
+            END { if (max_url != "") print max_url }')
+    [ -z "$_href" ] && return 1
+    case "$_href" in
+        http*) printf '%s\n' "$_href" ;;
+        *)     printf '%s\n' "${_RADEON_BASE_URL%/}/${_href#/}" ;;
+    esac
+}
+
 TORCH_INDEX_URL=$(get_torch_index_url)
 
+# Auto-detect GPU for AMD ROCm based
+# get_torch_index_url must have chosen */rocm*
+# (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon".
+_amd_gpu_radeon=false
+case "$TORCH_INDEX_URL" in
+    */rocm*)
+        if _has_amd_rocm_gpu && command -v rocminfo >/dev/null 2>&1 && \
+           rocminfo 2>/dev/null | grep -q 'Marketing Name:.*Radeon'; then
+            _amd_gpu_radeon=true
+        fi
+        ;;
+esac
+
 # ── Print CPU-only hint when no GPU detected ──
 case "$TORCH_INDEX_URL" in
     */cpu)
         if [ "$SKIP_TORCH" = false ] && [ "$OS" != "macos" ]; then
             echo ""
-            echo "  NOTE: No NVIDIA GPU detected (nvidia-smi not found)."
+            echo "  NOTE: No GPU detected (nvidia-smi and ROCm not found)."
             echo "  Installing CPU-only PyTorch. If you only need GGUF chat/inference,"
             echo "  re-run with --no-torch for a faster, lighter install:"
             echo "    curl -fsSL https://unsloth.ai/install.sh | sh -s -- --no-torch"
+            echo "  AMD ROCm users: see https://docs.unsloth.ai/get-started/install-and-update/amd"
             echo ""
         fi
         ;;
+    */rocm*)
+        echo ""
+        if [ "$_amd_gpu_radeon" = true ]; then
+            echo "  AMD Radeon + ROCm detected -- installing PyTorch wheels from repo.radeon.com"
+        else
+            echo "  AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)"
+        fi
+        echo ""
+        ;;
 esac
 
 # ── Install unsloth directly into the venv (no activation needed) ──
@@ -1040,7 +1313,7 @@ if [ "$_MIGRATED" = true ]; then
         # to prevent transitive torch resolution.
         run_install_cmd "install unsloth (migrated no-torch)" uv pip install --python "$_VENV_PY" --no-deps \
             --reinstall-package unsloth --reinstall-package unsloth-zoo \
-            "unsloth>=2026.4.2" unsloth-zoo
+            "unsloth>=2026.4.4" unsloth-zoo
         _NO_TORCH_RT="$(_find_no_torch_runtime)"
         if [ -n "$_NO_TORCH_RT" ]; then
             run_install_cmd "install no-torch runtime deps" uv pip install --python "$_VENV_PY" --no-deps -r "$_NO_TORCH_RT"
@@ -1048,21 +1321,169 @@ if [ "$_MIGRATED" = true ]; then
     else
         run_install_cmd "install unsloth (migrated)" uv pip install --python "$_VENV_PY" \
             --reinstall-package unsloth --reinstall-package unsloth-zoo \
-            "unsloth>=2026.4.2" unsloth-zoo
+            "unsloth>=2026.4.4" unsloth-zoo
     fi
     if [ "$STUDIO_LOCAL_INSTALL" = true ]; then
         substep "overlaying local repo (editable)..."
         run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps
     fi
+    # AMD ROCm: ensure torch has HIP support and install bitsandbytes
+    # in migrated environments. Existing venvs created before ROCm support
+    # may have CPU-only torch that needs replacing.
+    if [ "$SKIP_TORCH" = false ]; then
+        case "$TORCH_INDEX_URL" in
+            */rocm*)
+                if ! "$_VENV_PY" - <<'PY' >/dev/null 2>&1
+import sys, torch
+sys.exit(0 if getattr(torch.version, "hip", None) else 1)
+PY
+                then
+                    substep "reinstalling ROCm PyTorch ($TORCH_INDEX_URL)..."
+                    run_install_cmd "install PyTorch (ROCm)" uv pip install --python "$_VENV_PY" \
+                        --force-reinstall "$TORCH_CONSTRAINT" torchvision torchaudio \
+                        --index-url "$TORCH_INDEX_URL"
+                fi
+                substep "installing bitsandbytes for AMD ROCm..."
+                run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1"
+                ;;
+        esac
+    fi
 elif [ -n "$TORCH_INDEX_URL" ]; then
     # Fresh: Step 1 - install torch from explicit index (skip when --no-torch or Intel Mac)
     if [ "$SKIP_TORCH" = true ]; then
         substep "skipping PyTorch (--no-torch or Intel Mac x86_64)." "$C_WARN"
+    elif [ "$_amd_gpu_radeon" = true ]; then
+        _radeon_url=$(get_radeon_wheel_url)
+        if [ -n "$_radeon_url" ]; then
+            _radeon_listing_ok=false
+            if _radeon_fetch_listing "$_radeon_url" 2>/dev/null; then
+                _radeon_listing_ok=true
+            else
+                # Try shorter X.Y path (AMD publishes both X.Y.Z and X.Y dirs)
+                _radeon_url_short=$(printf '%s\n' "$_radeon_url" \
+                    | sed 's|rocm-rel-\([0-9]*\)\.\([0-9]*\)\.[0-9]*/|rocm-rel-\1.\2/|')
+                if [ "$_radeon_url_short" != "$_radeon_url" ] && \
+                   _radeon_fetch_listing "$_radeon_url_short" 2>/dev/null; then
+                    _radeon_listing_ok=true
+                fi
+            fi
+
+            if [ "$_radeon_listing_ok" = true ]; then
+                # Require torch, torchvision, torchaudio wheels to all resolve
+                # from the Radeon listing. If any is missing for this Python
+                # tag, fall through to the standard ROCm index instead of
+                # silently mixing Radeon wheels with PyPI defaults.
+                _torch_whl=$(_pick_radeon_wheel "torch"       2>/dev/null) || _torch_whl=""
+                _tv_whl=$(_pick_radeon_wheel    "torchvision" 2>/dev/null) || _tv_whl=""
+                _ta_whl=$(_pick_radeon_wheel    "torchaudio"  2>/dev/null) || _ta_whl=""
+                _tri_whl=$(_pick_radeon_wheel   "triton"      2>/dev/null) || _tri_whl=""
+                # Some ROCm versions publish triton as pytorch_triton_rocm
+                if [ -z "$_tri_whl" ]; then
+                    _tri_whl=$(_pick_radeon_wheel "pytorch_triton_rocm" 2>/dev/null) || _tri_whl=""
+                fi
+                # Sanity-check torch / torchvision / torchaudio are a
+                # matching release. The Radeon repo publishes multiple
+                # generations simultaneously, so picking the highest-version
+                # wheel for each package independently can assemble a
+                # mismatched trio (e.g. torch 2.9.1 + torchvision 0.23.0 +
+                # torchaudio 2.9.0 from the current rocm-rel-7.2.1 index).
+                # Check that torch and torchaudio share the same X.Y public
+                # version prefix, and that torchvision's minor correctly
+                # pairs with torch's minor (torchvision = torch.minor + 15
+                # since torch 2.4 -> torchvision 0.19 -> torch 2.9 ->
+                # torchvision 0.24).
+                # URL-decode each wheel name so %2B -> + before version
+                # extraction. Real Radeon wheel hrefs are percent-encoded
+                # (torch-2.10.0%2Brocm7.2.0...), so a plain [+-] terminator
+                # in the sed regex below would never match and
+                # _radeon_versions_match would stay false for every real
+                # listing, silently forcing a fallback to the generic
+                # ROCm index.
+                _torch_ver=""
+                _tv_ver=""
+                _ta_ver=""
+                if [ -n "$_torch_whl" ]; then
+                    _torch_name=$(printf '%s' "${_torch_whl##*/}" | sed 's/%2[Bb]/+/g')
+                    _torch_ver=$(printf '%s\n' "$_torch_name" | sed -n 's|^torch-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p')
+                fi
+                if [ -n "$_tv_whl" ]; then
+                    _tv_name=$(printf '%s' "${_tv_whl##*/}" | sed 's/%2[Bb]/+/g')
+                    _tv_ver=$(printf '%s\n' "$_tv_name" | sed -n 's|^torchvision-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p')
+                fi
+                if [ -n "$_ta_whl" ]; then
+                    _ta_name=$(printf '%s' "${_ta_whl##*/}" | sed 's/%2[Bb]/+/g')
+                    _ta_ver=$(printf '%s\n' "$_ta_name" | sed -n 's|^torchaudio-\([0-9][0-9]*\.[0-9][0-9]*\)\(\.[0-9][0-9]*\)\{0,1\}[+-].*|\1|p')
+                fi
+                _radeon_versions_match=false
+                if [ -n "$_torch_ver" ] && [ -n "$_tv_ver" ] && [ -n "$_ta_ver" ]; then
+                    _torch_major=${_torch_ver%%.*}
+                    _torch_minor=${_torch_ver#*.}
+                    _ta_major=${_ta_ver%%.*}
+                    _ta_minor=${_ta_ver#*.}
+                    _tv_major=${_tv_ver%%.*}
+                    _tv_minor=${_tv_ver#*.}
+                    # torchvision expected minor (e.g. torch 2.9 -> 0.24)
+                    _expected_tv_minor=$((_torch_minor + 15))
+                    if [ "$_torch_major" = "$_ta_major" ] && \
+                       [ "$_torch_minor" = "$_ta_minor" ] && \
+                       [ "$_tv_major" = "0" ] && \
+                       [ "$_tv_minor" = "$_expected_tv_minor" ]; then
+                        _radeon_versions_match=true
+                    fi
+                fi
+                if [ -z "$_torch_whl" ] || [ -z "$_tv_whl" ] || [ -z "$_ta_whl" ] || \
+                   [ "$_radeon_versions_match" != true ]; then
+                    substep "[WARN] Radeon repo lacks a compatible wheel set for this Python; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN"
+                    run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
+                        "$TORCH_CONSTRAINT" torchvision torchaudio \
+                        --index-url "$TORCH_INDEX_URL"
+                else
+                    substep "installing PyTorch from Radeon repo (${_RADEON_BASE_URL})..."
+                    # Pass explicit wheel URLs so the matched trio is
+                    # installed together. --find-links lets uv discover
+                    # the Radeon listing for any local lookup, and PyPI
+                    # (not disabled) provides transitive deps like
+                    # filelock / sympy / networkx which are not in the
+                    # Radeon listing.
+                    if [ -n "$_tri_whl" ]; then
+                        run_install_cmd "install triton + PyTorch" uv pip install --python "$_VENV_PY" \
+                            --find-links "$_RADEON_BASE_URL" \
+                            "$_tri_whl" "$_torch_whl" "$_tv_whl" "$_ta_whl"
+                    else
+                        run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
+                            --find-links "$_RADEON_BASE_URL" \
+                            "$_torch_whl" "$_tv_whl" "$_ta_whl"
+                    fi
+                fi
+            else
+                substep "[WARN] Radeon repo unavailable; falling back to ROCm index ($TORCH_INDEX_URL)" "$C_WARN"
+                run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
+                    "$TORCH_CONSTRAINT" torchvision torchaudio \
+                    --index-url "$TORCH_INDEX_URL"
+            fi
+        else
+            substep "[WARN] Radeon GPU detected but could not detect full ROCm version; falling back to ROCm index" "$C_WARN"
+            run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" \
+                "$TORCH_CONSTRAINT" torchvision torchaudio \
+                --index-url "$TORCH_INDEX_URL"
+        fi
     else
         substep "installing PyTorch ($TORCH_INDEX_URL)..."
         run_install_cmd "install PyTorch" uv pip install --python "$_VENV_PY" "$TORCH_CONSTRAINT" torchvision torchaudio \
             --index-url "$TORCH_INDEX_URL"
     fi
+    # AMD ROCm: install bitsandbytes (once, after torch, for all ROCm paths).
+    # Gate on SKIP_TORCH=false so a user running with --no-torch on a ROCm
+    # host stays in GGUF-only mode rather than pulling in bitsandbytes,
+    # which is only useful once torch is present for training.
+    if [ "$SKIP_TORCH" = false ]; then
+        case "$TORCH_INDEX_URL" in
+            */rocm*)
+                substep "installing bitsandbytes for AMD ROCm..."
+                run_install_cmd "install bitsandbytes (AMD)" uv pip install --python "$_VENV_PY" --force-reinstall --no-cache-dir "bitsandbytes>=0.49.1"
+                ;;
+        esac
+    fi
     # Fresh: Step 2 - install unsloth, preserving pre-installed torch
     substep "installing unsloth (this may take a few minutes)..."
     if [ "$SKIP_TORCH" = true ]; then
@@ -1070,7 +1491,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then
         # runtime deps (typer, safetensors, transformers, etc.) with --no-deps.
         run_install_cmd "install unsloth (no-torch)" uv pip install --python "$_VENV_PY" --no-deps \
             --upgrade-package unsloth --upgrade-package unsloth-zoo \
-            "unsloth>=2026.4.2" unsloth-zoo
+            "unsloth>=2026.4.4" unsloth-zoo
         _NO_TORCH_RT="$(_find_no_torch_runtime)"
         if [ -n "$_NO_TORCH_RT" ]; then
             run_install_cmd "install no-torch runtime deps" uv pip install --python "$_VENV_PY" --no-deps -r "$_NO_TORCH_RT"
@@ -1081,7 +1502,7 @@ elif [ -n "$TORCH_INDEX_URL" ]; then
         fi
     elif [ "$STUDIO_LOCAL_INSTALL" = true ]; then
         run_install_cmd "install unsloth (local)" uv pip install --python "$_VENV_PY" \
-            --upgrade-package unsloth "unsloth>=2026.4.2" unsloth-zoo
+            --upgrade-package unsloth "unsloth>=2026.4.4" unsloth-zoo
         substep "overlaying local repo (editable)..."
         run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps
     else
@@ -1092,7 +1513,7 @@ else
     # Fallback: GPU detection failed to produce a URL -- let uv resolve torch
     substep "installing unsloth (this may take a few minutes)..."
     if [ "$STUDIO_LOCAL_INSTALL" = true ]; then
-        run_install_cmd "install unsloth (auto torch backend)" uv pip install --python "$_VENV_PY" unsloth-zoo "unsloth>=2026.4.2" --torch-backend=auto
+        run_install_cmd "install unsloth (auto torch backend)" uv pip install --python "$_VENV_PY" unsloth-zoo "unsloth>=2026.4.4" --torch-backend=auto
         substep "overlaying local repo (editable)..."
         run_install_cmd "overlay local repo" uv pip install --python "$_VENV_PY" -e "$_REPO_ROOT" --no-deps
     else
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 0454eada89..ebf30c14ed 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -86,6 +86,7 @@ def _probe_causal_conv1d_env() -> dict[str, str] | None:
                     "'python_tag': f'cp{sys.version_info.major}{sys.version_info.minor}', "
                     "'torch_mm': torch_mm, "
                     "'cuda_major': str(int(str(torch.version.cuda).split('.', 1)[0])) if torch.version.cuda else '', "
+                    "'hip_version': str(torch.version.hip) if getattr(torch.version, 'hip', None) else '', "
                     "'cxx11abi': str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()"
                     "}))"
                 ),
@@ -237,28 +238,111 @@ def _install_package_wheel_first(
         else:
             logger.info("No published %s wheel found: %s", display_name, wheel_url)
 
-    _send_status(event_queue, f"Installing {display_name} from PyPI...")
-    pypi_cmd = [
-        sys.executable,
-        "-m",
-        "pip",
-        "install",
-        "--no-build-isolation",
-        "--no-deps",
-        "--no-cache-dir",
-        f"{pypi_name}=={pypi_version}",
-    ]
-    result = _sp.run(
-        pypi_cmd,
-        stdout = _sp.PIPE,
-        stderr = _sp.STDOUT,
-        text = True,
-    )
+    is_hip = env and env.get("hip_version")
+    if is_hip and not shutil.which("hipcc"):
+        logger.error(
+            "%s requires hipcc for source compilation on ROCm. "
+            "Install the ROCm HIP SDK: https://rocm.docs.amd.com",
+            display_name,
+        )
+        _send_status(
+            event_queue,
+            f"{display_name}: hipcc not found (ROCm HIP SDK required)",
+        )
+        return
+
+    if is_hip:
+        _send_status(
+            event_queue,
+            f"Compiling {display_name} from source for ROCm "
+            "(this may take several minutes)...",
+        )
+    else:
+        _send_status(event_queue, f"Installing {display_name} from PyPI...")
+
+    # Prefer uv for faster dependency resolution when available
+    if shutil.which("uv"):
+        pypi_cmd = [
+            "uv",
+            "pip",
+            "install",
+            "--python",
+            sys.executable,
+            "--no-build-isolation",
+            "--no-deps",
+        ]
+        # Avoid stale cache artifacts from partial HIP source builds
+        if is_hip:
+            pypi_cmd.append("--no-cache")
+        pypi_cmd.append(f"{pypi_name}=={pypi_version}")
+    else:
+        pypi_cmd = [
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            "--no-build-isolation",
+            "--no-deps",
+            "--no-cache-dir",
+            f"{pypi_name}=={pypi_version}",
+        ]
+
+    # Source compilation on ROCm can take 10-30 minutes; use a generous
+    # timeout. Non-HIP installs preserve the pre-existing "no timeout"
+    # behaviour so unrelated slow installs (e.g. causal-conv1d source
+    # build on Linux aarch64 or unsupported torch/CUDA combinations)
+    # are not aborted at 5 minutes by this PR.
+    _run_kwargs: dict[str, Any] = {
+        "stdout": _sp.PIPE,
+        "stderr": _sp.STDOUT,
+        "text": True,
+    }
+    if is_hip:
+        _run_kwargs["timeout"] = 1800
+
+    try:
+        result = _sp.run(pypi_cmd, **_run_kwargs)
+    except _sp.TimeoutExpired:
+        logger.error(
+            "%s installation timed out after %ds",
+            display_name,
+            _run_kwargs.get("timeout"),
+        )
+        _send_status(
+            event_queue,
+            f"{display_name} installation timed out after "
+            f"{_run_kwargs.get('timeout')}s",
+        )
+        return
+
     if result.returncode != 0:
-        logger.error("Failed to install %s from PyPI:\n%s", display_name, result.stdout)
+        if is_hip:
+            # Surface a clear error for ROCm source build failures
+            error_lines = (result.stdout or "").strip().splitlines()
+            snippet = "\n".join(error_lines[-5:]) if error_lines else "(no output)"
+            logger.error(
+                "Failed to compile %s for ROCm:\n%s",
+                display_name,
+                result.stdout,
+            )
+            _send_status(
+                event_queue,
+                f"Failed to compile {display_name} for ROCm. "
+                "Check that hipcc and ROCm development headers are installed.\n"
+                f"{snippet}",
+            )
+        else:
+            logger.error(
+                "Failed to install %s from PyPI:\n%s",
+                display_name,
+                result.stdout,
+            )
         return
 
-    logger.info("Installed %s from PyPI", display_name)
+    if is_hip:
+        logger.info("Compiled and installed %s from source for ROCm", display_name)
+    else:
+        logger.info("Installed %s from PyPI", display_name)
 
 
 def _ensure_causal_conv1d_fast_path(event_queue: Any, model_name: str) -> None:
diff --git a/studio/backend/main.py b/studio/backend/main.py
index ad19ee9679..c2c0a0b6e4 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -237,6 +237,7 @@ async def get_system_info():
     import platform
     import psutil
     from utils.hardware import get_device
+    from utils.hardware.hardware import _backend_label
 
     visibility_info = get_backend_visible_gpu_info()
     gpu_info = {
@@ -250,7 +251,10 @@ async def get_system_info():
     return {
         "platform": platform.platform(),
         "python_version": platform.python_version(),
-        "device_backend": get_device().value,
+        # Use the centralized _backend_label helper so the /api/system
+        # endpoint reports "rocm" on AMD hosts instead of "cuda", matching
+        # the /api/hardware and /api/gpu-visibility endpoints.
+        "device_backend": _backend_label(get_device()),
         "cpu_count": psutil.cpu_count(),
         "memory": {
             "total_gb": round(memory.total / 1e9, 2),
diff --git a/studio/backend/utils/hardware/__init__.py b/studio/backend/utils/hardware/__init__.py
index aaa0452406..400b5dd066 100644
--- a/studio/backend/utils/hardware/__init__.py
+++ b/studio/backend/utils/hardware/__init__.py
@@ -5,6 +5,7 @@
 Hardware detection and GPU utilities
 """
 
+from . import hardware as _hardware
 from .hardware import (
     DeviceType,
     DEVICE,
@@ -49,6 +50,7 @@
     "DeviceType",
     "DEVICE",
     "CHAT_ONLY",
+    "IS_ROCM",
     "detect_hardware",
     "get_device",
     "is_apple_silicon",
@@ -81,3 +83,11 @@
     "extract_arch_config",
     "estimate_training_vram",
 ]
+
+
+def __getattr__(name: str):
+    """Resolve IS_ROCM at access time so callers always see the live value
+    after detect_hardware() runs (it flips the flag in hardware.py)."""
+    if name == "IS_ROCM":
+        return getattr(_hardware, "IS_ROCM")
+    raise AttributeError(name)
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
new file mode 100644
index 0000000000..a2522c5ca1
--- /dev/null
+++ b/studio/backend/utils/hardware/amd.py
@@ -0,0 +1,373 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""AMD GPU monitoring via amd-smi.
+
+Mirrors the nvidia.py module structure so hardware.py can swap backends
+based on IS_ROCM. All functions return the same dict shapes as their
+nvidia.py counterparts.
+"""
+
+import json
+import math
+import os
+import re
+import subprocess
+from typing import Any, Optional
+
+from loggers import get_logger
+
+logger = get_logger(__name__)
+
+
+def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[Any]:
+    """Run amd-smi with the given arguments and return parsed JSON, or None."""
+    try:
+        result = subprocess.run(
+            ["amd-smi", *args, "--json"],
+            capture_output = True,
+            text = True,
+            timeout = timeout,
+        )
+    except (OSError, subprocess.TimeoutExpired) as e:
+        logger.warning("amd-smi query failed: %s", e)
+        return None
+    if result.returncode != 0 or not result.stdout.strip():
+        logger.warning("amd-smi returned code %d", result.returncode)
+        return None
+    try:
+        return json.loads(result.stdout)
+    except json.JSONDecodeError:
+        logger.warning("Failed to parse amd-smi JSON output")
+        return None
+
+
+def _parse_numeric(value: Any) -> Optional[float]:
+    """Extract a numeric value from amd-smi output (may be str, int, float, or dict)."""
+    if value is None:
+        return None
+    # Newer amd-smi versions emit {"value": 10, "unit": "W"}
+    if isinstance(value, dict):
+        return _parse_numeric(value.get("value"))
+    if isinstance(value, (int, float)):
+        f = float(value)
+        return f if math.isfinite(f) else None
+    if isinstance(value, str):
+        # Strip units like "W", "C", "%", "MB", "MiB", "GB", "GiB" etc.
+        cleaned = re.sub(r"\s*[A-Za-z/%]+$", "", value.strip())
+        if not cleaned or cleaned.lower() in ("n/a", "none", "unknown"):
+            return None
+        try:
+            return float(cleaned)
+        except (ValueError, TypeError):
+            return None
+    return None
+
+
+def _parse_memory_mb(value: Any) -> Optional[float]:
+    """Parse a memory value from amd-smi output and return MB.
+
+    Handles bare numbers (assumed MB -- the amd-smi convention on every
+    version we have seen), dict-shaped values with explicit units
+    (``{"value": 192, "unit": "GiB"}`` on newer releases), and plain
+    strings like ``"8192 MiB"``.
+    """
+    unit = ""
+    raw_value = value
+
+    if isinstance(value, dict):
+        unit = str(value.get("unit", "")).strip().lower()
+        raw_value = value.get("value")
+    elif isinstance(value, str):
+        # Extract unit suffix from strings like "192 GiB" or "8192 MB"
+        m = re.match(r"^\s*([\d.]+)\s*([A-Za-z]+)\s*$", value.strip())
+        if m:
+            unit = m.group(2).lower()
+
+    num = _parse_numeric(raw_value if isinstance(value, dict) else value)
+    if num is None:
+        return None
+
+    # Unit conversion -- GPU tools (including amd-smi) use binary units even
+    # when labeling them "GB" or "MB", so treat GB/GiB and MB/MiB the same.
+    if "gib" in unit or "gb" in unit:
+        return num * 1024
+    if "mib" in unit or "mb" in unit:
+        return num
+    if "kib" in unit or "kb" in unit:
+        return num / 1024
+    if unit in ("b", "byte", "bytes"):
+        # Plain bytes
+        return num / (1024 * 1024)
+
+    # No explicit unit -- default to MB, which is the amd-smi convention
+    # for bare numeric values. A previous heuristic assumed values above
+    # ~10M were bytes, but that misclassifies small VRAM allocations
+    # (e.g. 5 MB = 5,242,880 reported without a unit) as ~5 TB. Modern
+    # amd-smi always ships explicit units, so the heuristic branch only
+    # fired for legacy output where MB was already the convention.
+    return num
+
+
+def _extract_gpu_metrics(gpu_data: dict) -> dict[str, Any]:
+    """Extract standardized metrics from a single GPU's amd-smi data."""
+    # amd-smi metric output structure varies by version; try common paths
+    usage = gpu_data.get("usage", gpu_data.get("gpu_activity", {}))
+    if isinstance(usage, dict):
+        gpu_util = _parse_numeric(
+            usage.get("gfx_activity", usage.get("gpu_use_percent"))
+        )
+    else:
+        gpu_util = _parse_numeric(usage)
+
+    # Temperature
+    temp_data = gpu_data.get("temperature", {})
+    if isinstance(temp_data, dict):
+        temp = _parse_numeric(
+            temp_data.get(
+                "edge",
+                temp_data.get(
+                    "temperature_edge",
+                    temp_data.get("hotspot", temp_data.get("temperature_hotspot")),
+                ),
+            )
+        )
+    else:
+        temp = _parse_numeric(temp_data)
+
+    # Power
+    power_data = gpu_data.get("power", {})
+    if isinstance(power_data, dict):
+        power_draw = _parse_numeric(
+            power_data.get(
+                "current_socket_power",
+                power_data.get("average_socket_power", power_data.get("socket_power")),
+            )
+        )
+        power_limit = _parse_numeric(
+            power_data.get("power_cap", power_data.get("max_power_limit"))
+        )
+    else:
+        power_draw = None
+        power_limit = None
+
+    # VRAM -- unit-aware parsing to handle varying amd-smi output formats.
+    # Newer amd-smi versions may return {"value": 192, "unit": "GiB"}.
+    vram_data = gpu_data.get("vram", gpu_data.get("fb_memory_usage", {}))
+    if isinstance(vram_data, dict):
+        vram_used_mb = _parse_memory_mb(
+            vram_data.get("vram_used", vram_data.get("used"))
+        )
+        vram_total_mb = _parse_memory_mb(
+            vram_data.get("vram_total", vram_data.get("total"))
+        )
+    else:
+        vram_used_mb = None
+        vram_total_mb = None
+
+    # Build the standardized dict (same shape as nvidia._build_gpu_metrics)
+    vram_used_gb = round(vram_used_mb / 1024, 2) if vram_used_mb is not None else None
+    vram_total_gb = (
+        round(vram_total_mb / 1024, 2) if vram_total_mb is not None else None
+    )
+    vram_util = (
+        round((vram_used_mb / vram_total_mb) * 100, 1)
+        if vram_used_mb is not None and vram_total_mb is not None and vram_total_mb > 0
+        else None
+    )
+    power_util = (
+        round((power_draw / power_limit) * 100, 1)
+        if power_draw is not None and power_limit is not None and power_limit > 0
+        else None
+    )
+
+    return {
+        "gpu_utilization_pct": gpu_util,
+        "temperature_c": temp,
+        "vram_used_gb": vram_used_gb,
+        "vram_total_gb": vram_total_gb,
+        "vram_utilization_pct": vram_util,
+        "power_draw_w": power_draw,
+        "power_limit_w": power_limit,
+        "power_utilization_pct": power_util,
+    }
+
+
+def _has_real_metrics(metrics: dict[str, Any]) -> bool:
+    """Return True when ``metrics`` contains at least one non-None value.
+
+    ``amd-smi`` can return a zero-exit JSON envelope that is missing every
+    expected field (error response, unsupported card, hipless container).
+    In that case ``_extract_gpu_metrics`` produces a dict where every value
+    is ``None`` -- callers must surface this as ``available: False`` rather
+    than ``available: True`` with empty data.
+    """
+    return any(value is not None for value in metrics.values())
+
+
+def get_physical_gpu_count() -> Optional[int]:
+    """Return physical AMD GPU count via amd-smi, or None on failure."""
+    data = _run_amd_smi("list")
+    if data is None:
+        return None
+    if isinstance(data, list):
+        return len(data)
+    # Some versions return a dict with a "gpu" / "gpus" key. Guard the
+    # .get() access with an isinstance check so a malformed scalar /
+    # string response from amd-smi cannot raise AttributeError.
+    if not isinstance(data, dict):
+        return None
+    gpus = data.get("gpu", data.get("gpus", []))
+    if isinstance(gpus, list):
+        return len(gpus)
+    return None
+
+
+def _first_visible_amd_gpu_id() -> Optional[str]:
+    """Return the *physical* AMD GPU id that should be treated as 'primary'.
+
+    Delegates to ``hardware._get_parent_visible_gpu_spec()`` which correctly
+    composes the layered ROCm visibility masks (ROCR narrows physical set,
+    HIP/CUDA select ordinals within that set).  Returns ``None`` when all
+    GPUs are hidden so callers can short-circuit to ``available: False``.
+    """
+    try:
+        from .hardware import _get_parent_visible_gpu_spec
+    except ImportError:
+        return "0"
+
+    spec = _get_parent_visible_gpu_spec()
+    numeric_ids = spec.get("numeric_ids")
+    if numeric_ids is None:
+        # Non-numeric IDs (UUIDs etc.) -- fall back to device 0 and let
+        # amd-smi resolve it.
+        return "0"
+    if len(numeric_ids) == 0:
+        return None
+    return str(numeric_ids[0])
+
+
+def get_primary_gpu_utilization() -> dict[str, Any]:
+    """Return utilization metrics for the primary visible AMD GPU."""
+    gpu_idx = _first_visible_amd_gpu_id()
+    if gpu_idx is None:
+        return {"available": False}
+    data = _run_amd_smi("metric", "-g", gpu_idx)
+    if data is None:
+        return {"available": False}
+
+    # amd-smi may return a list, a dict wrapping a list ({"gpus": [...]}),
+    # or a bare dict for a single GPU. Normalize to a single gpu_data dict.
+    if isinstance(data, list):
+        gpu_list = data
+    elif isinstance(data, dict):
+        gpu_list = data.get("gpus", data.get("gpu", [data]))
+        if isinstance(gpu_list, dict):
+            gpu_list = [gpu_list]
+    else:
+        return {"available": False}
+
+    if not gpu_list:
+        return {"available": False}
+
+    gpu_data = gpu_list[0]
+    if not isinstance(gpu_data, dict):
+        return {"available": False}
+
+    metrics = _extract_gpu_metrics(gpu_data)
+    if not _has_real_metrics(metrics):
+        # amd-smi returned a JSON envelope with no usable fields (error
+        # response or unsupported card). Surface as unavailable rather
+        # than available-with-empty-data so the UI does not render a
+        # ghost device.
+        return {"available": False}
+    metrics["available"] = True
+    return metrics
+
+
+def get_visible_gpu_utilization(
+    parent_visible_ids: Optional[list[int]],
+    parent_cuda_visible_devices: Optional[str] = None,
+) -> dict[str, Any]:
+    """Return utilization metrics for visible AMD GPUs."""
+    if parent_visible_ids is None:
+        return {
+            "available": False,
+            "backend_cuda_visible_devices": parent_cuda_visible_devices,
+            "parent_visible_gpu_ids": [],
+            "devices": [],
+            "index_kind": "unresolved",
+        }
+
+    data = _run_amd_smi("metric")
+    if data is None:
+        return {
+            "available": False,
+            "backend_cuda_visible_devices": parent_cuda_visible_devices,
+            "parent_visible_gpu_ids": parent_visible_ids or [],
+            "devices": [],
+            "index_kind": "physical",
+        }
+
+    # Extract a device list from amd-smi's envelope. Newer versions return
+    # a JSON array directly, older versions return a dict with a "gpus" /
+    # "gpu" key wrapping the list. Guard non-dict / non-list envelopes
+    # (scalar / string fallbacks from malformed output) so the .get()
+    # access cannot raise AttributeError on an unexpected shape.
+    if isinstance(data, list):
+        gpu_list = data
+    elif isinstance(data, dict):
+        gpu_list = data.get("gpus", data.get("gpu", [data]))
+    else:
+        gpu_list = [data]
+    visible_set = set(parent_visible_ids)
+    ordinal_map = {gpu_id: ordinal for ordinal, gpu_id in enumerate(parent_visible_ids)}
+
+    devices = []
+    for fallback_idx, gpu_data in enumerate(gpu_list):
+        # Skip non-dict entries defensively: if amd-smi ever ships a
+        # scalar inside its "gpus" array (observed on some malformed
+        # output), _extract_gpu_metrics would raise AttributeError on
+        # the first .get() call.
+        if not isinstance(gpu_data, dict):
+            continue
+        # Use AMD-reported GPU ID when available, fall back to enumeration
+        # index. Newer amd-smi versions wrap scalars as ``{"value": 0,
+        # "unit": "none"}``, so route raw_id through ``_parse_numeric``
+        # which already handles bare ints, floats, strings, and that
+        # dict shape uniformly.
+        raw_id = gpu_data.get(
+            "gpu", gpu_data.get("gpu_id", gpu_data.get("id", fallback_idx))
+        )
+        parsed_id = _parse_numeric(raw_id)
+        if parsed_id is None:
+            logger.debug(
+                "amd-smi GPU id %r could not be parsed; falling back to "
+                "enumeration index %d",
+                raw_id,
+                fallback_idx,
+            )
+            idx = fallback_idx
+        else:
+            idx = int(parsed_id)
+        if idx not in visible_set:
+            continue
+        metrics = _extract_gpu_metrics(gpu_data)
+        if not _has_real_metrics(metrics):
+            # Skip ghost entries: an amd-smi response that decodes to a
+            # dict but contains no usable fields (error envelope, etc.)
+            # would otherwise show up as a device row with all-None
+            # numbers in the UI.
+            continue
+        metrics["index"] = idx
+        metrics["index_kind"] = "physical"
+        metrics["visible_ordinal"] = ordinal_map.get(idx, len(devices))
+        devices.append(metrics)
+
+    return {
+        "available": len(devices) > 0,
+        "backend_cuda_visible_devices": parent_cuda_visible_devices,
+        "parent_visible_gpu_ids": parent_visible_ids or [],
+        "devices": devices,
+        "index_kind": "physical",
+    }
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index b6d3faf6d7..3e340a4b53 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -43,6 +43,26 @@ class DeviceType(str, Enum):
 
 DEVICE: Optional[DeviceType] = None
 CHAT_ONLY: bool = True  # No CUDA GPU -> GGUF chat only (Mac, CPU-only, etc.)
+IS_ROCM: bool = (
+    False  # True when running on AMD ROCm (HIP) -- routes GPU monitoring to amd.py
+)
+
+
+def _backend_label(device: DeviceType) -> str:
+    """Return the user-facing backend name for API responses.
+
+    Internally we still represent ROCm hosts as ``DeviceType.CUDA`` because
+    ROCm torch sets ``torch.cuda.is_available() = True`` and reuses the whole
+    ``torch.cuda.*`` API surface, so branching on ``DeviceType`` stays
+    consistent with the rest of the codebase. For the JSON responses served
+    to the Studio frontend and other clients, however, "cuda" is misleading
+    on an AMD machine. This helper swaps the label to ``"rocm"`` when the
+    module-level ``IS_ROCM`` flag is set so the UI can render the correct
+    backend name without every caller having to duplicate the check.
+    """
+    if IS_ROCM and device == DeviceType.CUDA:
+        return "rocm"
+    return device.value
 
 
 # ========== Detection ==========
@@ -85,10 +105,11 @@ def detect_hardware() -> DeviceType:
       2. MLX   (Apple Silicon via MLX framework)
       3. CPU   (fallback)
     """
-    global DEVICE, CHAT_ONLY
-    CHAT_ONLY = True  # reset -- only CUDA sets it to False
+    global DEVICE, CHAT_ONLY, IS_ROCM
+    CHAT_ONLY = True  # reset -- only CUDA/ROCm sets it to False
+    IS_ROCM = False
 
-    # --- CUDA: try PyTorch ---
+    # --- CUDA / ROCm: try PyTorch ---
     if _has_torch():
         import torch
 
@@ -96,7 +117,16 @@ def detect_hardware() -> DeviceType:
             DEVICE = DeviceType.CUDA
             CHAT_ONLY = False
             device_name = torch.cuda.get_device_properties(0).name
-            print(f"Hardware detected: CUDA — {device_name}")
+
+            # Distinguish AMD ROCm (HIP) from NVIDIA CUDA for display purposes.
+            # DeviceType stays CUDA since torch.cuda.* works on ROCm via HIP.
+            if getattr(torch.version, "hip", None) is not None:
+                IS_ROCM = True
+                print(
+                    f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}"
+                )
+            else:
+                print(f"Hardware detected: CUDA -- {device_name}")
             return DEVICE
 
     # --- XPU: Intel GPU ---
@@ -186,7 +216,7 @@ def get_gpu_memory_info() -> Dict[str, Any]:
 
             return {
                 "available": True,
-                "backend": device.value,
+                "backend": _backend_label(device),
                 "device": idx,
                 "device_name": props.name,
                 "total_gb": total / (1024**3),
@@ -197,7 +227,11 @@ def get_gpu_memory_info() -> Dict[str, Any]:
             }
         except Exception as e:
             logger.error(f"Error getting CUDA GPU info: {e}")
-            return {"available": False, "backend": device.value, "error": str(e)}
+            return {
+                "available": False,
+                "backend": _backend_label(device),
+                "error": str(e),
+            }
 
     # ---- XPU path (Intel GPU) ----
     if device == DeviceType.XPU:
@@ -213,7 +247,7 @@ def get_gpu_memory_info() -> Dict[str, Any]:
 
             return {
                 "available": True,
-                "backend": device.value,
+                "backend": _backend_label(device),
                 "device": idx,
                 "device_name": props.name,
                 "total_gb": total / (1024**3),
@@ -224,7 +258,11 @@ def get_gpu_memory_info() -> Dict[str, Any]:
             }
         except Exception as e:
             logger.error("Error getting XPU GPU info: %s", e)
-            return {"available": False, "backend": device.value, "error": str(e)}
+            return {
+                "available": False,
+                "backend": _backend_label(device),
+                "error": str(e),
+            }
 
     # ---- MLX path (Apple Silicon) ----
     if device == DeviceType.MLX:
@@ -239,7 +277,7 @@ def get_gpu_memory_info() -> Dict[str, Any]:
 
             return {
                 "available": True,
-                "backend": device.value,
+                "backend": _backend_label(device),
                 "device": 0,
                 "device_name": f"Apple Silicon ({platform.processor() or platform.machine()})",
                 "total_gb": total / (1024**3),
@@ -250,7 +288,11 @@ def get_gpu_memory_info() -> Dict[str, Any]:
             }
         except Exception as e:
             logger.error(f"Error getting MLX GPU info: {e}")
-            return {"available": False, "backend": device.value, "error": str(e)}
+            return {
+                "available": False,
+                "backend": _backend_label(device),
+                "error": str(e),
+            }
 
     # ---- CPU-only ----
     return {"available": False, "backend": "cpu"}
@@ -315,13 +357,15 @@ def get_package_versions() -> Dict[str, Optional[str]]:
         except PackageNotFoundError:
             versions[name] = None
 
-    # CUDA toolkit version bundled with torch
+    # GPU runtime version bundled with torch
     try:
         import torch
 
         versions["cuda"] = getattr(torch.version, "cuda", None)
+        versions["rocm"] = getattr(torch.version, "hip", None)
     except Exception:
         versions["cuda"] = None
+        versions["rocm"] = None
 
     return versions
 
@@ -387,26 +431,50 @@ def _torch_get_per_device_info(device_indices: list[int]) -> list[Dict[str, Any]
 # ========== Live GPU Utilization ==========
 
 
+def _smi_query(func_name: str, *args, **kwargs) -> Optional[Dict[str, Any]]:
+    """Run a query against the appropriate SMI backend (amd-smi or nvidia-smi).
+
+    Returns the result dict if available, or None on failure/unavailability.
+    """
+    if IS_ROCM:
+        backend_name = "amd-smi"
+        try:
+            from . import amd as _backend
+        except Exception as e:
+            logger.warning("%s import failed: %s", backend_name, e)
+            return None
+    else:
+        backend_name = "nvidia-smi"
+        try:
+            from . import nvidia as _backend
+        except Exception as e:
+            logger.warning("%s import failed: %s", backend_name, e)
+            return None
+    try:
+        func = getattr(_backend, func_name)
+        result = func(*args, **kwargs)
+        if result.get("available"):
+            return result
+    except Exception as e:
+        logger.warning("%s %s query failed: %s", backend_name, func_name, e)
+    return None
+
+
 def get_gpu_utilization() -> Dict[str, Any]:
     """Return a live snapshot of device utilization information."""
     device = get_device()
 
     if device == DeviceType.CUDA:
-        try:
-            from . import nvidia
-
-            result = nvidia.get_primary_gpu_utilization()
-            if result.get("available"):
-                result["backend"] = device.value
-                return result
-        except Exception as e:
-            logger.warning("nvidia-smi utilization query failed: %s", e)
+        result = _smi_query("get_primary_gpu_utilization")
+        if result is not None:
+            result["backend"] = _backend_label(device)
+            return result
 
     mem = get_gpu_memory_info()
     if device != DeviceType.CPU and mem.get("available"):
         return {
             "available": True,
-            "backend": device.value,
+            "backend": _backend_label(device),
             "gpu_utilization_pct": None,
             "temperature_c": None,
             "vram_used_gb": round(mem.get("allocated_gb", 0), 2),
@@ -417,7 +485,7 @@ def get_gpu_utilization() -> Dict[str, Any]:
             "power_utilization_pct": None,
         }
 
-    return {"available": False, "backend": device.value}
+    return {"available": False, "backend": _backend_label(device)}
 
 
 def get_visible_gpu_utilization() -> Dict[str, Any]:
@@ -425,18 +493,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
 
     if device == DeviceType.CUDA:
         parent_visible_spec = _get_parent_visible_gpu_spec()
-        try:
-            from . import nvidia
-
-            result = nvidia.get_visible_gpu_utilization(
-                parent_visible_spec["numeric_ids"],
-                parent_cuda_visible_devices = parent_visible_spec["raw"],
-            )
-            if result.get("available"):
-                result["backend"] = device.value
-                return result
-        except Exception as e:
-            logger.warning("nvidia-smi visible GPU utilization query failed: %s", e)
+        result = _smi_query(
+            "get_visible_gpu_utilization",
+            parent_visible_spec["numeric_ids"],
+            parent_cuda_visible_devices = parent_visible_spec["raw"],
+        )
+        if result is not None:
+            result["backend"] = _backend_label(device)
+            return result
 
     # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel)
     if device in (DeviceType.CUDA, DeviceType.XPU):
@@ -475,7 +539,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
                 )
             return {
                 "available": True,
-                "backend": device.value,
+                "backend": _backend_label(device),
                 "parent_visible_gpu_ids": parent_ids,
                 "devices": devices,
                 "index_kind": index_kind,
@@ -486,14 +550,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
         if not mem.get("available"):
             return {
                 "available": False,
-                "backend": device.value,
+                "backend": _backend_label(device),
                 "parent_visible_gpu_ids": [],
                 "devices": [],
                 "index_kind": "relative",
             }
         return {
             "available": True,
-            "backend": device.value,
+            "backend": _backend_label(device),
             "parent_visible_gpu_ids": [0],
             "devices": [
                 {
@@ -515,7 +579,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
 
     return {
         "available": False,
-        "backend": device.value,
+        "backend": _backend_label(device),
         "parent_visible_gpu_ids": [],
         "devices": [],
         "index_kind": "relative",
@@ -528,7 +592,84 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
 _visible_gpu_count: Optional[int] = None
 
 
+def _parse_visible_ids(raw: str) -> tuple:
+    """Parse a CUDA/HIP/ROCR visibility string into (numeric_ids, ok).
+
+    Returns ([int, ...], True) on success, (None, False) when tokens are
+    non-numeric (UUIDs, BDF addresses).
+    """
+    raw = raw.strip()
+    if raw in ("", "-1"):
+        return [], True
+    tokens = [v.strip() for v in raw.split(",") if v.strip()]
+    try:
+        return [int(v) for v in tokens], True
+    except ValueError:
+        return None, False
+
+
 def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
+    # ── ROCm layered visibility ──
+    # On ROCm, ROCR_VISIBLE_DEVICES narrows the *physical* GPU set first.
+    # HIP_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES then select *ordinals*
+    # within that narrowed set (not physical IDs).  Example:
+    #   ROCR_VISIBLE_DEVICES=2,3  HIP_VISIBLE_DEVICES=1
+    #     -> physical set is [2,3], HIP ordinal 1 = physical GPU 3
+    # We must compose through both layers to report the correct physical IDs.
+    if IS_ROCM:
+        rocr_raw = os.environ.get("ROCR_VISIBLE_DEVICES")
+        hip_raw = os.environ.get("HIP_VISIBLE_DEVICES")
+        cuda_raw = os.environ.get("CUDA_VISIBLE_DEVICES")
+
+        # If any mask is explicitly empty / -1, all GPUs are hidden.
+        for val in (rocr_raw, hip_raw, cuda_raw):
+            if val is not None and val.strip() in ("", "-1"):
+                return {
+                    "raw": val.strip(),
+                    "numeric_ids": [],
+                    "supports_explicit_gpu_ids": True,
+                }
+
+        # Layer 1: ROCR narrows the physical set.
+        if rocr_raw is not None:
+            physical_ids, ok = _parse_visible_ids(rocr_raw)
+            if not ok:
+                return {
+                    "raw": rocr_raw,
+                    "numeric_ids": None,
+                    "supports_explicit_gpu_ids": False,
+                }
+        else:
+            physical_ids = list(range(get_physical_gpu_count()))
+
+        # Layer 2: HIP or CUDA selects ordinals within the ROCR set.
+        child_raw = hip_raw if hip_raw is not None else cuda_raw
+        if child_raw is not None:
+            ordinals, ok = _parse_visible_ids(child_raw)
+            if not ok:
+                return {
+                    "raw": child_raw,
+                    "numeric_ids": None,
+                    "supports_explicit_gpu_ids": False,
+                }
+            # Map ordinals back to physical IDs.
+            physical_ids = [
+                physical_ids[i] for i in ordinals
+                if 0 <= i < len(physical_ids)
+            ]
+            return {
+                "raw": child_raw,
+                "numeric_ids": physical_ids,
+                "supports_explicit_gpu_ids": True,
+            }
+
+        return {
+            "raw": rocr_raw,
+            "numeric_ids": physical_ids,
+            "supports_explicit_gpu_ids": True,
+        }
+
+    # ── NVIDIA / non-ROCm path (unchanged) ──
     cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
 
     if cuda_visible is None:
@@ -1109,15 +1250,17 @@ def get_physical_gpu_count() -> int:
 
     if device == DeviceType.CUDA:
         try:
-            from . import nvidia
-
-            count = nvidia.get_physical_gpu_count()
+            if IS_ROCM:
+                from . import amd as _smi_mod
+            else:
+                from . import nvidia as _smi_mod
+            count = _smi_mod.get_physical_gpu_count()
             if count is not None:
                 _physical_gpu_count = count
                 return _physical_gpu_count
         except Exception:
             pass
-        # nvidia-smi unavailable or failed — fall back to torch
+        # SMI tool unavailable or failed -- fall back to torch
         count = _torch_get_physical_gpu_count()
         _physical_gpu_count = count if count is not None else 1
         return _physical_gpu_count
@@ -1136,12 +1279,25 @@ def get_physical_gpu_count() -> int:
     return _physical_gpu_count
 
 
+def _backend_visible_devices_env() -> Optional[str]:
+    """Return the raw visibility env string that applies to this backend.
+
+    On ROCm, HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES take precedence
+    over CUDA_VISIBLE_DEVICES; the helper mirrors the resolution logic in
+    ``_get_parent_visible_gpu_spec`` so ``backend_cuda_visible_devices``
+    reports the value that is actually narrowing the visible device set.
+    """
+    if IS_ROCM:
+        return _get_parent_visible_gpu_spec().get("raw")
+    return os.environ.get("CUDA_VISIBLE_DEVICES")
+
+
 def get_backend_visible_gpu_info() -> Dict[str, Any]:
     device = get_device()
     if device in (DeviceType.CUDA, DeviceType.XPU):
         parent_visible_ids = get_parent_visible_gpu_ids()
-        # Try nvidia-smi first (NVIDIA only)
-        if device == DeviceType.CUDA:
+        # Try native SMI tool first (nvidia-smi for NVIDIA, skipped for ROCm)
+        if device == DeviceType.CUDA and not IS_ROCM:
             try:
                 from . import nvidia
 
@@ -1151,7 +1307,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
                     parent_visible_spec["raw"],
                 )
                 if result.get("available"):
-                    result["backend"] = device.value
+                    result["backend"] = _backend_label(device)
                     return result
             except Exception as e:
                 logger.warning("Backend GPU visibility query failed: %s", e)
@@ -1180,8 +1336,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
             ]
             return {
                 "available": True,
-                "backend": device.value,
-                "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"),
+                "backend": _backend_label(device),
+                "backend_cuda_visible_devices": _backend_visible_devices_env(),
                 "parent_visible_gpu_ids": parent_visible_ids,
                 "devices": devices,
                 "index_kind": index_kind,
@@ -1189,8 +1345,8 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
 
         return {
             "available": False,
-            "backend": device.value,
-            "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"),
+            "backend": _backend_label(device),
+            "backend_cuda_visible_devices": _backend_visible_devices_env(),
             "parent_visible_gpu_ids": parent_visible_ids,
             "devices": [],
             "index_kind": "physical",
@@ -1201,7 +1357,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
         if not mem.get("available"):
             return {
                 "available": False,
-                "backend": device.value,
+                "backend": _backend_label(device),
                 "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"),
                 "parent_visible_gpu_ids": [],
                 "devices": [],
@@ -1209,7 +1365,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
             }
         return {
             "available": True,
-            "backend": device.value,
+            "backend": _backend_label(device),
             "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"),
             "parent_visible_gpu_ids": [0],
             "devices": [
@@ -1226,7 +1382,7 @@ def get_backend_visible_gpu_info() -> Dict[str, Any]:
 
     return {
         "available": False,
-        "backend": device.value,
+        "backend": _backend_label(device),
         "backend_cuda_visible_devices": os.environ.get("CUDA_VISIBLE_DEVICES"),
         "parent_visible_gpu_ids": [],
         "devices": [],
@@ -1246,17 +1402,20 @@ def get_visible_gpu_count() -> int:
     if _visible_gpu_count is not None:
         return _visible_gpu_count
 
-    cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
-    if cuda_visible is not None:
-        # "" means zero GPUs, "0" means 1, "0,1,2" means 3
-        cuda_visible = cuda_visible.strip()
-        if cuda_visible == "" or cuda_visible == "-1":
+    # Use _get_parent_visible_gpu_spec() which already handles
+    # HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES on ROCm.
+    visible_spec = _get_parent_visible_gpu_spec()
+    if visible_spec["raw"] is not None:
+        raw = visible_spec["raw"].strip()
+        if raw == "" or raw == "-1":
             _visible_gpu_count = 0
+        elif visible_spec["numeric_ids"] is not None:
+            _visible_gpu_count = len(visible_spec["numeric_ids"])
         else:
-            _visible_gpu_count = len([x for x in cuda_visible.split(",") if x.strip()])
+            _visible_gpu_count = len([x for x in raw.split(",") if x.strip()])
         return _visible_gpu_count
 
-    # CUDA_VISIBLE_DEVICES not set -- try torch, fall back to physical count
+    # No visibility env var set -- try torch, fall back to physical count
     try:
         import torch
 
@@ -1288,8 +1447,39 @@ def apply_gpu_ids(gpu_ids) -> None:
         value = str(gpu_ids)
 
     os.environ["CUDA_VISIBLE_DEVICES"] = value
+    # Keep ROCm visibility env vars in sync so _get_parent_visible_gpu_spec()
+    # picks up the narrowed set on AMD systems. Workers can call
+    # apply_gpu_ids() before detect_hardware() runs (so IS_ROCM is still
+    # its default False), so also mirror the selection whenever the
+    # parent process already set a ROCm visibility variable -- that
+    # way a downstream ROCm process inherits the narrowed mask even
+    # before Studio's hardware detection has classified the host.
+    #
+    # ROCm layered visibility: ROCR_VISIBLE_DEVICES holds *physical* IDs,
+    # while HIP_VISIBLE_DEVICES / CUDA_VISIBLE_DEVICES hold *ordinals*
+    # within the ROCR set.  When narrowing to specific physical GPUs we
+    # set ROCR to the physical IDs and reset HIP/CUDA to a zero-based
+    # sequence so ordinals map 1:1 to the new ROCR set.
+    _inherits_rocm_visibility = (
+        "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ
+    )
+    if IS_ROCM or _inherits_rocm_visibility:
+        os.environ["ROCR_VISIBLE_DEVICES"] = value
+        # HIP/CUDA ordinals are relative to the ROCR set above.
+        n_gpus = len(value.split(",")) if value.strip() else 0
+        relative = ",".join(str(i) for i in range(n_gpus))
+        os.environ["HIP_VISIBLE_DEVICES"] = relative
+        os.environ["CUDA_VISIBLE_DEVICES"] = relative
     _visible_gpu_count = None
-    logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value)
+    if IS_ROCM or _inherits_rocm_visibility:
+        logger.info(
+            "Applied gpu_ids: ROCR_VISIBLE_DEVICES='%s', "
+            "HIP_VISIBLE_DEVICES='%s' (rocm)",
+            value,
+            os.environ.get("HIP_VISIBLE_DEVICES", ""),
+        )
+    else:
+        logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value)
 
 
 def get_device_map(
diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index 8d06c7d0e1..2ab5eadc45 100755
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -173,6 +173,7 @@ class HostInfo:
     visible_cuda_devices: str | None
     has_physical_nvidia: bool
     has_usable_nvidia: bool
+    has_rocm: bool = False
 
 
 @dataclass
@@ -2493,12 +2494,25 @@ def detect_host() -> HostInfo:
     has_physical_nvidia = False
     has_usable_nvidia = False
     if nvidia_smi:
+        # Require `nvidia-smi -L` to actually list a GPU before treating the
+        # host as NVIDIA. The banner text "NVIDIA-SMI ..." is printed even
+        # when the command fails to communicate with the driver (e.g. stale
+        # container leftovers), which would otherwise misclassify an AMD
+        # ROCm host as NVIDIA and short-circuit the ROCm path.
         try:
-            result = run_capture([nvidia_smi], timeout = 20)
-            merged = "\n".join(part for part in (result.stdout, result.stderr) if part)
-            if "NVIDIA-SMI" in merged:
+            listing = run_capture([nvidia_smi, "-L"], timeout = 20)
+            gpu_lines = [
+                line for line in listing.stdout.splitlines() if line.startswith("GPU ")
+            ]
+            if gpu_lines:
                 has_physical_nvidia = True
                 has_usable_nvidia = visible_device_tokens != []
+        except Exception:
+            pass
+
+        try:
+            result = run_capture([nvidia_smi], timeout = 20)
+            merged = "\n".join(part for part in (result.stdout, result.stderr) if part)
             for line in merged.splitlines():
                 if "CUDA Version:" in line:
                     raw = line.split("CUDA Version:", 1)[1].strip().split()[0]
@@ -2538,6 +2552,12 @@ def detect_host() -> HostInfo:
 
             if visible_gpu_rows:
                 has_usable_nvidia = True
+                # Older nvidia-smi versions (pre -L support) hit the
+                # except in the first try block but still succeed here,
+                # leaving has_physical_nvidia unset. Mirror the -L path
+                # so downstream diagnostics on line ~4390 still run.
+                if not has_physical_nvidia:
+                    has_physical_nvidia = True
             elif visible_device_tokens == []:
                 has_usable_nvidia = False
             elif supports_explicit_visible_device_matching(visible_device_tokens):
@@ -2547,6 +2567,61 @@ def detect_host() -> HostInfo:
         except Exception:
             pass
 
+    # Detect AMD ROCm (HIP) -- require actual GPU, not just tools installed
+
+    def _amd_smi_has_gpu(stdout: str) -> bool:
+        """Check for 'GPU: <number>' data rows, not just a table header."""
+        return bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
+
+    # Honour GPU visibility masks so hidden GPUs are not detected.
+    # On ROCm, ROCR_VISIBLE_DEVICES narrows the physical set, then
+    # CUDA/HIP_VISIBLE_DEVICES further restricts within that. If ANY
+    # is empty or "-1", all GPUs are hidden.
+    _rocm_vis_enabled = True
+    for _env_name in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"):
+        _env_raw = os.environ.get(_env_name)
+        if _env_raw is not None and _env_raw.strip() in {"", "-1"}:
+            _rocm_vis_enabled = False
+            break
+
+    has_rocm = False
+    if _rocm_vis_enabled and is_linux:
+        for _cmd, _check in (
+            # rocminfo: look for "gfxNNNN" with nonzero first digit (gfx000 is CPU agent)
+            (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))),
+            (["amd-smi", "list"], _amd_smi_has_gpu),
+        ):
+            _exe = shutil.which(_cmd[0])
+            if not _exe:
+                continue
+            try:
+                _result = run_capture([_exe, *_cmd[1:]], timeout = 10)
+            except Exception:
+                continue
+            if _result.returncode == 0 and _result.stdout.strip():
+                if _check(_result.stdout):
+                    has_rocm = True
+                    break
+    elif _rocm_vis_enabled and is_windows:
+        # Windows: prefer active probes that validate GPU presence
+        for _cmd, _check in (
+            (["hipinfo"], lambda out: "gcnarchname" in out.lower()),
+            (["amd-smi", "list"], _amd_smi_has_gpu),
+        ):
+            _exe = shutil.which(_cmd[0])
+            if not _exe:
+                continue
+            try:
+                _result = run_capture([_exe, *_cmd[1:]], timeout = 10)
+            except Exception:
+                continue
+            if _result.returncode == 0 and _result.stdout.strip():
+                if _check(_result.stdout):
+                    has_rocm = True
+                    break
+        # Note: amdhip64.dll presence alone is NOT treated as GPU evidence
+        # since the HIP SDK can be installed without an AMD GPU.
+
     return HostInfo(
         system = system,
         machine = machine,
@@ -2561,6 +2636,7 @@ def detect_host() -> HostInfo:
         visible_cuda_devices = visible_cuda_devices,
         has_physical_nvidia = has_physical_nvidia,
         has_usable_nvidia = has_usable_nvidia,
+        has_rocm = has_rocm,
     )
 
 
@@ -2926,9 +3002,168 @@ def published_asset_choice_for_kind(
     return None
 
 
+def _detect_host_rocm_version() -> tuple[int, int] | None:
+    """Return (major, minor) of the installed ROCm runtime, or None.
+
+    Best-effort read from /opt/rocm/.info/version, amd-smi version, and
+    hipconfig --version. Used to pick a compatible upstream llama.cpp
+    ROCm prebuilt rather than always taking the numerically newest one
+    (which can be newer than the host runtime).
+    """
+    rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm"
+    for path in (
+        os.path.join(rocm_root, ".info", "version"),
+        os.path.join(rocm_root, "lib", "rocm_version"),
+    ):
+        try:
+            with open(path) as fh:
+                parts = fh.read().strip().split("-")[0].split(".")
+            # Explicit length guard avoids relying on the broad except
+            # below to swallow IndexError when the version file contains
+            # a single component (e.g. "6\n" on a partial install).
+            if len(parts) >= 2:
+                return int(parts[0]), int(parts[1])
+        except Exception:
+            pass
+    amd_smi = shutil.which("amd-smi")
+    if amd_smi:
+        try:
+            result = subprocess.run(
+                [amd_smi, "version"],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 5,
+            )
+            if result.returncode == 0:
+                m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout)
+                if m:
+                    return int(m.group(1)), int(m.group(2))
+        except Exception:
+            pass
+    hipconfig = shutil.which("hipconfig")
+    if hipconfig:
+        try:
+            result = subprocess.run(
+                [hipconfig, "--version"],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 5,
+            )
+            if result.returncode == 0:
+                raw = (result.stdout or "").strip().split("\n")[0]
+                parts = raw.split(".")
+                if (
+                    len(parts) >= 2
+                    and parts[0].isdigit()
+                    and parts[1].split("-")[0].isdigit()
+                ):
+                    return int(parts[0]), int(parts[1].split("-")[0])
+        except Exception:
+            pass
+
+    # Distro package-manager fallbacks. Mirrors install.sh::get_torch_index_url
+    # and _detect_rocm_version() in install_python_stack.py so package-managed
+    # ROCm hosts without /opt/rocm/.info/version still report a usable version
+    # and the <= host version filter in resolve_upstream_asset_choice picks
+    # the correct upstream prebuilt instead of the newest-regardless fallback.
+    for _cmd in (
+        ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"],
+        ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"],
+    ):
+        _exe = shutil.which(_cmd[0])
+        if not _exe:
+            continue
+        try:
+            _result = subprocess.run(
+                [_exe, *_cmd[1:]],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 5,
+            )
+        except Exception:
+            continue
+        if _result.returncode != 0 or not _result.stdout.strip():
+            continue
+        _raw = _result.stdout.strip()
+        # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing.
+        _raw = re.sub(r"^\d+:", "", _raw)
+        _m = re.match(r"(\d+)[.-](\d+)", _raw)
+        if _m:
+            return int(_m.group(1)), int(_m.group(2))
+    return None
+
+
 def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice:
     upstream_assets = github_release_assets(UPSTREAM_REPO, llama_tag)
     if host.is_linux and host.is_x86_64:
+        # AMD ROCm: try upstream ROCm prebuilt first, then fall back to source build.
+        # Source build (via setup.sh) compiles with -DGGML_HIP=ON and auto-detects
+        # the exact GPU target via rocminfo, which is more reliable for consumer
+        # GPUs (e.g. gfx1151) that may not be in the prebuilt.
+        if host.has_rocm and not host.has_usable_nvidia:
+            # Scan upstream assets for any rocm-<version> prebuilt. When the
+            # host ROCm runtime version is known, pick the newest candidate
+            # whose major.minor is <= host version -- otherwise a ROCm 6.4
+            # host would download the rocm-7.2 tarball, fail preflight, and
+            # fall back to a source build even though a compatible 6.4
+            # prebuilt exists. If no compatible candidate matches (e.g. host
+            # runtime is older than every published prebuilt), fall back to
+            # the numerically newest so we at least try something.
+            _rocm_pattern = re.compile(
+                rf"llama-{re.escape(llama_tag)}-bin-ubuntu-rocm-([0-9]+(?:\.[0-9]+)*)-x64\.tar\.gz"
+            )
+            rocm_candidates: list[tuple[tuple[int, ...], str]] = []
+            for _name in upstream_assets:
+                _m = _rocm_pattern.match(_name)
+                if _m is None:
+                    continue
+                _parts = tuple(int(p) for p in _m.group(1).split("."))
+                rocm_candidates.append((_parts, _name))
+            rocm_candidates.sort(reverse = True)
+            _host_rocm_version = _detect_host_rocm_version()
+            _compatible: list[tuple[tuple[int, ...], str]] = rocm_candidates
+            if _host_rocm_version is not None:
+                _compatible = [
+                    item
+                    for item in rocm_candidates
+                    if item[0][:2] <= _host_rocm_version
+                ]
+            if rocm_candidates and not _compatible:
+                # Fall back to the newest candidate so a source build is
+                # not forced when the host runtime is older than every
+                # published prebuilt: preflight will still catch a true
+                # incompatibility and trigger a fallback.
+                _compatible = rocm_candidates[:1]
+            if _compatible:
+                rocm_name = _compatible[0][1]
+                if _host_rocm_version is not None:
+                    log(
+                        f"AMD ROCm {_host_rocm_version[0]}.{_host_rocm_version[1]} "
+                        f"detected -- trying upstream prebuilt {rocm_name}"
+                    )
+                else:
+                    log(f"AMD ROCm detected -- trying upstream prebuilt {rocm_name}")
+                log(
+                    "Note: if your ROCm runtime version differs significantly, "
+                    "this may fail preflight and fall back to a source build (safe)"
+                )
+                return AssetChoice(
+                    repo = UPSTREAM_REPO,
+                    tag = llama_tag,
+                    name = rocm_name,
+                    url = upstream_assets[rocm_name],
+                    source_label = "upstream",
+                    install_kind = "linux-rocm",
+                )
+            # No ROCm prebuilt available -- fall back to source build
+            raise PrebuiltFallback(
+                "AMD ROCm detected but no upstream ROCm prebuilt found; "
+                "falling back to source build with HIP support"
+            )
+
         upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz"
         if upstream_name not in upstream_assets:
             raise PrebuiltFallback("upstream Linux CPU asset was not found")
@@ -2948,6 +3183,25 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                 return attempts[0]
             raise PrebuiltFallback("no compatible Windows CUDA asset was found")
 
+        # AMD ROCm on Windows: try HIP prebuilt
+        if host.has_rocm:
+            hip_name = f"llama-{llama_tag}-bin-win-hip-radeon-x64.zip"
+            if hip_name in upstream_assets:
+                log(
+                    f"AMD ROCm detected on Windows -- trying upstream HIP prebuilt {hip_name}"
+                )
+                return AssetChoice(
+                    repo = UPSTREAM_REPO,
+                    tag = llama_tag,
+                    name = hip_name,
+                    url = upstream_assets[hip_name],
+                    source_label = "upstream",
+                    install_kind = "windows-hip",
+                )
+            log(
+                "AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU"
+            )
+
         upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip"
         if upstream_name not in upstream_assets:
             raise PrebuiltFallback("upstream Windows CPU asset was not found")
@@ -3028,8 +3282,19 @@ def resolve_release_asset_choice(
         )
 
     published_choice: AssetChoice | None = None
-    if host.is_windows and host.is_x86_64:
-        published_choice = published_asset_choice_for_kind(release, "windows-cpu")
+    if host.is_linux and host.is_x86_64 and host.has_rocm and not host.has_usable_nvidia:
+        published_choice = published_asset_choice_for_kind(release, "linux-rocm")
+    elif host.is_windows and host.is_x86_64:
+        # AMD Windows hosts should prefer a hash-approved published
+        # Windows HIP bundle when one exists, but otherwise fall through
+        # to resolve_asset_choice() so the upstream HIP prebuilt is
+        # tried before the CPU fallback. Hard-pinning the published
+        # windows-cpu bundle here would make the new HIP path
+        # unreachable.
+        if host.has_rocm:
+            published_choice = published_asset_choice_for_kind(release, "windows-hip")
+        else:
+            published_choice = published_asset_choice_for_kind(release, "windows-cpu")
     elif host.is_macos and host.is_arm64:
         published_choice = published_asset_choice_for_kind(release, "macos-arm64")
     elif host.is_macos and host.is_x86_64:
@@ -3378,7 +3643,7 @@ def overlay_directory_for_choice(
 
 
 def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]:
-    if choice.install_kind in {"linux-cpu", "linux-cuda"}:
+    if choice.install_kind in {"linux-cpu", "linux-cuda", "linux-rocm"}:
         return [
             "llama-server",
             "llama-quantize",
@@ -3388,11 +3653,12 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]:
             "libmtmd.so*",
             "libggml-cpu-*.so*",
             "libggml-cuda.so*",
+            "libggml-hip.so*",
             "libggml-rpc.so*",
         ]
     if choice.install_kind in {"macos-arm64", "macos-x64"}:
         return ["llama-server", "llama-quantize", "lib*.dylib"]
-    if choice.install_kind in {"windows-cpu", "windows-cuda"}:
+    if choice.install_kind in {"windows-cpu", "windows-cuda", "windows-hip"}:
         return ["*.exe", "*.dll"]
     raise PrebuiltFallback(
         f"unsupported install kind for runtime overlay: {choice.install_kind}"
@@ -4117,6 +4383,7 @@ def validate_server(
     install_dir: Path,
     *,
     runtime_line: str | None = None,
+    install_kind: str | None = None,
 ) -> None:
     last_failure: PrebuiltFallback | None = None
     for port_attempt in range(1, SERVER_PORT_BIND_ATTEMPTS + 1):
@@ -4140,7 +4407,33 @@ def validate_server(
             "--batch-size",
             "32",
         ]
-        if host.has_usable_nvidia or (host.is_macos and host.is_arm64):
+        # Only enable GPU offload for assets that actually ship GPU code.
+        # Gating on `host.has_rocm` alone breaks the intentional CPU
+        # fallback on AMD Windows hosts without a HIP prebuilt: the CPU
+        # binary would be launched with `--n-gpu-layers 1` and fail
+        # validation. Use the resolved install_kind as the source of
+        # truth and fall back to host detection when the caller did not
+        # pass one (keeps backwards compatibility with older call sites).
+        _gpu_kinds = {
+            "linux-cuda",
+            "linux-rocm",
+            "windows-cuda",
+            "windows-hip",
+            "macos-arm64",
+        }
+        if install_kind is not None:
+            _enable_gpu_layers = install_kind in _gpu_kinds
+        else:
+            # Older call sites that don't pass install_kind: keep ROCm
+            # hosts in the GPU-validation path so an AMD-only Linux host
+            # is exercised against the actual hardware rather than the
+            # CPU fallback. NVIDIA and macOS-arm64 are already covered.
+            _enable_gpu_layers = (
+                host.has_usable_nvidia
+                or host.has_rocm
+                or (host.is_macos and host.is_arm64)
+            )
+        if _enable_gpu_layers:
             command.extend(["--n-gpu-layers", "1"])
 
         log_fd, log_name = tempfile.mkstemp(prefix = "llama-server-", suffix = ".log")
@@ -4664,10 +4957,21 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]:
             ["libggml*.dylib"],
             ["libmtmd*.dylib"],
         ]
+    if choice.install_kind == "linux-rocm":
+        return [
+            ["libllama.so*"],
+            ["libggml.so*"],
+            ["libggml-base.so*"],
+            ["libggml-cpu-*.so*"],
+            ["libmtmd.so*"],
+            ["libggml-hip.so*"],
+        ]
     if choice.install_kind == "windows-cpu":
         return [["llama.dll"]]
     if choice.install_kind == "windows-cuda":
         return [["llama.dll"], ["ggml-cuda.dll"]]
+    if choice.install_kind == "windows-hip":
+        return [["llama.dll"], ["*hip*.dll"]]
     return []
 
 
@@ -4839,6 +5143,7 @@ def validate_prebuilt_choice(
         host,
         install_dir,
         runtime_line = choice.runtime_line,
+        install_kind = choice.install_kind,
     )
     log(f"staged prebuilt validation succeeded for {choice.name}")
     return server_path, quantize_path
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index f2981ea665..cbbb81f913 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -25,6 +25,319 @@
 IS_MACOS = sys.platform == "darwin"
 IS_MAC_INTEL = IS_MACOS and platform.machine() == "x86_64"
 
+# ── ROCm / AMD GPU support ─────────────────────────────────────────────────────
+# Mapping from detected ROCm (major, minor) to the best PyTorch wheel tag on
+# download.pytorch.org.  Entries are checked newest-first (>=).
+# ROCm 7.2 only has torch 2.11.0 on download.pytorch.org, which exceeds the
+# current torch upper bound (<2.11.0).  Fall back to rocm7.1 (torch 2.10.0).
+# TODO: uncomment rocm7.2 when torch upper bound is bumped to >=2.11.0
+_ROCM_TORCH_INDEX: dict[tuple[int, int], str] = {
+    # (7, 2): "rocm7.2",  # torch 2.11.0 -- requires torch>=2.11
+    (7, 1): "rocm7.1",
+    (7, 0): "rocm7.0",
+    (6, 4): "rocm6.4",
+    (6, 3): "rocm6.3",
+    (6, 2): "rocm6.2",
+    (6, 1): "rocm6.1",
+    (6, 0): "rocm6.0",
+}
+_PYTORCH_WHL_BASE = "https://download.pytorch.org/whl"
+
+
+def _detect_rocm_version() -> tuple[int, int] | None:
+    """Return (major, minor) of the installed ROCm stack, or None."""
+    # Check /opt/rocm/.info/version or ROCM_PATH equivalent
+    rocm_root = os.environ.get("ROCM_PATH") or "/opt/rocm"
+    for path in (
+        os.path.join(rocm_root, ".info", "version"),
+        os.path.join(rocm_root, "lib", "rocm_version"),
+    ):
+        try:
+            with open(path) as fh:
+                parts = fh.read().strip().split("-")[0].split(".")
+            # Explicit length guard avoids relying on the broad except
+            # below to swallow IndexError when the version file contains
+            # a single component (e.g. "6\n" on a partial install).
+            if len(parts) >= 2:
+                return int(parts[0]), int(parts[1])
+        except Exception:
+            pass
+
+    # Try amd-smi version (outputs "... | ROCm version: X.Y.Z")
+    amd_smi = shutil.which("amd-smi")
+    if amd_smi:
+        try:
+            result = subprocess.run(
+                [amd_smi, "version"],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 5,
+            )
+            if result.returncode == 0:
+                import re
+
+                m = re.search(r"ROCm version:\s*(\d+)\.(\d+)", result.stdout)
+                if m:
+                    return int(m.group(1)), int(m.group(2))
+        except Exception:
+            pass
+
+    # Try hipconfig --version (outputs bare version like "6.3.21234.2")
+    hipconfig = shutil.which("hipconfig")
+    if hipconfig:
+        try:
+            result = subprocess.run(
+                [hipconfig, "--version"],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 5,
+            )
+            if result.returncode == 0:
+                raw = result.stdout.strip().split("\n")[0]
+                parts = raw.split(".")
+                if (
+                    len(parts) >= 2
+                    and parts[0].isdigit()
+                    and parts[1].split("-")[0].isdigit()
+                ):
+                    return int(parts[0]), int(parts[1].split("-")[0])
+        except Exception:
+            pass
+
+    # Distro package-manager fallbacks. Package-managed ROCm installs can
+    # expose GPUs via rocminfo / amd-smi but still lack /opt/rocm/.info/version
+    # and hipconfig, so probe dpkg (Debian/Ubuntu) and rpm (RHEL/Fedora/SUSE)
+    # for the rocm-core package version. Matches the chain in
+    # install.sh::get_torch_index_url so `unsloth studio update` behaves
+    # the same as a fresh `curl | sh` install.
+    import re as _re_pkg
+
+    for cmd in (
+        ["dpkg-query", "-W", "-f=${Version}\n", "rocm-core"],
+        ["rpm", "-q", "--qf", "%{VERSION}\n", "rocm-core"],
+    ):
+        exe = shutil.which(cmd[0])
+        if not exe:
+            continue
+        try:
+            result = subprocess.run(
+                [exe, *cmd[1:]],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 5,
+            )
+        except Exception:
+            continue
+        if result.returncode != 0 or not result.stdout.strip():
+            continue
+        raw = result.stdout.strip()
+        # dpkg can prepend an epoch ("1:6.3.0-1"); strip it before parsing.
+        raw = _re_pkg.sub(r"^\d+:", "", raw)
+        m = _re_pkg.match(r"(\d+)[.-](\d+)", raw)
+        if m:
+            return int(m.group(1)), int(m.group(2))
+
+    return None
+
+
+def _rocm_devices_enabled() -> bool:
+    """Return True when no env var explicitly hides all AMD GPUs.
+
+    On ROCm, ROCR_VISIBLE_DEVICES narrows the physical set, then
+    CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES further restricts within
+    that set. If ANY of the defined vars is "" or "-1", all GPUs are hidden.
+    """
+    for name in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES", "CUDA_VISIBLE_DEVICES"):
+        raw = os.environ.get(name)
+        if raw is not None and raw.strip() in {"", "-1"}:
+            return False
+    return True
+
+
+def _has_rocm_gpu() -> bool:
+    """Return True only if an actual AMD GPU is visible (not just ROCm tools installed)."""
+    if not _rocm_devices_enabled():
+        return False
+    import re
+
+    for cmd, check_fn in (
+        # rocminfo: look for "Name: gfxNNNN" with nonzero first digit (gfx000 is the CPU agent)
+        (["rocminfo"], lambda out: bool(re.search(r"gfx[1-9]", out.lower()))),
+        # amd-smi list: require "GPU: <number>" data rows, not just a header
+        (
+            ["amd-smi", "list"],
+            lambda out: bool(re.search(r"(?im)^gpu\s*[:\[]\s*\d", out)),
+        ),
+    ):
+        exe = shutil.which(cmd[0])
+        if not exe:
+            continue
+        try:
+            result = subprocess.run(
+                [exe, *cmd[1:]],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 10,
+            )
+        except Exception:
+            continue
+        if result.returncode == 0 and result.stdout.strip():
+            if check_fn(result.stdout):
+                return True
+    return False
+
+
+def _has_usable_nvidia_gpu() -> bool:
+    """Return True only when nvidia-smi exists AND reports at least one GPU.
+
+    Respects CUDA_VISIBLE_DEVICES="" or "-1" so mixed NVIDIA+AMD hosts
+    where NVIDIA is intentionally hidden are correctly routed to ROCm.
+    """
+    raw = os.environ.get("CUDA_VISIBLE_DEVICES")
+    if raw is not None and raw.strip() in {"", "-1"}:
+        return False
+    exe = shutil.which("nvidia-smi")
+    if not exe:
+        return False
+    try:
+        result = subprocess.run(
+            [exe, "-L"],
+            stdout = subprocess.PIPE,
+            stderr = subprocess.DEVNULL,
+            text = True,
+            timeout = 10,
+        )
+    except Exception:
+        return False
+    return result.returncode == 0 and "GPU " in result.stdout
+
+
+def _ensure_rocm_torch() -> None:
+    """Reinstall torch with ROCm wheels when the venv received CPU-only torch.
+
+    Runs only on Linux x86_64 hosts where an AMD GPU is present and the
+    ROCm runtime is detectable (rocminfo / amd-smi / hipconfig /
+    rocm-core package).  No-op when torch already links against HIP
+    (ROCm), on Windows / macOS, on non-x86_64 Linux (PyTorch does not
+    publish ROCm wheels for aarch64 / arm64), or on mixed AMD+NVIDIA
+    hosts (NVIDIA takes precedence).
+    Uses pip_install() to respect uv, constraints, and --python targeting.
+    """
+    # Explicit OS / architecture guards so the helper is safe to call
+    # from any context -- PyTorch only publishes ROCm wheels for
+    # linux_x86_64, so aarch64 / arm64 hosts must skip this repair path
+    # instead of failing the update with a missing-wheel error.
+    if IS_WINDOWS or IS_MACOS:
+        return
+    if platform.machine().lower() not in {"x86_64", "amd64"}:
+        return
+    # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable
+    if _has_usable_nvidia_gpu():
+        return
+    # Rely on _has_rocm_gpu() (rocminfo / amd-smi GPU data rows) as the
+    # authoritative "is this actually an AMD ROCm host?" signal. The old
+    # gate required /opt/rocm or hipcc to exist, which breaks on
+    # runtime-only ROCm installs (package-managed minimal installs,
+    # Radeon software) that ship amd-smi/rocminfo without /opt/rocm or
+    # hipcc, and leaves `unsloth studio update` unable to repair a
+    # CPU-only venv on those systems.
+    if not _has_rocm_gpu():
+        return  # no AMD GPU visible
+
+    ver = _detect_rocm_version()
+    if ver is None:
+        print("   ROCm detected but version unreadable -- skipping torch reinstall")
+        return
+
+    # Probe whether torch already links against HIP (ROCm is already working).
+    # Do NOT skip for CUDA-only builds since they are unusable on AMD-only
+    # hosts (the NVIDIA check above already handled mixed AMD+NVIDIA setups).
+    try:
+        probe = subprocess.run(
+            [
+                sys.executable,
+                "-c",
+                "import torch; print(getattr(torch.version,'hip','') or '')",
+            ],
+            stdout = subprocess.PIPE,
+            stderr = subprocess.DEVNULL,
+            timeout = 30,
+        )
+    except (OSError, subprocess.TimeoutExpired):
+        probe = None
+    has_hip_torch = (
+        probe is not None
+        and probe.returncode == 0
+        and probe.stdout.decode().strip() != ""
+    )
+
+    rocm_torch_ready = has_hip_torch
+
+    if not has_hip_torch:
+        # Select best matching wheel tag (newest ROCm version <= installed)
+        tag = next(
+            (
+                t
+                for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
+                if ver >= (maj, mn)
+            ),
+            None,
+        )
+        if tag is None:
+            print(
+                f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- "
+                f"skipping torch reinstall"
+            )
+        else:
+            index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
+            print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
+            pip_install(
+                f"ROCm torch ({tag})",
+                "--force-reinstall",
+                "--no-cache-dir",
+                "torch>=2.4,<2.11.0",
+                "torchvision<0.26.0",  # TODO: bump to <0.27.0 when rocm7.2 is uncommented
+                "torchaudio<2.11.0",
+                "--index-url",
+                index_url,
+                constrain = False,
+            )
+            # Re-probe: only mark ready if HIP torch is now actually present.
+            # pip_install() may have failed silently.
+            try:
+                probe2 = subprocess.run(
+                    [sys.executable, "-c",
+                     "import torch; print(getattr(torch.version,'hip','') or '')"],
+                    stdout = subprocess.PIPE,
+                    stderr = subprocess.DEVNULL,
+                    timeout = 30,
+                )
+                rocm_torch_ready = (
+                    probe2.returncode == 0
+                    and probe2.stdout.decode().strip() != ""
+                )
+            except Exception:
+                rocm_torch_ready = False
+
+    # Install bitsandbytes only when the venv has a ROCm-compatible torch
+    # (either already present or just installed). Avoids leaving an AMD
+    # bitsandbytes on top of a CPU/CUDA torch on hosts where the ROCm
+    # runtime is older than any published torch wheel. Uses
+    # --force-reinstall so an existing CPU/CUDA bitsandbytes is replaced
+    # by the AMD build during upgrades.
+    if rocm_torch_ready:
+        pip_install(
+            "bitsandbytes (AMD)",
+            "--force-reinstall",
+            "--no-cache-dir",
+            "bitsandbytes>=0.49.1",
+            constrain = False,
+        )
+
 
 def _infer_no_torch() -> bool:
     """Determine whether to run in no-torch (GGUF-only) mode.
@@ -414,6 +727,9 @@ def install_python_stack() -> int:
     base_total = 10 if IS_WINDOWS else 11
     if IS_MACOS:
         base_total -= 1  # triton step is skipped on macOS
+    # ROCm torch check step (Linux only, non-macOS, non-no-torch)
+    if not IS_WINDOWS and not IS_MACOS and not NO_TORCH:
+        base_total += 1
     _TOTAL = (base_total - 1) if skip_base else base_total
 
     # 1. Try to use uv for faster installs (must happen before pip upgrade
@@ -537,6 +853,53 @@ def install_python_stack() -> int:
             req = REQ_ROOT / "base.txt",
         )
 
+    # 2b. AMD ROCm: reinstall torch with HIP wheels if the host has ROCm but the
+    #     venv received CPU-only torch (common when pip resolves torch from PyPI).
+    #     Must come immediately after base packages so torch is present for inspection.
+    if not IS_WINDOWS and not IS_MACOS and not NO_TORCH:
+        _progress("ROCm torch check")
+        _ensure_rocm_torch()
+
+    # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows.
+    # Detect and warn so users know manual steps are needed for GPU training.
+    if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu():
+        # Validate actual AMD GPU presence (not just tool existence)
+        import re as _re_win
+
+        def _win_amd_smi_has_gpu(stdout: str) -> bool:
+            return bool(_re_win.search(r"(?im)^gpu\s*[:\[]\s*\d", stdout))
+
+        _win_amd_gpu = False
+        for _wcmd, _check_fn in (
+            (["hipinfo"], lambda out: "gcnarchname" in out.lower()),
+            (["amd-smi", "list"], _win_amd_smi_has_gpu),
+        ):
+            _wexe = shutil.which(_wcmd[0])
+            if not _wexe:
+                continue
+            try:
+                _wr = subprocess.run(
+                    [_wexe, *_wcmd[1:]],
+                    stdout = subprocess.PIPE,
+                    stderr = subprocess.DEVNULL,
+                    text = True,
+                    timeout = 10,
+                )
+            except Exception:
+                continue
+            if _wr.returncode == 0 and _check_fn(_wr.stdout):
+                _win_amd_gpu = True
+                break
+        if _win_amd_gpu:
+            _safe_print(
+                _dim("  Note:"),
+                "AMD GPU detected on Windows. ROCm-enabled PyTorch must be",
+            )
+            _safe_print(
+                " " * 8,
+                "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
+            )
+
     # 3. Extra dependencies
     _progress("unsloth extras")
     pip_install(
diff --git a/unsloth/kernels/utils.py b/unsloth/kernels/utils.py
index 90f2d5d238..09e755e001 100644
--- a/unsloth/kernels/utils.py
+++ b/unsloth/kernels/utils.py
@@ -88,10 +88,27 @@ def is_cdna():
 
 @functools.lru_cache(1)
 def is_rdna():
-    """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA3, RDNA4)."""
+    """Detect ROCm-supported RDNA consumer/workstation GPUs (RDNA2, RDNA3, RDNA3.5, RDNA4)."""
     return is_hip() and triton.runtime.driver.active.get_current_target().arch in (
+        # RDNA2 (Navi 21-24)
+        "gfx1030",
+        "gfx1031",
+        "gfx1032",
+        "gfx1033",
+        "gfx1034",
+        "gfx1035",
+        "gfx1036",
+        # RDNA3 (Navi 31-33)
         "gfx1100",
         "gfx1101",
+        "gfx1102",
+        "gfx1103",
+        # RDNA3.5 (Strix Point / Strix Halo / Krackan Point)
+        "gfx1150",
+        "gfx1151",
+        "gfx1152",
+        "gfx1153",
+        # RDNA4 (Navi 48-44)
         "gfx1200",
         "gfx1201",
     )
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 8be6bb5a5a..0d9ef896e6 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -1103,7 +1103,16 @@ def patch_sft_trainer_tokenizer():
             "    a = np.array([int(x.decode('utf-8'))/1024 for x in a])\n"
             "except:\n"
             "    if not torch.cuda.is_available():\n"
-            "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"
+            "        raise RuntimeError('Unsloth: No GPU detected. AMD ROCm users: install ROCm-enabled PyTorch -- see https://docs.unsloth.ai/get-started/install-and-update/amd')\n"
+            "    # nvidia-smi unavailable but torch.cuda IS available -- we are on\n"
+            "    # a ROCm host (ROCm reuses the torch.cuda.* API surface, so\n"
+            "    # device_count() is authoritative) or on a CUDA host without\n"
+            "    # the CLI installed. Use the device count directly as a\n"
+            "    # conservative multi-GPU signal: any configuration with more\n"
+            "    # than one visible device is flagged as unsupported, matching\n"
+            "    # the spirit of the per-device memory check used on CUDA.\n"
+            "    if torch.cuda.device_count() > 1:\n"
+            "        raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"
             "if ((a - PRE_CHECK) >= 1).sum() > 1:\n"
             "    raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"
             "for _ in range(3):\n"