From 0539621fa15969d9f3922665f8a7115d2d6856d2 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 5 May 2026 13:08:38 -0500
Subject: [PATCH 001/165] fix(studio): set HIP_VISIBLE_DEVICES in apply_gpu_ids
 for ROCm training workers

Training workers are spawned via multiprocessing spawn before detect_hardware()
runs, so IS_ROCM is still False. If the user never set HIP_VISIBLE_DEVICES in
their shell, _inherits_rocm_visibility is also False, leaving the worker with
only CUDA_VISIBLE_DEVICES set. On ROCm hosts the HIP runtime honors
HIP_VISIBLE_DEVICES over CUDA_VISIBLE_DEVICES, so the worker saw the full
device list and torch raised "no usable HIP accelerator" on some setups.

Fall back to probing torch.version.hip (a build-time attribute, safe to read
before GPU init) to detect ROCm when neither IS_ROCM nor inherited env vars
are available. Mirrors the existing fix in llama_cpp.py for llama-server
subprocess GPU pinning.

Fixes https://github.com/unslothai/unsloth/issues/5180
---
 studio/backend/utils/hardware/hardware.py | 15 +++++++++--
 tests/studio/install/test_rocm_support.py | 33 +++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index be31c00a78..7246a0519a 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1391,14 +1391,25 @@ def apply_gpu_ids(gpu_ids) -> None:
     # parent process already set a ROCm visibility variable -- that
     # way a downstream ROCm process inherits the narrowed mask even
     # before Studio's hardware detection has classified the host.
+    # As a final fallback, probe torch.version.hip directly so spawned
+    # training workers on AMD hosts where the user never set HIP_VISIBLE_DEVICES
+    # still get the correct ROCm visibility mask (mirrors the llama_cpp.py
+    # approach for llama-server subprocess GPU pinning).
     _inherits_rocm_visibility = (
         "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ
     )
-    if IS_ROCM or _inherits_rocm_visibility:
+    _is_rocm = IS_ROCM or _inherits_rocm_visibility
+    if not _is_rocm:
+        try:
+            import torch as _torch
+            _is_rocm = bool(getattr(_torch.version, "hip", None))
+        except Exception:
+            pass
+    if _is_rocm:
         os.environ["HIP_VISIBLE_DEVICES"] = value
         os.environ["ROCR_VISIBLE_DEVICES"] = value
     _visible_gpu_count = None
-    if IS_ROCM or _inherits_rocm_visibility:
+    if _is_rocm:
         logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s' (rocm)", value)
     else:
         logger.info("Applied gpu_ids: CUDA_VISIBLE_DEVICES='%s'", value)
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 48831fd57b..9ce6e53998 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1250,6 +1250,39 @@ def test_hardware_branches_on_is_rocm_for_physical_count(self):
         assert "amd.get_physical_gpu_count" in func_body
 
 
+# =============================================================================
+# TEST: hardware.py -- apply_gpu_ids ROCm fallback (issue #5180)
+# =============================================================================
+
+
+class TestApplyGpuIdsRocmFallback:
+    """Verify apply_gpu_ids sets HIP_VISIBLE_DEVICES on ROCm hosts even when
+    IS_ROCM is still False (worker subprocess before detect_hardware runs)."""
+
+    def test_apply_gpu_ids_source_checks_torch_version_hip(self):
+        """apply_gpu_ids should fall back to torch.version.hip when IS_ROCM is False."""
+        hw_path = (
+            PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+        )
+        source = hw_path.read_text()
+        func_start = source.find("def apply_gpu_ids")
+        func_body = source[func_start : source.find("\ndef ", func_start + 1)]
+        assert 'getattr(_torch.version, "hip", None)' in func_body or \
+               "getattr(torch.version, 'hip', None)" in func_body or \
+               "torch.version.hip" in func_body
+
+    def test_apply_gpu_ids_source_sets_hip_visible_devices(self):
+        """apply_gpu_ids should set HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES."""
+        hw_path = (
+            PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+        )
+        source = hw_path.read_text()
+        func_start = source.find("def apply_gpu_ids")
+        func_body = source[func_start : source.find("\ndef ", func_start + 1)]
+        assert "HIP_VISIBLE_DEVICES" in func_body
+        assert "ROCR_VISIBLE_DEVICES" in func_body
+
+
 # =============================================================================
 # TEST: install_python_stack.py -- Windows AMD warning
 # =============================================================================

From 14fccdee48bac943b8f12ec504068e5836a1193a Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 5 May 2026 13:30:56 -0500
Subject: [PATCH 002/165] test: tighten apply_gpu_ids ROCm fallback assertions

Replace loose OR chain with exact string matches, split into three
focused tests, and add a guard check for the try/except wrapper.
---
 tests/studio/install/test_rocm_support.py | 27 +++++++++++++++--------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 9ce6e53998..99bc9c11bc 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1259,28 +1259,37 @@ class TestApplyGpuIdsRocmFallback:
     """Verify apply_gpu_ids sets HIP_VISIBLE_DEVICES on ROCm hosts even when
     IS_ROCM is still False (worker subprocess before detect_hardware runs)."""
 
-    def test_apply_gpu_ids_source_checks_torch_version_hip(self):
-        """apply_gpu_ids should fall back to torch.version.hip when IS_ROCM is False."""
+    def test_apply_gpu_ids_falls_back_to_torch_version_hip(self):
+        """apply_gpu_ids should probe torch.version.hip when IS_ROCM is False and no ROCm env vars are set."""
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
         source = hw_path.read_text()
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
-        assert 'getattr(_torch.version, "hip", None)' in func_body or \
-               "getattr(torch.version, 'hip', None)" in func_body or \
-               "torch.version.hip" in func_body
+        assert 'getattr(_torch.version, "hip", None)' in func_body
 
-    def test_apply_gpu_ids_source_sets_hip_visible_devices(self):
-        """apply_gpu_ids should set HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES."""
+    def test_apply_gpu_ids_sets_hip_and_rocr_visible_devices(self):
+        """apply_gpu_ids should set both HIP_VISIBLE_DEVICES and ROCR_VISIBLE_DEVICES on ROCm."""
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
         source = hw_path.read_text()
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
-        assert "HIP_VISIBLE_DEVICES" in func_body
-        assert "ROCR_VISIBLE_DEVICES" in func_body
+        assert 'os.environ["HIP_VISIBLE_DEVICES"] = value' in func_body
+        assert 'os.environ["ROCR_VISIBLE_DEVICES"] = value' in func_body
+
+    def test_apply_gpu_ids_rocm_fallback_is_guarded_by_try_except(self):
+        """torch import in apply_gpu_ids must be wrapped in try/except so a missing torch never crashes."""
+        hw_path = (
+            PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+        )
+        source = hw_path.read_text()
+        func_start = source.find("def apply_gpu_ids")
+        func_body = source[func_start : source.find("\ndef ", func_start + 1)]
+        assert "import torch as _torch" in func_body
+        assert "except Exception" in func_body
 
 
 # =============================================================================

From e87c90f769589f6556a1f24475fc58d2661ab031 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 04:34:52 +0000
Subject: [PATCH 003/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 7246a0519a..3b1b9fe95e 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1402,6 +1402,7 @@ def apply_gpu_ids(gpu_ids) -> None:
     if not _is_rocm:
         try:
             import torch as _torch
+
             _is_rocm = bool(getattr(_torch.version, "hip", None))
         except Exception:
             pass

From 74c871d109fe2ff1a1bc9a40b5f683beaf718a27 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 5 May 2026 23:42:09 -0500
Subject: [PATCH 004/165] fix: detect ROCm unified memory (Strix Halo / AMD
 iGPU) via torch fallback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

amd-smi on iGPUs with shared/unified memory (e.g. Radeon 8060S on Strix
Halo) reports only the dedicated VRAM slice (~512 MB) in its metric output,
so get_visible_gpu_utilization() was returning usable_gb ≈ 0.35 GB instead
of the full GTT pool (~128 GB).  torch.cuda.mem_get_info() already surfaces
the correct unified-pool size.

Add _reconcile_rocm_unified_memory(): after amd-smi returns a valid result
on a ROCm device, cross-check each device's vram_total_gb against
torch.cuda.mem_get_info().  When torch reports a larger total, replace the
amd-smi VRAM fields in-place.  No-op for discrete AMD GPUs where the two
sources agree.

Fixes: "Falling back to all visible GPUs -- model may not fit" on AMD iGPU
machines even when 100+ GB of unified memory is available.
---
 studio/backend/utils/hardware/hardware.py | 49 +++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index c218b7b4b9..df105f7a59 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -488,6 +488,47 @@ def get_gpu_utilization() -> Dict[str, Any]:
     return {"available": False, "backend": _backend_label(device)}
 
 
+def _reconcile_rocm_unified_memory(
+    utilization: Dict[str, Any], device_indices: list[int]
+) -> None:
+    """Cross-check amd-smi VRAM data against torch mem_get_info for ROCm.
+
+    On AMD iGPUs with unified/shared memory (e.g. Strix Halo / Radeon 8060S),
+    amd-smi reports only the dedicated VRAM slice (typically 512 MB) in its
+    metric output, while torch.cuda.mem_get_info() surfaces the full GTT /
+    unified pool (~128 GB). When torch reports a larger total than amd-smi,
+    replace the per-device VRAM fields so auto_select_gpu_ids sees the real
+    usable memory instead of the tiny dedicated slice.
+    """
+    torch_devices = _torch_get_per_device_info(device_indices)
+    if not torch_devices:
+        return
+    torch_by_index = {td["index"]: td for td in torch_devices}
+    for dev in utilization.get("devices", []):
+        idx = dev.get("index")
+        td = torch_by_index.get(idx)
+        if td is None:
+            continue
+        torch_total_gb = td["total_gb"]
+        smi_total_gb = dev.get("vram_total_gb") or 0.0
+        if torch_total_gb > smi_total_gb:
+            torch_used_gb = td["used_gb"]
+            dev["vram_total_gb"] = torch_total_gb
+            dev["vram_used_gb"] = torch_used_gb
+            dev["vram_utilization_pct"] = (
+                round((torch_used_gb / torch_total_gb) * 100, 1)
+                if torch_total_gb > 0
+                else None
+            )
+            logger.debug(
+                "ROCm unified memory: replaced amd-smi VRAM (%.2f GB) with "
+                "torch mem_get_info total (%.2f GB) for device %d",
+                smi_total_gb,
+                torch_total_gb,
+                idx,
+            )
+
+
 def get_visible_gpu_utilization() -> Dict[str, Any]:
     device = get_device()
 
@@ -500,6 +541,14 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
         )
         if result is not None:
             result["backend"] = _backend_label(device)
+            if IS_ROCM:
+                # amd-smi on iGPUs with unified memory (e.g. Strix Halo)
+                # reports only the dedicated VRAM slice; torch mem_get_info
+                # sees the full unified pool. Reconcile so downstream GPU
+                # selection uses the real available memory.
+                _reconcile_rocm_unified_memory(
+                    result, parent_visible_spec["numeric_ids"]
+                )
             return result
 
     # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel)

From 0d58e42a103cd6c3348c1c7e375524b49b29fd8e Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 6 May 2026 12:21:39 +0000
Subject: [PATCH 005/165] Apply unified-memory reconciliation in
 get_gpu_utilization too

The visible-GPU path was already corrected for AMD iGPUs with unified memory
(Strix Halo / Radeon 8060S), but get_gpu_utilization was still returning the
raw 512 MB amd-smi VRAM slice. Studio's /api/train/hardware endpoint and the
live GPU monitor read from this primary path, so users continued seeing the
wrong total even after auto_select_gpu_ids picked the right device.

Refactor to share the per-device correction:
  * _apply_unified_memory_correction(metrics, torch_info) -- the actual
    replacement logic, in-place on a single metrics dict.
  * _reconcile_rocm_unified_memory(...)                   -- multi-device,
    iterates utilization["devices"] (visible-GPU path).
  * _reconcile_primary_rocm_unified_memory(...)           -- single flat
    metrics dict (primary-GPU path), uses parent_visible_spec to pick the
    primary index, falls back to ordinal 0 when no visibility env is set.

get_gpu_utilization now calls the primary reconciler under IS_ROCM, so both
endpoints surface the real unified-memory pool on iGPUs while leaving
discrete AMD GPUs untouched (torch_total <= smi_total -> no replace).
---
 studio/backend/utils/hardware/hardware.py | 85 +++++++++++++++++------
 1 file changed, 65 insertions(+), 20 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index df105f7a59..70026ba080 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -468,6 +468,16 @@ def get_gpu_utilization() -> Dict[str, Any]:
         result = _smi_query("get_primary_gpu_utilization")
         if result is not None:
             result["backend"] = _backend_label(device)
+            if IS_ROCM:
+                # Mirror the unified-memory reconciliation done in the
+                # visible-GPU path. amd-smi on AMD iGPUs (Strix Halo etc.)
+                # reports only the dedicated VRAM slice; torch.mem_get_info
+                # sees the full GTT pool. Without this the /api/train/hardware
+                # endpoint and the live GPU monitor still display the wrong
+                # VRAM total even after auto-selection has been corrected.
+                _reconcile_primary_rocm_unified_memory(
+                    result, _get_parent_visible_gpu_spec()
+                )
             return result
 
     mem = get_gpu_memory_info()
@@ -488,6 +498,35 @@ def get_gpu_utilization() -> Dict[str, Any]:
     return {"available": False, "backend": _backend_label(device)}
 
 
+def _apply_unified_memory_correction(
+    device_metrics: Dict[str, Any], torch_info: Dict[str, Any]
+) -> None:
+    """Per-device reconciliation: when torch reports a larger memory total
+    than amd-smi, overwrite the smi VRAM fields in place.
+
+    Used by both the multi-device and primary-device reconciliation helpers
+    so the two endpoints stay in sync on AMD iGPUs with unified memory.
+    """
+    torch_total_gb = torch_info["total_gb"]
+    smi_total_gb = device_metrics.get("vram_total_gb") or 0.0
+    if torch_total_gb > smi_total_gb:
+        torch_used_gb = torch_info["used_gb"]
+        device_metrics["vram_total_gb"] = torch_total_gb
+        device_metrics["vram_used_gb"] = torch_used_gb
+        device_metrics["vram_utilization_pct"] = (
+            round((torch_used_gb / torch_total_gb) * 100, 1)
+            if torch_total_gb > 0
+            else None
+        )
+        logger.debug(
+            "ROCm unified memory: replaced amd-smi VRAM (%.2f GB) with "
+            "torch mem_get_info total (%.2f GB) for device %s",
+            smi_total_gb,
+            torch_total_gb,
+            torch_info.get("index"),
+        )
+
+
 def _reconcile_rocm_unified_memory(
     utilization: Dict[str, Any], device_indices: list[int]
 ) -> None:
@@ -505,28 +544,34 @@ def _reconcile_rocm_unified_memory(
         return
     torch_by_index = {td["index"]: td for td in torch_devices}
     for dev in utilization.get("devices", []):
-        idx = dev.get("index")
-        td = torch_by_index.get(idx)
+        td = torch_by_index.get(dev.get("index"))
         if td is None:
             continue
-        torch_total_gb = td["total_gb"]
-        smi_total_gb = dev.get("vram_total_gb") or 0.0
-        if torch_total_gb > smi_total_gb:
-            torch_used_gb = td["used_gb"]
-            dev["vram_total_gb"] = torch_total_gb
-            dev["vram_used_gb"] = torch_used_gb
-            dev["vram_utilization_pct"] = (
-                round((torch_used_gb / torch_total_gb) * 100, 1)
-                if torch_total_gb > 0
-                else None
-            )
-            logger.debug(
-                "ROCm unified memory: replaced amd-smi VRAM (%.2f GB) with "
-                "torch mem_get_info total (%.2f GB) for device %d",
-                smi_total_gb,
-                torch_total_gb,
-                idx,
-            )
+        _apply_unified_memory_correction(dev, td)
+
+
+def _reconcile_primary_rocm_unified_memory(
+    utilization: Dict[str, Any], parent_visible_spec: Dict[str, Any]
+) -> None:
+    """Primary-GPU variant of the unified-memory reconciliation.
+
+    ``get_primary_gpu_utilization`` returns a flat metrics dict (no nested
+    ``devices`` list) for the first visible AMD GPU. Run the same correction
+    against torch.mem_get_info for that single device so the live training
+    hardware endpoint and the GPU monitor surface the real unified-memory
+    pool on Strix Halo and similar iGPUs.
+    """
+    numeric_ids = parent_visible_spec.get("numeric_ids")
+    if numeric_ids:
+        primary_idx = [int(numeric_ids[0])]
+    else:
+        # No CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES set: torch ordinal 0
+        # is the primary visible device.
+        primary_idx = [0]
+    torch_devices = _torch_get_per_device_info(primary_idx)
+    if not torch_devices:
+        return
+    _apply_unified_memory_correction(utilization, torch_devices[0])
 
 
 def get_visible_gpu_utilization() -> Dict[str, Any]:

From cb0edfc56c4bc653f557639872dbf73da85e32fc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 6 May 2026 12:29:20 +0000
Subject: [PATCH 006/165] Use 'is not None' and log debug on torch.version.hip
 probe failures

Two small follow-ups to the apply_gpu_ids ROCm fallback:

1. Match detect_hardware()'s 'getattr(torch.version, "hip", None) is not None'
   form so the entire codebase has one canonical 'this torch was built with
   HIP' check. On every shipping torch wheel hip is either None or a non-empty
   version string, so the new form agrees with the old bool() form on every
   real install.

2. Log the probe failure at debug level instead of swallowing it silently.
   The broad 'except Exception' is intentional (we never want apply_gpu_ids
   to crash a worker over a probe), but the silent pass made it impossible
   to tell whether the fallback was firing or being skipped.
---
 studio/backend/utils/hardware/hardware.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 3b1b9fe95e..1cb985539a 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1400,12 +1400,24 @@ def apply_gpu_ids(gpu_ids) -> None:
     )
     _is_rocm = IS_ROCM or _inherits_rocm_visibility
     if not _is_rocm:
+        # Use ``is not None`` here to match the detect_hardware() check at
+        # module top -- torch ships HIP version as a non-empty string on
+        # ROCm builds and None on CUDA builds, so the two forms agree on
+        # every shipping torch wheel; the ``is not None`` form is the one
+        # the rest of the codebase reads for "this torch was built with
+        # HIP". Keep the broad ``except`` as a safety net (we never want
+        # apply_gpu_ids to crash a worker over a probe failure) but log at
+        # debug level so the skip is observable when needed.
         try:
             import torch as _torch
 
-            _is_rocm = bool(getattr(_torch.version, "hip", None))
-        except Exception:
-            pass
+            _is_rocm = getattr(_torch.version, "hip", None) is not None
+        except Exception as e:
+            logger.debug(
+                "apply_gpu_ids: torch.version.hip probe skipped (%s: %s)",
+                type(e).__name__,
+                e,
+            )
     if _is_rocm:
         os.environ["HIP_VISIBLE_DEVICES"] = value
         os.environ["ROCR_VISIBLE_DEVICES"] = value

From 9a83a74bbfdef5233ce1a8850b1709aa2cea6929 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 12:49:19 -0500
Subject: [PATCH 007/165] fix(studio): honour HIP_VISIBLE_DEVICES in
 _get_parent_visible_gpu_spec before IS_ROCM is set

When a user has HIP_VISIBLE_DEVICES set in their shell (e.g. "1" to select
GPU 1) but detect_hardware() has not yet run in the Studio parent process,
IS_ROCM is still False.  _get_parent_visible_gpu_spec() was gated on IS_ROCM
so it fell through to CUDA_VISIBLE_DEVICES (unset), saw all physical GPUs,
and auto-selected index 0.  apply_gpu_ids then overwrote HIP_VISIBLE_DEVICES
with "0", making the intended GPU invisible to ROCm torch in the worker,
which triggered the "no usable HIP accelerator" error (issue #5180).

Apply the same _inherits_rocm_visibility pattern already used in
apply_gpu_ids: check for HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES in the
environment regardless of IS_ROCM so the correct GPU index is preserved.
---
 studio/backend/utils/hardware/hardware.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 1cb985539a..487ab1e850 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -599,7 +599,10 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
     # Use explicit None checks (not `or`) so empty string "" is honoured
     # as "no visible GPUs" rather than falling through to CUDA_VISIBLE_DEVICES.
     cuda_visible = None
-    if IS_ROCM:
+    _is_rocm_spec = IS_ROCM or (
+        "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ
+    )
+    if _is_rocm_spec:
         hip_vis = os.environ.get("HIP_VISIBLE_DEVICES")
         rocr_vis = os.environ.get("ROCR_VISIBLE_DEVICES")
         if hip_vis is not None:

From 9bcd0ed4401a806584bdd5f94b8f3db1ad7cf49f Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 5 May 2026 21:27:10 -0700
Subject: [PATCH 008/165] fix(install): harden AMD ROCm GPU detection for
 multi-GPU and env-filtered setups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous rocminfo awk pattern could miss discrete GPUs on machines
where HIP_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES is used to mask an
integrated GPU — the env vars filter rocminfo output but may not
propagate into the install script subprocess, causing detection to
fail entirely.

Two changes:
- Tighten rocminfo pattern from /gfx[0-9]/ && !/gfx000/ to
  /gfx[1-9][0-9]/ — simpler and correctly excludes the CPU agent
  (gfx000) without a negative lookahead
- Add sysfs KFD topology fallback: reads
  /sys/class/kfd/kfd/topology/nodes/*/gpu_id which is a kernel-level
  view unaffected by HIP_VISIBLE_DEVICES or ROCR_VISIBLE_DEVICES

Fixes detection failure reported in Discord by Chains (gfx1201 + iGPU
machine where env var exclusion of the iGPU caused rocminfo to return
no usable device).
---
 install.sh | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/install.sh b/install.sh
index 9046a9bdf6..305e45917f 100755
--- a/install.sh
+++ b/install.sh
@@ -1478,15 +1478,21 @@ _find_no_torch_runtime() {
 
 # ── AMD ROCm GPU detection helper ──
 # Returns 0 (true) if an actual AMD GPU is present, 1 (false) otherwise.
-# Checks rocminfo for gfx[1-9]* (excludes gfx000 CPU agent) and
-# amd-smi list for GPU data rows (excludes header-only output).
+# Checks rocminfo for gfx[1-9][0-9]+ (excludes gfx000 CPU agent),
+# amd-smi list for GPU data rows, and falls back to sysfs KFD topology
+# which is env-var-independent (works even when HIP_VISIBLE_DEVICES or
+# ROCR_VISIBLE_DEVICES hides devices from rocminfo/amd-smi).
 _has_amd_rocm_gpu() {
     if command -v rocminfo >/dev/null 2>&1 && \
-       rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[0-9]/ && !/Name:[[:space:]]*gfx000/{found=1} END{exit !found}'; then
+       rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9][0-9]/{found=1} END{exit !found}'; then
         return 0
     elif command -v amd-smi >/dev/null 2>&1 && \
          amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[][[:space:]]*[0-9]/{ found=1 } END{ exit !found }'; then
         return 0
+    elif [ -e /dev/kfd ] && \
+         awk '/gpu_id/{ if ($2+0 > 0) found=1 } END{ exit !found }' \
+             /sys/class/kfd/kfd/topology/nodes/*/gpu_id 2>/dev/null; then
+        return 0
     fi
     return 1
 }

From 3241eb32779d48cb3961e38cfe4fcd091f4448fa Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Wed, 6 May 2026 12:18:11 +0000
Subject: [PATCH 009/165] Fix KFD sysfs awk fallback to read properties file

The fallback added by this PR reads /sys/class/kfd/kfd/topology/nodes/*/gpu_id
files but matches the literal token 'gpu_id' against their content. Those
files contain only a single decimal value (e.g. '0' for CPU agents, '50432'
for GPU agents), so the regex never matches and 'found' stays 0, making the
fallback a no-op on every host. The properties file in the same directory
contains key/value lines like 'gpu_id 50432' which is what the existing awk
pattern expects.

Reproduced with a synthetic sysfs layout: against gpu_id files awk exits 1;
against properties files awk exits 0 when any node reports gpu_id > 0.
---
 install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index 305e45917f..fa2921b5ad 100755
--- a/install.sh
+++ b/install.sh
@@ -1491,7 +1491,7 @@ _has_amd_rocm_gpu() {
         return 0
     elif [ -e /dev/kfd ] && \
          awk '/gpu_id/{ if ($2+0 > 0) found=1 } END{ exit !found }' \
-             /sys/class/kfd/kfd/topology/nodes/*/gpu_id 2>/dev/null; then
+             /sys/class/kfd/kfd/topology/nodes/*/properties 2>/dev/null; then
         return 0
     fi
     return 1

From d2da8ce4d312c08808d6147070ea6427f8080ded Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 15:47:59 -0500
Subject: [PATCH 010/165] fix(setup.ps1): detect AMD ROCm GPU on Windows, bring
 to parity with setup.sh

setup.ps1 only checked nvidia-smi and fell straight to "gpu: none" on AMD
machines. setup.sh already probed rocminfo/amd-smi/hipconfig/hipinfo.

Add three-tier detection mirroring install_llama_prebuilt.py's detect_host():
1. hipinfo: gcnArchName in output confirms a real HIP GPU (not just SDK)
2. amd-smi list: "GPU: <digit>" data rows as fallback
3. WMI Win32_VideoController: last resort -- detects AMD GPU even without
   HIP SDK, then guides user to install it rather than silently going CPU

Also corrects the "none" message to mention AMD ROCm alongside NVIDIA so
users with AMD hardware understand the requirement.

Fixes: rohit-style install where Strix Halo (Radeon 8060S) showed
"gpu: none" even with the HIP SDK present.
---
 studio/setup.ps1 | 60 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 3 deletions(-)

diff --git a/studio/setup.ps1 b/studio/setup.ps1
index f2753d5c88..bbc8427e16 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -661,13 +661,67 @@ if (-not $HasNvidiaSmi) {
         }
     }
 }
+# ── AMD ROCm detection (Windows) ────────────────────────────────────────────
+# Mirror setup.sh: probe hipinfo then amd-smi for an actual GPU, not just
+# tool presence. amdhip64.dll alone is NOT treated as GPU evidence.
+$HasROCm = $false
+$ROCmGpuLabel = $null
 if (-not $HasNvidiaSmi) {
+    # hipinfo: present + output contains gcnArchName → real HIP GPU
+    $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
+    if ($hipinfoExe) {
+        try {
+            $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
+            if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
+                $HasROCm = $true
+                if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
+                    $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
+                } else {
+                    $ROCmGpuLabel = "AMD ROCm"
+                }
+            }
+        } catch {}
+    }
+    # amd-smi list fallback: look for "GPU: <digit>" data rows
+    if (-not $HasROCm) {
+        $amdSmiExe = Get-Command "amd-smi" -ErrorAction SilentlyContinue
+        if ($amdSmiExe) {
+            try {
+                $smiOut = & $amdSmiExe.Source list 2>&1 | Out-String
+                if ($LASTEXITCODE -eq 0 -and $smiOut -match "(?im)^GPU\s*[:\[]\s*\d") {
+                    $HasROCm = $true
+                    $ROCmGpuLabel = "AMD ROCm"
+                }
+            } catch {}
+        }
+    }
+    # WMI fallback: AMD GPU in device list but no HIP SDK → guide the user
+    if (-not $HasROCm) {
+        try {
+            $wmiGpu = Get-WmiObject Win32_VideoController -ErrorAction SilentlyContinue |
+                Where-Object { $_.Name -match "AMD|Radeon" } |
+                Select-Object -First 1
+            if ($wmiGpu) { $ROCmGpuLabel = $wmiGpu.Name }
+        } catch {}
+    }
+}
+
+if ($HasNvidiaSmi) {
+    step "gpu" "NVIDIA GPU detected"
+} elseif ($HasROCm) {
+    step "gpu" $ROCmGpuLabel
+} elseif ($ROCmGpuLabel) {
     Write-Host ""
-    step "gpu" "none (chat-only / GGUF)" "Yellow"
-    substep "Training and GPU inference require an NVIDIA GPU with drivers installed." "Yellow"
+    step "gpu" "AMD GPU detected -- HIP SDK not found" "Yellow"
+    substep "Detected: $ROCmGpuLabel" "Yellow"
+    substep "Install the HIP SDK for ROCm GPU inference:" "Yellow"
+    substep "https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" "Yellow"
     Write-Host ""
 } else {
-    step "gpu" "NVIDIA GPU detected"
+    Write-Host ""
+    step "gpu" "none (chat-only / GGUF)" "Yellow"
+    substep "Training and GPU inference require an NVIDIA or AMD ROCm GPU." "Yellow"
+    Write-Host ""
 }
 
 # ============================================

From f84a7236cafb19ec606c5a54710577aac30f8719 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 16:03:54 -0500
Subject: [PATCH 011/165] fix(install.ps1): detect AMD ROCm GPU on Windows,
 bring to parity with setup.ps1

install.ps1 had the same nvidia-smi-only GPU detection as setup.ps1 before
the setup.ps1 fix. Applies the same three-tier AMD detection:
1. hipinfo: gcnArchName confirms real HIP GPU
2. amd-smi list: GPU data rows as fallback
3. WMI Win32_VideoController: detects AMD GPU without HIP SDK and guides
   user to install it

Fixes: install.ps1 showing "gpu: none" while setup.ps1 correctly showed
"AMD GPU detected" on the same machine (reported by rohit, RX 7600 XT).
---
 install.ps1 | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index ef87c5ed08..fa5cfa991c 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1206,11 +1206,58 @@ shell.Run cmd, 0, False
             }
         }
     }
+    # ── AMD ROCm detection (Windows) — mirrors setup.ps1 ──
+    $HasROCm = $false
+    $ROCmGpuLabel = $null
+    if (-not $HasNvidiaSmi) {
+        $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
+        if ($hipinfoExe) {
+            try {
+                $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
+                if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
+                    $HasROCm = $true
+                    if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
+                        $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
+                    } else {
+                        $ROCmGpuLabel = "AMD ROCm"
+                    }
+                }
+            } catch {}
+        }
+        if (-not $HasROCm) {
+            $amdSmiExe = Get-Command "amd-smi" -ErrorAction SilentlyContinue
+            if ($amdSmiExe) {
+                try {
+                    $smiOut = & $amdSmiExe.Source list 2>&1 | Out-String
+                    if ($LASTEXITCODE -eq 0 -and $smiOut -match "(?im)^GPU\s*[:\[]\s*\d") {
+                        $HasROCm = $true
+                        $ROCmGpuLabel = "AMD ROCm"
+                    }
+                } catch {}
+            }
+        }
+        if (-not $HasROCm) {
+            try {
+                $wmiGpu = Get-WmiObject Win32_VideoController -ErrorAction SilentlyContinue |
+                    Where-Object { $_.Name -match "AMD|Radeon" } |
+                    Select-Object -First 1
+                if ($wmiGpu) { $ROCmGpuLabel = $wmiGpu.Name }
+            } catch {}
+        }
+    }
+
     if ($HasNvidiaSmi) {
         step "gpu" "NVIDIA GPU detected"
+    } elseif ($HasROCm) {
+        step "gpu" $ROCmGpuLabel
+    } elseif ($ROCmGpuLabel) {
+        step "gpu" "AMD GPU detected -- HIP SDK not found" "Yellow"
+        substep "Detected: $ROCmGpuLabel" "Yellow"
+        substep "Install the HIP SDK for ROCm GPU inference:" "Yellow"
+        substep "https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" "Yellow"
     } else {
         step "gpu" "none (chat-only / GGUF)" "Yellow"
-        substep "Training and GPU inference require an NVIDIA GPU with drivers installed." "Yellow"
+        substep "Training and GPU inference require an NVIDIA or AMD ROCm GPU." "Yellow"
     }
 
     # ── Choose the correct PyTorch index URL based on driver CUDA version ──

From 5d5ae5656661f1ed6bbfe269c6001af851652299 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 16:11:17 -0500
Subject: [PATCH 012/165] fix(install.ps1): suppress 'No NVIDIA GPU detected'
 when AMD GPU is present

---
 install.ps1 | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index fa5cfa991c..413abfc6cc 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1288,7 +1288,11 @@ shell.Run cmd, 0, False
     # ── Print CPU-only hint when no GPU detected ──
     if (-not $SkipTorch -and $TorchIndexUrl -like "*/cpu") {
         Write-Host ""
-        substep "No NVIDIA GPU detected." "Yellow"
+        if ($HasROCm -or $ROCmGpuLabel) {
+            substep "Installing CPU-only PyTorch (ROCm wheels require the HIP SDK)." "Yellow"
+        } else {
+            substep "No NVIDIA GPU detected." "Yellow"
+        }
         substep "Installing CPU-only PyTorch. If you only need GGUF chat/inference," "Yellow"
         substep "re-run with --no-torch for a faster, lighter install:" "Yellow"
         substep ".\install.ps1 --no-torch" "Yellow"

From 270b2dd9b0c4b564d6cfdc972066f69155fb6aea Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 16:30:35 -0500
Subject: [PATCH 013/165] feat: add Windows AMD ROCm PyTorch wheel installation

install_python_stack.py:
- Add _ROCM_WINDOWS_WHEEL_BASE and _ROCM_WINDOWS_RELEASES constants
  pointing to AMD repo.radeon.com (ROCm 7.2 -> torch 2.9.1+rocm7.2.1)
- Extend _ensure_rocm_torch() with a Windows branch: detects ROCm via
  _has_rocm_gpu() / _detect_rocm_version(), requires Python 3.12 (cp312
  is the only ABI AMD publishes for Windows), installs the direct wheel
  URL from repo.radeon.com

install.ps1:
- Capture ROCmVersion during AMD detection via hipconfig --version /
  amd-smi version (needed for wheel URL selection)
- After Get-TorchIndexUrl, add an AMD wheel override block: when HasROCm
  and Python 3.12 detected, set ROCmTorchWheelUrl to AMD wheel URL
- Expand torch install branch to handle ROCmTorchWheelUrl with
  uv pip install --force-reinstall --no-cache-dir
---
 install.ps1                    | 65 ++++++++++++++++++++++++--
 studio/install_python_stack.py | 83 +++++++++++++++++++++++++++++-----
 2 files changed, 134 insertions(+), 14 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 413abfc6cc..345b8fa9a3 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1209,6 +1209,7 @@ shell.Run cmd, 0, False
     # ── AMD ROCm detection (Windows) — mirrors setup.ps1 ──
     $HasROCm = $false
     $ROCmGpuLabel = $null
+    $ROCmVersion = $null
     if (-not $HasNvidiaSmi) {
         $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
         if ($hipinfoExe) {
@@ -1244,6 +1245,29 @@ shell.Run cmd, 0, False
                 if ($wmiGpu) { $ROCmGpuLabel = $wmiGpu.Name }
             } catch {}
         }
+        # Capture ROCm version for wheel selection (hipconfig, then amd-smi)
+        if ($HasROCm) {
+            $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
+            if ($hipConfigExe) {
+                try {
+                    $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
+                    if ($LASTEXITCODE -eq 0 -and $hipVerOut -match '(\d+\.\d+)') {
+                        $ROCmVersion = $Matches[1]
+                    }
+                } catch {}
+            }
+            if (-not $ROCmVersion) {
+                $amdSmiVer = Get-Command "amd-smi" -ErrorAction SilentlyContinue
+                if ($amdSmiVer) {
+                    try {
+                        $smiVerOut = & $amdSmiVer.Source version 2>&1 | Out-String
+                        if ($LASTEXITCODE -eq 0 -and $smiVerOut -match 'ROCm version:\s*(\d+\.\d+)') {
+                            $ROCmVersion = $Matches[1]
+                        }
+                    } catch {}
+                }
+            }
+        }
     }
 
     if ($HasNvidiaSmi) {
@@ -1281,12 +1305,39 @@ shell.Run cmd, 0, False
         return "$baseUrl/cu126"
     }
     $TorchIndexUrl = Get-TorchIndexUrl
-    $TorchIndexFamily = Get-TauriTorchIndexFamily $TorchIndexUrl
+
+    # ── AMD Windows ROCm wheel override ──
+    # AMD publishes direct torch wheels for Windows (cp312 only) at repo.radeon.com.
+    # When the HIP SDK is present and Python 3.12 is in use, swap in the AMD wheel
+    # URL and clear $TorchIndexUrl so the standard --index-url path is skipped.
+    $ROCmTorchWheelUrl = $null
+    if ($HasROCm -and -not $SkipTorch) {
+        $pyMajMin = if ($DetectedPython) { ($DetectedPython.Version -split '\.')[0..1] -join '.' } else { "" }
+        if ($pyMajMin -eq "3.12") {
+            $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
+            if ($ROCmVersion -and $ROCmVersion -match '^7\.2') {
+                $ROCmTorchWheelUrl = "$amdWheelBase/rocm-rel-7.2.1/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                $TorchIndexUrl = $null
+            }
+            if ($ROCmTorchWheelUrl) {
+                substep "AMD ROCm $ROCmVersion (Python 3.12) -- AMD Windows torch wheel selected" "Cyan"
+            } elseif ($ROCmVersion) {
+                substep "No AMD Windows torch wheel for ROCm $ROCmVersion -- falling back to CPU-only PyTorch" "Yellow"
+            } else {
+                substep "ROCm version unknown -- falling back to CPU-only PyTorch" "Yellow"
+            }
+        } else {
+            substep "AMD Windows ROCm wheels require Python 3.12 (detected: $pyMajMin) -- using CPU-only PyTorch" "Yellow"
+            substep "To enable ROCm training, reinstall with Python 3.12." "Yellow"
+        }
+    }
+
+    $TorchIndexFamily = Get-TauriTorchIndexFamily $(if ($ROCmTorchWheelUrl) { "rocm7.2" } else { $TorchIndexUrl })
     $GpuBranch = Get-TauriGpuBranch $TorchIndexFamily
     Write-TauriDiag -GpuBranch $GpuBranch -TorchIndexFamily $TorchIndexFamily -PythonVersionForDiag $DetectedPython.Version
 
     # ── Print CPU-only hint when no GPU detected ──
-    if (-not $SkipTorch -and $TorchIndexUrl -like "*/cpu") {
+    if (-not $SkipTorch -and -not $ROCmTorchWheelUrl -and $TorchIndexUrl -like "*/cpu") {
         Write-Host ""
         if ($HasROCm -or $ROCmGpuLabel) {
             substep "Installing CPU-only PyTorch (ROCm wheels require the HIP SDK)." "Yellow"
@@ -1364,9 +1415,17 @@ shell.Run cmd, 0, False
                 return (Exit-InstallFailure "Failed to overlay unsloth-zoo (exit code $zooOverlayExit)" $zooOverlayExit)
             }
         }
-    } elseif ($TorchIndexUrl) {
+    } elseif ($TorchIndexUrl -or $ROCmTorchWheelUrl) {
         if ($SkipTorch) {
             substep "skipping PyTorch (--no-torch flag set)." "Yellow"
+        } elseif ($ROCmTorchWheelUrl) {
+            Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
+            substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir $ROCmTorchWheelUrl }
+            if ($torchInstallExit -ne 0) {
+                Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
+                return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
+            }
         } else {
             Write-TauriLog "STEP" "Installing PyTorch"
             substep "installing PyTorch ($TorchIndexUrl)..."
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 3fd1e6af66..0c6960adf2 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -57,6 +57,17 @@
     os.environ.get("UNSLOTH_PYTORCH_MIRROR") or "https://download.pytorch.org/whl"
 ).rstrip("/")
 
+# AMD Windows ROCm wheels — repo.radeon.com (cp312 only; AMD does not publish
+# Windows ROCm wheels for other Python versions)
+_ROCM_WINDOWS_WHEEL_BASE = (
+    os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR")
+    or "https://repo.radeon.com/rocm/windows"
+).rstrip("/")
+# Maps (major, minor) → (release_folder, torch_version_string)
+_ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, str]] = {
+    (7, 2): ("rocm-rel-7.2.1", "2.9.1+rocm7.2.1"),
+}
+
 # bitsandbytes continuous-release_main wheels with the ROCm 4-bit GEMV fix
 # (bnb PR #1887, post-0.49.2). bnb <= 0.49.2 NaNs at decode shape on every
 # AMD GPU. Drop the pin once bnb 0.50+ ships on PyPI.
@@ -241,20 +252,70 @@ def _has_usable_nvidia_gpu() -> bool:
 def _ensure_rocm_torch() -> None:
     """Reinstall torch with ROCm wheels when the venv received CPU-only torch.
 
-    Runs only on Linux x86_64 hosts where an AMD GPU is present and the
-    ROCm runtime is detectable (rocminfo / amd-smi / hipconfig /
-    rocm-core package).  No-op when torch already links against HIP
-    (ROCm), on Windows / macOS, on non-x86_64 Linux (PyTorch does not
-    publish ROCm wheels for aarch64 / arm64), or on mixed AMD+NVIDIA
-    hosts (NVIDIA takes precedence).
+    On Linux x86_64: uses pytorch.org ROCm wheel index tags.
+    On Windows (cp312 only): uses AMD's repo.radeon.com direct wheel releases.
+    No-op on macOS, non-x86_64 Linux, NVIDIA-primary hosts, or when torch
+    already links against HIP.
     Uses pip_install() to respect uv, constraints, and --python targeting.
     """
-    # Explicit OS / architecture guards so the helper is safe to call
-    # from any context -- PyTorch only publishes ROCm wheels for
-    # linux_x86_64, so aarch64 / arm64 hosts must skip this repair path
-    # instead of failing the update with a missing-wheel error.
-    if IS_WINDOWS or IS_MACOS:
+    if IS_MACOS:
+        return
+
+    if IS_WINDOWS:
+        # AMD only publishes Windows ROCm wheels for Python 3.12 (cp312)
+        if sys.version_info[:2] != (3, 12):
+            print(
+                f"   ROCm torch on Windows requires Python 3.12 "
+                f"(current: {sys.version_info[0]}.{sys.version_info[1]}) -- skipping"
+            )
+            return
+        if _has_usable_nvidia_gpu():
+            return
+        if not _has_rocm_gpu():
+            return
+        try:
+            probe = subprocess.run(
+                [sys.executable, "-c", "import torch; print(getattr(torch.version,'hip','') or '')"],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                timeout = 30,
+            )
+            if probe.returncode == 0 and probe.stdout.decode().strip():
+                return  # already ROCm torch
+        except (OSError, subprocess.TimeoutExpired):
+            pass
+        ver = _detect_rocm_version()
+        if ver is None:
+            print("   ROCm detected but version unreadable -- skipping torch reinstall")
+            return
+        entry = next(
+            ((rt, tv) for (maj, mn), (rt, tv) in sorted(_ROCM_WINDOWS_RELEASES.items(), reverse = True)
+             if ver >= (maj, mn)),
+            None,
+        )
+        if entry is None:
+            print(
+                f"   No AMD Windows torch wheel for ROCm {ver[0]}.{ver[1]} -- skipping"
+            )
+            return
+        rel_tag, torch_ver = entry
+        wheel_url = (
+            f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}/"
+            f"torch-{torch_ver}-cp312-cp312-win_amd64.whl"
+        )
+        print(f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {wheel_url}")
+        pip_install(
+            f"ROCm torch (Windows, {rel_tag})",
+            "--force-reinstall",
+            "--no-cache-dir",
+            wheel_url,
+            constrain = False,
+        )
         return
+
+    # ── Linux x86_64 path ──────────────────────────────────────────────────────
+    # PyTorch only publishes ROCm wheels for linux_x86_64; skip aarch64 / arm64
+    # to avoid a missing-wheel error on `unsloth studio update`.
     if platform.machine().lower() not in {"x86_64", "amd64"}:
         return
     # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable

From 14f655917c653b42e96eddc241368e7dfd5a5b13 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 21:32:22 +0000
Subject: [PATCH 014/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/install_python_stack.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 0c6960adf2..0be4607380 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -275,7 +275,11 @@ def _ensure_rocm_torch() -> None:
             return
         try:
             probe = subprocess.run(
-                [sys.executable, "-c", "import torch; print(getattr(torch.version,'hip','') or '')"],
+                [
+                    sys.executable,
+                    "-c",
+                    "import torch; print(getattr(torch.version,'hip','') or '')",
+                ],
                 stdout = subprocess.PIPE,
                 stderr = subprocess.DEVNULL,
                 timeout = 30,
@@ -289,8 +293,13 @@ def _ensure_rocm_torch() -> None:
             print("   ROCm detected but version unreadable -- skipping torch reinstall")
             return
         entry = next(
-            ((rt, tv) for (maj, mn), (rt, tv) in sorted(_ROCM_WINDOWS_RELEASES.items(), reverse = True)
-             if ver >= (maj, mn)),
+            (
+                (rt, tv)
+                for (maj, mn), (rt, tv) in sorted(
+                    _ROCM_WINDOWS_RELEASES.items(), reverse = True
+                )
+                if ver >= (maj, mn)
+            ),
             None,
         )
         if entry is None:
@@ -303,7 +312,9 @@ def _ensure_rocm_torch() -> None:
             f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}/"
             f"torch-{torch_ver}-cp312-cp312-win_amd64.whl"
         )
-        print(f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {wheel_url}")
+        print(
+            f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {wheel_url}"
+        )
         pip_install(
             f"ROCm torch (Windows, {rel_tag})",
             "--force-reinstall",

From 8299842271a164eae73eeada5454c0c35a9192c6 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 16:32:35 -0500
Subject: [PATCH 015/165] fix: also install torchvision and torchaudio from AMD
 Windows repo

AMD publishes matching torchvision-0.24.1+rocm7.2.1 and
torchaudio-2.9.1+rocm7.2.1 cp312 wheels at the same repo.radeon.com
release folder. Install all three in both install.ps1 and
install_python_stack.py Windows ROCm path.
---
 install.ps1                    |  7 +++++--
 studio/install_python_stack.py | 28 ++++++++++++++--------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 345b8fa9a3..4dfd023af5 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1316,7 +1316,10 @@ shell.Run cmd, 0, False
         if ($pyMajMin -eq "3.12") {
             $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
             if ($ROCmVersion -and $ROCmVersion -match '^7\.2') {
-                $ROCmTorchWheelUrl = "$amdWheelBase/rocm-rel-7.2.1/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                $amdRelBase = "$amdWheelBase/rocm-rel-7.2.1"
+                $ROCmTorchWheelUrl = "$amdRelBase/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                $ROCmTorchVisionUrl = "$amdRelBase/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                $ROCmTorchAudioUrl = "$amdRelBase/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
                 $TorchIndexUrl = $null
             }
             if ($ROCmTorchWheelUrl) {
@@ -1421,7 +1424,7 @@ shell.Run cmd, 0, False
         } elseif ($ROCmTorchWheelUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
             substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir $ROCmTorchWheelUrl }
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir $ROCmTorchWheelUrl $ROCmTorchVisionUrl $ROCmTorchAudioUrl }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 0be4607380..4b9159529d 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -63,9 +63,9 @@
     os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR")
     or "https://repo.radeon.com/rocm/windows"
 ).rstrip("/")
-# Maps (major, minor) → (release_folder, torch_version_string)
-_ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, str]] = {
-    (7, 2): ("rocm-rel-7.2.1", "2.9.1+rocm7.2.1"),
+# Maps (major, minor) → (release_folder, torch_ver, torchvision_ver, torchaudio_ver)
+_ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, str, str, str]] = {
+    (7, 2): ("rocm-rel-7.2.1", "2.9.1+rocm7.2.1", "0.24.1+rocm7.2.1", "2.9.1+rocm7.2.1"),
 }
 
 # bitsandbytes continuous-release_main wheels with the ROCm 4-bit GEMV fix
@@ -294,8 +294,8 @@ def _ensure_rocm_torch() -> None:
             return
         entry = next(
             (
-                (rt, tv)
-                for (maj, mn), (rt, tv) in sorted(
+                v
+                for (maj, mn), v in sorted(
                     _ROCM_WINDOWS_RELEASES.items(), reverse = True
                 )
                 if ver >= (maj, mn)
@@ -307,19 +307,19 @@ def _ensure_rocm_torch() -> None:
                 f"   No AMD Windows torch wheel for ROCm {ver[0]}.{ver[1]} -- skipping"
             )
             return
-        rel_tag, torch_ver = entry
-        wheel_url = (
-            f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}/"
-            f"torch-{torch_ver}-cp312-cp312-win_amd64.whl"
-        )
-        print(
-            f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {wheel_url}"
-        )
+        rel_tag, torch_ver, tv_ver, ta_ver = entry
+        base = f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}"
+        torch_url = f"{base}/torch-{torch_ver}-cp312-cp312-win_amd64.whl"
+        tv_url = f"{base}/torchvision-{tv_ver}-cp312-cp312-win_amd64.whl"
+        ta_url = f"{base}/torchaudio-{ta_ver}-cp312-cp312-win_amd64.whl"
+        print(f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {base}/")
         pip_install(
             f"ROCm torch (Windows, {rel_tag})",
             "--force-reinstall",
             "--no-cache-dir",
-            wheel_url,
+            torch_url,
+            tv_url,
+            ta_url,
             constrain = False,
         )
         return

From ec40e9f207a68aab42c1e2e919126204797cd1a3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 6 May 2026 21:34:34 +0000
Subject: [PATCH 016/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/install_python_stack.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 4b9159529d..78885bfeef 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -65,7 +65,12 @@
 ).rstrip("/")
 # Maps (major, minor) → (release_folder, torch_ver, torchvision_ver, torchaudio_ver)
 _ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, str, str, str]] = {
-    (7, 2): ("rocm-rel-7.2.1", "2.9.1+rocm7.2.1", "0.24.1+rocm7.2.1", "2.9.1+rocm7.2.1"),
+    (7, 2): (
+        "rocm-rel-7.2.1",
+        "2.9.1+rocm7.2.1",
+        "0.24.1+rocm7.2.1",
+        "2.9.1+rocm7.2.1",
+    ),
 }
 
 # bitsandbytes continuous-release_main wheels with the ROCm 4-bit GEMV fix
@@ -295,9 +300,7 @@ def _ensure_rocm_torch() -> None:
         entry = next(
             (
                 v
-                for (maj, mn), v in sorted(
-                    _ROCM_WINDOWS_RELEASES.items(), reverse = True
-                )
+                for (maj, mn), v in sorted(_ROCM_WINDOWS_RELEASES.items(), reverse = True)
                 if ver >= (maj, mn)
             ),
             None,

From 7e2943597ddbadffd0e394e382a054e0ef66fed3 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 17:55:23 -0500
Subject: [PATCH 017/165] feat: add ROCm 7.1.1 Windows wheel mapping

AMD uses a different version string for 7.1.1 wheels:
2.9.0+rocmsdk20251116 (date-tagged) instead of +rocm7.1.1.
Adds the 7.1.1 release folder to both install.ps1 and
install_python_stack.py so users with ROCm 7.1 get ROCm
torch instead of falling back to CPU.
---
 install.ps1                    | 6 ++++++
 studio/install_python_stack.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/install.ps1 b/install.ps1
index 4dfd023af5..ce0211027e 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1321,6 +1321,12 @@ shell.Run cmd, 0, False
                 $ROCmTorchVisionUrl = "$amdRelBase/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
                 $ROCmTorchAudioUrl = "$amdRelBase/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
                 $TorchIndexUrl = $null
+            } elseif ($ROCmVersion -and $ROCmVersion -match '^7\.1') {
+                $amdRelBase = "$amdWheelBase/rocm-rel-7.1.1"
+                $ROCmTorchWheelUrl = "$amdRelBase/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+                $ROCmTorchVisionUrl = "$amdRelBase/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+                $ROCmTorchAudioUrl = "$amdRelBase/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+                $TorchIndexUrl = $null
             }
             if ($ROCmTorchWheelUrl) {
                 substep "AMD ROCm $ROCmVersion (Python 3.12) -- AMD Windows torch wheel selected" "Cyan"
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 78885bfeef..a3e84d5885 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -71,6 +71,12 @@
         "0.24.1+rocm7.2.1",
         "2.9.1+rocm7.2.1",
     ),
+    (7, 1): (
+        "rocm-rel-7.1.1",
+        "2.9.0+rocmsdk20251116",
+        "0.24.0+rocmsdk20251116",
+        "2.9.0+rocmsdk20251116",
+    ),
 }
 
 # bitsandbytes continuous-release_main wheels with the ROCm 4-bit GEMV fix

From e53b543ef0010af2d5c0fb535bcc57a76c5b4ada Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 18:08:01 -0500
Subject: [PATCH 018/165] fix: install rocm_sdk_core and
 rocm_sdk_libraries_custom alongside torch

The AMD Windows torch wheels declare rocm[libraries]==<ver> as a hard
dependency. Without installing rocm_sdk_core and rocm_sdk_libraries_custom
from the same AMD release folder, uv cannot resolve the dependency and
fails with 'No solution found'. Include all 5 wheels in one install call.
---
 install.ps1                    | 24 ++++++++++++++++-------
 studio/install_python_stack.py | 36 ++++++++++++++++++++--------------
 2 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/install.ps1 b/install.ps1
index ce0211027e..cad8734bb8 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1317,15 +1317,25 @@ shell.Run cmd, 0, False
             $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
             if ($ROCmVersion -and $ROCmVersion -match '^7\.2') {
                 $amdRelBase = "$amdWheelBase/rocm-rel-7.2.1"
-                $ROCmTorchWheelUrl = "$amdRelBase/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
-                $ROCmTorchVisionUrl = "$amdRelBase/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
-                $ROCmTorchAudioUrl = "$amdRelBase/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                $ROCmAllWheelUrls = @(
+                    "$amdRelBase/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+                    "$amdRelBase/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
+                    "$amdRelBase/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                    "$amdRelBase/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                    "$amdRelBase/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                )
+                $ROCmTorchWheelUrl = $ROCmAllWheelUrls[2]
                 $TorchIndexUrl = $null
             } elseif ($ROCmVersion -and $ROCmVersion -match '^7\.1') {
                 $amdRelBase = "$amdWheelBase/rocm-rel-7.1.1"
-                $ROCmTorchWheelUrl = "$amdRelBase/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
-                $ROCmTorchVisionUrl = "$amdRelBase/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
-                $ROCmTorchAudioUrl = "$amdRelBase/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+                $ROCmAllWheelUrls = @(
+                    "$amdRelBase/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
+                    "$amdRelBase/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
+                    "$amdRelBase/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                    "$amdRelBase/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                    "$amdRelBase/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+                )
+                $ROCmTorchWheelUrl = $ROCmAllWheelUrls[2]
                 $TorchIndexUrl = $null
             }
             if ($ROCmTorchWheelUrl) {
@@ -1430,7 +1440,7 @@ shell.Run cmd, 0, False
         } elseif ($ROCmTorchWheelUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
             substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir $ROCmTorchWheelUrl $ROCmTorchVisionUrl $ROCmTorchAudioUrl }
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir @ROCmAllWheelUrls }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index a3e84d5885..8ffd9a1974 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -63,19 +63,29 @@
     os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR")
     or "https://repo.radeon.com/rocm/windows"
 ).rstrip("/")
-# Maps (major, minor) → (release_folder, torch_ver, torchvision_ver, torchaudio_ver)
-_ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, str, str, str]] = {
+# Maps (major, minor) → (release_folder, [wheel_filename, ...])
+# Includes rocm_sdk_core and rocm_sdk_libraries_custom because the torch
+# wheels declare them as hard dependencies (rocm[libraries]==<ver>).
+_ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, list[str]]] = {
     (7, 2): (
         "rocm-rel-7.2.1",
-        "2.9.1+rocm7.2.1",
-        "0.24.1+rocm7.2.1",
-        "2.9.1+rocm7.2.1",
+        [
+            "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+            "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
+            "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+            "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+            "torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+        ],
     ),
     (7, 1): (
         "rocm-rel-7.1.1",
-        "2.9.0+rocmsdk20251116",
-        "0.24.0+rocmsdk20251116",
-        "2.9.0+rocmsdk20251116",
+        [
+            "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
+            "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
+            "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+            "torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+            "torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+        ],
     ),
 }
 
@@ -316,19 +326,15 @@ def _ensure_rocm_torch() -> None:
                 f"   No AMD Windows torch wheel for ROCm {ver[0]}.{ver[1]} -- skipping"
             )
             return
-        rel_tag, torch_ver, tv_ver, ta_ver = entry
+        rel_tag, wheel_files = entry
         base = f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}"
-        torch_url = f"{base}/torch-{torch_ver}-cp312-cp312-win_amd64.whl"
-        tv_url = f"{base}/torchvision-{tv_ver}-cp312-cp312-win_amd64.whl"
-        ta_url = f"{base}/torchaudio-{ta_ver}-cp312-cp312-win_amd64.whl"
+        wheel_urls = [f"{base}/{fn}" for fn in wheel_files]
         print(f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {base}/")
         pip_install(
             f"ROCm torch (Windows, {rel_tag})",
             "--force-reinstall",
             "--no-cache-dir",
-            torch_url,
-            tv_url,
-            ta_url,
+            *wheel_urls,
             constrain = False,
         )
         return

From a74cb8b94e3b4c678a402ce882543b8aa5c8f9fa Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 18:12:42 -0500
Subject: [PATCH 019/165] fix: expand ROCm wheel array to scalars for
 Invoke-InstallCommand

@array splatting inside a scriptblock only works when the native command
is prefixed with '&'. Invoke-InstallCommand uses '& $Command' to run the
block, so @ROCmAllWheelUrls was not being expanded. Extract to scalar
variables $rw0-$rw4 which are captured correctly by the closure.
---
 install.ps1 | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index cad8734bb8..2563ddffae 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1440,7 +1440,12 @@ shell.Run cmd, 0, False
         } elseif ($ROCmTorchWheelUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
             substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir @ROCmAllWheelUrls }
+            # Expand array to scalars — @array splatting requires & and doesn't
+            # work reliably inside scriptblocks passed to Invoke-InstallCommand.
+            $rw0 = $ROCmAllWheelUrls[0]; $rw1 = $ROCmAllWheelUrls[1]
+            $rw2 = $ROCmAllWheelUrls[2]; $rw3 = $ROCmAllWheelUrls[3]
+            $rw4 = $ROCmAllWheelUrls[4]
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir $rw0 $rw1 $rw2 $rw3 $rw4 }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)

From 79670ab4a9fb685f2d8c80b87d7e7ecf78b6c969 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 18:48:57 -0500
Subject: [PATCH 020/165] fix: use --no-deps for AMD Windows torch wheel
 install

uv's resolver looks up rocm[libraries]==0.1.dev0 on PyPI during
dependency resolution before downloading any wheels, and fails because
the package doesn't exist on PyPI. --no-deps skips resolution entirely
and installs all 5 AMD wheels directly. The GPU runtime dependency is
satisfied by the HIP SDK, not a Python package.
---
 install.ps1                    | 2 +-
 studio/install_python_stack.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index 2563ddffae..9b55ff1492 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1445,7 +1445,7 @@ shell.Run cmd, 0, False
             $rw0 = $ROCmAllWheelUrls[0]; $rw1 = $ROCmAllWheelUrls[1]
             $rw2 = $ROCmAllWheelUrls[2]; $rw3 = $ROCmAllWheelUrls[3]
             $rw4 = $ROCmAllWheelUrls[4]
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir $rw0 $rw1 $rw2 $rw3 $rw4 }
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir --no-deps $rw0 $rw1 $rw2 $rw3 $rw4 }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 8ffd9a1974..a339d273bf 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -334,6 +334,7 @@ def _ensure_rocm_torch() -> None:
             f"ROCm torch (Windows, {rel_tag})",
             "--force-reinstall",
             "--no-cache-dir",
+            "--no-deps",
             *wheel_urls,
             constrain = False,
         )

From b550948b39fb758a5f53052e19d4f5c7a151b203 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 19:07:17 -0500
Subject: [PATCH 021/165] fix: setup.ps1 and install_python_stack.py now
 install ROCm torch on Windows

setup.ps1 was always setting CuTag='cpu' for non-NVIDIA hosts and installing
cpu-only PyTorch, overwriting the ROCm torch installed by install.ps1.
Adds the same AMD wheel selection logic (ROCm version detection, Python 3.12
check, 5-wheel install with --no-deps) to setup.ps1's torch install block.

install_python_stack.py: remove IS_WINDOWS guard from _ensure_rocm_torch()
call site so the Windows path in _ensure_rocm_torch() is reachable during
'unsloth studio update' as well.
---
 studio/install_python_stack.py |  6 ++--
 studio/setup.ps1               | 66 ++++++++++++++++++++++++++++++++--
 2 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index a339d273bf..c47d1e98b0 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -1140,12 +1140,12 @@ def install_python_stack() -> int:
     # 2b. AMD ROCm: reinstall torch with HIP wheels if the host has ROCm but the
     #     venv received CPU-only torch (common when pip resolves torch from PyPI).
     #     Must come immediately after base packages so torch is present for inspection.
-    if not IS_WINDOWS and not IS_MACOS and not NO_TORCH:
+    if not IS_MACOS and not NO_TORCH:
         _progress("ROCm torch check")
         _ensure_rocm_torch()
 
-    # Windows + AMD GPU: PyTorch does not publish ROCm wheels for Windows.
-    # Detect and warn so users know manual steps are needed for GPU training.
+    # Windows + AMD GPU: if ROCm torch was not installed (wrong Python version
+    # or unknown ROCm version), warn the user.
     if IS_WINDOWS and not NO_TORCH and not _has_usable_nvidia_gpu():
         # Validate actual AMD GPU presence (not just tool existence)
         import re as _re_win
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index bbc8427e16..96318349c6 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1823,9 +1823,71 @@ if ($HasNvidiaSmi) {
     $CuTag = "cpu"
 }
 
+# ── AMD Windows ROCm torch override ──────────────────────────────────────────
+# When ROCm HIP SDK is present and Python 3.12 is in use, install AMD's direct
+# torch wheels instead of CPU-only PyTorch.
+$ROCmVersion = $null
+$ROCmTorchWheelUrls = $null
+if ($HasROCm -and $CuTag -eq "cpu") {
+    # Detect ROCm version via hipconfig, then amd-smi
+    $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
+    if ($hipConfigExe) {
+        try {
+            $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
+            if ($LASTEXITCODE -eq 0 -and $hipVerOut -match '(\d+\.\d+)') { $ROCmVersion = $Matches[1] }
+        } catch {}
+    }
+    if (-not $ROCmVersion) {
+        $amdSmiVer = Get-Command "amd-smi" -ErrorAction SilentlyContinue
+        if ($amdSmiVer) {
+            try {
+                $smiVerOut = & $amdSmiVer.Source version 2>&1 | Out-String
+                if ($LASTEXITCODE -eq 0 -and $smiVerOut -match 'ROCm version:\s*(\d+\.\d+)') { $ROCmVersion = $Matches[1] }
+            } catch {}
+        }
+    }
+    $pyVer = (& python --version 2>&1 | Out-String) -replace '[^0-9.]',''
+    $pyMajMin = ($pyVer.Trim() -split '\.')[0..1] -join '.'
+    $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
+    if ($pyMajMin -eq "3.12" -and $ROCmVersion) {
+        if ($ROCmVersion -match '^7\.2') {
+            $rb = "$amdWheelBase/rocm-rel-7.2.1"
+            $ROCmTorchWheelUrls = @(
+                "$rb/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+                "$rb/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
+                "$rb/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                "$rb/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                "$rb/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+            )
+        } elseif ($ROCmVersion -match '^7\.1') {
+            $rb = "$amdWheelBase/rocm-rel-7.1.1"
+            $ROCmTorchWheelUrls = @(
+                "$rb/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
+                "$rb/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
+                "$rb/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                "$rb/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                "$rb/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+            )
+        }
+    }
+}
+
 $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR.TrimEnd('/') } else { "https://download.pytorch.org/whl" }
 
-if ($CuTag -eq "cpu") {
+if ($ROCmTorchWheelUrls) {
+    substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
+    $sw0 = $ROCmTorchWheelUrls[0]; $sw1 = $ROCmTorchWheelUrls[1]
+    $sw2 = $ROCmTorchWheelUrls[2]; $sw3 = $ROCmTorchWheelUrls[3]; $sw4 = $ROCmTorchWheelUrls[4]
+    $output = Fast-Install --force-reinstall --no-cache-dir --no-deps $sw0 $sw1 $sw2 $sw3 $sw4 | Out-String
+    $torchInstallExit = $LASTEXITCODE
+    if ($torchInstallExit -ne 0) {
+        Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow
+        Write-Host $output -ForegroundColor Yellow
+        $ROCmTorchWheelUrls = $null
+    }
+}
+
+if (-not $ROCmTorchWheelUrls -and $CuTag -eq "cpu") {
     substep "installing PyTorch (CPU-only)..."
     if ($script:UnslothVerbose) {
         Fast-Install torch torchvision torchaudio --index-url "$PyTorchWhlBase/cpu"
@@ -1840,7 +1902,7 @@ if ($CuTag -eq "cpu") {
         Write-Host $output -ForegroundColor Red
         exit 1
     }
-} else {
+} elseif (-not $ROCmTorchWheelUrls) {
     substep "installing PyTorch with CUDA support ($CuTag)..."
     substep "(This download is ~2.8 GB -- may take a few minutes)"
     if ($script:UnslothVerbose) {

From 6f792137bb4f1dee3aecd010f6c685d4d4e581cf Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 19:24:46 -0500
Subject: [PATCH 022/165] fix: suppress manual-install warning when ROCm torch
 already present; fix progress counter

- Gate the 'must be installed manually' warning on torch.version.hip being empty
  so it doesn't fire when our ROCm torch install succeeded
- Update _TOTAL counter to include the 3 ROCm steps on Windows now that
  _ensure_rocm_torch() is called there (fixes 10/9 display)
---
 studio/install_python_stack.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index c47d1e98b0..0a9175abef 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -991,7 +991,7 @@ def install_python_stack() -> int:
     base_total = 10 if IS_WINDOWS else 11
     if IS_MACOS:
         base_total -= 1  # triton step is skipped on macOS
-    if not IS_WINDOWS and not IS_MACOS and not NO_TORCH:
+    if not IS_MACOS and not NO_TORCH:
         base_total += 3
     _TOTAL = (base_total - 1) if skip_base else base_total
 
@@ -1175,14 +1175,24 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
                 _win_amd_gpu = True
                 break
         if _win_amd_gpu:
-            _safe_print(
-                _dim("  Note:"),
-                "AMD GPU detected on Windows. ROCm-enabled PyTorch must be",
-            )
-            _safe_print(
-                " " * 8,
-                "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
-            )
+            # Only warn if torch doesn't already have ROCm (HIP) support
+            try:
+                _hip_ver = subprocess.run(
+                    [sys.executable, "-c", "import torch; print(getattr(torch.version,'hip','') or '')"],
+                    stdout = subprocess.PIPE, stderr = subprocess.DEVNULL, timeout = 20,
+                )
+                _has_rocm_torch = _hip_ver.returncode == 0 and _hip_ver.stdout.decode().strip() != ""
+            except Exception:
+                _has_rocm_torch = False
+            if not _has_rocm_torch:
+                _safe_print(
+                    _dim("  Note:"),
+                    "AMD GPU detected on Windows. ROCm-enabled PyTorch must be",
+                )
+                _safe_print(
+                    " " * 8,
+                    "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
+                )
 
     # 3. Extra dependencies
     _progress("unsloth extras")

From b9c6882df49dc09c03a7c18376eab3b5c9fab3f9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 00:27:25 +0000
Subject: [PATCH 023/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/install_python_stack.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 0a9175abef..d580a163e4 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -1178,10 +1178,18 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
             # Only warn if torch doesn't already have ROCm (HIP) support
             try:
                 _hip_ver = subprocess.run(
-                    [sys.executable, "-c", "import torch; print(getattr(torch.version,'hip','') or '')"],
-                    stdout = subprocess.PIPE, stderr = subprocess.DEVNULL, timeout = 20,
+                    [
+                        sys.executable,
+                        "-c",
+                        "import torch; print(getattr(torch.version,'hip','') or '')",
+                    ],
+                    stdout = subprocess.PIPE,
+                    stderr = subprocess.DEVNULL,
+                    timeout = 20,
+                )
+                _has_rocm_torch = (
+                    _hip_ver.returncode == 0 and _hip_ver.stdout.decode().strip() != ""
                 )
-                _has_rocm_torch = _hip_ver.returncode == 0 and _hip_ver.stdout.decode().strip() != ""
             except Exception:
                 _has_rocm_torch = False
             if not _has_rocm_torch:

From 67d8b7481a1c6bf54ea7ff9ff8bb6cff13fdee8e Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 19:50:49 -0500
Subject: [PATCH 024/165] feat: add rocm step display in setup.ps1; fix warning
 and progress counter

- Add 'rocm' step after 'cuda' in setup.ps1 showing ROCm version or HIP SDK missing
- Move ROCm version detection up to GPU detection block so it's available early
- Suppress 'must be installed manually' warning when torch.version.hip is set
- Fix _TOTAL counter to include ROCm steps on Windows (fixes 10/9 display)
---
 studio/setup.ps1 | 46 ++++++++++++++++++++++++++++------------------
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 96318349c6..267fd56106 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -704,6 +704,26 @@ if (-not $HasNvidiaSmi) {
             if ($wmiGpu) { $ROCmGpuLabel = $wmiGpu.Name }
         } catch {}
     }
+    # Capture ROCm version early for display and wheel selection
+    if ($HasROCm) {
+        $script:ROCmVersion = $null
+        $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
+        if ($hipConfigExe) {
+            try {
+                $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
+                if ($LASTEXITCODE -eq 0 -and $hipVerOut -match '(\d+\.\d+)') { $script:ROCmVersion = $Matches[1] }
+            } catch {}
+        }
+        if (-not $script:ROCmVersion) {
+            $amdSmiVer = Get-Command "amd-smi" -ErrorAction SilentlyContinue
+            if ($amdSmiVer) {
+                try {
+                    $smiVerOut = & $amdSmiVer.Source version 2>&1 | Out-String
+                    if ($LASTEXITCODE -eq 0 -and $smiVerOut -match 'ROCm version:\s*(\d+\.\d+)') { $script:ROCmVersion = $Matches[1] }
+                } catch {}
+            }
+        }
+    }
 }
 
 if ($HasNvidiaSmi) {
@@ -1130,6 +1150,13 @@ if (-not $CudaArch) {
     step "cuda" "skipped (no NVIDIA GPU detected)" "Yellow"
 }
 
+if ($HasROCm) {
+    $rocmVerLabel = if ($ROCmVersion) { "ROCm $ROCmVersion" } else { "ROCm (version unknown)" }
+    step "rocm" $rocmVerLabel
+} elseif ($ROCmGpuLabel) {
+    step "rocm" "HIP SDK not found -- GPU-accelerated training unavailable" "Yellow"
+}
+
 # ============================================
 # 1f. Node.js / npm (skip if pip-installed or Tauri -- only needed for frontend build)
 # ============================================
@@ -1826,26 +1853,9 @@ if ($HasNvidiaSmi) {
 # ── AMD Windows ROCm torch override ──────────────────────────────────────────
 # When ROCm HIP SDK is present and Python 3.12 is in use, install AMD's direct
 # torch wheels instead of CPU-only PyTorch.
-$ROCmVersion = $null
+$ROCmVersion = $script:ROCmVersion
 $ROCmTorchWheelUrls = $null
 if ($HasROCm -and $CuTag -eq "cpu") {
-    # Detect ROCm version via hipconfig, then amd-smi
-    $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
-    if ($hipConfigExe) {
-        try {
-            $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
-            if ($LASTEXITCODE -eq 0 -and $hipVerOut -match '(\d+\.\d+)') { $ROCmVersion = $Matches[1] }
-        } catch {}
-    }
-    if (-not $ROCmVersion) {
-        $amdSmiVer = Get-Command "amd-smi" -ErrorAction SilentlyContinue
-        if ($amdSmiVer) {
-            try {
-                $smiVerOut = & $amdSmiVer.Source version 2>&1 | Out-String
-                if ($LASTEXITCODE -eq 0 -and $smiVerOut -match 'ROCm version:\s*(\d+\.\d+)') { $ROCmVersion = $Matches[1] }
-            } catch {}
-        }
-    }
     $pyVer = (& python --version 2>&1 | Out-String) -replace '[^0-9.]',''
     $pyMajMin = ($pyVer.Trim() -split '\.')[0..1] -join '.'
     $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }

From f036ee0022a02d5d794b53bc25907caf0d61620e Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:14:14 -0500
Subject: [PATCH 025/165] fix: detect AMD SDK ROCm torch via __version__ when
 torch.version.hip is unset

AMD's repo.radeon.com wheels (e.g. 2.9.0+rocmsdk20251116) do not set
torch.version.hip, leaving it None. All three probes that relied solely on
torch.version.hip now also check for 'rocm' in torch.__version__.lower():

- hardware.py detect_hardware(): IS_ROCM was never set, causing the studio
  to report 'Hardware detected: CPU' even after AMD wheels were installed
  and HIP DLLs were on PATH.
- install_python_stack.py _ensure_rocm_torch(): skip-if-already-installed
  probe would always reinstall on subsequent runs.
- install_python_stack.py Windows AMD warning: suppression check always
  failed, so the 'must be installed manually' note kept appearing after
  a successful AMD wheel install.
---
 studio/backend/utils/hardware/hardware.py |  8 ++++++--
 studio/install_python_stack.py            | 25 +++++++++++++++++------
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 3b1c2a54dc..c4d6584e2b 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -120,10 +120,14 @@ def detect_hardware() -> DeviceType:
 
             # Distinguish AMD ROCm (HIP) from NVIDIA CUDA for display purposes.
             # DeviceType stays CUDA since torch.cuda.* works on ROCm via HIP.
-            if getattr(torch.version, "hip", None) is not None:
+            # AMD's repo.radeon.com SDK wheels (e.g. 2.9.0+rocmsdk20251116) do
+            # not set torch.version.hip, so fall back to checking __version__.
+            _hip_ver = getattr(torch.version, "hip", None)
+            if _hip_ver is not None or "rocm" in torch.__version__.lower():
                 IS_ROCM = True
+                _hip_label = _hip_ver or torch.__version__
                 print(
-                    f"Hardware detected: ROCm (HIP {torch.version.hip}) -- {device_name}"
+                    f"Hardware detected: ROCm (HIP {_hip_label}) -- {device_name}"
                 )
             else:
                 print(f"Hardware detected: CUDA -- {device_name}")
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index d580a163e4..29fe70cdbb 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -299,13 +299,18 @@ def _ensure_rocm_torch() -> None:
                 [
                     sys.executable,
                     "-c",
-                    "import torch; print(getattr(torch.version,'hip','') or '')",
+                    (
+                        "import torch; "
+                        "hip=getattr(torch.version,'hip','') or ''; "
+                        "ver=torch.__version__; "
+                        "print('yes' if hip or 'rocm' in ver.lower() else '')"
+                    ),
                 ],
                 stdout = subprocess.PIPE,
                 stderr = subprocess.DEVNULL,
                 timeout = 30,
             )
-            if probe.returncode == 0 and probe.stdout.decode().strip():
+            if probe.returncode == 0 and probe.stdout.decode().strip() == "yes":
                 return  # already ROCm torch
         except (OSError, subprocess.TimeoutExpired):
             pass
@@ -1175,20 +1180,28 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
                 _win_amd_gpu = True
                 break
         if _win_amd_gpu:
-            # Only warn if torch doesn't already have ROCm (HIP) support
+            # Only warn if torch doesn't already have ROCm (HIP) support.
+            # AMD SDK wheels (e.g. 2.9.0+rocmsdk20251116) don't set
+            # torch.version.hip, so also check for "rocm" in __version__.
             try:
-                _hip_ver = subprocess.run(
+                _rocm_probe = subprocess.run(
                     [
                         sys.executable,
                         "-c",
-                        "import torch; print(getattr(torch.version,'hip','') or '')",
+                        (
+                            "import torch; "
+                            "hip=getattr(torch.version,'hip','') or ''; "
+                            "ver=torch.__version__; "
+                            "print('yes' if hip or 'rocm' in ver.lower() else '')"
+                        ),
                     ],
                     stdout = subprocess.PIPE,
                     stderr = subprocess.DEVNULL,
                     timeout = 20,
                 )
                 _has_rocm_torch = (
-                    _hip_ver.returncode == 0 and _hip_ver.stdout.decode().strip() != ""
+                    _rocm_probe.returncode == 0
+                    and _rocm_probe.stdout.decode().strip() == "yes"
                 )
             except Exception:
                 _has_rocm_torch = False

From 7d3de8b3172ccdc96fc83ed4d2c2c6fdb5aa6498 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 01:14:31 +0000
Subject: [PATCH 026/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index c4d6584e2b..1b990b6adf 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -126,9 +126,7 @@ def detect_hardware() -> DeviceType:
             if _hip_ver is not None or "rocm" in torch.__version__.lower():
                 IS_ROCM = True
                 _hip_label = _hip_ver or torch.__version__
-                print(
-                    f"Hardware detected: ROCm (HIP {_hip_label}) -- {device_name}"
-                )
+                print(f"Hardware detected: ROCm (HIP {_hip_label}) -- {device_name}")
             else:
                 print(f"Hardware detected: CUDA -- {device_name}")
             return DEVICE

From 77f7adef62cd77b54624e26c4a975681189b7978 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:16:01 -0500
Subject: [PATCH 027/165] perf: drop --no-cache-dir from AMD ROCm torch wheel
 installs

uv caches downloaded wheels by default; passing --no-cache-dir forced a
full redownload of the ~2 GB torch wheel on every install run. CUDA installs
never had this flag -- AMD was the only path affected.
---
 install.ps1      | 2 +-
 studio/setup.ps1 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 9b55ff1492..5228c7ba30 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1445,7 +1445,7 @@ shell.Run cmd, 0, False
             $rw0 = $ROCmAllWheelUrls[0]; $rw1 = $ROCmAllWheelUrls[1]
             $rw2 = $ROCmAllWheelUrls[2]; $rw3 = $ROCmAllWheelUrls[3]
             $rw4 = $ROCmAllWheelUrls[4]
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-cache-dir --no-deps $rw0 $rw1 $rw2 $rw3 $rw4 }
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps $rw0 $rw1 $rw2 $rw3 $rw4 }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 267fd56106..21d99869e5 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1888,7 +1888,7 @@ if ($ROCmTorchWheelUrls) {
     substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
     $sw0 = $ROCmTorchWheelUrls[0]; $sw1 = $ROCmTorchWheelUrls[1]
     $sw2 = $ROCmTorchWheelUrls[2]; $sw3 = $ROCmTorchWheelUrls[3]; $sw4 = $ROCmTorchWheelUrls[4]
-    $output = Fast-Install --force-reinstall --no-cache-dir --no-deps $sw0 $sw1 $sw2 $sw3 $sw4 | Out-String
+    $output = Fast-Install --force-reinstall --no-deps $sw0 $sw1 $sw2 $sw3 $sw4 | Out-String
     $torchInstallExit = $LASTEXITCODE
     if ($torchInstallExit -ne 0) {
         Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow

From 9439657ee82b164d151160537c4711eb95291f0b Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:29:06 -0500
Subject: [PATCH 028/165] fix: use install-state flag instead of subprocess
 probe for AMD Windows warning

Replace the subprocess torch probe in the post-install warning block with a
module-level _rocm_windows_torch_installed flag set by _ensure_rocm_torch().
Subprocess re-import of torch is unnecessary and fragile -- the install
function already knows whether it succeeded.
---
 studio/install_python_stack.py | 54 ++++++++++++----------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 29fe70cdbb..b2c6f9bd1a 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -270,6 +270,12 @@ def _has_usable_nvidia_gpu() -> bool:
     return result.returncode == 0 and "GPU " in result.stdout
 
 
+# Set to True by _ensure_rocm_torch() when AMD Windows wheels are installed
+# successfully. Used by the post-install warning block to skip the "must be
+# installed manually" note without spawning a subprocess.
+_rocm_windows_torch_installed: bool = False
+
+
 def _ensure_rocm_torch() -> None:
     """Reinstall torch with ROCm wheels when the venv received CPU-only torch.
 
@@ -311,6 +317,8 @@ def _ensure_rocm_torch() -> None:
                 timeout = 30,
             )
             if probe.returncode == 0 and probe.stdout.decode().strip() == "yes":
+                global _rocm_windows_torch_installed
+                _rocm_windows_torch_installed = True
                 return  # already ROCm torch
         except (OSError, subprocess.TimeoutExpired):
             pass
@@ -343,6 +351,8 @@ def _ensure_rocm_torch() -> None:
             *wheel_urls,
             constrain = False,
         )
+        global _rocm_windows_torch_installed
+        _rocm_windows_torch_installed = True
         return
 
     # ── Linux x86_64 path ──────────────────────────────────────────────────────
@@ -1179,41 +1189,15 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
             if _wr.returncode == 0 and _check_fn(_wr.stdout):
                 _win_amd_gpu = True
                 break
-        if _win_amd_gpu:
-            # Only warn if torch doesn't already have ROCm (HIP) support.
-            # AMD SDK wheels (e.g. 2.9.0+rocmsdk20251116) don't set
-            # torch.version.hip, so also check for "rocm" in __version__.
-            try:
-                _rocm_probe = subprocess.run(
-                    [
-                        sys.executable,
-                        "-c",
-                        (
-                            "import torch; "
-                            "hip=getattr(torch.version,'hip','') or ''; "
-                            "ver=torch.__version__; "
-                            "print('yes' if hip or 'rocm' in ver.lower() else '')"
-                        ),
-                    ],
-                    stdout = subprocess.PIPE,
-                    stderr = subprocess.DEVNULL,
-                    timeout = 20,
-                )
-                _has_rocm_torch = (
-                    _rocm_probe.returncode == 0
-                    and _rocm_probe.stdout.decode().strip() == "yes"
-                )
-            except Exception:
-                _has_rocm_torch = False
-            if not _has_rocm_torch:
-                _safe_print(
-                    _dim("  Note:"),
-                    "AMD GPU detected on Windows. ROCm-enabled PyTorch must be",
-                )
-                _safe_print(
-                    " " * 8,
-                    "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
-                )
+        if _win_amd_gpu and not _rocm_windows_torch_installed:
+            _safe_print(
+                _dim("  Note:"),
+                "AMD GPU detected on Windows. ROCm-enabled PyTorch must be",
+            )
+            _safe_print(
+                " " * 8,
+                "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
+            )
 
     # 3. Extra dependencies
     _progress("unsloth extras")

From 4b2f7fb8e21c575036f05862a92a1c0af2445518 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:31:55 -0500
Subject: [PATCH 029/165] fix: hoist global declaration to top of
 _ensure_rocm_torch

Python requires the global statement to appear before any assignment
to the variable within a function. Moving it to the function top fixes
the SyntaxError on line 354.
---
 studio/install_python_stack.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index b2c6f9bd1a..101978b362 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -285,6 +285,7 @@ def _ensure_rocm_torch() -> None:
     already links against HIP.
     Uses pip_install() to respect uv, constraints, and --python targeting.
     """
+    global _rocm_windows_torch_installed
     if IS_MACOS:
         return
 
@@ -317,7 +318,6 @@ def _ensure_rocm_torch() -> None:
                 timeout = 30,
             )
             if probe.returncode == 0 and probe.stdout.decode().strip() == "yes":
-                global _rocm_windows_torch_installed
                 _rocm_windows_torch_installed = True
                 return  # already ROCm torch
         except (OSError, subprocess.TimeoutExpired):
@@ -351,7 +351,6 @@ def _ensure_rocm_torch() -> None:
             *wheel_urls,
             constrain = False,
         )
-        global _rocm_windows_torch_installed
         _rocm_windows_torch_installed = True
         return
 

From 7fbdce171c7dbb550047f2d1b0837ffdbc8cace1 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:37:44 -0500
Subject: [PATCH 030/165] fix: pass AMD torch install status via env var to
 suppress false warning

setup.ps1 now sets UNSLOTH_ROCM_TORCH_INSTALLED=1 after a successful AMD
wheel install. install_python_stack.py reads this at the top of
_ensure_rocm_torch() to skip both the subprocess probe and the warning --
no re-import of torch needed, and the warning message now correctly says
'could not be auto-installed' rather than 'must be installed manually'.
---
 studio/install_python_stack.py | 10 ++++++++--
 studio/setup.ps1               |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 101978b362..cedbb8058d 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -286,6 +286,12 @@ def _ensure_rocm_torch() -> None:
     Uses pip_install() to respect uv, constraints, and --python targeting.
     """
     global _rocm_windows_torch_installed
+    # setup.ps1 sets this env var when it successfully installs AMD wheels
+    # before calling install_python_stack.py, so we can skip the subprocess
+    # probe and avoid reinstalling what was just installed.
+    if os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
+        _rocm_windows_torch_installed = True
+        return
     if IS_MACOS:
         return
 
@@ -1191,11 +1197,11 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
         if _win_amd_gpu and not _rocm_windows_torch_installed:
             _safe_print(
                 _dim("  Note:"),
-                "AMD GPU detected on Windows. ROCm-enabled PyTorch must be",
+                "AMD GPU detected but ROCm PyTorch could not be auto-installed.",
             )
             _safe_print(
                 " " * 8,
-                "installed manually. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
+                "Manual install may be required. See: https://docs.unsloth.ai/get-started/install-and-update/amd",
             )
 
     # 3. Extra dependencies
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 21d99869e5..bd6a6ac101 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1894,6 +1894,10 @@ if ($ROCmTorchWheelUrls) {
         Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow
         Write-Host $output -ForegroundColor Yellow
         $ROCmTorchWheelUrls = $null
+    } else {
+        # Signal to install_python_stack.py that AMD wheels are already installed
+        # so it skips the subprocess probe and suppresses the manual-install warning.
+        $env:UNSLOTH_ROCM_TORCH_INSTALLED = "1"
     }
 }
 

From f09a4240a1ab24312b3cf1191022d5ca2be75305 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:43:18 -0500
Subject: [PATCH 031/165] fix: register ROCm DLL directory before torch import
 on Windows

Python 3.8+ ignores PATH for extension DLL loading on Windows; amdhip64.dll
and other HIP runtime DLLs must be registered via os.add_dll_directory().
Without this, torch.cuda.is_available() always returns False on AMD ROCm
Windows even when HIP_PATH is correctly set in system environment variables.

Reads HIP_PATH / ROCM_PATH env vars first, then falls back to scanning
common ROCm install roots (C:\Program Files\AMD\ROCm, F:\ROCm, C:\ROCm).
---
 studio/backend/main.py | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index cd901327db..0979f48865 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -12,6 +12,40 @@
 # Suppress annoying C-level dependency warnings globally
 os.environ["PYTHONWARNINGS"] = "ignore"
 
+# ── Windows AMD ROCm DLL injection ──────────────────────────────────────────
+# On Windows, Python 3.8+ uses a secure DLL search that ignores PATH for
+# extension modules. torch's HIP backend (amdhip64.dll etc.) won't be found
+# even if F:\ROCm\...\bin is in PATH unless we explicitly register the
+# directory with os.add_dll_directory(). Do this before any torch import.
+if sys.platform == "win32":
+    import ctypes as _ctypes
+
+    def _add_rocm_dll_dirs() -> None:
+        hip_path = os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH")
+        candidates = []
+        if hip_path:
+            candidates.append(os.path.join(hip_path, "bin"))
+        # Also scan common install roots in case HIP_PATH is not set
+        for _root in (r"C:\Program Files\AMD\ROCm", r"F:\ROCm", r"C:\ROCm"):
+            try:
+                if os.path.isdir(_root):
+                    for _ver in sorted(os.listdir(_root), reverse=True):
+                        _bin = os.path.join(_root, _ver, "bin")
+                        if os.path.isdir(_bin):
+                            candidates.append(_bin)
+                            break
+            except OSError:
+                pass
+        for _d in candidates:
+            if os.path.isdir(_d):
+                try:
+                    os.add_dll_directory(_d)
+                except (OSError, AttributeError):
+                    pass
+
+    _add_rocm_dll_dirs()
+    del _add_rocm_dll_dirs, _ctypes
+
 # Ensure backend dir is on sys.path so _platform_compat is importable when
 # main.py is launched directly (e.g. `uvicorn main:app`).
 _backend_dir = str(_Path(__file__).parent)

From 05f5cda29507ecf8721932de872310253e168881 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 01:45:08 +0000
Subject: [PATCH 032/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 0979f48865..701791f0a7 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -29,7 +29,7 @@ def _add_rocm_dll_dirs() -> None:
         for _root in (r"C:\Program Files\AMD\ROCm", r"F:\ROCm", r"C:\ROCm"):
             try:
                 if os.path.isdir(_root):
-                    for _ver in sorted(os.listdir(_root), reverse=True):
+                    for _ver in sorted(os.listdir(_root), reverse = True):
                         _bin = os.path.join(_root, _ver, "bin")
                         if os.path.isdir(_bin):
                             candidates.append(_bin)

From 0facfbc4bcb27cf37477e574c80738a7a3a1a42b Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:45:31 -0500
Subject: [PATCH 033/165] fix: remove hardcoded non-standard ROCm paths from
 DLL directory scan

Only use HIP_PATH/ROCM_PATH (set by AMD installer) and the standard
C:\Program Files\AMD\ROCm\<version>\bin location. Custom drive paths
like F:\ROCm are user-specific and should not be hardcoded.
---
 studio/backend/main.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 701791f0a7..2b6619ac37 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -21,21 +21,25 @@
     import ctypes as _ctypes
 
     def _add_rocm_dll_dirs() -> None:
-        hip_path = os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH")
         candidates = []
-        if hip_path:
-            candidates.append(os.path.join(hip_path, "bin"))
-        # Also scan common install roots in case HIP_PATH is not set
-        for _root in (r"C:\Program Files\AMD\ROCm", r"F:\ROCm", r"C:\ROCm"):
-            try:
-                if os.path.isdir(_root):
-                    for _ver in sorted(os.listdir(_root), reverse = True):
-                        _bin = os.path.join(_root, _ver, "bin")
-                        if os.path.isdir(_bin):
-                            candidates.append(_bin)
-                            break
-            except OSError:
-                pass
+        # 1. HIP_PATH / ROCM_PATH -- set by the AMD HIP SDK installer
+        for _var in ("HIP_PATH", "ROCM_PATH"):
+            _val = os.environ.get(_var)
+            if _val:
+                candidates.append(os.path.join(_val, "bin"))
+        # 2. Standard AMD installer location: C:\Program Files\AMD\ROCm\<ver>\bin
+        #    Scan all installed versions, newest first.
+        _default_root = os.path.join(
+            os.environ.get("ProgramFiles", r"C:\Program Files"), "AMD", "ROCm"
+        )
+        try:
+            if os.path.isdir(_default_root):
+                for _ver in sorted(os.listdir(_default_root), reverse=True):
+                    _bin = os.path.join(_default_root, _ver, "bin")
+                    if os.path.isdir(_bin):
+                        candidates.append(_bin)
+        except OSError:
+            pass
         for _d in candidates:
             if os.path.isdir(_d):
                 try:

From cc77737b78c0c143fd5eaf6b700de938b5390271 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 01:46:32 +0000
Subject: [PATCH 034/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 2b6619ac37..812ddd45b9 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -34,7 +34,7 @@ def _add_rocm_dll_dirs() -> None:
         )
         try:
             if os.path.isdir(_default_root):
-                for _ver in sorted(os.listdir(_default_root), reverse=True):
+                for _ver in sorted(os.listdir(_default_root), reverse = True):
                     _bin = os.path.join(_default_root, _ver, "bin")
                     if os.path.isdir(_bin):
                         candidates.append(_bin)

From efcaccbbcf84fab72163b8e89cd9b7980dcb8543 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 20:53:54 -0500
Subject: [PATCH 035/165] fix: prevent torchao overrides step from overwriting
 AMD ROCm torch

torchao==0.14.0 in overrides.txt declares torch as a dependency. Without
--no-deps, uv resolves torch from PyPI and installs 2.11.0+cpu on top of
the AMD ROCm wheels (2.9.0+rocmsdk20251116). This was the root cause of
'Hardware detected: CPU' -- the AMD wheels were installed but then
immediately overwritten by the overrides step.

When _rocm_windows_torch_installed is True, add --no-deps to the overrides
pip_install call so torchao is installed without pulling in CPU torch.
---
 studio/install_python_stack.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index cedbb8058d..1a774e1f5c 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -1228,10 +1228,17 @@ def _win_amd_smi_has_gpu(stdout: str) -> bool:
         _progress("dependency overrides (skipped, no torch)")
     else:
         _progress("dependency overrides")
+        _override_extra_args: tuple[str, ...] = ()
+        if _rocm_windows_torch_installed:
+            # torchao in overrides.txt declares torch as a dependency; without
+            # --no-deps uv would resolve and install CPU torch from PyPI,
+            # overwriting the AMD ROCm wheels we just installed.
+            _override_extra_args = ("--no-deps",)
         pip_install(
             "Installing dependency overrides",
             "--force-reinstall",
             "--no-cache-dir",
+            *_override_extra_args,
             req = REQ_ROOT / "overrides.txt",
         )
 

From 301d6c0aa740988030aa16f01e7bcd3ad8aa51eb Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 21:31:44 -0500
Subject: [PATCH 036/165] fix: add rocm_sdk namespace tarball to Windows ROCm
 wheel installs

torch/_rocm_init.py calls `import rocm_sdk` at startup, which requires
the rocm namespace tarball (rocm-*.tar.gz) in addition to the SDK wheel
packages. This tarball was missing from both install.ps1 and setup.ps1,
causing ModuleNotFoundError on first torch import.

- Add rocm-0.1.dev0.tar.gz to ROCm 7.1.1 install (provides rocm_sdk namespace)
- Add rocm-7.2.1.tar.gz + rocm_sdk_devel to ROCm 7.2.1 install
- Install tarball in a dedicated step before main SDK/torch wheels
- Switch to @array splatting in install.ps1 scriptblock for dynamic wheel count
- Remove --no-cache-dir from Python-side ROCm wheel install (prevents ~2GB redownload)
---
 install.ps1                    | 27 ++++++++++++++++++++-------
 studio/install_python_stack.py |  6 +++++-
 studio/setup.ps1               | 22 +++++++++++++++++++---
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 5228c7ba30..bf279b5850 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1311,23 +1311,31 @@ shell.Run cmd, 0, False
     # When the HIP SDK is present and Python 3.12 is in use, swap in the AMD wheel
     # URL and clear $TorchIndexUrl so the standard --index-url path is skipped.
     $ROCmTorchWheelUrl = $null
+    $ROCmTarballUrl    = $null
     if ($HasROCm -and -not $SkipTorch) {
         $pyMajMin = if ($DetectedPython) { ($DetectedPython.Version -split '\.')[0..1] -join '.' } else { "" }
         if ($pyMajMin -eq "3.12") {
             $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
             if ($ROCmVersion -and $ROCmVersion -match '^7\.2') {
                 $amdRelBase = "$amdWheelBase/rocm-rel-7.2.1"
+                # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
+                # torch/_rocm_init.py imports at startup.
+                $ROCmTarballUrl = "$amdRelBase/rocm-7.2.1.tar.gz"
                 $ROCmAllWheelUrls = @(
                     "$amdRelBase/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+                    "$amdRelBase/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
                     "$amdRelBase/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
                     "$amdRelBase/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
                     "$amdRelBase/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
                     "$amdRelBase/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
                 )
-                $ROCmTorchWheelUrl = $ROCmAllWheelUrls[2]
+                $ROCmTorchWheelUrl = $ROCmAllWheelUrls[3]
                 $TorchIndexUrl = $null
             } elseif ($ROCmVersion -and $ROCmVersion -match '^7\.1') {
                 $amdRelBase = "$amdWheelBase/rocm-rel-7.1.1"
+                # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
+                # torch/_rocm_init.py imports at startup.
+                $ROCmTarballUrl = "$amdRelBase/rocm-0.1.dev0.tar.gz"
                 $ROCmAllWheelUrls = @(
                     "$amdRelBase/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
                     "$amdRelBase/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
@@ -1440,12 +1448,17 @@ shell.Run cmd, 0, False
         } elseif ($ROCmTorchWheelUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
             substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
-            # Expand array to scalars — @array splatting requires & and doesn't
-            # work reliably inside scriptblocks passed to Invoke-InstallCommand.
-            $rw0 = $ROCmAllWheelUrls[0]; $rw1 = $ROCmAllWheelUrls[1]
-            $rw2 = $ROCmAllWheelUrls[2]; $rw3 = $ROCmAllWheelUrls[3]
-            $rw4 = $ROCmAllWheelUrls[4]
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps $rw0 $rw1 $rw2 $rw3 $rw4 }
+            # Install the rocm namespace tarball first (provides the 'rocm_sdk'
+            # Python package that torch/_rocm_init.py imports at startup).
+            if ($ROCmTarballUrl) {
+                $tarballExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps $ROCmTarballUrl }
+                if ($tarballExit -ne 0) {
+                    Write-Host "[WARN] ROCm namespace tarball install failed (exit $tarballExit) -- continuing" -ForegroundColor Yellow
+                }
+            }
+            # Install remaining SDK + torch wheels.  @array splatting inside a
+            # scriptblock works in PS 5.1 because & $Command runs in-scope.
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps @ROCmAllWheelUrls }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 1a774e1f5c..39968ce484 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -70,7 +70,10 @@
     (7, 2): (
         "rocm-rel-7.2.1",
         [
+            # rocm tarball provides the 'rocm_sdk' Python namespace package
+            "rocm-7.2.1.tar.gz",
             "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+            "rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
             "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
             "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
             "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
@@ -80,6 +83,8 @@
     (7, 1): (
         "rocm-rel-7.1.1",
         [
+            # rocm tarball provides the 'rocm_sdk' Python namespace package
+            "rocm-0.1.dev0.tar.gz",
             "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
             "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
             "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
@@ -352,7 +357,6 @@ def _ensure_rocm_torch() -> None:
         pip_install(
             f"ROCm torch (Windows, {rel_tag})",
             "--force-reinstall",
-            "--no-cache-dir",
             "--no-deps",
             *wheel_urls,
             constrain = False,
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index bd6a6ac101..fdffaa30b8 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1855,6 +1855,7 @@ if ($HasNvidiaSmi) {
 # torch wheels instead of CPU-only PyTorch.
 $ROCmVersion = $script:ROCmVersion
 $ROCmTorchWheelUrls = $null
+$ROCmTarballUrl     = $null
 if ($HasROCm -and $CuTag -eq "cpu") {
     $pyVer = (& python --version 2>&1 | Out-String) -replace '[^0-9.]',''
     $pyMajMin = ($pyVer.Trim() -split '\.')[0..1] -join '.'
@@ -1862,8 +1863,12 @@ if ($HasROCm -and $CuTag -eq "cpu") {
     if ($pyMajMin -eq "3.12" -and $ROCmVersion) {
         if ($ROCmVersion -match '^7\.2') {
             $rb = "$amdWheelBase/rocm-rel-7.2.1"
+            # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
+            # torch/_rocm_init.py imports at startup.
+            $ROCmTarballUrl = "$rb/rocm-7.2.1.tar.gz"
             $ROCmTorchWheelUrls = @(
                 "$rb/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+                "$rb/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
                 "$rb/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
                 "$rb/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
                 "$rb/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
@@ -1871,6 +1876,9 @@ if ($HasROCm -and $CuTag -eq "cpu") {
             )
         } elseif ($ROCmVersion -match '^7\.1') {
             $rb = "$amdWheelBase/rocm-rel-7.1.1"
+            # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
+            # torch/_rocm_init.py imports at startup.
+            $ROCmTarballUrl = "$rb/rocm-0.1.dev0.tar.gz"
             $ROCmTorchWheelUrls = @(
                 "$rb/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
                 "$rb/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
@@ -1886,9 +1894,17 @@ $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR
 
 if ($ROCmTorchWheelUrls) {
     substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
-    $sw0 = $ROCmTorchWheelUrls[0]; $sw1 = $ROCmTorchWheelUrls[1]
-    $sw2 = $ROCmTorchWheelUrls[2]; $sw3 = $ROCmTorchWheelUrls[3]; $sw4 = $ROCmTorchWheelUrls[4]
-    $output = Fast-Install --force-reinstall --no-deps $sw0 $sw1 $sw2 $sw3 $sw4 | Out-String
+    # Install the rocm namespace tarball first (provides the 'rocm_sdk' Python
+    # package that torch/_rocm_init.py imports at startup).
+    if ($ROCmTarballUrl) {
+        $tarballOut = Fast-Install --force-reinstall --no-deps $ROCmTarballUrl | Out-String
+        if ($LASTEXITCODE -ne 0) {
+            Write-Host "[WARN] ROCm namespace tarball install failed -- continuing" -ForegroundColor Yellow
+            Write-Host $tarballOut -ForegroundColor Yellow
+        }
+    }
+    # Install remaining SDK + torch wheels using array splatting.
+    $output = Fast-Install --force-reinstall --no-deps @ROCmTorchWheelUrls | Out-String
     $torchInstallExit = $LASTEXITCODE
     if ($torchInstallExit -ne 0) {
         Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow

From 6fe91e7749eb904e3b7fba7c73e7006669b1c570 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 6 May 2026 21:44:50 -0500
Subject: [PATCH 037/165] feat: enable ROCm 7.2 torch install + warn on gfx1151
 with ROCm < 7.2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chigoma333 (AMD Radeon 8060S / gfx1151, Strix Halo) confirmed that ROCm
7.1 segfaults when tensors are moved to GPU, but ROCm 7.2 + torch
2.11.0+rocm7.2 works fully including training.

Changes:
- Uncomment (7,2): "rocm7.2" in _ROCM_TORCH_INDEX (was blocked by <2.11.0)
- Add _ROCM_TORCH_PKG_SPECS dict with per-tag version bounds:
  rocm7.2 → torch>=2.11.0,<2.12.0; all older tags → <2.11.0
- Add _detect_amd_gfx_codes() helper that parses rocminfo output
- Warn on gfx1151/gfx1150 (Strix Halo) when ROCm < 7.2 is installed,
  pointing users at the known segfault and recommending upgrade
- install.sh get_torch_index_url(): enable rocm7.2 case (previously capped
  to rocm7.1), cap unknown future tags to rocm7.2
- install.sh: override TORCH_CONSTRAINT to >=2.11.0,<2.12.0 when rocm7.2
  index is selected, so pip can actually resolve torch 2.11.0
---
 install.sh                     | 26 ++++++------
 studio/install_python_stack.py | 75 ++++++++++++++++++++++++++++++----
 2 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/install.sh b/install.sh
index fa2921b5ad..f1462088d9 100755
--- a/install.sh
+++ b/install.sh
@@ -1573,26 +1573,20 @@ get_torch_index_url() {
             case "$_rocm_tag" in
                 rocm[1-5].*) echo "$_base/cpu"; return ;;
             esac
-            # ROCm 7.2 only has torch 2.11.0 which exceeds current bounds
-            # (<2.11.0).  Fall back to rocm7.1 index which has torch 2.10.0.
-            # Enumerate explicit versions rather than matching rocm6.* so
-            # a host on ROCm 6.5 or 6.6 (no PyTorch wheels published) is
-            # clipped down to the last supported 6.x (rocm6.4) instead of
-            # constructing https://download.pytorch.org/whl/rocm6.5 which
-            # returns HTTP 403. PyTorch only ships: rocm5.7, 6.0, 6.1, 6.2,
-            # 6.3, 6.4, 7.0, 7.1, 7.2 (and 5.7 is below our minimum).
-            # TODO: uncomment rocm7.2 when the torch upper bound is bumped
-            # to >=2.11.0.
+            # Enumerate explicit supported ROCm wheel tags.  A host on ROCm
+            # 6.5+ (no published PyTorch wheels) is clipped to rocm6.4.
+            # PyTorch publishes: rocm5.7, 6.0, 6.1, 6.2, 6.3, 6.4, 7.0, 7.1,
+            # 7.2 (5.7 is below our minimum; rocm7.2 ships torch 2.11.0).
             case "$_rocm_tag" in
-                rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*)
+                rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*|rocm7.2|rocm7.2.*)
                     echo "$_base/$_rocm_tag" ;;
                 rocm6.*)
                     # ROCm 6.5+ (no published PyTorch wheels): clip down
                     # to the last supported 6.x wheel set.
                     echo "$_base/rocm6.4" ;;
                 *)
-                    # ROCm 7.2+ (including future 10.x+): cap to rocm7.1
-                    echo "$_base/rocm7.1" ;;
+                    # ROCm 7.3+ (future): cap to rocm7.2 (latest known)
+                    echo "$_base/rocm7.2" ;;
             esac
             return
         fi
@@ -1731,6 +1725,12 @@ _pick_radeon_wheel() {
 
 TORCH_INDEX_URL=$(get_torch_index_url)
 
+# rocm7.2 ships torch 2.11.0 -- adjust the constraint to allow it.
+# All other ROCm tags and CUDA stay within <2.11.0.
+case "$TORCH_INDEX_URL" in
+    */rocm7.2) TORCH_CONSTRAINT="torch>=2.11.0,<2.12.0" ;;
+esac
+
 # Auto-detect GPU for AMD ROCm based
 # get_torch_index_url must have chosen */rocm*
 # (gfx in rocminfo or amd-smi list). Then require rocminfo "Marketing Name:.*Radeon".
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 39968ce484..9251827b50 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -40,12 +40,9 @@
 # ── ROCm / AMD GPU support ─────────────────────────────────────────────────────
 # Mapping from detected ROCm (major, minor) to the best PyTorch wheel tag on
 # download.pytorch.org.  Entries are checked newest-first (>=).
-# ROCm 7.2 only has torch 2.11.0 on download.pytorch.org, which exceeds the
-# current torch upper bound (<2.11.0).  Fall back to rocm7.1 (torch 2.10.0).
-# TODO: uncomment rocm7.2 when torch upper bound is bumped to >=2.11.0
 _ROCM_TORCH_INDEX: dict[tuple[int, int], str] = {
-    # (7, 2): "rocm7.2",  # torch 2.11.0 -- requires torch>=2.11
-    (7, 1): "rocm7.1",
+    (7, 2): "rocm7.2",   # torch 2.11.0
+    (7, 1): "rocm7.1",   # torch 2.10.0
     (7, 0): "rocm7.0",
     (6, 4): "rocm6.4",
     (6, 3): "rocm6.3",
@@ -53,6 +50,23 @@
     (6, 1): "rocm6.1",
     (6, 0): "rocm6.0",
 }
+
+# Per-tag torch/torchvision/torchaudio version specs for pip.
+# rocm7.2 ships torch 2.11.0 which is a major version bump; older tags top out
+# at 2.10.x.  These specs prevent uv from picking an incompatible minor.
+_ROCM_TORCH_PKG_SPECS: dict[str, tuple[str, str, str]] = {
+    "rocm7.2": (
+        "torch>=2.11.0,<2.12.0",
+        "torchvision>=0.26.0,<0.27.0",
+        "torchaudio>=2.11.0,<2.12.0",
+    ),
+    # Default for rocm7.1 and earlier: torch 2.x below 2.11
+    "_default": (
+        "torch>=2.4,<2.11.0",
+        "torchvision<0.26.0",
+        "torchaudio<2.11.0",
+    ),
+}
 _PYTORCH_WHL_BASE = (
     os.environ.get("UNSLOTH_PYTORCH_MIRROR") or "https://download.pytorch.org/whl"
 ).rstrip("/")
@@ -275,6 +289,35 @@ def _has_usable_nvidia_gpu() -> bool:
     return result.returncode == 0 and "GPU " in result.stdout
 
 
+def _detect_amd_gfx_codes() -> list[str]:
+    """Return the list of AMD gfx ISA strings visible to ROCm (e.g. ['gfx1151']).
+
+    Parses ``rocminfo`` output for ``ISA Info`` / ``gfx`` entries.  Returns an
+    empty list when rocminfo is not found or no GPU agents are present.
+    """
+    import re
+
+    exe = shutil.which("rocminfo")
+    if not exe:
+        return []
+    try:
+        result = subprocess.run(
+            [exe],
+            stdout = subprocess.PIPE,
+            stderr = subprocess.DEVNULL,
+            text = True,
+            timeout = 15,
+        )
+    except Exception:
+        return []
+    if result.returncode != 0:
+        return []
+    # Match lines like "  Name:                    gfx1151" or ISA strings
+    # "amdgcn-amd-amdhsa--gfx1151".  Exclude the CPU agent (gfx000).
+    codes = re.findall(r"gfx([1-9][0-9a-z]{2,3})", result.stdout.lower())
+    return list(dict.fromkeys(f"gfx{c}" for c in codes))  # deduplicate, preserve order
+
+
 # Set to True by _ensure_rocm_torch() when AMD Windows wheels are installed
 # successfully. Used by the post-install warning block to skip the "must be
 # installed manually" note without spawning a subprocess.
@@ -411,6 +454,19 @@ def _ensure_rocm_torch() -> None:
 
     rocm_torch_ready = has_hip_torch
 
+    # Strix Halo (gfx1151) segfaults under ROCm 7.1 due to a ROCm driver bug
+    # fixed in ROCm 7.2.  Warn early so users know why training may crash.
+    if ver < (7, 2):
+        gfx_codes = _detect_amd_gfx_codes()
+        _strix_gfx = {"gfx1151", "gfx1150"}
+        if _strix_gfx.intersection(gfx_codes):
+            _gfx_str = ", ".join(sorted(_strix_gfx.intersection(gfx_codes)))
+            print(
+                f"\n   ⚠️  {_gfx_str} (AMD Strix Halo) detected with ROCm {ver[0]}.{ver[1]}.\n"
+                f"   ROCm 7.1 has a known segfault on this GPU when tensors are\n"
+                f"   moved to the GPU.  Upgrade to ROCm 7.2+ to enable training.\n"
+            )
+
     if not has_hip_torch:
         # Select best matching wheel tag (newest ROCm version <= installed)
         tag = next(
@@ -429,13 +485,16 @@ def _ensure_rocm_torch() -> None:
         else:
             index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
             print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
+            _torch_pkg, _vision_pkg, _audio_pkg = _ROCM_TORCH_PKG_SPECS.get(
+                tag, _ROCM_TORCH_PKG_SPECS["_default"]
+            )
             pip_install(
                 f"ROCm torch ({tag})",
                 "--force-reinstall",
                 "--no-cache-dir",
-                "torch>=2.4,<2.11.0",
-                "torchvision<0.26.0",
-                "torchaudio<2.11.0",
+                _torch_pkg,
+                _vision_pkg,
+                _audio_pkg,
                 "--index-url",
                 index_url,
                 constrain = False,

From 1680dacfee42aaa2709e594e7fb4ae4f09741ad9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 7 May 2026 02:45:13 +0000
Subject: [PATCH 038/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/install_python_stack.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 9251827b50..c6bbfda385 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -41,8 +41,8 @@
 # Mapping from detected ROCm (major, minor) to the best PyTorch wheel tag on
 # download.pytorch.org.  Entries are checked newest-first (>=).
 _ROCM_TORCH_INDEX: dict[tuple[int, int], str] = {
-    (7, 2): "rocm7.2",   # torch 2.11.0
-    (7, 1): "rocm7.1",   # torch 2.10.0
+    (7, 2): "rocm7.2",  # torch 2.11.0
+    (7, 1): "rocm7.1",  # torch 2.10.0
     (7, 0): "rocm7.0",
     (6, 4): "rocm6.4",
     (6, 3): "rocm6.3",

From 5deb230cfb6661f11a41a6a997ea2497e04f48b6 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 8 May 2026 03:05:12 -0500
Subject: [PATCH 039/165] fix: prefer Python 3.12 for AMD ROCm users when 3.13
 is also installed

After GPU detection, if ROCm HIP SDK is found and the selected Python
is not 3.12, run a second pass to locate a 3.12 install via py.exe and
PATH (catches uv-managed installs). Switch $DetectedPython to 3.12 so
the venv is created with a compatible interpreter for the cp312-only AMD
Windows torch wheels.

NVIDIA and Intel GPU paths are unaffected -- the re-detection block only
runs when $HasROCm is true.

Fixes: #5301
---
 install.ps1 | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/install.ps1 b/install.ps1
index bf279b5850..f2646ce092 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1284,6 +1284,53 @@ shell.Run cmd, 0, False
         substep "Training and GPU inference require an NVIDIA or AMD ROCm GPU." "Yellow"
     }
 
+    # ── AMD ROCm: prefer Python 3.12 (Windows wheels are cp312-only) ──
+    # Python detection runs before GPU detection, so if AMD ROCm is found and the
+    # selected Python is not 3.12, try to locate a 3.12 install now.  This lets
+    # users who have both 3.13 and 3.12 installed get ROCm support automatically
+    # without having to uninstall 3.13.  3.13 remains the default for NVIDIA.
+    if ($HasROCm -and $DetectedPython -and ($DetectedPython.Version -split '\.')[0..1] -join '.' -ne "3.12") {
+        $py312 = $null
+        # 1. Try py launcher (official CPython installs)
+        $pyLauncher = Get-Command py -CommandType Application -ErrorAction SilentlyContinue
+        if ($pyLauncher -and $pyLauncher.Source -notmatch $script:CondaSkipPattern) {
+            try {
+                $out = & $pyLauncher.Source "-3.12" --version 2>&1 | Out-String
+                if ($out -match "Python 3\.12\.\d+") {
+                    $resolvedExe = (& $pyLauncher.Source "-3.12" -c "import sys; print(sys.executable)" 2>$null | Out-String).Trim()
+                    if ($resolvedExe -and (Test-Path $resolvedExe) -and -not (Test-IsCondaPython $resolvedExe)) {
+                        $py312 = @{ Version = "3.12"; Path = $resolvedExe }
+                    }
+                }
+            } catch {}
+        }
+        # 2. Try PATH (catches uv-managed or manually placed python3.12 / python)
+        if (-not $py312) {
+            foreach ($name in @("python3.12", "python3", "python")) {
+                foreach ($cmd in @(Get-Command $name -All -ErrorAction SilentlyContinue)) {
+                    if (-not $cmd.Source) { continue }
+                    if ($cmd.Source -like "*\WindowsApps\*") { continue }
+                    if (Test-IsCondaPython $cmd.Source) { continue }
+                    try {
+                        $out = & $cmd.Source --version 2>&1 | Out-String
+                        if ($out -match "Python 3\.12\.\d+") {
+                            $py312 = @{ Version = "3.12"; Path = $cmd.Source }
+                            break
+                        }
+                    } catch {}
+                }
+                if ($py312) { break }
+            }
+        }
+        if ($py312) {
+            $DetectedPython = $py312
+            substep "AMD ROCm detected -- switching to Python 3.12 (ROCm wheels are cp312-only)" "Cyan"
+        } else {
+            substep "AMD ROCm detected but Python 3.12 not found -- ROCm GPU training requires Python 3.12" "Yellow"
+            substep "Install Python 3.12 from python.org and re-run." "Yellow"
+        }
+    }
+
     # ── Choose the correct PyTorch index URL based on driver CUDA version ──
     # Mirrors Get-PytorchCudaTag in setup.ps1.
     function Get-TorchIndexUrl {

From bafb3f54df8996dd4fc0d5834f84b04b42f56bf4 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 8 May 2026 03:20:06 -0500
Subject: [PATCH 040/165] fix: also check uv-managed Python 3.12 for AMD ROCm
 #5301

---
 install.ps1 | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index f2646ce092..1b668375d2 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1304,7 +1304,7 @@ shell.Run cmd, 0, False
                 }
             } catch {}
         }
-        # 2. Try PATH (catches uv-managed or manually placed python3.12 / python)
+        # 2. Try PATH (catches manually placed python3.12 / python)
         if (-not $py312) {
             foreach ($name in @("python3.12", "python3", "python")) {
                 foreach ($cmd in @(Get-Command $name -All -ErrorAction SilentlyContinue)) {
@@ -1322,6 +1322,21 @@ shell.Run cmd, 0, False
                 if ($py312) { break }
             }
         }
+        # 3. Ask uv for its managed Python 3.12 (uv installs don't appear in PATH or py.exe)
+        if (-not $py312) {
+            $uvCmd = Get-Command uv -ErrorAction SilentlyContinue
+            if ($uvCmd) {
+                try {
+                    $uvPy = (& $uvCmd.Source python find 3.12 2>$null | Out-String).Trim()
+                    if ($uvPy -and (Test-Path $uvPy) -and -not (Test-IsCondaPython $uvPy)) {
+                        $verOut = (& $uvPy --version 2>&1 | Out-String)
+                        if ($verOut -match "Python 3\.12\.\d+") {
+                            $py312 = @{ Version = "3.12"; Path = $uvPy }
+                        }
+                    }
+                } catch {}
+            }
+        }
         if ($py312) {
             $DetectedPython = $py312
             substep "AMD ROCm detected -- switching to Python 3.12 (ROCm wheels are cp312-only)" "Cyan"

From 2de2c29d3a7a762ae3739c1a69a764ab75868bf5 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 8 May 2026 03:50:46 -0500
Subject: [PATCH 041/165] fix: hide amd-smi console popups on Windows, guard
 torch.distributed.is_initialized for ROCm #5301

---
 studio/backend/utils/hardware/amd.py | 3 +++
 unsloth/models/loader_utils.py       | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index fdb1ab4520..0219407471 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -13,10 +13,12 @@
 import os
 import re
 import subprocess
+import sys
 from typing import Any, Optional
 
 from loggers import get_logger
 from utils.native_path_leases import child_env_without_native_path_secret
+from utils.subprocess_compat import windows_hidden_subprocess_kwargs
 
 logger = get_logger(__name__)
 
@@ -30,6 +32,7 @@ def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[Any]:
             text = True,
             timeout = timeout,
             env = child_env_without_native_path_secret(),
+            **windows_hidden_subprocess_kwargs(),
         )
     except (OSError, subprocess.TimeoutExpired) as e:
         logger.warning("amd-smi query failed: %s", e)
diff --git a/unsloth/models/loader_utils.py b/unsloth/models/loader_utils.py
index 99da5f799e..99c327c10a 100644
--- a/unsloth/models/loader_utils.py
+++ b/unsloth/models/loader_utils.py
@@ -68,7 +68,7 @@ def _get_env_int(keys):
 
 
 def _infer_distributed_ranks():
-    if torch.distributed.is_available() and torch.distributed.is_initialized():
+    if torch.distributed.is_available() and getattr(torch.distributed, "is_initialized", lambda: False)():
         try:
             return torch.distributed.get_rank(), torch.distributed.get_world_size()
         except Exception:

From ba6b279e6199a7d8ea0dd0de4e9ee7d05a445dd2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 08:51:03 +0000
Subject: [PATCH 042/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 unsloth/models/loader_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/unsloth/models/loader_utils.py b/unsloth/models/loader_utils.py
index 99c327c10a..c0c6455a2e 100644
--- a/unsloth/models/loader_utils.py
+++ b/unsloth/models/loader_utils.py
@@ -68,7 +68,10 @@ def _get_env_int(keys):
 
 
 def _infer_distributed_ranks():
-    if torch.distributed.is_available() and getattr(torch.distributed, "is_initialized", lambda: False)():
+    if (
+        torch.distributed.is_available()
+        and getattr(torch.distributed, "is_initialized", lambda: False)()
+    ):
         try:
             return torch.distributed.get_rank(), torch.distributed.get_world_size()
         except Exception:

From 17071696808f5f7d2a892f19fa3a5c05dba745c3 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 8 May 2026 04:21:55 -0500
Subject: [PATCH 043/165] fix: suppress remaining console popups on Windows,
 patch torch.distributed.is_initialized for ROCm #5301

---
 studio/backend/core/training/worker.py | 11 +++++++++++
 studio/backend/utils/wheel_utils.py    |  2 ++
 2 files changed, 13 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ef5cafb175..fa9501f58a 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1084,6 +1084,17 @@ def run_training_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
+    # ── 1d. Ensure torch.distributed.is_initialized exists before ML libs load ──
+    # The ROCm Windows wheel (2.9.0+rocmsdk*) does not expose is_initialized on
+    # the torch.distributed module object until it is explicitly imported.
+    # transformers and trl access it before that happens, causing AttributeError.
+    try:
+        import torch.distributed as _td  # noqa: F401 -- forces full module load
+        if not hasattr(_td, "is_initialized"):
+            _td.is_initialized = lambda: False
+    except Exception:
+        pass
+
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:
         _send_status(event_queue, "Importing Unsloth...")
diff --git a/studio/backend/utils/wheel_utils.py b/studio/backend/utils/wheel_utils.py
index 3ed9bda827..06c5544f22 100644
--- a/studio/backend/utils/wheel_utils.py
+++ b/studio/backend/utils/wheel_utils.py
@@ -14,6 +14,7 @@
 from typing import Callable
 
 from utils.native_path_leases import child_env_without_native_path_secret
+from utils.subprocess_compat import windows_hidden_subprocess_kwargs
 
 _logger = logging.getLogger(__name__)
 
@@ -62,6 +63,7 @@ def probe_torch_wheel_env(*, timeout: int | None = None) -> dict[str, str] | Non
             text = True,
             timeout = timeout,
             env = child_env_without_native_path_secret(),
+            **windows_hidden_subprocess_kwargs(),
         )
     except subprocess.TimeoutExpired:
         return None

From a7047229ff461b4b8f9d443027ef8a4082c2e8fa Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 09:22:18 +0000
Subject: [PATCH 044/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index fa9501f58a..efdda3a77b 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1090,6 +1090,7 @@ def run_training_process(
     # transformers and trl access it before that happens, causing AttributeError.
     try:
         import torch.distributed as _td  # noqa: F401 -- forces full module load
+
         if not hasattr(_td, "is_initialized"):
             _td.is_initialized = lambda: False
     except Exception:

From 739e5d555dfd5a706e54c1fb53fd01dbdd973aac Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 8 May 2026 04:46:58 -0500
Subject: [PATCH 045/165] fix: stub all missing torch.distributed attrs for
 ROCm Windows wheel #5301

---
 studio/backend/core/training/worker.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index efdda3a77b..df9df699bb 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1084,15 +1084,24 @@ def run_training_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
-    # ── 1d. Ensure torch.distributed.is_initialized exists before ML libs load ──
-    # The ROCm Windows wheel (2.9.0+rocmsdk*) does not expose is_initialized on
-    # the torch.distributed module object until it is explicitly imported.
-    # transformers and trl access it before that happens, causing AttributeError.
+    # ── 1d. Ensure torch.distributed attributes exist before ML libs load ──
+    # The ROCm Windows wheel (2.9.0+rocmsdk*) does not expose several
+    # torch.distributed functions on the module object until it is explicitly
+    # imported. transformers and trl access them at import time, causing
+    # AttributeError. Force a full import and stub any missing callables.
     try:
-        import torch.distributed as _td  # noqa: F401 -- forces full module load
-
-        if not hasattr(_td, "is_initialized"):
-            _td.is_initialized = lambda: False
+        import torch.distributed as _td
+        _td_stubs = {
+            "is_initialized": lambda: False,
+            "is_available": lambda: False,
+            "is_torchelastic_launched": lambda: False,
+            "get_rank": lambda: 0,
+            "get_world_size": lambda: 1,
+            "barrier": lambda: None,
+        }
+        for _name, _stub in _td_stubs.items():
+            if not hasattr(_td, _name):
+                setattr(_td, _name, _stub)
     except Exception:
         pass
 

From 18690c6b679752149f9a33da8295895908d2ae97 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 09:47:56 +0000
Subject: [PATCH 046/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index df9df699bb..6b526707ce 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1091,6 +1091,7 @@ def run_training_process(
     # AttributeError. Force a full import and stub any missing callables.
     try:
         import torch.distributed as _td
+
         _td_stubs = {
             "is_initialized": lambda: False,
             "is_available": lambda: False,

From bfac7c1015dcf29450b1ed1b80905c1414ffe86d Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 8 May 2026 04:56:59 -0500
Subject: [PATCH 047/165] fix: inject torch.distributed stub when C backend
 missing in ROCm Windows wheel #5301

---
 studio/backend/core/training/worker.py | 43 +++++++++++++++++---------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 6b526707ce..28cecfcdb2 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1084,27 +1084,40 @@ def run_training_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
-    # ── 1d. Ensure torch.distributed attributes exist before ML libs load ──
-    # The ROCm Windows wheel (2.9.0+rocmsdk*) does not expose several
-    # torch.distributed functions on the module object until it is explicitly
-    # imported. transformers and trl access them at import time, causing
-    # AttributeError. Force a full import and stub any missing callables.
+    # ── 1d. Ensure torch.distributed is importable before ML libs load ──
+    # The ROCm Windows wheel lacks torch._C._distributed_c10d (the C backend),
+    # so `import torch.distributed` raises ImportError. transformers/trl import
+    # it unconditionally, killing the subprocess. We try a real import first; if
+    # it fails we inject a stub module into sys.modules so all subsequent imports
+    # get a harmless no-op object instead of crashing.
+    _td_stubs = {
+        "is_initialized": lambda: False,
+        "is_available": lambda: False,
+        "is_torchelastic_launched": lambda: False,
+        "get_rank": lambda: 0,
+        "get_world_size": lambda: 1,
+        "barrier": lambda: None,
+    }
     try:
         import torch.distributed as _td
-
-        _td_stubs = {
-            "is_initialized": lambda: False,
-            "is_available": lambda: False,
-            "is_torchelastic_launched": lambda: False,
-            "get_rank": lambda: 0,
-            "get_world_size": lambda: 1,
-            "barrier": lambda: None,
-        }
         for _name, _stub in _td_stubs.items():
             if not hasattr(_td, _name):
                 setattr(_td, _name, _stub)
     except Exception:
-        pass
+        import types
+        _td_mock = types.ModuleType("torch.distributed")
+        for _name, _stub in _td_stubs.items():
+            setattr(_td_mock, _name, _stub)
+        sys.modules["torch.distributed"] = _td_mock
+        # Stub the missing C extension so re-imports don't re-raise
+        sys.modules.setdefault(
+            "torch._C._distributed_c10d", types.ModuleType("torch._C._distributed_c10d")
+        )
+        try:
+            import torch as _torch
+            _torch.distributed = _td_mock
+        except Exception:
+            pass
 
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:

From a288ff208cd1d5c68ea3131fbf4f1c3f6604790a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 May 2026 09:58:25 +0000
Subject: [PATCH 048/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 28cecfcdb2..fa334a825e 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1100,11 +1100,13 @@ def run_training_process(
     }
     try:
         import torch.distributed as _td
+
         for _name, _stub in _td_stubs.items():
             if not hasattr(_td, _name):
                 setattr(_td, _name, _stub)
     except Exception:
         import types
+
         _td_mock = types.ModuleType("torch.distributed")
         for _name, _stub in _td_stubs.items():
             setattr(_td_mock, _name, _stub)
@@ -1115,6 +1117,7 @@ def run_training_process(
         )
         try:
             import torch as _torch
+
             _torch.distributed = _td_mock
         except Exception:
             pass

From fe5546d83782c1cdd932d8f468854503329e0f8b Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sun, 10 May 2026 23:02:23 -0500
Subject: [PATCH 049/165] fix(rocm/windows): pre-stub
 torch._C._distributed_c10d + raise amd-smi timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two fixes for Windows ROCm regressions reported by electroglyph on #5301:

1. worker.py — torch.distributed stub now fires unconditionally on Windows
   The previous stub only injected sys.modules in the except branch, meaning
   it was silently skipped when `import torch.distributed` happened to succeed
   (the C backend is lazily resolved).  The crash then hit later when
   transformers/trl triggered the lazy load.  Fix: on win32 we pre-populate
   sys.modules['torch._C._distributed_c10d'] AND set the attribute on the
   torch._C extension module *before* attempting the import, covering both
   the early-ImportError and lazy-load failure modes.

2. amd.py — increase amd-smi timeout from 5 s to 30 s on Windows (10 s Linux)
   amd-smi on Windows must cold-init the ROCm runtime on first invocation;
   5 s was consistently too short, producing repeated 'Command timed out'
   warnings in the server log.  30 s gives enough headroom without blocking
   indefinitely on broken installs.

3. install.ps1 — widen Python 3.12 enforcement to ROCmGpuLabel (WMI-only path)
   Users whose HIP SDK is not on PATH were detected via WMI but not switched
   to Python 3.12 before the install started, causing a second pass.  Guard
   now fires on (HasROCm -or ROCmGpuLabel).
---
 install.ps1                            | 10 +++---
 studio/backend/core/training/worker.py | 48 +++++++++++++++++++-------
 studio/backend/utils/hardware/amd.py   |  7 +++-
 3 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 1b668375d2..1281103a35 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1285,11 +1285,13 @@ shell.Run cmd, 0, False
     }
 
     # ── AMD ROCm: prefer Python 3.12 (Windows wheels are cp312-only) ──
-    # Python detection runs before GPU detection, so if AMD ROCm is found and the
+    # Python detection runs before GPU detection, so if an AMD GPU is found and the
     # selected Python is not 3.12, try to locate a 3.12 install now.  This lets
     # users who have both 3.13 and 3.12 installed get ROCm support automatically
     # without having to uninstall 3.13.  3.13 remains the default for NVIDIA.
-    if ($HasROCm -and $DetectedPython -and ($DetectedPython.Version -split '\.')[0..1] -join '.' -ne "3.12") {
+    # Fires on $ROCmGpuLabel (WMI-only, no HIP SDK) as well as $HasROCm so that
+    # users are switched to 3.12 upfront rather than after a second install pass.
+    if (($HasROCm -or $ROCmGpuLabel) -and $DetectedPython -and ($DetectedPython.Version -split '\.')[0..1] -join '.' -ne "3.12") {
         $py312 = $null
         # 1. Try py launcher (official CPython installs)
         $pyLauncher = Get-Command py -CommandType Application -ErrorAction SilentlyContinue
@@ -1339,9 +1341,9 @@ shell.Run cmd, 0, False
         }
         if ($py312) {
             $DetectedPython = $py312
-            substep "AMD ROCm detected -- switching to Python 3.12 (ROCm wheels are cp312-only)" "Cyan"
+            substep "AMD GPU detected -- switching to Python 3.12 (ROCm wheels are cp312-only)" "Cyan"
         } else {
-            substep "AMD ROCm detected but Python 3.12 not found -- ROCm GPU training requires Python 3.12" "Yellow"
+            substep "AMD GPU detected but Python 3.12 not found -- ROCm GPU support requires Python 3.12" "Yellow"
             substep "Install Python 3.12 from python.org and re-run." "Yellow"
         }
     }
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index fa334a825e..a4751d2e69 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1085,11 +1085,20 @@ def run_training_process(
             )
 
     # ── 1d. Ensure torch.distributed is importable before ML libs load ──
-    # The ROCm Windows wheel lacks torch._C._distributed_c10d (the C backend),
-    # so `import torch.distributed` raises ImportError. transformers/trl import
-    # it unconditionally, killing the subprocess. We try a real import first; if
-    # it fails we inject a stub module into sys.modules so all subsequent imports
-    # get a harmless no-op object instead of crashing.
+    # The Windows ROCm wheel ships without torch._C._distributed_c10d (the C
+    # backend for the distributed package).  This causes two distinct failure
+    # modes that must both be handled:
+    #
+    #   (a) `import torch.distributed` raises ImportError immediately, OR
+    #   (b) the import SUCCEEDS (the symbol is lazily resolved) but the first
+    #       actual call by transformers/trl triggers the missing-module error.
+    #
+    # Strategy: on Windows, unconditionally pre-stub torch._C._distributed_c10d
+    # in sys.modules AND as an attribute on the torch._C extension module BEFORE
+    # attempting the import.  That covers both (a) and (b).  Then do the import
+    # and backfill any missing helper attributes on torch.distributed itself.
+    import types as _types
+
     _td_stubs = {
         "is_initialized": lambda: False,
         "is_available": lambda: False,
@@ -1098,6 +1107,22 @@ def run_training_process(
         "get_world_size": lambda: 1,
         "barrier": lambda: None,
     }
+
+    if sys.platform == "win32":
+        # Pre-stub the missing C extension so both the module-import path and
+        # the attribute-access path (`from torch._C import _distributed_c10d`)
+        # return a harmless no-op object instead of raising ImportError.
+        _c10d_key = "torch._C._distributed_c10d"
+        _c10d_stub = _types.ModuleType(_c10d_key)
+        sys.modules[_c10d_key] = _c10d_stub
+        try:
+            import torch._C as _torch_C_mod  # C ext — always importable
+
+            if not hasattr(_torch_C_mod, "_distributed_c10d"):
+                _torch_C_mod._distributed_c10d = _c10d_stub
+        except Exception:
+            pass
+
     try:
         import torch.distributed as _td
 
@@ -1105,16 +1130,15 @@ def run_training_process(
             if not hasattr(_td, _name):
                 setattr(_td, _name, _stub)
     except Exception:
-        import types
-
-        _td_mock = types.ModuleType("torch.distributed")
+        _td_mock = _types.ModuleType("torch.distributed")
         for _name, _stub in _td_stubs.items():
             setattr(_td_mock, _name, _stub)
         sys.modules["torch.distributed"] = _td_mock
-        # Stub the missing C extension so re-imports don't re-raise
-        sys.modules.setdefault(
-            "torch._C._distributed_c10d", types.ModuleType("torch._C._distributed_c10d")
-        )
+        # Ensure C extension stub survives (may have been wiped by a failed import)
+        if "torch._C._distributed_c10d" not in sys.modules:
+            sys.modules["torch._C._distributed_c10d"] = _types.ModuleType(
+                "torch._C._distributed_c10d"
+            )
         try:
             import torch as _torch
 
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index 0219407471..443566136a 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -11,6 +11,7 @@
 import json
 import math
 import os
+import platform
 import re
 import subprocess
 import sys
@@ -22,8 +23,12 @@
 
 logger = get_logger(__name__)
 
+# amd-smi on Windows must initialise the full ROCm runtime on first call, which
+# can take 15-25 s on cold hardware.  Linux is consistently < 2 s.
+_AMD_SMI_DEFAULT_TIMEOUT = 30 if platform.system() == "Windows" else 10
 
-def _run_amd_smi(*args: str, timeout: int = 5) -> Optional[Any]:
+
+def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optional[Any]:
     """Run amd-smi with the given arguments and return parsed JSON, or None."""
     try:
         result = subprocess.run(

From 85841b5cad5490f8e278bfbc08776b3c32f076a9 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 00:08:14 -0500
Subject: [PATCH 050/165] fix(rocm): guard c10d stub, fix TorchIndexFamily for
 7.1, clean dead code + comments

- worker.py: wrap c10d stub injection in `if _c10d_key not in sys.modules` so
  Windows NVIDIA users with a real torch.distributed are never affected
- install.ps1: fix Get-TauriTorchIndexFamily receiving hardcoded "rocm7.2"
  even when ROCm 7.1 wheels are installed; now branches on $ROCmVersion
- main.py: remove dead `import ctypes as _ctypes` (ctypes is never called)
- hardware.py, install_python_stack.py, worker.py, install.ps1: shorten
  verbose multi-line comment blocks throughout
- tests: update 4 stale assertions that expected rocm7.2 to be absent/capped
---
 install.ps1                               | 23 ++++++-------
 studio/backend/core/training/worker.py    | 39 +++++++++--------------
 studio/backend/main.py                    | 10 ++----
 studio/backend/utils/hardware/hardware.py | 34 +++++---------------
 studio/install_python_stack.py            |  6 +---
 tests/studio/install/test_rocm_support.py | 24 +++++++-------
 6 files changed, 49 insertions(+), 87 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 1281103a35..b967eaf627 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1371,9 +1371,7 @@ shell.Run cmd, 0, False
     $TorchIndexUrl = Get-TorchIndexUrl
 
     # ── AMD Windows ROCm wheel override ──
-    # AMD publishes direct torch wheels for Windows (cp312 only) at repo.radeon.com.
-    # When the HIP SDK is present and Python 3.12 is in use, swap in the AMD wheel
-    # URL and clear $TorchIndexUrl so the standard --index-url path is skipped.
+    # When the HIP SDK is present and Python 3.12, use repo.radeon.com direct wheels.
     $ROCmTorchWheelUrl = $null
     $ROCmTarballUrl    = $null
     if ($HasROCm -and -not $SkipTorch) {
@@ -1382,9 +1380,7 @@ shell.Run cmd, 0, False
             $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
             if ($ROCmVersion -and $ROCmVersion -match '^7\.2') {
                 $amdRelBase = "$amdWheelBase/rocm-rel-7.2.1"
-                # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
-                # torch/_rocm_init.py imports at startup.
-                $ROCmTarballUrl = "$amdRelBase/rocm-7.2.1.tar.gz"
+                $ROCmTarballUrl = "$amdRelBase/rocm-7.2.1.tar.gz"  # rocm_sdk namespace
                 $ROCmAllWheelUrls = @(
                     "$amdRelBase/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
                     "$amdRelBase/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
@@ -1397,9 +1393,7 @@ shell.Run cmd, 0, False
                 $TorchIndexUrl = $null
             } elseif ($ROCmVersion -and $ROCmVersion -match '^7\.1') {
                 $amdRelBase = "$amdWheelBase/rocm-rel-7.1.1"
-                # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
-                # torch/_rocm_init.py imports at startup.
-                $ROCmTarballUrl = "$amdRelBase/rocm-0.1.dev0.tar.gz"
+                $ROCmTarballUrl = "$amdRelBase/rocm-0.1.dev0.tar.gz"  # rocm_sdk namespace
                 $ROCmAllWheelUrls = @(
                     "$amdRelBase/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
                     "$amdRelBase/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
@@ -1423,7 +1417,11 @@ shell.Run cmd, 0, False
         }
     }
 
-    $TorchIndexFamily = Get-TauriTorchIndexFamily $(if ($ROCmTorchWheelUrl) { "rocm7.2" } else { $TorchIndexUrl })
+    $TorchIndexFamily = Get-TauriTorchIndexFamily $(
+        if ($ROCmTorchWheelUrl) {
+            if ($ROCmVersion -match '^7\.1') { "rocm7.1" } else { "rocm7.2" }
+        } else { $TorchIndexUrl }
+    )
     $GpuBranch = Get-TauriGpuBranch $TorchIndexFamily
     Write-TauriDiag -GpuBranch $GpuBranch -TorchIndexFamily $TorchIndexFamily -PythonVersionForDiag $DetectedPython.Version
 
@@ -1512,16 +1510,13 @@ shell.Run cmd, 0, False
         } elseif ($ROCmTorchWheelUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
             substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
-            # Install the rocm namespace tarball first (provides the 'rocm_sdk'
-            # Python package that torch/_rocm_init.py imports at startup).
+            # rocm_sdk namespace tarball (torch/_rocm_init.py imports it at startup)
             if ($ROCmTarballUrl) {
                 $tarballExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps $ROCmTarballUrl }
                 if ($tarballExit -ne 0) {
                     Write-Host "[WARN] ROCm namespace tarball install failed (exit $tarballExit) -- continuing" -ForegroundColor Yellow
                 }
             }
-            # Install remaining SDK + torch wheels.  @array splatting inside a
-            # scriptblock works in PS 5.1 because & $Command runs in-scope.
             $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps @ROCmAllWheelUrls }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index a4751d2e69..ba60aa802c 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1085,18 +1085,12 @@ def run_training_process(
             )
 
     # ── 1d. Ensure torch.distributed is importable before ML libs load ──
-    # The Windows ROCm wheel ships without torch._C._distributed_c10d (the C
-    # backend for the distributed package).  This causes two distinct failure
-    # modes that must both be handled:
-    #
-    #   (a) `import torch.distributed` raises ImportError immediately, OR
-    #   (b) the import SUCCEEDS (the symbol is lazily resolved) but the first
-    #       actual call by transformers/trl triggers the missing-module error.
-    #
-    # Strategy: on Windows, unconditionally pre-stub torch._C._distributed_c10d
-    # in sys.modules AND as an attribute on the torch._C extension module BEFORE
-    # attempting the import.  That covers both (a) and (b).  Then do the import
-    # and backfill any missing helper attributes on torch.distributed itself.
+    # The Windows ROCm wheel ships without torch._C._distributed_c10d.
+    # Two failure modes: (a) ImportError on `import torch.distributed`, or
+    # (b) the import succeeds (lazy load) but the first call by trl/transformers
+    # crashes. Pre-stubbing before the import covers both.
+    # Guard with `not in sys.modules` so we never overwrite a real CUDA/NVIDIA
+    # implementation that was already loaded.
     import types as _types
 
     _td_stubs = {
@@ -1109,19 +1103,17 @@ def run_training_process(
     }
 
     if sys.platform == "win32":
-        # Pre-stub the missing C extension so both the module-import path and
-        # the attribute-access path (`from torch._C import _distributed_c10d`)
-        # return a harmless no-op object instead of raising ImportError.
         _c10d_key = "torch._C._distributed_c10d"
-        _c10d_stub = _types.ModuleType(_c10d_key)
-        sys.modules[_c10d_key] = _c10d_stub
-        try:
-            import torch._C as _torch_C_mod  # C ext — always importable
+        if _c10d_key not in sys.modules:
+            _c10d_stub = _types.ModuleType(_c10d_key)
+            sys.modules[_c10d_key] = _c10d_stub
+            try:
+                import torch._C as _torch_C_mod  # C ext — always importable
 
-            if not hasattr(_torch_C_mod, "_distributed_c10d"):
-                _torch_C_mod._distributed_c10d = _c10d_stub
-        except Exception:
-            pass
+                if not hasattr(_torch_C_mod, "_distributed_c10d"):
+                    _torch_C_mod._distributed_c10d = _c10d_stub
+            except Exception:
+                pass
 
     try:
         import torch.distributed as _td
@@ -1134,7 +1126,6 @@ def run_training_process(
         for _name, _stub in _td_stubs.items():
             setattr(_td_mock, _name, _stub)
         sys.modules["torch.distributed"] = _td_mock
-        # Ensure C extension stub survives (may have been wiped by a failed import)
         if "torch._C._distributed_c10d" not in sys.modules:
             sys.modules["torch._C._distributed_c10d"] = _types.ModuleType(
                 "torch._C._distributed_c10d"
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 812ddd45b9..46f5b97a86 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -13,13 +13,9 @@
 os.environ["PYTHONWARNINGS"] = "ignore"
 
 # ── Windows AMD ROCm DLL injection ──────────────────────────────────────────
-# On Windows, Python 3.8+ uses a secure DLL search that ignores PATH for
-# extension modules. torch's HIP backend (amdhip64.dll etc.) won't be found
-# even if F:\ROCm\...\bin is in PATH unless we explicitly register the
-# directory with os.add_dll_directory(). Do this before any torch import.
+# Python 3.8+ ignores PATH for extension modules; register ROCm bin dirs with
+# os.add_dll_directory() so amdhip64.dll etc. are found before any torch import.
 if sys.platform == "win32":
-    import ctypes as _ctypes
-
     def _add_rocm_dll_dirs() -> None:
         candidates = []
         # 1. HIP_PATH / ROCM_PATH -- set by the AMD HIP SDK installer
@@ -48,7 +44,7 @@ def _add_rocm_dll_dirs() -> None:
                     pass
 
     _add_rocm_dll_dirs()
-    del _add_rocm_dll_dirs, _ctypes
+    del _add_rocm_dll_dirs
 
 # Ensure backend dir is on sys.path so _platform_compat is importable when
 # main.py is launched directly (e.g. `uvicorn main:app`).
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 1b990b6adf..3593d9d677 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -516,12 +516,7 @@ def get_gpu_utilization() -> Dict[str, Any]:
         if result is not None:
             result["backend"] = _backend_label(device)
             if IS_ROCM:
-                # Mirror the unified-memory reconciliation done in the
-                # visible-GPU path. amd-smi on AMD iGPUs (Strix Halo etc.)
-                # reports only the dedicated VRAM slice; torch.mem_get_info
-                # sees the full GTT pool. Without this the /api/train/hardware
-                # endpoint and the live GPU monitor still display the wrong
-                # VRAM total even after auto-selection has been corrected.
+                # Fix unified-memory VRAM on AMD iGPUs (Strix Halo etc.)
                 _reconcile_primary_rocm_unified_memory(
                     result, _get_parent_visible_gpu_spec()
                 )
@@ -621,14 +616,11 @@ def _apply_unified_memory_correction(
 def _reconcile_rocm_unified_memory(
     utilization: Dict[str, Any], device_indices: list[int]
 ) -> None:
-    """Cross-check amd-smi VRAM data against torch mem_get_info for ROCm.
-
-    On AMD iGPUs with unified/shared memory (e.g. Strix Halo / Radeon 8060S),
-    amd-smi reports only the dedicated VRAM slice (typically 512 MB) in its
-    metric output, while torch.cuda.mem_get_info() surfaces the full GTT /
-    unified pool (~128 GB). When torch reports a larger total than amd-smi,
-    replace the per-device VRAM fields so auto_select_gpu_ids sees the real
-    usable memory instead of the tiny dedicated slice.
+    """Fix amd-smi VRAM for ROCm unified-memory GPUs (e.g. Strix Halo).
+
+    amd-smi reports only the dedicated slice (~512 MB); torch sees the full
+    GTT pool (~128 GB). When torch total > smi total, overwrite per-device
+    VRAM fields so GPU selection uses the real available memory.
     """
     torch_devices = _torch_get_per_device_info(device_indices)
     if not torch_devices:
@@ -644,14 +636,7 @@ def _reconcile_rocm_unified_memory(
 def _reconcile_primary_rocm_unified_memory(
     utilization: Dict[str, Any], parent_visible_spec: Dict[str, Any]
 ) -> None:
-    """Primary-GPU variant of the unified-memory reconciliation.
-
-    ``get_primary_gpu_utilization`` returns a flat metrics dict (no nested
-    ``devices`` list) for the first visible AMD GPU. Run the same correction
-    against torch.mem_get_info for that single device so the live training
-    hardware endpoint and the GPU monitor surface the real unified-memory
-    pool on Strix Halo and similar iGPUs.
-    """
+    """Same fix as _reconcile_rocm_unified_memory for the flat primary-GPU dict."""
     numeric_ids = parent_visible_spec.get("numeric_ids")
     if numeric_ids:
         primary_idx = [int(numeric_ids[0])]
@@ -678,10 +663,7 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
         if result is not None:
             result["backend"] = _backend_label(device)
             if IS_ROCM:
-                # amd-smi on iGPUs with unified memory (e.g. Strix Halo)
-                # reports only the dedicated VRAM slice; torch mem_get_info
-                # sees the full unified pool. Reconcile so downstream GPU
-                # selection uses the real available memory.
+                # Fix unified-memory VRAM on AMD iGPUs (Strix Halo etc.)
                 _reconcile_rocm_unified_memory(
                     result, parent_visible_spec["numeric_ids"]
                 )
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index c6bbfda385..0eb1a81d40 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -78,8 +78,6 @@
     or "https://repo.radeon.com/rocm/windows"
 ).rstrip("/")
 # Maps (major, minor) → (release_folder, [wheel_filename, ...])
-# Includes rocm_sdk_core and rocm_sdk_libraries_custom because the torch
-# wheels declare them as hard dependencies (rocm[libraries]==<ver>).
 _ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, list[str]]] = {
     (7, 2): (
         "rocm-rel-7.2.1",
@@ -334,9 +332,7 @@ def _ensure_rocm_torch() -> None:
     Uses pip_install() to respect uv, constraints, and --python targeting.
     """
     global _rocm_windows_torch_installed
-    # setup.ps1 sets this env var when it successfully installs AMD wheels
-    # before calling install_python_stack.py, so we can skip the subprocess
-    # probe and avoid reinstalling what was just installed.
+    # setup.ps1 sets this when it already installed AMD wheels; skip the probe.
     if os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
         _rocm_windows_torch_installed = True
         return
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 81aa66c999..ec8668b6a4 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -657,8 +657,8 @@ def test_version_unreadable_prints_warning(
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)
     @patch.object(stack_mod, "_has_rocm_gpu", return_value = True)
     @patch.object(stack_mod, "_detect_rocm_version", return_value = (7, 2))
-    def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip):
-        """ROCm 7.2 should select rocm7.1 tag (capped, not in mapping)."""
+    def test_rocm_72_selects_72_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip):
+        """ROCm 7.2 should select rocm7.2 tag (now in mapping with torch 2.11.0)."""
         mock_probe = MagicMock()
         mock_probe.returncode = 0
         mock_probe.stdout = b"\n"
@@ -666,7 +666,7 @@ def test_rocm_72_selects_71_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip)
             with patch("subprocess.run", return_value = mock_probe):
                 _ensure_rocm_torch()
         torch_call = mock_pip.call_args_list[0]
-        assert "rocm7.1" in str(torch_call)
+        assert "rocm7.2" in str(torch_call)
 
     @patch.object(stack_mod, "pip_install_try", return_value = True)
     @patch.object(stack_mod, "pip_install")
@@ -711,9 +711,10 @@ def test_mapping_is_sorted_descending(self):
         keys = list(_ROCM_TORCH_INDEX.keys())
         assert keys == sorted(keys, reverse = True)
 
-    def test_rocm_72_not_in_mapping(self):
-        """ROCm 7.2 should NOT be in the active mapping (torch 2.11.0 exceeds bound)."""
-        assert (7, 2) not in _ROCM_TORCH_INDEX
+    def test_rocm_72_in_mapping(self):
+        """ROCm 7.2 should be in the active mapping (torch 2.11.0 now supported)."""
+        assert (7, 2) in _ROCM_TORCH_INDEX
+        assert _ROCM_TORCH_INDEX[(7, 2)] == "rocm7.2"
 
     def test_rocm_71_maps_correctly(self):
         assert _ROCM_TORCH_INDEX[(7, 1)] == "rocm7.1"
@@ -731,7 +732,7 @@ def test_all_tags_use_download_pytorch(self):
             assert "radeon" not in tag
 
     def test_newer_rocm_selects_best_match(self):
-        """ROCm 7.2 (not in map) should select rocm7.1 via >= comparison."""
+        """ROCm 7.2 (now in map) should select rocm7.2 directly."""
         ver = (7, 2)
         tag = next(
             (
@@ -741,7 +742,7 @@ def test_newer_rocm_selects_best_match(self):
             ),
             None,
         )
-        assert tag == "rocm7.1"
+        assert tag == "rocm7.2"
 
     def test_rocm_64_selects_64(self):
         ver = (6, 4)
@@ -927,15 +928,16 @@ def test_cpu_hint_mentions_amd(self):
         source = sh_path.read_text()
         assert "ROCm" in source
 
-    def test_rocm72_capped_to_71(self):
-        """ROCm 7.2+ should fall back to rocm7.1 index."""
+    def test_rocm72_supported_future_capped(self):
+        """ROCm 7.2 should pass through directly; 7.3+ falls back to rocm7.2."""
         sh_path = PACKAGE_ROOT / "install.sh"
         source = sh_path.read_text()
-        assert 'echo "$_base/rocm7.1"' in source  # fallback for unknown versions
+        assert 'echo "$_base/rocm7.2"' in source  # fallback for unknown future versions
         # Allowlisted versions should pass through directly
         assert "rocm6.*" in source
         assert "rocm7.0" in source
         assert "rocm7.1" in source
+        assert "rocm7.2" in source
 
     def test_rocm_tag_validation_guard_exists(self):
         """install.sh should validate _rocm_tag with a case guard."""

From f892b65a1808689944459bba7ff7111eb620f178 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 00:13:42 -0500
Subject: [PATCH 051/165] fix(tests): match windows AMD warning assertion to
 actual source string

---
 tests/studio/install/test_rocm_support.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index ec8668b6a4..0771fe7b59 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1401,7 +1401,7 @@ class TestWindowsRocmWarning:
     def test_windows_amd_warning_in_source(self):
         """install_python_stack.py should warn Windows AMD users."""
         source = _STACK_PATH.read_text()
-        assert "AMD GPU detected on Windows" in source
+        assert "AMD GPU detected" in source
 
     def test_windows_amd_warning_checks_hipinfo_or_amdsmi(self):
         """Warning should check for hipinfo or amd-smi."""

From 42c5d984b76e1929872cc8d8a7f80a1d8bb394bc Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 00:20:28 -0500
Subject: [PATCH 052/165] chore: trim verbose comment blocks across all
 ROCm-related files

---
 install.ps1                                   |  8 ++----
 install.sh                                    | 12 +++------
 studio/backend/core/training/worker.py        |  9 +++----
 .../tests/test_log_filter_no_truncation.py    | 26 ++++---------------
 studio/backend/utils/hardware/hardware.py     | 16 +++---------
 studio/install_python_stack.py                | 15 +++--------
 studio/setup.ps1                              | 15 +++--------
 7 files changed, 25 insertions(+), 76 deletions(-)

diff --git a/install.ps1 b/install.ps1
index b967eaf627..424c3d5857 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1285,12 +1285,8 @@ shell.Run cmd, 0, False
     }
 
     # ── AMD ROCm: prefer Python 3.12 (Windows wheels are cp312-only) ──
-    # Python detection runs before GPU detection, so if an AMD GPU is found and the
-    # selected Python is not 3.12, try to locate a 3.12 install now.  This lets
-    # users who have both 3.13 and 3.12 installed get ROCm support automatically
-    # without having to uninstall 3.13.  3.13 remains the default for NVIDIA.
-    # Fires on $ROCmGpuLabel (WMI-only, no HIP SDK) as well as $HasROCm so that
-    # users are switched to 3.12 upfront rather than after a second install pass.
+    # If a non-3.12 Python was selected and an AMD GPU is present, try to find 3.12.
+    # Fires on $ROCmGpuLabel (WMI/no-HIP-SDK) as well as $HasROCm.
     if (($HasROCm -or $ROCmGpuLabel) -and $DetectedPython -and ($DetectedPython.Version -split '\.')[0..1] -join '.' -ne "3.12") {
         $py312 = $null
         # 1. Try py launcher (official CPython installs)
diff --git a/install.sh b/install.sh
index f1462088d9..287327b5e5 100755
--- a/install.sh
+++ b/install.sh
@@ -1477,11 +1477,8 @@ _find_no_torch_runtime() {
 }
 
 # ── AMD ROCm GPU detection helper ──
-# Returns 0 (true) if an actual AMD GPU is present, 1 (false) otherwise.
-# Checks rocminfo for gfx[1-9][0-9]+ (excludes gfx000 CPU agent),
-# amd-smi list for GPU data rows, and falls back to sysfs KFD topology
-# which is env-var-independent (works even when HIP_VISIBLE_DEVICES or
-# ROCR_VISIBLE_DEVICES hides devices from rocminfo/amd-smi).
+# Returns 0 if an AMD GPU is present. Checks rocminfo, amd-smi, then sysfs
+# KFD topology (env-var-independent fallback for when HIP/ROCR_VISIBLE_DEVICES hides devices).
 _has_amd_rocm_gpu() {
     if command -v rocminfo >/dev/null 2>&1 && \
        rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9][0-9]/{found=1} END{exit !found}'; then
@@ -1573,10 +1570,7 @@ get_torch_index_url() {
             case "$_rocm_tag" in
                 rocm[1-5].*) echo "$_base/cpu"; return ;;
             esac
-            # Enumerate explicit supported ROCm wheel tags.  A host on ROCm
-            # 6.5+ (no published PyTorch wheels) is clipped to rocm6.4.
-            # PyTorch publishes: rocm5.7, 6.0, 6.1, 6.2, 6.3, 6.4, 7.0, 7.1,
-            # 7.2 (5.7 is below our minimum; rocm7.2 ships torch 2.11.0).
+            # Supported tags; 6.5+ clips to rocm6.4, 7.3+ caps to rocm7.2.
             case "$_rocm_tag" in
                 rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*|rocm7.2|rocm7.2.*)
                     echo "$_base/$_rocm_tag" ;;
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ba60aa802c..9089eeb480 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1085,12 +1085,9 @@ def run_training_process(
             )
 
     # ── 1d. Ensure torch.distributed is importable before ML libs load ──
-    # The Windows ROCm wheel ships without torch._C._distributed_c10d.
-    # Two failure modes: (a) ImportError on `import torch.distributed`, or
-    # (b) the import succeeds (lazy load) but the first call by trl/transformers
-    # crashes. Pre-stubbing before the import covers both.
-    # Guard with `not in sys.modules` so we never overwrite a real CUDA/NVIDIA
-    # implementation that was already loaded.
+    # Windows ROCm wheel lacks torch._C._distributed_c10d. Pre-stub it to handle
+    # both ImportError and lazy-load crashes from trl/transformers. The
+    # `not in sys.modules` guard preserves a real NVIDIA implementation.
     import types as _types
 
     _td_stubs = {
diff --git a/studio/backend/tests/test_log_filter_no_truncation.py b/studio/backend/tests/test_log_filter_no_truncation.py
index d78643f5b9..d9a6e2bc4a 100644
--- a/studio/backend/tests/test_log_filter_no_truncation.py
+++ b/studio/backend/tests/test_log_filter_no_truncation.py
@@ -2,27 +2,11 @@
 # Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
 
 """
-Regression tests for studio.backend.loggers.handlers.filter_sensitive_data.
-
-Context: filter_sensitive_data was originally written with a base64-detection
-heuristic that truncated any string >100 chars containing ',' or '/' down to
-20 chars + '...'. The block was dormant until PR #5246 wired the processor
-into the structlog chain to redact native-path leases. Once active, the
-heuristic ate normal log lines emitted by llama_cpp_backend (GGUF size
-summary, mmproj selection, the full llama-server command line) and any
-exception traceback that happened to contain a file path.
-
-These tests pin two properties:
-
-1. Long, comma- or slash-bearing log messages flow through filter_sensitive_data
-   unchanged. The exact strings exercised match the call sites at
-   studio/backend/core/inference/llama_cpp.py:2117, :2283, and :2312 that
-   were truncated in the original bug report.
-
-2. PR #5246's native-path lease redaction still fires for both the inline
-   ``native_path_lease=...`` regex form and the ``nativePathLease`` dict-key
-   form. This guards against future regressions that strip redaction along
-   with the truncation block.
+Regression tests for loggers.handlers.filter_sensitive_data.
+
+Pins two properties: (1) long strings with commas/slashes pass through
+unchanged (the base64-truncation heuristic from PR #5246 was too aggressive),
+and (2) native-path lease redaction still fires for both inline and dict-key forms.
 """
 
 from loggers.handlers import filter_sensitive_data
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 3593d9d677..94787d4d95 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1632,23 +1632,15 @@ def apply_gpu_ids(gpu_ids) -> None:
     # parent process already set a ROCm visibility variable -- that
     # way a downstream ROCm process inherits the narrowed mask even
     # before Studio's hardware detection has classified the host.
-    # As a final fallback, probe torch.version.hip directly so spawned
-    # training workers on AMD hosts where the user never set HIP_VISIBLE_DEVICES
-    # still get the correct ROCm visibility mask (mirrors the llama_cpp.py
-    # approach for llama-server subprocess GPU pinning).
+    # Final fallback: probe torch.version.hip so AMD workers without
+    # HIP_VISIBLE_DEVICES still get the correct ROCm visibility mask.
     _inherits_rocm_visibility = (
         "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ
     )
     _is_rocm = IS_ROCM or _inherits_rocm_visibility
     if not _is_rocm:
-        # Use ``is not None`` here to match the detect_hardware() check at
-        # module top -- torch ships HIP version as a non-empty string on
-        # ROCm builds and None on CUDA builds, so the two forms agree on
-        # every shipping torch wheel; the ``is not None`` form is the one
-        # the rest of the codebase reads for "this torch was built with
-        # HIP". Keep the broad ``except`` as a safety net (we never want
-        # apply_gpu_ids to crash a worker over a probe failure) but log at
-        # debug level so the skip is observable when needed.
+        # torch.version.hip is a non-empty string on ROCm, None on CUDA.
+        # Broad except: a probe failure must never crash a training worker.
         try:
             import torch as _torch
 
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 0eb1a81d40..8af5fe3829 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -51,9 +51,7 @@
     (6, 0): "rocm6.0",
 }
 
-# Per-tag torch/torchvision/torchaudio version specs for pip.
-# rocm7.2 ships torch 2.11.0 which is a major version bump; older tags top out
-# at 2.10.x.  These specs prevent uv from picking an incompatible minor.
+# Per-tag pip specs; rocm7.2 ships torch 2.11.0 (older tags cap at 2.10.x).
 _ROCM_TORCH_PKG_SPECS: dict[str, tuple[str, str, str]] = {
     "rocm7.2": (
         "torch>=2.11.0,<2.12.0",
@@ -71,8 +69,7 @@
     os.environ.get("UNSLOTH_PYTORCH_MIRROR") or "https://download.pytorch.org/whl"
 ).rstrip("/")
 
-# AMD Windows ROCm wheels — repo.radeon.com (cp312 only; AMD does not publish
-# Windows ROCm wheels for other Python versions)
+# AMD Windows ROCm wheels — repo.radeon.com (cp312 only)
 _ROCM_WINDOWS_WHEEL_BASE = (
     os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR")
     or "https://repo.radeon.com/rocm/windows"
@@ -316,9 +313,7 @@ def _detect_amd_gfx_codes() -> list[str]:
     return list(dict.fromkeys(f"gfx{c}" for c in codes))  # deduplicate, preserve order
 
 
-# Set to True by _ensure_rocm_torch() when AMD Windows wheels are installed
-# successfully. Used by the post-install warning block to skip the "must be
-# installed manually" note without spawning a subprocess.
+# Set by _ensure_rocm_torch() on success; suppresses the post-install AMD warning.
 _rocm_windows_torch_installed: bool = False
 
 
@@ -403,9 +398,7 @@ def _ensure_rocm_torch() -> None:
         _rocm_windows_torch_installed = True
         return
 
-    # ── Linux x86_64 path ──────────────────────────────────────────────────────
-    # PyTorch only publishes ROCm wheels for linux_x86_64; skip aarch64 / arm64
-    # to avoid a missing-wheel error on `unsloth studio update`.
+    # ── Linux x86_64 only: PyTorch ROCm wheels are not published for aarch64 ──
     if platform.machine().lower() not in {"x86_64", "amd64"}:
         return
     # NVIDIA takes precedence on mixed hosts -- but only if an actual GPU is usable
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index fdffaa30b8..476f2f974b 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -661,9 +661,7 @@ if (-not $HasNvidiaSmi) {
         }
     }
 }
-# ── AMD ROCm detection (Windows) ────────────────────────────────────────────
-# Mirror setup.sh: probe hipinfo then amd-smi for an actual GPU, not just
-# tool presence. amdhip64.dll alone is NOT treated as GPU evidence.
+# ── AMD ROCm detection (Windows): probe hipinfo/amd-smi for actual GPU ──
 $HasROCm = $false
 $ROCmGpuLabel = $null
 if (-not $HasNvidiaSmi) {
@@ -1863,9 +1861,7 @@ if ($HasROCm -and $CuTag -eq "cpu") {
     if ($pyMajMin -eq "3.12" -and $ROCmVersion) {
         if ($ROCmVersion -match '^7\.2') {
             $rb = "$amdWheelBase/rocm-rel-7.2.1"
-            # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
-            # torch/_rocm_init.py imports at startup.
-            $ROCmTarballUrl = "$rb/rocm-7.2.1.tar.gz"
+            $ROCmTarballUrl = "$rb/rocm-7.2.1.tar.gz"  # rocm_sdk namespace
             $ROCmTorchWheelUrls = @(
                 "$rb/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
                 "$rb/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
@@ -1876,9 +1872,7 @@ if ($HasROCm -and $CuTag -eq "cpu") {
             )
         } elseif ($ROCmVersion -match '^7\.1') {
             $rb = "$amdWheelBase/rocm-rel-7.1.1"
-            # rocm tarball (14 KB) provides the 'rocm_sdk' Python namespace that
-            # torch/_rocm_init.py imports at startup.
-            $ROCmTarballUrl = "$rb/rocm-0.1.dev0.tar.gz"
+            $ROCmTarballUrl = "$rb/rocm-0.1.dev0.tar.gz"  # rocm_sdk namespace
             $ROCmTorchWheelUrls = @(
                 "$rb/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
                 "$rb/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
@@ -1911,8 +1905,7 @@ if ($ROCmTorchWheelUrls) {
         Write-Host $output -ForegroundColor Yellow
         $ROCmTorchWheelUrls = $null
     } else {
-        # Signal to install_python_stack.py that AMD wheels are already installed
-        # so it skips the subprocess probe and suppresses the manual-install warning.
+        # Tell install_python_stack.py to skip probe + suppress manual-install warning.
         $env:UNSLOTH_ROCM_TORCH_INSTALLED = "1"
     }
 }

From 265a09a341739b8cce03004b0132e50458f45432 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 00:27:10 -0500
Subject: [PATCH 053/165] fix: guard reconcile call against None numeric_ids;
 add torchvision lower bounds

---
 studio/backend/utils/hardware/hardware.py | 7 +++----
 studio/install_python_stack.py            | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 94787d4d95..747ff194ab 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -662,11 +662,10 @@ def get_visible_gpu_utilization() -> Dict[str, Any]:
         )
         if result is not None:
             result["backend"] = _backend_label(device)
-            if IS_ROCM:
+            numeric_ids = parent_visible_spec.get("numeric_ids")
+            if IS_ROCM and numeric_ids is not None:
                 # Fix unified-memory VRAM on AMD iGPUs (Strix Halo etc.)
-                _reconcile_rocm_unified_memory(
-                    result, parent_visible_spec["numeric_ids"]
-                )
+                _reconcile_rocm_unified_memory(result, numeric_ids)
             return result
 
     # Torch-based fallback for CUDA (nvidia-smi unavailable, AMD ROCm) and XPU (Intel)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 8af5fe3829..1e71ee8661 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -61,8 +61,8 @@
     # Default for rocm7.1 and earlier: torch 2.x below 2.11
     "_default": (
         "torch>=2.4,<2.11.0",
-        "torchvision<0.26.0",
-        "torchaudio<2.11.0",
+        "torchvision>=0.19,<0.26.0",
+        "torchaudio>=2.4,<2.11.0",
     ),
 }
 _PYTORCH_WHL_BASE = (

From 643a7979694e63d60e3069bf07c05df33ced4df7 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 01:59:14 -0500
Subject: [PATCH 054/165] fix(install.ps1): recreate venv with Python 3.12
 after ROCm switch

Venv was created with 3.13 before GPU detection ran; switching
$DetectedPython to 3.12 had no effect since $VenvPython still
pointed to the 3.13 interpreter inside the already-created venv.
---
 install.ps1 | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/install.ps1 b/install.ps1
index 424c3d5857..1d8ffb8040 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1338,6 +1338,16 @@ shell.Run cmd, 0, False
         if ($py312) {
             $DetectedPython = $py312
             substep "AMD GPU detected -- switching to Python 3.12 (ROCm wheels are cp312-only)" "Cyan"
+            # Recreate the venv with Python 3.12 (it was just created with 3.13 above).
+            if (Test-Path -LiteralPath $VenvDir) {
+                Remove-Item -LiteralPath $VenvDir -Recurse -Force -ErrorAction SilentlyContinue
+            }
+            step "venv" "creating Python 3.12 virtual environment"
+            substep "$VenvDir"
+            $venvExit = Invoke-InstallCommand { uv venv $VenvDir --python "$($DetectedPython.Path)" }
+            if ($venvExit -ne 0) {
+                return (Exit-InstallFailure "Failed to create Python 3.12 virtual environment (exit code $venvExit)" $venvExit)
+            }
         } else {
             substep "AMD GPU detected but Python 3.12 not found -- ROCm GPU support requires Python 3.12" "Yellow"
             substep "Install Python 3.12 from python.org and re-run." "Yellow"

From 7b38f0a9625f8d5952fd1c133729b3e7d79b6ab8 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 02:25:52 -0500
Subject: [PATCH 055/165] ux: detect AMD GPU before Python selection to avoid
 double venv creation

- Early hipinfo + WMI probe runs before Find-CompatiblePython so Python
  3.12 is selected upfront when AMD is detected; venv is now created
  exactly once instead of 3.13 then immediately 3.12.
- Post-venv recreation block replaced with a simple warning for the rare
  case where AMD was missed by the early probe.
- setup.ps1: show venv's actual Python version (e.g. 3.12) instead of
  the system Python found by the pre-activation search (was showing 3.13).
---
 install.ps1      | 135 +++++++++++++++++++++++------------------------
 studio/setup.ps1 |   9 +++-
 2 files changed, 74 insertions(+), 70 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 1d8ffb8040..dd18b047ea 100644
--- a/install.ps1
+++ b/install.ps1
@@ -950,12 +950,71 @@ shell.Run cmd, 0, False
         return $null
     }
 
+    # ── Quick AMD GPU probe (before Python selection) ──
+    # Checks hipinfo and WMI now so we can pick Python 3.12 upfront if AMD is
+    # present (ROCm Windows wheels are cp312-only). The full GPU detection with
+    # version strings and display labels runs after venv creation below.
+    $_EarlyAmdDetected = $false
+    try {
+        $hipinfoEarly = Get-Command hipinfo -ErrorAction SilentlyContinue
+        if ($hipinfoEarly) {
+            $hipEarlyOut = & $hipinfoEarly.Source 2>&1 | Out-String
+            if ($LASTEXITCODE -eq 0 -and $hipEarlyOut -match "(?i)gcnArchName") {
+                $_EarlyAmdDetected = $true
+            }
+        }
+    } catch {}
+    if (-not $_EarlyAmdDetected) {
+        try {
+            $wmiGpuEarly = Get-WmiObject Win32_VideoController -ErrorAction SilentlyContinue |
+                Where-Object { $_.Name -match "AMD|Radeon" } | Select-Object -First 1
+            if ($wmiGpuEarly) { $_EarlyAmdDetected = $true }
+        } catch {}
+    }
+
     # ── Install Python if no compatible version (3.11-3.13) found ──
     # Find-CompatiblePython returns @{ Version = "3.13"; Path = "C:\...\python.exe" } or $null.
     Write-TauriLog "STEP" "Installing Python"
     $DetectedPython = Find-CompatiblePython
+
+    # If AMD GPU is present and we didn't land on Python 3.12, find 3.12 now
+    # before creating the venv -- avoids creating the environment twice.
+    if ($_EarlyAmdDetected -and $DetectedPython -and (($DetectedPython.Version -split '\.')[0..1] -join '.') -ne "3.12") {
+        $py312Pre = $null
+        $pyLauncherPre = Get-Command py -CommandType Application -ErrorAction SilentlyContinue
+        if ($pyLauncherPre -and $pyLauncherPre.Source -notmatch $script:CondaSkipPattern) {
+            try {
+                $out312 = & $pyLauncherPre.Source "-3.12" --version 2>&1 | Out-String
+                if ($out312 -match "Python 3\.12\.\d+") {
+                    $resolvedExe312 = (& $pyLauncherPre.Source "-3.12" -c "import sys; print(sys.executable)" 2>$null | Out-String).Trim()
+                    if ($resolvedExe312 -and (Test-Path $resolvedExe312) -and -not (Test-IsCondaPython $resolvedExe312)) {
+                        $py312Pre = @{ Version = "3.12"; Path = $resolvedExe312 }
+                    }
+                }
+            } catch {}
+        }
+        if (-not $py312Pre) {
+            foreach ($name312 in @("python3.12", "python3", "python")) {
+                foreach ($cmd312 in @(Get-Command $name312 -All -ErrorAction SilentlyContinue)) {
+                    if (-not $cmd312.Source -or $cmd312.Source -like "*\WindowsApps\*") { continue }
+                    if (Test-IsCondaPython $cmd312.Source) { continue }
+                    try {
+                        $out312 = & $cmd312.Source --version 2>&1 | Out-String
+                        if ($out312 -match "Python 3\.12\.\d+") { $py312Pre = @{ Version = "3.12"; Path = $cmd312.Source }; break }
+                    } catch {}
+                }
+                if ($py312Pre) { break }
+            }
+        }
+        if ($py312Pre) { $DetectedPython = $py312Pre }
+    }
+
     if ($DetectedPython) {
-        step "python" "Python $($DetectedPython.Version) already installed"
+        $pyStepLabel = "Python $($DetectedPython.Version) already installed"
+        if ($_EarlyAmdDetected -and (($DetectedPython.Version -split '\.')[0..1] -join '.') -eq "3.12") {
+            $pyStepLabel = "Python 3.12 selected (ROCm wheels are cp312-only)"
+        }
+        step "python" $pyStepLabel
     }
     if (-not $DetectedPython) {
         substep "installing Python ${PythonVersion}..."
@@ -1284,74 +1343,12 @@ shell.Run cmd, 0, False
         substep "Training and GPU inference require an NVIDIA or AMD ROCm GPU." "Yellow"
     }
 
-    # ── AMD ROCm: prefer Python 3.12 (Windows wheels are cp312-only) ──
-    # If a non-3.12 Python was selected and an AMD GPU is present, try to find 3.12.
-    # Fires on $ROCmGpuLabel (WMI/no-HIP-SDK) as well as $HasROCm.
-    if (($HasROCm -or $ROCmGpuLabel) -and $DetectedPython -and ($DetectedPython.Version -split '\.')[0..1] -join '.' -ne "3.12") {
-        $py312 = $null
-        # 1. Try py launcher (official CPython installs)
-        $pyLauncher = Get-Command py -CommandType Application -ErrorAction SilentlyContinue
-        if ($pyLauncher -and $pyLauncher.Source -notmatch $script:CondaSkipPattern) {
-            try {
-                $out = & $pyLauncher.Source "-3.12" --version 2>&1 | Out-String
-                if ($out -match "Python 3\.12\.\d+") {
-                    $resolvedExe = (& $pyLauncher.Source "-3.12" -c "import sys; print(sys.executable)" 2>$null | Out-String).Trim()
-                    if ($resolvedExe -and (Test-Path $resolvedExe) -and -not (Test-IsCondaPython $resolvedExe)) {
-                        $py312 = @{ Version = "3.12"; Path = $resolvedExe }
-                    }
-                }
-            } catch {}
-        }
-        # 2. Try PATH (catches manually placed python3.12 / python)
-        if (-not $py312) {
-            foreach ($name in @("python3.12", "python3", "python")) {
-                foreach ($cmd in @(Get-Command $name -All -ErrorAction SilentlyContinue)) {
-                    if (-not $cmd.Source) { continue }
-                    if ($cmd.Source -like "*\WindowsApps\*") { continue }
-                    if (Test-IsCondaPython $cmd.Source) { continue }
-                    try {
-                        $out = & $cmd.Source --version 2>&1 | Out-String
-                        if ($out -match "Python 3\.12\.\d+") {
-                            $py312 = @{ Version = "3.12"; Path = $cmd.Source }
-                            break
-                        }
-                    } catch {}
-                }
-                if ($py312) { break }
-            }
-        }
-        # 3. Ask uv for its managed Python 3.12 (uv installs don't appear in PATH or py.exe)
-        if (-not $py312) {
-            $uvCmd = Get-Command uv -ErrorAction SilentlyContinue
-            if ($uvCmd) {
-                try {
-                    $uvPy = (& $uvCmd.Source python find 3.12 2>$null | Out-String).Trim()
-                    if ($uvPy -and (Test-Path $uvPy) -and -not (Test-IsCondaPython $uvPy)) {
-                        $verOut = (& $uvPy --version 2>&1 | Out-String)
-                        if ($verOut -match "Python 3\.12\.\d+") {
-                            $py312 = @{ Version = "3.12"; Path = $uvPy }
-                        }
-                    }
-                } catch {}
-            }
-        }
-        if ($py312) {
-            $DetectedPython = $py312
-            substep "AMD GPU detected -- switching to Python 3.12 (ROCm wheels are cp312-only)" "Cyan"
-            # Recreate the venv with Python 3.12 (it was just created with 3.13 above).
-            if (Test-Path -LiteralPath $VenvDir) {
-                Remove-Item -LiteralPath $VenvDir -Recurse -Force -ErrorAction SilentlyContinue
-            }
-            step "venv" "creating Python 3.12 virtual environment"
-            substep "$VenvDir"
-            $venvExit = Invoke-InstallCommand { uv venv $VenvDir --python "$($DetectedPython.Path)" }
-            if ($venvExit -ne 0) {
-                return (Exit-InstallFailure "Failed to create Python 3.12 virtual environment (exit code $venvExit)" $venvExit)
-            }
-        } else {
-            substep "AMD GPU detected but Python 3.12 not found -- ROCm GPU support requires Python 3.12" "Yellow"
-            substep "Install Python 3.12 from python.org and re-run." "Yellow"
-        }
+    # Warn if AMD GPU is present but the venv still isn't Python 3.12.
+    # The early probe above covers the normal case; this fires only when the
+    # full GPU detection reveals AMD that the early probe missed (e.g. hipinfo
+    # not yet on PATH) and 3.12 still wasn't found.
+    if (($HasROCm -or $ROCmGpuLabel) -and $DetectedPython -and (($DetectedPython.Version -split '\.')[0..1] -join '.') -ne "3.12") {
+        substep "AMD GPU requires Python 3.12 for ROCm wheels -- install it from python.org and re-run." "Yellow"
     }
 
     # ── Choose the correct PyTorch index URL based on driver CUDA version ──
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 476f2f974b..d27e050526 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1569,7 +1569,7 @@ if (-not $PythonCmd) {
     exit 1
 }
 
-substep "Using $PythonCmd ($(& $PythonCmd --version 2>&1))"
+substep "Python found: $PythonCmd"
 
 # The venv must already exist (created by install.ps1); this script only
 # updates packages. UNSLOTH_STUDIO_HOME (or STUDIO_HOME alias) overrides the
@@ -1737,6 +1737,13 @@ if (-not (Test-Path -LiteralPath $VenvDir)) {
     exit 1
 } else {
     substep "reusing existing virtual environment at $VenvDir"
+    $_venvPyExe = Join-Path $VenvDir "Scripts\python.exe"
+    if (Test-Path -LiteralPath $_venvPyExe) {
+        try {
+            $_venvPyVer = (& $_venvPyExe --version 2>&1 | Out-String).Trim()
+            if ($_venvPyVer) { substep $_venvPyVer }
+        } catch {}
+    }
 }
 
 # pip and python write to stderr even on success (progress bars, warnings).

From 1ba91d73dcca3b4d1fac212f6a1c9d85a58ac9ff Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 03:17:24 -0500
Subject: [PATCH 056/165] fix(rocm/win): auto-stub all _distributed_c10d
 symbols via PEP-562 __getattr__

The bare ModuleType stub caused ImportError when torch._dynamo was imported
(triggered by trainer.py accessing torch._dynamo.config at load time).
torch._dynamo pulls in torch.distributed.fsdp._flat_param which does:
  from torch._C._distributed_c10d import FakeProcessGroup
and potentially other symbols. Adding module __getattr__ auto-creates a
stub class for any missing symbol so all such imports succeed without
enumerating every individual symbol. Applied to both the primary stub
and the fallback stub in the except branch.
---
 studio/backend/core/training/worker.py | 31 ++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 9089eeb480..e7f8b7c79e 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1101,8 +1101,23 @@ def run_training_process(
 
     if sys.platform == "win32":
         _c10d_key = "torch._C._distributed_c10d"
-        if _c10d_key not in sys.modules:
+        if _c10d_key not in sys.modules:  # guard: never overwrite real NVIDIA impl
             _c10d_stub = _types.ModuleType(_c10d_key)
+
+            # ROCm Windows wheels omit the _distributed_c10d C extension.
+            # torch._dynamo imports torch.distributed.fsdp at load time which
+            # pulls in FakeProcessGroup (and potentially other symbols) from
+            # this module. PEP-562 module __getattr__ auto-stubs any missing
+            # symbol so every `from torch._C._distributed_c10d import X`
+            # succeeds; each stub is a plain class cached after first access.
+            def _c10d_stub_getattr(_attr):
+                if _attr.startswith("__"):
+                    raise AttributeError(_attr)
+                _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
+                setattr(_c10d_stub, _attr, _cls)  # cache — next access hits __dict__
+                return _cls
+
+            _c10d_stub.__getattr__ = _c10d_stub_getattr
             sys.modules[_c10d_key] = _c10d_stub
             try:
                 import torch._C as _torch_C_mod  # C ext — always importable
@@ -1124,9 +1139,17 @@ def run_training_process(
             setattr(_td_mock, _name, _stub)
         sys.modules["torch.distributed"] = _td_mock
         if "torch._C._distributed_c10d" not in sys.modules:
-            sys.modules["torch._C._distributed_c10d"] = _types.ModuleType(
-                "torch._C._distributed_c10d"
-            )
+            _c10d_fb = _types.ModuleType("torch._C._distributed_c10d")
+
+            def _c10d_fb_getattr(_attr):
+                if _attr.startswith("__"):
+                    raise AttributeError(_attr)
+                _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
+                setattr(_c10d_fb, _attr, _cls)
+                return _cls
+
+            _c10d_fb.__getattr__ = _c10d_fb_getattr
+            sys.modules["torch._C._distributed_c10d"] = _c10d_fb
         try:
             import torch as _torch
 

From fac005be27e29d0036d0c91c8c5894fce3aefabf Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 03:22:40 -0500
Subject: [PATCH 057/165] chore: trim c10d stub comment

---
 studio/backend/core/training/worker.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index e7f8b7c79e..4593873383 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1104,12 +1104,8 @@ def run_training_process(
         if _c10d_key not in sys.modules:  # guard: never overwrite real NVIDIA impl
             _c10d_stub = _types.ModuleType(_c10d_key)
 
-            # ROCm Windows wheels omit the _distributed_c10d C extension.
-            # torch._dynamo imports torch.distributed.fsdp at load time which
-            # pulls in FakeProcessGroup (and potentially other symbols) from
-            # this module. PEP-562 module __getattr__ auto-stubs any missing
-            # symbol so every `from torch._C._distributed_c10d import X`
-            # succeeds; each stub is a plain class cached after first access.
+            # ROCm Windows wheels omit this C extension; auto-stub every
+            # missing symbol so torch._dynamo's fsdp imports don't crash.
             def _c10d_stub_getattr(_attr):
                 if _attr.startswith("__"):
                     raise AttributeError(_attr)

From a3d9bac77966941b9ab5ab56b66e9077fafadeb7 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 03:32:40 -0500
Subject: [PATCH 058/165] =?UTF-8?q?fix(rocm/win):=20auto-stub=20missing=20?=
 =?UTF-8?q?torch.distributed=20attrs=20(Store,=20ProcessGroup,=20=E2=80=A6?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 studio/backend/core/training/worker.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 4593873383..785125e9d7 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1129,10 +1129,30 @@ def _c10d_stub_getattr(_attr):
         for _name, _stub in _td_stubs.items():
             if not hasattr(_td, _name):
                 setattr(_td, _name, _stub)
+        # ROCm Windows wheels omit C-extension-backed distributed classes
+        # (Store, ProcessGroup, …). Auto-stub any missing attribute so
+        # torch._dynamo's fake_pg class definitions don't crash at import time.
+        if not hasattr(_td, "__getattr__"):
+            def _td_getattr(_attr):
+                if _attr.startswith("__"):
+                    raise AttributeError(_attr)
+                _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
+                setattr(_td, _attr, _cls)
+                return _cls
+            _td.__getattr__ = _td_getattr
     except Exception:
         _td_mock = _types.ModuleType("torch.distributed")
         for _name, _stub in _td_stubs.items():
             setattr(_td_mock, _name, _stub)
+
+        def _td_mock_getattr(_attr):
+            if _attr.startswith("__"):
+                raise AttributeError(_attr)
+            _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
+            setattr(_td_mock, _attr, _cls)
+            return _cls
+
+        _td_mock.__getattr__ = _td_mock_getattr
         sys.modules["torch.distributed"] = _td_mock
         if "torch._C._distributed_c10d" not in sys.modules:
             _c10d_fb = _types.ModuleType("torch._C._distributed_c10d")

From 73ae40c34b637c5b408de4a30347205008ec4d70 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 03:39:55 -0500
Subject: [PATCH 059/165] fix(rocm/win): pre-stub fsdp submodules in
 sys.modules; fix __getattr__ subpackage clash

---
 studio/backend/core/training/worker.py | 63 +++++++++++++++-----------
 1 file changed, 36 insertions(+), 27 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 785125e9d7..ee0d680f75 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1099,6 +1099,18 @@ def run_training_process(
         "barrier": lambda: None,
     }
 
+    # Helper: build a ModuleType stub whose __getattr__ auto-creates child stubs.
+    def _make_mod_stub(mod_name):
+        m = _types.ModuleType(mod_name)
+        def _ga(attr, _m=m, _n=mod_name):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _make_mod_stub(f"{_n}.{attr}")
+            setattr(_m, attr, child)
+            return child
+        m.__getattr__ = _ga
+        return m
+
     if sys.platform == "win32":
         _c10d_key = "torch._C._distributed_c10d"
         if _c10d_key not in sys.modules:  # guard: never overwrite real NVIDIA impl
@@ -1110,65 +1122,62 @@ def _c10d_stub_getattr(_attr):
                 if _attr.startswith("__"):
                     raise AttributeError(_attr)
                 _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
-                setattr(_c10d_stub, _attr, _cls)  # cache — next access hits __dict__
+                setattr(_c10d_stub, _attr, _cls)
                 return _cls
 
             _c10d_stub.__getattr__ = _c10d_stub_getattr
             sys.modules[_c10d_key] = _c10d_stub
             try:
                 import torch._C as _torch_C_mod  # C ext — always importable
-
                 if not hasattr(_torch_C_mod, "_distributed_c10d"):
                     _torch_C_mod._distributed_c10d = _c10d_stub
             except Exception:
                 pass
 
+            # Pre-register torch.distributed.fsdp submodules as stubs so
+            # torch._dynamo's module-level fsdp import short-circuits before
+            # the real package loads (it has a circular import on ROCm Windows).
+            for _fsdp_name in (
+                "torch.distributed.fsdp",
+                "torch.distributed.fsdp._flat_param",
+                "torch.distributed.fsdp._fully_shard",
+                "torch.distributed.fsdp._fsdp_param_group",
+                "torch.distributed.fsdp._common_utils",
+            ):
+                if _fsdp_name not in sys.modules:
+                    sys.modules[_fsdp_name] = _make_mod_stub(_fsdp_name)
+
     try:
         import torch.distributed as _td
 
         for _name, _stub in _td_stubs.items():
             if not hasattr(_td, _name):
                 setattr(_td, _name, _stub)
-        # ROCm Windows wheels omit C-extension-backed distributed classes
-        # (Store, ProcessGroup, …). Auto-stub any missing attribute so
-        # torch._dynamo's fake_pg class definitions don't crash at import time.
+        # Stub C-extension-backed class attrs (Store, ProcessGroup, …) that
+        # the ROCm Windows wheel omits. __getattr__ checks sys.modules first
+        # so it never intercepts real subpackage lookups as plain classes.
         if not hasattr(_td, "__getattr__"):
             def _td_getattr(_attr):
                 if _attr.startswith("__"):
                     raise AttributeError(_attr)
+                _full = f"torch.distributed.{_attr}"
+                if _full in sys.modules:
+                    _mod = sys.modules[_full]
+                    setattr(_td, _attr, _mod)
+                    return _mod
                 _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
                 setattr(_td, _attr, _cls)
                 return _cls
             _td.__getattr__ = _td_getattr
     except Exception:
-        _td_mock = _types.ModuleType("torch.distributed")
+        _td_mock = _make_mod_stub("torch.distributed")
         for _name, _stub in _td_stubs.items():
             setattr(_td_mock, _name, _stub)
-
-        def _td_mock_getattr(_attr):
-            if _attr.startswith("__"):
-                raise AttributeError(_attr)
-            _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
-            setattr(_td_mock, _attr, _cls)
-            return _cls
-
-        _td_mock.__getattr__ = _td_mock_getattr
         sys.modules["torch.distributed"] = _td_mock
         if "torch._C._distributed_c10d" not in sys.modules:
-            _c10d_fb = _types.ModuleType("torch._C._distributed_c10d")
-
-            def _c10d_fb_getattr(_attr):
-                if _attr.startswith("__"):
-                    raise AttributeError(_attr)
-                _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
-                setattr(_c10d_fb, _attr, _cls)
-                return _cls
-
-            _c10d_fb.__getattr__ = _c10d_fb_getattr
-            sys.modules["torch._C._distributed_c10d"] = _c10d_fb
+            sys.modules["torch._C._distributed_c10d"] = _make_mod_stub("torch._C._distributed_c10d")
         try:
             import torch as _torch
-
             _torch.distributed = _td_mock
         except Exception:
             pass

From ea510b5936a9a85c14e0eeec420b2ad066012678 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 04:35:20 -0500
Subject: [PATCH 060/165] feat(rocm/win): arch-aware wheel selector always
 picks newest ROCm release
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace HIP-SDK-version-gated wheel selection with GPU arch-based logic.
Select-ROCmWheelRelease (PS) and _select_windows_rocm_release (Python) map
gcnArchName → minimum ROCm version, then pick the newest available release
that satisfies it (currently always rocm-rel-7.2.1 for any supported GPU).
Wheels bundle their own ROCm runtime so the installed HIP SDK 7.1 does not
prevent using 7.2.1 wheels on gfx1200 (RX 9060 XT) and similar RDNA 4 GPUs.

Also installs the bitsandbytes Windows ROCm continuous-release wheel and sets
BNB_ROCM_VERSION=72 in worker.py before ML imports so bnb loads the
libbitsandbytes_rocm72.dll that ships in that wheel.
---
 install.ps1                            | 118 +++++++++++++++++--------
 studio/backend/core/training/worker.py |   6 ++
 studio/install_python_stack.py         | 106 ++++++++++++++++++----
 studio/setup.ps1                       | 103 +++++++++++++++------
 4 files changed, 251 insertions(+), 82 deletions(-)

diff --git a/install.ps1 b/install.ps1
index dd18b047ea..b3dfed5d65 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1269,6 +1269,7 @@ shell.Run cmd, 0, False
     $HasROCm = $false
     $ROCmGpuLabel = $null
     $ROCmVersion = $null
+    $ROCmGfxArch = $null
     if (-not $HasNvidiaSmi) {
         $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
         if ($hipinfoExe) {
@@ -1277,7 +1278,8 @@ shell.Run cmd, 0, False
                 if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                     $HasROCm = $true
                     if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
-                        $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
+                        $ROCmGfxArch  = $Matches[1].Trim()
+                        $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
                     } else {
                         $ROCmGpuLabel = "AMD ROCm"
                     }
@@ -1373,46 +1375,90 @@ shell.Run cmd, 0, False
     }
     $TorchIndexUrl = Get-TorchIndexUrl
 
+    # ── GPU arch → newest compatible Windows ROCm wheel release ──
+    # Wheels bundle their own ROCm runtime; the installed HIP SDK version does
+    # not constrain which release to use.  Always picks the newest release that
+    # supports the GPU architecture.
+    function Select-ROCmWheelRelease {
+        param([string]$GfxArch)
+
+        # Available releases, newest first.
+        $releases = @(
+            @{
+                Rel     = "rocm-rel-7.2.1"
+                Tag     = "rocm7.2"
+                RocmVer = @(7, 2)
+                Tarball = "rocm-7.2.1.tar.gz"
+                Wheels  = @(
+                    "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+                    "rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
+                    "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
+                    "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                    "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                    "torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+                )
+            },
+            @{
+                Rel     = "rocm-rel-7.1.1"
+                Tag     = "rocm7.1"
+                RocmVer = @(7, 1)
+                Tarball = "rocm-0.1.dev0.tar.gz"
+                Wheels  = @(
+                    "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
+                    "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
+                    "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                    "torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                    "torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+                )
+            }
+        )
+
+        # GPU arch → minimum (major, minor) ROCm release needed.
+        $archMin = @{
+            "gfx1201" = @(7,1); "gfx1200" = @(7,1)   # RDNA 4
+            "gfx1151" = @(7,1); "gfx1150" = @(7,1)   # RDNA 3.5 (Strix Halo/Point)
+            "gfx1103" = @(6,4); "gfx1102" = @(6,4); "gfx1101" = @(6,4); "gfx1100" = @(6,4)  # RDNA 3
+            "gfx1036" = @(6,4); "gfx1035" = @(6,4); "gfx1034" = @(6,4); "gfx1033" = @(6,4)  # RDNA 2
+            "gfx1032" = @(6,4); "gfx1031" = @(6,4); "gfx1030" = @(6,4)
+            "gfx1011" = @(6,4); "gfx1010" = @(6,4)   # RDNA 1
+            "gfx906"  = @(6,4); "gfx908"  = @(6,4); "gfx90a" = @(6,4)   # Vega/MI
+        }
+        $minVer = if ($GfxArch -and $archMin.ContainsKey($GfxArch)) {
+            $archMin[$GfxArch]
+        } else {
+            @(6, 4)  # unknown arch: try the latest (7.2.1 supports all modern GPUs)
+        }
+
+        foreach ($r in $releases) {
+            $rv = $r.RocmVer
+            $ok = ($rv[0] -gt $minVer[0]) -or ($rv[0] -eq $minVer[0] -and $rv[1] -ge $minVer[1])
+            if ($ok) { return $r }
+        }
+        return $null
+    }
+
     # ── AMD Windows ROCm wheel override ──
-    # When the HIP SDK is present and Python 3.12, use repo.radeon.com direct wheels.
+    # Selects the newest wheel release compatible with the GPU arch (HIP SDK
+    # version is irrelevant; wheels bundle their own ROCm runtime).
     $ROCmTorchWheelUrl = $null
     $ROCmTarballUrl    = $null
+    $ROCmWheelTag      = $null
     if ($HasROCm -and -not $SkipTorch) {
         $pyMajMin = if ($DetectedPython) { ($DetectedPython.Version -split '\.')[0..1] -join '.' } else { "" }
         if ($pyMajMin -eq "3.12") {
             $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
-            if ($ROCmVersion -and $ROCmVersion -match '^7\.2') {
-                $amdRelBase = "$amdWheelBase/rocm-rel-7.2.1"
-                $ROCmTarballUrl = "$amdRelBase/rocm-7.2.1.tar.gz"  # rocm_sdk namespace
-                $ROCmAllWheelUrls = @(
-                    "$amdRelBase/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
-                    "$amdRelBase/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
-                    "$amdRelBase/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
-                    "$amdRelBase/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                    "$amdRelBase/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                    "$amdRelBase/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
-                )
-                $ROCmTorchWheelUrl = $ROCmAllWheelUrls[3]
-                $TorchIndexUrl = $null
-            } elseif ($ROCmVersion -and $ROCmVersion -match '^7\.1') {
-                $amdRelBase = "$amdWheelBase/rocm-rel-7.1.1"
-                $ROCmTarballUrl = "$amdRelBase/rocm-0.1.dev0.tar.gz"  # rocm_sdk namespace
-                $ROCmAllWheelUrls = @(
-                    "$amdRelBase/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
-                    "$amdRelBase/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
-                    "$amdRelBase/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                    "$amdRelBase/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                    "$amdRelBase/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
-                )
-                $ROCmTorchWheelUrl = $ROCmAllWheelUrls[2]
-                $TorchIndexUrl = $null
-            }
-            if ($ROCmTorchWheelUrl) {
-                substep "AMD ROCm $ROCmVersion (Python 3.12) -- AMD Windows torch wheel selected" "Cyan"
-            } elseif ($ROCmVersion) {
-                substep "No AMD Windows torch wheel for ROCm $ROCmVersion -- falling back to CPU-only PyTorch" "Yellow"
+            $sel = Select-ROCmWheelRelease -GfxArch $ROCmGfxArch
+            if ($sel) {
+                $rb               = "$amdWheelBase/$($sel.Rel)"
+                $ROCmTarballUrl   = "$rb/$($sel.Tarball)"
+                $ROCmAllWheelUrls = $sel.Wheels | ForEach-Object { "$rb/$_" }
+                $ROCmTorchWheelUrl = ($ROCmAllWheelUrls | Where-Object { $_ -match '/torch-' })[0]
+                $ROCmWheelTag      = $sel.Tag
+                $TorchIndexUrl     = $null
+                $archLabel = if ($ROCmGfxArch) { $ROCmGfxArch } else { "AMD GPU" }
+                substep "$archLabel -- Windows torch wheel $($sel.Rel) selected" "Cyan"
             } else {
-                substep "ROCm version unknown -- falling back to CPU-only PyTorch" "Yellow"
+                substep "No AMD Windows torch wheel for GPU arch $ROCmGfxArch -- falling back to CPU-only PyTorch" "Yellow"
             }
         } else {
             substep "AMD Windows ROCm wheels require Python 3.12 (detected: $pyMajMin) -- using CPU-only PyTorch" "Yellow"
@@ -1421,9 +1467,7 @@ shell.Run cmd, 0, False
     }
 
     $TorchIndexFamily = Get-TauriTorchIndexFamily $(
-        if ($ROCmTorchWheelUrl) {
-            if ($ROCmVersion -match '^7\.1') { "rocm7.1" } else { "rocm7.2" }
-        } else { $TorchIndexUrl }
+        if ($ROCmTorchWheelUrl) { $ROCmWheelTag } else { $TorchIndexUrl }
     )
     $GpuBranch = Get-TauriGpuBranch $TorchIndexFamily
     Write-TauriDiag -GpuBranch $GpuBranch -TorchIndexFamily $TorchIndexFamily -PythonVersionForDiag $DetectedPython.Version
@@ -1512,7 +1556,7 @@ shell.Run cmd, 0, False
             substep "skipping PyTorch (--no-torch flag set)." "Yellow"
         } elseif ($ROCmTorchWheelUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
-            substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
+            substep "installing PyTorch ($ROCmWheelTag)..."
             # rocm_sdk namespace tarball (torch/_rocm_init.py imports it at startup)
             if ($ROCmTarballUrl) {
                 $tarballExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps $ROCmTarballUrl }
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ee0d680f75..ad62ad459a 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1182,6 +1182,12 @@ def _td_getattr(_attr):
         except Exception:
             pass
 
+    # ── 1e. Point bitsandbytes at the ROCm 7.2 DLL on Windows ──
+    # The AMD continuous-release wheel ships libbitsandbytes_rocm72.dll.
+    # BNB_ROCM_VERSION overrides the version string bnb uses to locate the DLL.
+    if sys.platform == "win32" and os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
+        os.environ.setdefault("BNB_ROCM_VERSION", "72")
+
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:
         _send_status(event_queue, "Importing Unsloth...")
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 1e71ee8661..a2c15b4542 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -117,6 +117,13 @@
         "download/continuous-release_main/"
         "bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl"
     ),
+    # Windows ROCm wheel — ships libbitsandbytes_rocm72.dll.
+    # BNB_ROCM_VERSION=72 must be set in the environment before importing bnb.
+    "win_amd64": (
+        "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/"
+        "download/continuous-release_main/"
+        "bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl"
+    ),
 }
 _BNB_ROCM_PYPI_FALLBACK = "bitsandbytes>=0.49.1"
 
@@ -228,6 +235,56 @@ def _detect_rocm_version() -> tuple[int, int] | None:
     return None
 
 
+# GPU arch → minimum (major, minor) ROCm release that supports it on Windows.
+# Wheels bundle their own ROCm runtime, so the installed HIP SDK version does
+# not constrain selection — only the GPU's architecture minimum matters.
+_GFX_MIN_ROCM_WINDOWS: dict[str, tuple[int, int]] = {
+    "gfx1201": (7, 1), "gfx1200": (7, 1),  # RDNA 4
+    "gfx1151": (7, 1), "gfx1150": (7, 1),  # RDNA 3.5 (Strix Halo/Point)
+    "gfx1103": (6, 4), "gfx1102": (6, 4), "gfx1101": (6, 4), "gfx1100": (6, 4),  # RDNA 3
+    "gfx1036": (6, 4), "gfx1035": (6, 4), "gfx1034": (6, 4), "gfx1033": (6, 4),  # RDNA 2
+    "gfx1032": (6, 4), "gfx1031": (6, 4), "gfx1030": (6, 4),
+    "gfx1011": (6, 4), "gfx1010": (6, 4),  # RDNA 1
+    "gfx906": (6, 4), "gfx908": (6, 4), "gfx90a": (6, 4),  # Vega/MI
+}
+
+
+def _detect_windows_gfx_arch() -> str | None:
+    """Return the gcnArchName from hipinfo on Windows (e.g. 'gfx1200'), or None."""
+    import re
+
+    hipinfo = shutil.which("hipinfo")
+    if not hipinfo:
+        return None
+    try:
+        result = subprocess.run(
+            [hipinfo],
+            stdout = subprocess.PIPE,
+            stderr = subprocess.DEVNULL,
+            timeout = 10,
+        )
+        if result.returncode != 0:
+            return None
+        text = result.stdout.decode(errors = "replace")
+        m = re.search(r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text)
+        return m.group(1).strip() if m else None
+    except Exception:
+        return None
+
+
+def _select_windows_rocm_release(gfx_arch: str | None) -> tuple[str, list[str]] | None:
+    """Pick the best available Windows ROCm release for the given GPU arch.
+
+    Always selects the newest available release whose ROCm version meets the
+    GPU's minimum requirement.  Returns None when no release qualifies.
+    """
+    min_ver = _GFX_MIN_ROCM_WINDOWS.get(gfx_arch or "", (6, 4))
+    for (maj, mn), entry in sorted(_ROCM_WINDOWS_RELEASES.items(), reverse = True):
+        if (maj, mn) >= min_ver:
+            return entry
+    return None
+
+
 def _has_rocm_gpu() -> bool:
     """Return True only if an actual AMD GPU is visible (not just ROCm tools installed)."""
     import re
@@ -344,8 +401,9 @@ def _ensure_rocm_torch() -> None:
             return
         if _has_usable_nvidia_gpu():
             return
-        if not _has_rocm_gpu():
-            return
+        gfx_arch = _detect_windows_gfx_arch()
+        if not gfx_arch:
+            return  # no AMD GPU visible via hipinfo
         try:
             probe = subprocess.run(
                 [
@@ -367,34 +425,44 @@ def _ensure_rocm_torch() -> None:
                 return  # already ROCm torch
         except (OSError, subprocess.TimeoutExpired):
             pass
-        ver = _detect_rocm_version()
-        if ver is None:
-            print("   ROCm detected but version unreadable -- skipping torch reinstall")
-            return
-        entry = next(
-            (
-                v
-                for (maj, mn), v in sorted(_ROCM_WINDOWS_RELEASES.items(), reverse = True)
-                if ver >= (maj, mn)
-            ),
-            None,
-        )
+        entry = _select_windows_rocm_release(gfx_arch)
         if entry is None:
-            print(
-                f"   No AMD Windows torch wheel for ROCm {ver[0]}.{ver[1]} -- skipping"
-            )
+            print(f"   No AMD Windows torch wheel for GPU arch {gfx_arch} -- skipping")
             return
         rel_tag, wheel_files = entry
         base = f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}"
         wheel_urls = [f"{base}/{fn}" for fn in wheel_files]
-        print(f"   ROCm {ver[0]}.{ver[1]} (Windows) -- installing torch from {base}/")
+        print(f"   {gfx_arch} (Windows) -- installing torch from {base}/")
+        # Install rocm namespace tarball first (torch/_rocm_init.py imports it)
+        tarball_url = next((u for u in wheel_urls if u.endswith(".tar.gz")), None)
+        whl_urls = [u for u in wheel_urls if not u.endswith(".tar.gz")]
+        if tarball_url:
+            pip_install(
+                f"ROCm namespace ({rel_tag})",
+                "--force-reinstall",
+                "--no-deps",
+                tarball_url,
+                constrain = False,
+            )
         pip_install(
             f"ROCm torch (Windows, {rel_tag})",
             "--force-reinstall",
             "--no-deps",
-            *wheel_urls,
+            *whl_urls,
             constrain = False,
         )
+        # bitsandbytes Windows ROCm wheel (ships libbitsandbytes_rocm72.dll).
+        # BNB_ROCM_VERSION=72 is set in worker.py before the bnb import.
+        _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
+        if _bnb_win_url is not None:
+            pip_install_try(
+                "bitsandbytes (AMD Windows, pre-release main)",
+                "--force-reinstall",
+                "--no-cache-dir",
+                "--no-deps",
+                _bnb_win_url,
+                constrain = False,
+            )
         _rocm_windows_torch_installed = True
         return
 
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index d27e050526..f1c7c10801 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -664,6 +664,7 @@ if (-not $HasNvidiaSmi) {
 # ── AMD ROCm detection (Windows): probe hipinfo/amd-smi for actual GPU ──
 $HasROCm = $false
 $ROCmGpuLabel = $null
+$script:ROCmGfxArch = $null
 if (-not $HasNvidiaSmi) {
     # hipinfo: present + output contains gcnArchName → real HIP GPU
     $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
@@ -673,7 +674,8 @@ if (-not $HasNvidiaSmi) {
             if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                 $HasROCm = $true
                 if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
-                    $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
+                    $script:ROCmGfxArch = $Matches[1].Trim()
+                    $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
                 } else {
                     $ROCmGpuLabel = "AMD ROCm"
                 }
@@ -1855,38 +1857,87 @@ if ($HasNvidiaSmi) {
     $CuTag = "cpu"
 }
 
+# ── GPU arch → newest compatible Windows ROCm wheel release ──
+# Wheels bundle their own ROCm runtime; the installed HIP SDK version does
+# not constrain which release to use.  Always picks the newest release that
+# supports the GPU architecture.
+function Select-ROCmWheelRelease {
+    param([string]$GfxArch)
+
+    # Available releases, newest first.
+    $releases = @(
+        @{
+            Rel     = "rocm-rel-7.2.1"
+            Tag     = "rocm7.2"
+            RocmVer = @(7, 2)
+            Tarball = "rocm-7.2.1.tar.gz"
+            Wheels  = @(
+                "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
+                "rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
+                "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
+                "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
+                "torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
+            )
+        },
+        @{
+            Rel     = "rocm-rel-7.1.1"
+            Tag     = "rocm7.1"
+            RocmVer = @(7, 1)
+            Tarball = "rocm-0.1.dev0.tar.gz"
+            Wheels  = @(
+                "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
+                "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
+                "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                "torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
+                "torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
+            )
+        }
+    )
+
+    # GPU arch → minimum (major, minor) ROCm release needed.
+    $archMin = @{
+        "gfx1201" = @(7,1); "gfx1200" = @(7,1)   # RDNA 4
+        "gfx1151" = @(7,1); "gfx1150" = @(7,1)   # RDNA 3.5 (Strix Halo/Point)
+        "gfx1103" = @(6,4); "gfx1102" = @(6,4); "gfx1101" = @(6,4); "gfx1100" = @(6,4)  # RDNA 3
+        "gfx1036" = @(6,4); "gfx1035" = @(6,4); "gfx1034" = @(6,4); "gfx1033" = @(6,4)  # RDNA 2
+        "gfx1032" = @(6,4); "gfx1031" = @(6,4); "gfx1030" = @(6,4)
+        "gfx1011" = @(6,4); "gfx1010" = @(6,4)   # RDNA 1
+        "gfx906"  = @(6,4); "gfx908"  = @(6,4); "gfx90a" = @(6,4)   # Vega/MI
+    }
+    $minVer = if ($GfxArch -and $archMin.ContainsKey($GfxArch)) {
+        $archMin[$GfxArch]
+    } else {
+        @(6, 4)  # unknown arch: try the latest (7.2.1 supports all modern GPUs)
+    }
+
+    foreach ($r in $releases) {
+        $rv = $r.RocmVer
+        $ok = ($rv[0] -gt $minVer[0]) -or ($rv[0] -eq $minVer[0] -and $rv[1] -ge $minVer[1])
+        if ($ok) { return $r }
+    }
+    return $null
+}
+
 # ── AMD Windows ROCm torch override ──────────────────────────────────────────
-# When ROCm HIP SDK is present and Python 3.12 is in use, install AMD's direct
-# torch wheels instead of CPU-only PyTorch.
+# Selects the newest wheel release compatible with the GPU arch (HIP SDK
+# version is irrelevant; wheels bundle their own ROCm runtime).
 $ROCmVersion = $script:ROCmVersion
+$ROCmGfxArch = $script:ROCmGfxArch
 $ROCmTorchWheelUrls = $null
 $ROCmTarballUrl     = $null
+$ROCmWheelTag       = $null
 if ($HasROCm -and $CuTag -eq "cpu") {
     $pyVer = (& python --version 2>&1 | Out-String) -replace '[^0-9.]',''
     $pyMajMin = ($pyVer.Trim() -split '\.')[0..1] -join '.'
     $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
-    if ($pyMajMin -eq "3.12" -and $ROCmVersion) {
-        if ($ROCmVersion -match '^7\.2') {
-            $rb = "$amdWheelBase/rocm-rel-7.2.1"
-            $ROCmTarballUrl = "$rb/rocm-7.2.1.tar.gz"  # rocm_sdk namespace
-            $ROCmTorchWheelUrls = @(
-                "$rb/rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
-                "$rb/rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
-                "$rb/rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
-                "$rb/torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                "$rb/torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                "$rb/torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
-            )
-        } elseif ($ROCmVersion -match '^7\.1') {
-            $rb = "$amdWheelBase/rocm-rel-7.1.1"
-            $ROCmTarballUrl = "$rb/rocm-0.1.dev0.tar.gz"  # rocm_sdk namespace
-            $ROCmTorchWheelUrls = @(
-                "$rb/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
-                "$rb/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
-                "$rb/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                "$rb/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                "$rb/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
-            )
+    if ($pyMajMin -eq "3.12") {
+        $sel = Select-ROCmWheelRelease -GfxArch $ROCmGfxArch
+        if ($sel) {
+            $rb               = "$amdWheelBase/$($sel.Rel)"
+            $ROCmTarballUrl   = "$rb/$($sel.Tarball)"
+            $ROCmTorchWheelUrls = $sel.Wheels | ForEach-Object { "$rb/$_" }
+            $ROCmWheelTag     = $sel.Tag
         }
     }
 }
@@ -1894,7 +1945,7 @@ if ($HasROCm -and $CuTag -eq "cpu") {
 $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR.TrimEnd('/') } else { "https://download.pytorch.org/whl" }
 
 if ($ROCmTorchWheelUrls) {
-    substep "installing PyTorch (AMD ROCm $ROCmVersion)..."
+    substep "installing PyTorch ($ROCmWheelTag)..."
     # Install the rocm namespace tarball first (provides the 'rocm_sdk' Python
     # package that torch/_rocm_init.py imports at startup).
     if ($ROCmTarballUrl) {

From 4d09cbbda4dcd5e1ce683d1c3e36f512268524f8 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 05:22:03 -0500
Subject: [PATCH 061/165] fix(rocm/win): stub class metaclass for
 ProcessGroup.BackendType; amd-smi circuit breaker

torchao.float8.inference accesses ProcessGroup.BackendType as a class-level
attribute.  Plain type() stubs have no __getattr__ on the metaclass so this
raises AttributeError.  Introduce _StubClassMeta whose __getattr__ returns
child stub classes, fixing the torchao import chain.

Add an amd-smi circuit breaker in amd.py: after 3 consecutive failures the
module stops spawning the process, eliminating the repeated Windows UAC /
DiskPart elevation prompts caused by polling a non-functional amd-smi.

Also guard BNB_ROCM_VERSION=72 behind a DLL existence check so bitsandbytes
fails with its own detection message rather than a harder "DLL not found" when
the Windows ROCm bnb wheel is not yet installed.
---
 studio/backend/core/training/worker.py | 29 ++++++++++++++++++++++----
 studio/backend/utils/hardware/amd.py   | 19 +++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ad62ad459a..7d4202fe4b 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1111,6 +1111,20 @@ def _ga(attr, _m=m, _n=mod_name):
         m.__getattr__ = _ga
         return m
 
+    # Metaclass for stub *classes* so class-level attribute access works too.
+    # e.g. torchao does ProcessGroup.BackendType — plain type() has no __getattr__
+    # on the metaclass, so we need this to avoid AttributeError on class attrs.
+    class _StubClassMeta(type):
+        def __getattr__(cls, attr):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _StubClassMeta(attr, (), {"__init__": lambda self, *a, **kw: None})
+            setattr(cls, attr, child)
+            return child
+
+    def _make_stub_class(name):
+        return _StubClassMeta(name, (), {"__init__": lambda self, *a, **kw: None})
+
     if sys.platform == "win32":
         _c10d_key = "torch._C._distributed_c10d"
         if _c10d_key not in sys.modules:  # guard: never overwrite real NVIDIA impl
@@ -1121,7 +1135,7 @@ def _ga(attr, _m=m, _n=mod_name):
             def _c10d_stub_getattr(_attr):
                 if _attr.startswith("__"):
                     raise AttributeError(_attr)
-                _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
+                _cls = _make_stub_class(_attr)
                 setattr(_c10d_stub, _attr, _cls)
                 return _cls
 
@@ -1165,7 +1179,7 @@ def _td_getattr(_attr):
                     _mod = sys.modules[_full]
                     setattr(_td, _attr, _mod)
                     return _mod
-                _cls = type(_attr, (), {"__init__": lambda self, *a, **kw: None})
+                _cls = _make_stub_class(_attr)
                 setattr(_td, _attr, _cls)
                 return _cls
             _td.__getattr__ = _td_getattr
@@ -1184,9 +1198,16 @@ def _td_getattr(_attr):
 
     # ── 1e. Point bitsandbytes at the ROCm 7.2 DLL on Windows ──
     # The AMD continuous-release wheel ships libbitsandbytes_rocm72.dll.
-    # BNB_ROCM_VERSION overrides the version string bnb uses to locate the DLL.
+    # Only set BNB_ROCM_VERSION when that DLL is actually present — setting it
+    # when the DLL is absent makes bnb fail harder than the default detection.
     if sys.platform == "win32" and os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
-        os.environ.setdefault("BNB_ROCM_VERSION", "72")
+        import importlib.util as _ilu
+        _bnb_spec = _ilu.find_spec("bitsandbytes")
+        if _bnb_spec and _bnb_spec.origin:
+            import pathlib as _pl
+            _bnb_dll = _pl.Path(_bnb_spec.origin).parent / "libbitsandbytes_rocm72.dll"
+            if _bnb_dll.exists():
+                os.environ.setdefault("BNB_ROCM_VERSION", "72")
 
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index 443566136a..d13cbdb545 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -27,9 +27,19 @@
 # can take 15-25 s on cold hardware.  Linux is consistently < 2 s.
 _AMD_SMI_DEFAULT_TIMEOUT = 30 if platform.system() == "Windows" else 10
 
+# Circuit breaker: stop calling amd-smi after this many consecutive failures.
+# On Windows, each failed call spawns a process that may show a UAC/DiskPart
+# elevation prompt.  Once we know amd-smi doesn't work we stop polling it.
+_AMD_SMI_FAILURE_LIMIT = 3
+_amd_smi_consecutive_failures = 0
+_amd_smi_disabled = False
+
 
 def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optional[Any]:
     """Run amd-smi with the given arguments and return parsed JSON, or None."""
+    global _amd_smi_consecutive_failures, _amd_smi_disabled
+    if _amd_smi_disabled:
+        return None
     try:
         result = subprocess.run(
             ["amd-smi", *args, "--json"],
@@ -41,10 +51,19 @@ def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optiona
         )
     except (OSError, subprocess.TimeoutExpired) as e:
         logger.warning("amd-smi query failed: %s", e)
+        _amd_smi_consecutive_failures += 1
+        if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
+            logger.warning("amd-smi unavailable -- disabling GPU polling to avoid repeated prompts")
+            _amd_smi_disabled = True
         return None
     if result.returncode != 0 or not result.stdout.strip():
         logger.warning("amd-smi returned code %d", result.returncode)
+        _amd_smi_consecutive_failures += 1
+        if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
+            logger.warning("amd-smi unavailable -- disabling GPU polling to avoid repeated prompts")
+            _amd_smi_disabled = True
         return None
+    _amd_smi_consecutive_failures = 0  # reset on success
     try:
         return json.loads(result.stdout)
     except json.JSONDecodeError:

From e64c1966dc9adfc4856ef29e8deeedc900e80103 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 05:32:13 -0500
Subject: [PATCH 062/165] fix: stub __members__ so torchao float8 enum check
 doesn't crash on ROCm Windows

torchao.float8.inference accesses ProcessGroup.BackendType.__members__
expecting a Python Enum registry dict. _StubClassMeta.__getattr__ was
blocking all dunder attributes, causing AttributeError. Return {} for
__members__ specifically so the isinstance/iteration checks pass cleanly.
---
 studio/backend/core/training/worker.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 7d4202fe4b..a9589bb65e 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1112,12 +1112,28 @@ def _ga(attr, _m=m, _n=mod_name):
         return m
 
     # Metaclass for stub *classes* so class-level attribute access works too.
-    # e.g. torchao does ProcessGroup.BackendType — plain type() has no __getattr__
-    # on the metaclass, so we need this to avoid AttributeError on class attrs.
+    # e.g. torchao / distributed_c10d does ProcessGroup.BackendType.NCCL —
+    # plain type() has no __getattr__ on the metaclass, so we need this to
+    # avoid AttributeError on arbitrary class-level attribute access.
+    #
+    # We intentionally do NOT use a real enum.Enum here: the C++ BackendType
+    # enum gains new members across PyTorch versions (XCCL was added in 2.6+)
+    # and hard-coding the list means every new member causes another crash.
+    # Instead _StubClassMeta auto-creates child stubs for any attr access, and
+    # the __members__ safety net satisfies Enum-duck-typing checks in torchao.
     class _StubClassMeta(type):
         def __getattr__(cls, attr):
+            if attr == "__members__":
+                # torchao checks ProcessGroup.BackendType.__members__ (Enum
+                # interface).  Return an empty dict — we have no real members
+                # to enumerate and the caller just iterates / checks membership.
+                return {}
             if attr.startswith("__"):
                 raise AttributeError(attr)
+            # Auto-create a child stub for any member access (BackendType.NCCL,
+            # BackendType.XCCL, BackendType.UNDEFINED, …).  We cache it on the
+            # class so repeated accesses return the same object (identity
+            # comparisons stay consistent).
             child = _StubClassMeta(attr, (), {"__init__": lambda self, *a, **kw: None})
             setattr(cls, attr, child)
             return child

From 26f073d642cdd30ab8e285d99fe0539d0625f4f3 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 05:53:18 -0500
Subject: [PATCH 063/165] fix: stub distributed tensor/functional_collectives
 to prevent missing C++ op crash on ROCm Windows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

torch._dynamo.trace_rules eagerly loads torch.distributed.tensor at import
time, which pulls in _functional_collectives.py. That file registers Meta
kernels for _c10d_functional C++ ops, but those ops are only registered
by torch._C._distributed_c10d — a C extension absent from ROCm Windows
wheels. Pre-stubbing the affected modules in sys.modules prevents the real
import chain from running and avoids the "operator does not exist" crash.
---
 studio/backend/core/training/worker.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index a9589bb65e..a654b51f9b 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1177,6 +1177,27 @@ def _c10d_stub_getattr(_attr):
                 if _fsdp_name not in sys.modules:
                     sys.modules[_fsdp_name] = _make_mod_stub(_fsdp_name)
 
+            # torch._dynamo.trace_rules.get_torch_obj_rule_map() eagerly loads
+            # torch.distributed.tensor, which in turn imports
+            # torch.distributed._functional_collectives.  That module registers
+            # Meta kernels for ops in the _c10d_functional C++ namespace, but
+            # that namespace only exists when torch._C._distributed_c10d (the
+            # C extension absent from ROCm Windows wheels) has been loaded.
+            # Without it the impl() call raises "operator does not exist".
+            # Pre-stubbing these modules short-circuits the real import so
+            # torch._dynamo gets empty stub objects instead of crashing.
+            for _dist_name in (
+                "torch.distributed._functional_collectives",
+                "torch.distributed._functional_collectives_impl",
+                "torch.distributed.tensor",
+                "torch.distributed.tensor._ops",
+                "torch.distributed.tensor._ops._conv_ops",
+                "torch.distributed.tensor._dtensor_spec",
+                "torch.distributed.tensor.placement_types",
+            ):
+                if _dist_name not in sys.modules:
+                    sys.modules[_dist_name] = _make_mod_stub(_dist_name)
+
     try:
         import torch.distributed as _td
 

From b0732018e75d22f2e26fa7284bcd5d21b98cd362 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 05:59:13 -0500
Subject: [PATCH 064/165] fix: give mod stubs __path__ and pre-stub _tensor to
 fix 'not a package' import error

_make_mod_stub now sets __path__=[] so Python treats stub modules as
packages. Without it, any import of a submodule raises "is not a package".
Also pre-stub torch.distributed._tensor and its submodules so that
_tensor/__init__.py (which re-exports from torch.distributed.tensor) never
runs and torchao's `from torch.distributed._tensor import DTensor` gets a
harmless stub instead of crashing.
---
 studio/backend/core/training/worker.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index a654b51f9b..c2009b1230 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1100,8 +1100,14 @@ def run_training_process(
     }
 
     # Helper: build a ModuleType stub whose __getattr__ auto-creates child stubs.
+    # __path__ is set to [] so Python treats the stub as a package — without it,
+    # any attempt to import a submodule (e.g. "torch.distributed.tensor._foo")
+    # raises "is not a package" because Python checks __path__ before looking
+    # in sys.modules for the child.
     def _make_mod_stub(mod_name):
         m = _types.ModuleType(mod_name)
+        m.__path__ = []        # marks this as a package to the import system
+        m.__package__ = mod_name
         def _ga(attr, _m=m, _n=mod_name):
             if attr.startswith("__"):
                 raise AttributeError(attr)
@@ -1194,6 +1200,15 @@ def _c10d_stub_getattr(_attr):
                 "torch.distributed.tensor._ops._conv_ops",
                 "torch.distributed.tensor._dtensor_spec",
                 "torch.distributed.tensor.placement_types",
+                # torch.distributed._tensor is the canonical private package;
+                # its __init__.py tries to re-export submodules from
+                # torch.distributed.tensor (which we stubbed above), causing
+                # "is not a package" errors.  Stubbing _tensor directly
+                # short-circuits that __init__ so torchao's
+                # `from torch.distributed._tensor import DTensor` gets a stub.
+                "torch.distributed._tensor",
+                "torch.distributed._tensor.placement_types",
+                "torch.distributed._tensor.api",
             ):
                 if _dist_name not in sys.modules:
                     sys.modules[_dist_name] = _make_mod_stub(_dist_name)

From ce9098ae32633d411e0d78ff59532c33c883a742 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 06:07:50 -0500
Subject: [PATCH 065/165] fix: stub torch.ops._c10d_functional namespace with
 hashable op sentinels

torchao.dtypes.nf4tensor uses _c10d_functional ops as dict keys at import
time (all_gather_into_tensor.default, wait_tensor.default) and
torch.ops.c10d.scatter_.default. None of these ops are registered on ROCm
Windows because torch._C._distributed_c10d (the C extension) doesn't ship.
Replace the whole _c10d_functional namespace with a custom stub whose ops
return hashable .default objects, so dict-key construction doesn't crash.
Also inject a scatter_ stub into torch.ops.c10d if it's missing.
---
 studio/backend/core/training/worker.py | 65 +++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index c2009b1230..e8bdd59af3 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1248,7 +1248,70 @@ def _td_getattr(_attr):
         except Exception:
             pass
 
-    # ── 1e. Point bitsandbytes at the ROCm 7.2 DLL on Windows ──
+    # ── 1e. Stub torch.ops._c10d_functional (and c10d.scatter_) ──
+    # torchao.dtypes.nf4tensor accesses these at *import time* as dict keys:
+    #   NF4_OPS_TABLE = {
+    #       torch.ops._c10d_functional.all_gather_into_tensor.default: ...,
+    #       torch.ops._c10d_functional.wait_tensor.default: ...,   (decorator)
+    #       torch.ops.c10d.scatter_.default: ...,
+    #   }
+    # The real _c10d_functional ops are registered by torch._C._distributed_c10d
+    # (absent on ROCm Windows).  We replace the whole namespace with a stub
+    # whose op objects are hashable so dict-key usage doesn't crash.
+    try:
+        import torch as _torch_ops
+
+        class _C10dFunctionalOpDefault:
+            """Hashable stub for op.default — used as dict keys."""
+            __slots__ = ("_name",)
+            def __init__(self, name):
+                self._name = name
+            def __hash__(self):
+                return hash(("_c10d_functional_stub", self._name))
+            def __eq__(self, other):
+                return (type(other) is _C10dFunctionalOpDefault
+                        and self._name == other._name)
+            def __call__(self, *a, **kw):
+                return a[0] if a else None
+            def __repr__(self):
+                return f"torch.ops._c10d_functional.{self._name}.default"
+
+        class _C10dFunctionalOp:
+            """Stub for a single _c10d_functional op (has a .default attr)."""
+            __slots__ = ("_name", "default")
+            def __init__(self, name):
+                self._name = name
+                self.default = _C10dFunctionalOpDefault(name)
+            def __call__(self, *a, **kw):
+                return self.default(*a, **kw)
+            def __repr__(self):
+                return f"torch.ops._c10d_functional.{self._name}"
+
+        class _C10dFunctionalNamespace:
+            """Drop-in for torch.ops._c10d_functional; auto-stubs every op."""
+            def __getattr__(self, name):
+                if name.startswith("_"):
+                    raise AttributeError(name)
+                op = _C10dFunctionalOp(name)
+                object.__setattr__(self, name, op)
+                return op
+
+        _torch_ops.ops._c10d_functional = _C10dFunctionalNamespace()
+
+        # Also stub torch.ops.c10d.scatter_ if it's missing (same root cause).
+        try:
+            _ = _torch_ops.ops.c10d.scatter_.default
+        except AttributeError:
+            # c10d namespace exists but scatter_ op isn't registered; inject stub.
+            _c10d_scatter_stub = _C10dFunctionalOp("scatter_")
+            try:
+                setattr(_torch_ops.ops.c10d, "scatter_", _c10d_scatter_stub)
+            except Exception:
+                pass
+    except Exception:
+        pass
+
+    # ── 1g. Point bitsandbytes at the ROCm 7.2 DLL on Windows ──
     # The AMD continuous-release wheel ships libbitsandbytes_rocm72.dll.
     # Only set BNB_ROCM_VERSION when that DLL is actually present — setting it
     # when the DLL is absent makes bnb fail harder than the default detection.

From e778e0e39152857a581f5f5b78c154f8eb9bc1bf Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 06:09:51 -0500
Subject: [PATCH 066/165] fix: stub entire torchao package on ROCm Windows
 instead of individual ops

torchao is not supported on ROCm Windows and its import chain transitively
requires torch._C._distributed_c10d (absent from the ROCm Windows wheel).
Rather than stub each missing op one by one, stub the whole torchao package
upfront. Unsloth uses bitsandbytes for quantization, not torchao, so this
has no functional impact. transformers gracefully handles an importable-but-
empty torchao by disabling TorchAoHfQuantizer.
---
 studio/backend/core/training/worker.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index e8bdd59af3..8db2686f15 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1148,6 +1148,22 @@ def _make_stub_class(name):
         return _StubClassMeta(name, (), {"__init__": lambda self, *a, **kw: None})
 
     if sys.platform == "win32":
+        # torchao is not supported on ROCm Windows and its import chain
+        # transitively pulls in torch._C._distributed_c10d (absent from the
+        # ROCm Windows wheel), causing cascading AttributeErrors.  We don't
+        # use torchao quantization (unsloth uses bitsandbytes), so stub the
+        # entire package up-front.  transformers falls back gracefully when
+        # torchao is importable but empty.
+        for _tao_name in (
+            "torchao",
+            "torchao.quantization",
+            "torchao.dtypes",
+            "torchao.float8",
+            "torchao.utils",
+        ):
+            if _tao_name not in sys.modules:
+                sys.modules[_tao_name] = _make_mod_stub(_tao_name)
+
         _c10d_key = "torch._C._distributed_c10d"
         if _c10d_key not in sys.modules:  # guard: never overwrite real NVIDIA impl
             _c10d_stub = _types.ModuleType(_c10d_key)

From 3e57133c84a2a505bcb9bbe5ef1fc3dba30d35c2 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 06:16:05 -0500
Subject: [PATCH 067/165] fix: set __spec__ on mod stubs so
 importlib.util.find_spec doesn't raise

Manually-injected sys.modules entries have __spec__=None by default.
importlib.util.find_spec() raises ValueError when it finds a module in
sys.modules with __spec__=None (transformers.utils.import_utils hits this
when checking if torchao is available). Give every stub a minimal
ModuleSpec(name, loader=None, is_package=True) to satisfy find_spec.
---
 studio/backend/core/training/worker.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 8db2686f15..8701b16f1e 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1104,10 +1104,17 @@ def run_training_process(
     # any attempt to import a submodule (e.g. "torch.distributed.tensor._foo")
     # raises "is not a package" because Python checks __path__ before looking
     # in sys.modules for the child.
+    import importlib.machinery as _ilm
+
     def _make_mod_stub(mod_name):
         m = _types.ModuleType(mod_name)
         m.__path__ = []        # marks this as a package to the import system
         m.__package__ = mod_name
+        # importlib.util.find_spec() raises ValueError when __spec__ is None
+        # on a module that's already in sys.modules (our manually-injected
+        # stubs).  Give every stub a minimal ModuleSpec so find_spec() returns
+        # a spec object (non-None) instead of raising.
+        m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
         def _ga(attr, _m=m, _n=mod_name):
             if attr.startswith("__"):
                 raise AttributeError(attr)

From cf1021500888f06dea38d03f7ca304b1dd7913ea Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 06:25:32 -0500
Subject: [PATCH 068/165] fix: add meta path finder to auto-stub subpackages of
 stub modules

`import torchao.prototype` goes through the import machinery, not
__getattr__, so an empty __path__ means ModuleNotFoundError. Rather than
list every submodule explicitly, register a MetaPathFinder that intercepts
any import whose parent is one of our stubs (detected by loader=None in the
parent's ModuleSpec). Real installed packages always have a SourceFileLoader
so they are never intercepted. Also register child stubs in sys.modules
from __getattr__ as a belt-and-suspenders measure.
---
 studio/backend/core/training/worker.py | 40 +++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 8701b16f1e..a2af71b800 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1105,6 +1105,7 @@ def run_training_process(
     # raises "is not a package" because Python checks __path__ before looking
     # in sys.modules for the child.
     import importlib.machinery as _ilm
+    import importlib.abc as _ilabc
 
     def _make_mod_stub(mod_name):
         m = _types.ModuleType(mod_name)
@@ -1114,16 +1115,53 @@ def _make_mod_stub(mod_name):
         # on a module that's already in sys.modules (our manually-injected
         # stubs).  Give every stub a minimal ModuleSpec so find_spec() returns
         # a spec object (non-None) instead of raising.
+        # loader=None is our sentinel: _StubSubpackageFinder uses it to detect
+        # stub modules and auto-stub their children via the import machinery.
         m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
         def _ga(attr, _m=m, _n=mod_name):
             if attr.startswith("__"):
                 raise AttributeError(attr)
-            child = _make_mod_stub(f"{_n}.{attr}")
+            child_name = f"{_n}.{attr}"
+            child = _make_mod_stub(child_name)
+            sys.modules.setdefault(child_name, child)
             setattr(_m, attr, child)
             return child
         m.__getattr__ = _ga
         return m
 
+    class _StubSubpackageLoader(_ilabc.Loader):
+        """Creates a stub module for any subpackage of a stub package."""
+        def __init__(self, mod_name):
+            self._mod_name = mod_name
+        def create_module(self, spec):
+            return _make_mod_stub(self._mod_name)
+        def exec_module(self, module):
+            pass  # stub is fully initialised in create_module
+
+    class _StubSubpackageFinder(_ilabc.MetaPathFinder):
+        """Auto-stubs any subpackage import whose parent is one of our stubs.
+
+        Identified by parent.__spec__.loader is None (our sentinel).  Real
+        installed packages always have a SourceFileLoader or similar.  This
+        means we never accidentally intercept real subpackage loads.
+        """
+        def find_spec(self, fullname, path, target=None):
+            if "." not in fullname:
+                return None
+            parent_name = fullname.rsplit(".", 1)[0]
+            parent = sys.modules.get(parent_name)
+            if parent is None:
+                return None
+            parent_spec = getattr(parent, "__spec__", None)
+            if not isinstance(parent_spec, _ilm.ModuleSpec):
+                return None
+            if parent_spec.loader is not None:
+                return None  # real module — don't intercept
+            loader = _StubSubpackageLoader(fullname)
+            return _ilm.ModuleSpec(fullname, loader, is_package=True)
+
+    sys.meta_path.append(_StubSubpackageFinder())
+
     # Metaclass for stub *classes* so class-level attribute access works too.
     # e.g. torchao / distributed_c10d does ProcessGroup.BackendType.NCCL —
     # plain type() has no __getattr__ on the metaclass, so we need this to

From 32643198b2cd07b4d4ff14c00ea77b5e2f982e6e Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Mon, 11 May 2026 06:30:04 -0500
Subject: [PATCH 069/165] fix: use _unsloth_stub sentinel instead of
 loader=None for stub detection

The import machinery overwrites module.__spec__ with the spec returned by
find_spec (which has loader=_StubSubpackageLoader, not None), so the
loader=None check broke for second-level subpackages. Switch to a custom
_unsloth_stub object identity sentinel set directly on each stub module --
it survives __spec__ being replaced and correctly identifies stubs at any
depth (torchao.prototype.safetensors, etc.).
---
 studio/backend/core/training/worker.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index a2af71b800..246f7630c2 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1107,16 +1107,17 @@ def run_training_process(
     import importlib.machinery as _ilm
     import importlib.abc as _ilabc
 
+    _STUB_SENTINEL = object()  # identity tag placed on every stub module
+
     def _make_mod_stub(mod_name):
         m = _types.ModuleType(mod_name)
         m.__path__ = []        # marks this as a package to the import system
         m.__package__ = mod_name
-        # importlib.util.find_spec() raises ValueError when __spec__ is None
-        # on a module that's already in sys.modules (our manually-injected
-        # stubs).  Give every stub a minimal ModuleSpec so find_spec() returns
-        # a spec object (non-None) instead of raising.
-        # loader=None is our sentinel: _StubSubpackageFinder uses it to detect
-        # stub modules and auto-stub their children via the import machinery.
+        # _STUB_SENTINEL survives __spec__ being replaced by the import
+        # machinery (which overwrites __spec__ with the spec returned by
+        # find_spec, breaking the loader=None sentinel we used before).
+        m._unsloth_stub = _STUB_SENTINEL
+        # importlib.util.find_spec() raises ValueError when __spec__ is None.
         m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
         def _ga(attr, _m=m, _n=mod_name):
             if attr.startswith("__"):
@@ -1141,9 +1142,9 @@ def exec_module(self, module):
     class _StubSubpackageFinder(_ilabc.MetaPathFinder):
         """Auto-stubs any subpackage import whose parent is one of our stubs.
 
-        Identified by parent.__spec__.loader is None (our sentinel).  Real
-        installed packages always have a SourceFileLoader or similar.  This
-        means we never accidentally intercept real subpackage loads.
+        Uses _unsloth_stub sentinel on the module object — NOT __spec__.loader,
+        which the import machinery overwrites with the loader from find_spec,
+        breaking the loader=None check for second-level subpackages.
         """
         def find_spec(self, fullname, path, target=None):
             if "." not in fullname:
@@ -1152,11 +1153,8 @@ def find_spec(self, fullname, path, target=None):
             parent = sys.modules.get(parent_name)
             if parent is None:
                 return None
-            parent_spec = getattr(parent, "__spec__", None)
-            if not isinstance(parent_spec, _ilm.ModuleSpec):
-                return None
-            if parent_spec.loader is not None:
-                return None  # real module — don't intercept
+            if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
+                return None  # real installed module — don't intercept
             loader = _StubSubpackageLoader(fullname)
             return _ilm.ModuleSpec(fullname, loader, is_package=True)
 

From d731c5fc6385a69c4c77cc7ea8e1a5f5c38bf1f0 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 12:42:04 -0500
Subject: [PATCH 070/165] refactor(rocm/win): switch to repo.amd.com arch-aware
 index, remove stubs

AMD recommends repo.amd.com/rocm/whl/{arch}/ as the Windows ROCm wheel
source. These wheels bundle their own ROCm runtime, support all Python
versions (not just cp312), and include the full torch._C extension set
(including _distributed_c10d) that the old repo.radeon.com wheel omitted.

Changes:
- install.ps1: remove Select-ROCmWheelRelease + hardcoded cp312 wheel
  URLs; remove Python 3.12 forced-preference logic; install via
  --index-url repo.amd.com/rocm/whl/{arch-family}/
- studio/setup.ps1: same -- remove Select-ROCmWheelRelease, switch to
  repo.amd.com arch-aware index URL
- studio/install_python_stack.py: replace _ROCM_WINDOWS_RELEASES /
  _select_windows_rocm_release with _windows_rocm_index_url() using the
  _GFX_TO_AMD_INDEX_ARCH map; drop Python 3.12 restriction
- studio/backend/core/training/worker.py: remove all stub machinery
  (_make_mod_stub, _StubSubpackageFinder, _StubSubpackageLoader,
  _StubClassMeta, torchao/fsdp/dtensor stubs, _c10d_functional ops
  stubs, BNB DLL detection) -- no longer needed with new wheel source
---
 install.ps1                            | 195 +++--------------
 studio/backend/core/training/worker.py | 277 +------------------------
 studio/install_python_stack.py         | 115 +++-------
 studio/setup.ps1                       | 111 ++--------
 4 files changed, 85 insertions(+), 613 deletions(-)

diff --git a/install.ps1 b/install.ps1
index b3dfed5d65..53c0eea4a1 100644
--- a/install.ps1
+++ b/install.ps1
@@ -950,71 +950,13 @@ shell.Run cmd, 0, False
         return $null
     }
 
-    # ── Quick AMD GPU probe (before Python selection) ──
-    # Checks hipinfo and WMI now so we can pick Python 3.12 upfront if AMD is
-    # present (ROCm Windows wheels are cp312-only). The full GPU detection with
-    # version strings and display labels runs after venv creation below.
-    $_EarlyAmdDetected = $false
-    try {
-        $hipinfoEarly = Get-Command hipinfo -ErrorAction SilentlyContinue
-        if ($hipinfoEarly) {
-            $hipEarlyOut = & $hipinfoEarly.Source 2>&1 | Out-String
-            if ($LASTEXITCODE -eq 0 -and $hipEarlyOut -match "(?i)gcnArchName") {
-                $_EarlyAmdDetected = $true
-            }
-        }
-    } catch {}
-    if (-not $_EarlyAmdDetected) {
-        try {
-            $wmiGpuEarly = Get-WmiObject Win32_VideoController -ErrorAction SilentlyContinue |
-                Where-Object { $_.Name -match "AMD|Radeon" } | Select-Object -First 1
-            if ($wmiGpuEarly) { $_EarlyAmdDetected = $true }
-        } catch {}
-    }
-
     # ── Install Python if no compatible version (3.11-3.13) found ──
     # Find-CompatiblePython returns @{ Version = "3.13"; Path = "C:\...\python.exe" } or $null.
     Write-TauriLog "STEP" "Installing Python"
     $DetectedPython = Find-CompatiblePython
 
-    # If AMD GPU is present and we didn't land on Python 3.12, find 3.12 now
-    # before creating the venv -- avoids creating the environment twice.
-    if ($_EarlyAmdDetected -and $DetectedPython -and (($DetectedPython.Version -split '\.')[0..1] -join '.') -ne "3.12") {
-        $py312Pre = $null
-        $pyLauncherPre = Get-Command py -CommandType Application -ErrorAction SilentlyContinue
-        if ($pyLauncherPre -and $pyLauncherPre.Source -notmatch $script:CondaSkipPattern) {
-            try {
-                $out312 = & $pyLauncherPre.Source "-3.12" --version 2>&1 | Out-String
-                if ($out312 -match "Python 3\.12\.\d+") {
-                    $resolvedExe312 = (& $pyLauncherPre.Source "-3.12" -c "import sys; print(sys.executable)" 2>$null | Out-String).Trim()
-                    if ($resolvedExe312 -and (Test-Path $resolvedExe312) -and -not (Test-IsCondaPython $resolvedExe312)) {
-                        $py312Pre = @{ Version = "3.12"; Path = $resolvedExe312 }
-                    }
-                }
-            } catch {}
-        }
-        if (-not $py312Pre) {
-            foreach ($name312 in @("python3.12", "python3", "python")) {
-                foreach ($cmd312 in @(Get-Command $name312 -All -ErrorAction SilentlyContinue)) {
-                    if (-not $cmd312.Source -or $cmd312.Source -like "*\WindowsApps\*") { continue }
-                    if (Test-IsCondaPython $cmd312.Source) { continue }
-                    try {
-                        $out312 = & $cmd312.Source --version 2>&1 | Out-String
-                        if ($out312 -match "Python 3\.12\.\d+") { $py312Pre = @{ Version = "3.12"; Path = $cmd312.Source }; break }
-                    } catch {}
-                }
-                if ($py312Pre) { break }
-            }
-        }
-        if ($py312Pre) { $DetectedPython = $py312Pre }
-    }
-
     if ($DetectedPython) {
-        $pyStepLabel = "Python $($DetectedPython.Version) already installed"
-        if ($_EarlyAmdDetected -and (($DetectedPython.Version -split '\.')[0..1] -join '.') -eq "3.12") {
-            $pyStepLabel = "Python 3.12 selected (ROCm wheels are cp312-only)"
-        }
-        step "python" $pyStepLabel
+        step "python" "Python $($DetectedPython.Version) already installed"
     }
     if (-not $DetectedPython) {
         substep "installing Python ${PythonVersion}..."
@@ -1345,14 +1287,6 @@ shell.Run cmd, 0, False
         substep "Training and GPU inference require an NVIDIA or AMD ROCm GPU." "Yellow"
     }
 
-    # Warn if AMD GPU is present but the venv still isn't Python 3.12.
-    # The early probe above covers the normal case; this fires only when the
-    # full GPU detection reveals AMD that the early probe missed (e.g. hipinfo
-    # not yet on PATH) and 3.12 still wasn't found.
-    if (($HasROCm -or $ROCmGpuLabel) -and $DetectedPython -and (($DetectedPython.Version -split '\.')[0..1] -join '.') -ne "3.12") {
-        substep "AMD GPU requires Python 3.12 for ROCm wheels -- install it from python.org and re-run." "Yellow"
-    }
-
     # ── Choose the correct PyTorch index URL based on driver CUDA version ──
     # Mirrors Get-PytorchCudaTag in setup.ps1.
     function Get-TorchIndexUrl {
@@ -1379,101 +1313,41 @@ shell.Run cmd, 0, False
     # Wheels bundle their own ROCm runtime; the installed HIP SDK version does
     # not constrain which release to use.  Always picks the newest release that
     # supports the GPU architecture.
-    function Select-ROCmWheelRelease {
-        param([string]$GfxArch)
-
-        # Available releases, newest first.
-        $releases = @(
-            @{
-                Rel     = "rocm-rel-7.2.1"
-                Tag     = "rocm7.2"
-                RocmVer = @(7, 2)
-                Tarball = "rocm-7.2.1.tar.gz"
-                Wheels  = @(
-                    "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
-                    "rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
-                    "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
-                    "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                    "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                    "torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
-                )
-            },
-            @{
-                Rel     = "rocm-rel-7.1.1"
-                Tag     = "rocm7.1"
-                RocmVer = @(7, 1)
-                Tarball = "rocm-0.1.dev0.tar.gz"
-                Wheels  = @(
-                    "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
-                    "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
-                    "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                    "torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                    "torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
-                )
-            }
-        )
-
-        # GPU arch → minimum (major, minor) ROCm release needed.
-        $archMin = @{
-            "gfx1201" = @(7,1); "gfx1200" = @(7,1)   # RDNA 4
-            "gfx1151" = @(7,1); "gfx1150" = @(7,1)   # RDNA 3.5 (Strix Halo/Point)
-            "gfx1103" = @(6,4); "gfx1102" = @(6,4); "gfx1101" = @(6,4); "gfx1100" = @(6,4)  # RDNA 3
-            "gfx1036" = @(6,4); "gfx1035" = @(6,4); "gfx1034" = @(6,4); "gfx1033" = @(6,4)  # RDNA 2
-            "gfx1032" = @(6,4); "gfx1031" = @(6,4); "gfx1030" = @(6,4)
-            "gfx1011" = @(6,4); "gfx1010" = @(6,4)   # RDNA 1
-            "gfx906"  = @(6,4); "gfx908"  = @(6,4); "gfx90a" = @(6,4)   # Vega/MI
-        }
-        $minVer = if ($GfxArch -and $archMin.ContainsKey($GfxArch)) {
-            $archMin[$GfxArch]
+    # ── AMD Windows ROCm: arch-aware pip index (repo.amd.com) ──
+    # Wheels bundle their own ROCm runtime and support all Python versions.
+    # Override with UNSLOTH_ROCM_WINDOWS_MIRROR for air-gapped / mirror installs.
+    $ROCmIndexUrl = $null
+    if ($HasROCm -and $TorchIndexUrl -like "*/cpu" -and -not $SkipTorch) {
+        $amdIndexBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.amd.com/rocm/whl" }
+        $archFamilyMap = @{
+            "gfx1201" = "gfx120X-all"; "gfx1200" = "gfx120X-all"  # RDNA 4
+            "gfx1151" = "gfx1151";     "gfx1150" = "gfx1150"       # RDNA 3.5 (Strix Halo/Point)
+            "gfx1103" = "gfx110X-all"; "gfx1102" = "gfx110X-all"   # RDNA 3
+            "gfx1101" = "gfx110X-all"; "gfx1100" = "gfx110X-all"
+            "gfx90a"  = "gfx90a";      "gfx908"  = "gfx908"        # MI200/MI100
+        }
+        $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
+        if ($archFamily) {
+            $ROCmIndexUrl = "$amdIndexBase/$archFamily/"
+            $archLabel = if ($ROCmGfxArch) { $ROCmGfxArch } else { "AMD GPU" }
+            substep "$archLabel -- AMD repo.amd.com index selected" "Cyan"
+        } elseif ($ROCmGfxArch) {
+            substep "AMD GPU ($ROCmGfxArch) not in supported arch list -- falling back to CPU-only PyTorch" "Yellow"
         } else {
-            @(6, 4)  # unknown arch: try the latest (7.2.1 supports all modern GPUs)
+            substep "AMD GPU detected but arch unknown -- falling back to CPU-only PyTorch" "Yellow"
         }
-
-        foreach ($r in $releases) {
-            $rv = $r.RocmVer
-            $ok = ($rv[0] -gt $minVer[0]) -or ($rv[0] -eq $minVer[0] -and $rv[1] -ge $minVer[1])
-            if ($ok) { return $r }
-        }
-        return $null
     }
 
-    # ── AMD Windows ROCm wheel override ──
-    # Selects the newest wheel release compatible with the GPU arch (HIP SDK
-    # version is irrelevant; wheels bundle their own ROCm runtime).
-    $ROCmTorchWheelUrl = $null
-    $ROCmTarballUrl    = $null
-    $ROCmWheelTag      = $null
-    if ($HasROCm -and -not $SkipTorch) {
-        $pyMajMin = if ($DetectedPython) { ($DetectedPython.Version -split '\.')[0..1] -join '.' } else { "" }
-        if ($pyMajMin -eq "3.12") {
-            $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
-            $sel = Select-ROCmWheelRelease -GfxArch $ROCmGfxArch
-            if ($sel) {
-                $rb               = "$amdWheelBase/$($sel.Rel)"
-                $ROCmTarballUrl   = "$rb/$($sel.Tarball)"
-                $ROCmAllWheelUrls = $sel.Wheels | ForEach-Object { "$rb/$_" }
-                $ROCmTorchWheelUrl = ($ROCmAllWheelUrls | Where-Object { $_ -match '/torch-' })[0]
-                $ROCmWheelTag      = $sel.Tag
-                $TorchIndexUrl     = $null
-                $archLabel = if ($ROCmGfxArch) { $ROCmGfxArch } else { "AMD GPU" }
-                substep "$archLabel -- Windows torch wheel $($sel.Rel) selected" "Cyan"
-            } else {
-                substep "No AMD Windows torch wheel for GPU arch $ROCmGfxArch -- falling back to CPU-only PyTorch" "Yellow"
-            }
-        } else {
-            substep "AMD Windows ROCm wheels require Python 3.12 (detected: $pyMajMin) -- using CPU-only PyTorch" "Yellow"
-            substep "To enable ROCm training, reinstall with Python 3.12." "Yellow"
-        }
+    if ($ROCmIndexUrl) {
+        $TorchIndexFamily = "rocm"
+    } else {
+        $TorchIndexFamily = Get-TauriTorchIndexFamily $TorchIndexUrl
     }
-
-    $TorchIndexFamily = Get-TauriTorchIndexFamily $(
-        if ($ROCmTorchWheelUrl) { $ROCmWheelTag } else { $TorchIndexUrl }
-    )
     $GpuBranch = Get-TauriGpuBranch $TorchIndexFamily
     Write-TauriDiag -GpuBranch $GpuBranch -TorchIndexFamily $TorchIndexFamily -PythonVersionForDiag $DetectedPython.Version
 
     # ── Print CPU-only hint when no GPU detected ──
-    if (-not $SkipTorch -and -not $ROCmTorchWheelUrl -and $TorchIndexUrl -like "*/cpu") {
+    if (-not $SkipTorch -and -not $ROCmIndexUrl -and $TorchIndexUrl -like "*/cpu") {
         Write-Host ""
         if ($HasROCm -or $ROCmGpuLabel) {
             substep "Installing CPU-only PyTorch (ROCm wheels require the HIP SDK)." "Yellow"
@@ -1551,20 +1425,13 @@ shell.Run cmd, 0, False
                 return (Exit-InstallFailure "Failed to overlay unsloth-zoo (exit code $zooOverlayExit)" $zooOverlayExit)
             }
         }
-    } elseif ($TorchIndexUrl -or $ROCmTorchWheelUrl) {
+    } elseif ($TorchIndexUrl -or $ROCmIndexUrl) {
         if ($SkipTorch) {
             substep "skipping PyTorch (--no-torch flag set)." "Yellow"
-        } elseif ($ROCmTorchWheelUrl) {
+        } elseif ($ROCmIndexUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
-            substep "installing PyTorch ($ROCmWheelTag)..."
-            # rocm_sdk namespace tarball (torch/_rocm_init.py imports it at startup)
-            if ($ROCmTarballUrl) {
-                $tarballExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps $ROCmTarballUrl }
-                if ($tarballExit -ne 0) {
-                    Write-Host "[WARN] ROCm namespace tarball install failed (exit $tarballExit) -- continuing" -ForegroundColor Yellow
-                }
-            }
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --no-deps @ROCmAllWheelUrls }
+            substep "installing PyTorch from $ROCmIndexUrl..."
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --index-url $ROCmIndexUrl torch torchvision torchaudio }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 246f7630c2..f6bebaaa0e 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1084,10 +1084,10 @@ def run_training_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
-    # ── 1d. Ensure torch.distributed is importable before ML libs load ──
-    # Windows ROCm wheel lacks torch._C._distributed_c10d. Pre-stub it to handle
-    # both ImportError and lazy-load crashes from trl/transformers. The
-    # `not in sys.modules` guard preserves a real NVIDIA implementation.
+    # ── 1d. Ensure torch.distributed helper attrs are present ──
+    # Single-GPU training never initialises the process group, so these helpers
+    # are never called — but transformers/trl import them unconditionally at the
+    # module level and crash when they're missing.
     import types as _types
 
     _td_stubs = {
@@ -1099,290 +1099,23 @@ def run_training_process(
         "barrier": lambda: None,
     }
 
-    # Helper: build a ModuleType stub whose __getattr__ auto-creates child stubs.
-    # __path__ is set to [] so Python treats the stub as a package — without it,
-    # any attempt to import a submodule (e.g. "torch.distributed.tensor._foo")
-    # raises "is not a package" because Python checks __path__ before looking
-    # in sys.modules for the child.
-    import importlib.machinery as _ilm
-    import importlib.abc as _ilabc
-
-    _STUB_SENTINEL = object()  # identity tag placed on every stub module
-
-    def _make_mod_stub(mod_name):
-        m = _types.ModuleType(mod_name)
-        m.__path__ = []        # marks this as a package to the import system
-        m.__package__ = mod_name
-        # _STUB_SENTINEL survives __spec__ being replaced by the import
-        # machinery (which overwrites __spec__ with the spec returned by
-        # find_spec, breaking the loader=None sentinel we used before).
-        m._unsloth_stub = _STUB_SENTINEL
-        # importlib.util.find_spec() raises ValueError when __spec__ is None.
-        m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
-        def _ga(attr, _m=m, _n=mod_name):
-            if attr.startswith("__"):
-                raise AttributeError(attr)
-            child_name = f"{_n}.{attr}"
-            child = _make_mod_stub(child_name)
-            sys.modules.setdefault(child_name, child)
-            setattr(_m, attr, child)
-            return child
-        m.__getattr__ = _ga
-        return m
-
-    class _StubSubpackageLoader(_ilabc.Loader):
-        """Creates a stub module for any subpackage of a stub package."""
-        def __init__(self, mod_name):
-            self._mod_name = mod_name
-        def create_module(self, spec):
-            return _make_mod_stub(self._mod_name)
-        def exec_module(self, module):
-            pass  # stub is fully initialised in create_module
-
-    class _StubSubpackageFinder(_ilabc.MetaPathFinder):
-        """Auto-stubs any subpackage import whose parent is one of our stubs.
-
-        Uses _unsloth_stub sentinel on the module object — NOT __spec__.loader,
-        which the import machinery overwrites with the loader from find_spec,
-        breaking the loader=None check for second-level subpackages.
-        """
-        def find_spec(self, fullname, path, target=None):
-            if "." not in fullname:
-                return None
-            parent_name = fullname.rsplit(".", 1)[0]
-            parent = sys.modules.get(parent_name)
-            if parent is None:
-                return None
-            if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
-                return None  # real installed module — don't intercept
-            loader = _StubSubpackageLoader(fullname)
-            return _ilm.ModuleSpec(fullname, loader, is_package=True)
-
-    sys.meta_path.append(_StubSubpackageFinder())
-
-    # Metaclass for stub *classes* so class-level attribute access works too.
-    # e.g. torchao / distributed_c10d does ProcessGroup.BackendType.NCCL —
-    # plain type() has no __getattr__ on the metaclass, so we need this to
-    # avoid AttributeError on arbitrary class-level attribute access.
-    #
-    # We intentionally do NOT use a real enum.Enum here: the C++ BackendType
-    # enum gains new members across PyTorch versions (XCCL was added in 2.6+)
-    # and hard-coding the list means every new member causes another crash.
-    # Instead _StubClassMeta auto-creates child stubs for any attr access, and
-    # the __members__ safety net satisfies Enum-duck-typing checks in torchao.
-    class _StubClassMeta(type):
-        def __getattr__(cls, attr):
-            if attr == "__members__":
-                # torchao checks ProcessGroup.BackendType.__members__ (Enum
-                # interface).  Return an empty dict — we have no real members
-                # to enumerate and the caller just iterates / checks membership.
-                return {}
-            if attr.startswith("__"):
-                raise AttributeError(attr)
-            # Auto-create a child stub for any member access (BackendType.NCCL,
-            # BackendType.XCCL, BackendType.UNDEFINED, …).  We cache it on the
-            # class so repeated accesses return the same object (identity
-            # comparisons stay consistent).
-            child = _StubClassMeta(attr, (), {"__init__": lambda self, *a, **kw: None})
-            setattr(cls, attr, child)
-            return child
-
-    def _make_stub_class(name):
-        return _StubClassMeta(name, (), {"__init__": lambda self, *a, **kw: None})
-
-    if sys.platform == "win32":
-        # torchao is not supported on ROCm Windows and its import chain
-        # transitively pulls in torch._C._distributed_c10d (absent from the
-        # ROCm Windows wheel), causing cascading AttributeErrors.  We don't
-        # use torchao quantization (unsloth uses bitsandbytes), so stub the
-        # entire package up-front.  transformers falls back gracefully when
-        # torchao is importable but empty.
-        for _tao_name in (
-            "torchao",
-            "torchao.quantization",
-            "torchao.dtypes",
-            "torchao.float8",
-            "torchao.utils",
-        ):
-            if _tao_name not in sys.modules:
-                sys.modules[_tao_name] = _make_mod_stub(_tao_name)
-
-        _c10d_key = "torch._C._distributed_c10d"
-        if _c10d_key not in sys.modules:  # guard: never overwrite real NVIDIA impl
-            _c10d_stub = _types.ModuleType(_c10d_key)
-
-            # ROCm Windows wheels omit this C extension; auto-stub every
-            # missing symbol so torch._dynamo's fsdp imports don't crash.
-            def _c10d_stub_getattr(_attr):
-                if _attr.startswith("__"):
-                    raise AttributeError(_attr)
-                _cls = _make_stub_class(_attr)
-                setattr(_c10d_stub, _attr, _cls)
-                return _cls
-
-            _c10d_stub.__getattr__ = _c10d_stub_getattr
-            sys.modules[_c10d_key] = _c10d_stub
-            try:
-                import torch._C as _torch_C_mod  # C ext — always importable
-                if not hasattr(_torch_C_mod, "_distributed_c10d"):
-                    _torch_C_mod._distributed_c10d = _c10d_stub
-            except Exception:
-                pass
-
-            # Pre-register torch.distributed.fsdp submodules as stubs so
-            # torch._dynamo's module-level fsdp import short-circuits before
-            # the real package loads (it has a circular import on ROCm Windows).
-            for _fsdp_name in (
-                "torch.distributed.fsdp",
-                "torch.distributed.fsdp._flat_param",
-                "torch.distributed.fsdp._fully_shard",
-                "torch.distributed.fsdp._fsdp_param_group",
-                "torch.distributed.fsdp._common_utils",
-            ):
-                if _fsdp_name not in sys.modules:
-                    sys.modules[_fsdp_name] = _make_mod_stub(_fsdp_name)
-
-            # torch._dynamo.trace_rules.get_torch_obj_rule_map() eagerly loads
-            # torch.distributed.tensor, which in turn imports
-            # torch.distributed._functional_collectives.  That module registers
-            # Meta kernels for ops in the _c10d_functional C++ namespace, but
-            # that namespace only exists when torch._C._distributed_c10d (the
-            # C extension absent from ROCm Windows wheels) has been loaded.
-            # Without it the impl() call raises "operator does not exist".
-            # Pre-stubbing these modules short-circuits the real import so
-            # torch._dynamo gets empty stub objects instead of crashing.
-            for _dist_name in (
-                "torch.distributed._functional_collectives",
-                "torch.distributed._functional_collectives_impl",
-                "torch.distributed.tensor",
-                "torch.distributed.tensor._ops",
-                "torch.distributed.tensor._ops._conv_ops",
-                "torch.distributed.tensor._dtensor_spec",
-                "torch.distributed.tensor.placement_types",
-                # torch.distributed._tensor is the canonical private package;
-                # its __init__.py tries to re-export submodules from
-                # torch.distributed.tensor (which we stubbed above), causing
-                # "is not a package" errors.  Stubbing _tensor directly
-                # short-circuits that __init__ so torchao's
-                # `from torch.distributed._tensor import DTensor` gets a stub.
-                "torch.distributed._tensor",
-                "torch.distributed._tensor.placement_types",
-                "torch.distributed._tensor.api",
-            ):
-                if _dist_name not in sys.modules:
-                    sys.modules[_dist_name] = _make_mod_stub(_dist_name)
-
     try:
         import torch.distributed as _td
 
         for _name, _stub in _td_stubs.items():
             if not hasattr(_td, _name):
                 setattr(_td, _name, _stub)
-        # Stub C-extension-backed class attrs (Store, ProcessGroup, …) that
-        # the ROCm Windows wheel omits. __getattr__ checks sys.modules first
-        # so it never intercepts real subpackage lookups as plain classes.
-        if not hasattr(_td, "__getattr__"):
-            def _td_getattr(_attr):
-                if _attr.startswith("__"):
-                    raise AttributeError(_attr)
-                _full = f"torch.distributed.{_attr}"
-                if _full in sys.modules:
-                    _mod = sys.modules[_full]
-                    setattr(_td, _attr, _mod)
-                    return _mod
-                _cls = _make_stub_class(_attr)
-                setattr(_td, _attr, _cls)
-                return _cls
-            _td.__getattr__ = _td_getattr
     except Exception:
-        _td_mock = _make_mod_stub("torch.distributed")
+        _td_mock = _types.ModuleType("torch.distributed")
         for _name, _stub in _td_stubs.items():
             setattr(_td_mock, _name, _stub)
         sys.modules["torch.distributed"] = _td_mock
-        if "torch._C._distributed_c10d" not in sys.modules:
-            sys.modules["torch._C._distributed_c10d"] = _make_mod_stub("torch._C._distributed_c10d")
         try:
             import torch as _torch
             _torch.distributed = _td_mock
         except Exception:
             pass
 
-    # ── 1e. Stub torch.ops._c10d_functional (and c10d.scatter_) ──
-    # torchao.dtypes.nf4tensor accesses these at *import time* as dict keys:
-    #   NF4_OPS_TABLE = {
-    #       torch.ops._c10d_functional.all_gather_into_tensor.default: ...,
-    #       torch.ops._c10d_functional.wait_tensor.default: ...,   (decorator)
-    #       torch.ops.c10d.scatter_.default: ...,
-    #   }
-    # The real _c10d_functional ops are registered by torch._C._distributed_c10d
-    # (absent on ROCm Windows).  We replace the whole namespace with a stub
-    # whose op objects are hashable so dict-key usage doesn't crash.
-    try:
-        import torch as _torch_ops
-
-        class _C10dFunctionalOpDefault:
-            """Hashable stub for op.default — used as dict keys."""
-            __slots__ = ("_name",)
-            def __init__(self, name):
-                self._name = name
-            def __hash__(self):
-                return hash(("_c10d_functional_stub", self._name))
-            def __eq__(self, other):
-                return (type(other) is _C10dFunctionalOpDefault
-                        and self._name == other._name)
-            def __call__(self, *a, **kw):
-                return a[0] if a else None
-            def __repr__(self):
-                return f"torch.ops._c10d_functional.{self._name}.default"
-
-        class _C10dFunctionalOp:
-            """Stub for a single _c10d_functional op (has a .default attr)."""
-            __slots__ = ("_name", "default")
-            def __init__(self, name):
-                self._name = name
-                self.default = _C10dFunctionalOpDefault(name)
-            def __call__(self, *a, **kw):
-                return self.default(*a, **kw)
-            def __repr__(self):
-                return f"torch.ops._c10d_functional.{self._name}"
-
-        class _C10dFunctionalNamespace:
-            """Drop-in for torch.ops._c10d_functional; auto-stubs every op."""
-            def __getattr__(self, name):
-                if name.startswith("_"):
-                    raise AttributeError(name)
-                op = _C10dFunctionalOp(name)
-                object.__setattr__(self, name, op)
-                return op
-
-        _torch_ops.ops._c10d_functional = _C10dFunctionalNamespace()
-
-        # Also stub torch.ops.c10d.scatter_ if it's missing (same root cause).
-        try:
-            _ = _torch_ops.ops.c10d.scatter_.default
-        except AttributeError:
-            # c10d namespace exists but scatter_ op isn't registered; inject stub.
-            _c10d_scatter_stub = _C10dFunctionalOp("scatter_")
-            try:
-                setattr(_torch_ops.ops.c10d, "scatter_", _c10d_scatter_stub)
-            except Exception:
-                pass
-    except Exception:
-        pass
-
-    # ── 1g. Point bitsandbytes at the ROCm 7.2 DLL on Windows ──
-    # The AMD continuous-release wheel ships libbitsandbytes_rocm72.dll.
-    # Only set BNB_ROCM_VERSION when that DLL is actually present — setting it
-    # when the DLL is absent makes bnb fail harder than the default detection.
-    if sys.platform == "win32" and os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
-        import importlib.util as _ilu
-        _bnb_spec = _ilu.find_spec("bitsandbytes")
-        if _bnb_spec and _bnb_spec.origin:
-            import pathlib as _pl
-            _bnb_dll = _pl.Path(_bnb_spec.origin).parent / "libbitsandbytes_rocm72.dll"
-            if _bnb_dll.exists():
-                os.environ.setdefault("BNB_ROCM_VERSION", "72")
-
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:
         _send_status(event_queue, "Importing Unsloth...")
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index a2c15b4542..bdd80ef4b4 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -69,38 +69,22 @@
     os.environ.get("UNSLOTH_PYTORCH_MIRROR") or "https://download.pytorch.org/whl"
 ).rstrip("/")
 
-# AMD Windows ROCm wheels — repo.radeon.com (cp312 only)
-_ROCM_WINDOWS_WHEEL_BASE = (
+# AMD Windows ROCm wheels — repo.amd.com (arch-specific pip index)
+# Format: https://repo.amd.com/rocm/whl/{arch_family}/
+# Override with UNSLOTH_ROCM_WINDOWS_MIRROR for air-gapped / mirror installs.
+_ROCM_WINDOWS_INDEX_BASE = (
     os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR")
-    or "https://repo.radeon.com/rocm/windows"
+    or "https://repo.amd.com/rocm/whl"
 ).rstrip("/")
-# Maps (major, minor) → (release_folder, [wheel_filename, ...])
-_ROCM_WINDOWS_RELEASES: dict[tuple[int, int], tuple[str, list[str]]] = {
-    (7, 2): (
-        "rocm-rel-7.2.1",
-        [
-            # rocm tarball provides the 'rocm_sdk' Python namespace package
-            "rocm-7.2.1.tar.gz",
-            "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
-            "rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
-            "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
-            "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-            "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-            "torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-        ],
-    ),
-    (7, 1): (
-        "rocm-rel-7.1.1",
-        [
-            # rocm tarball provides the 'rocm_sdk' Python namespace package
-            "rocm-0.1.dev0.tar.gz",
-            "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
-            "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
-            "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-            "torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-            "torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-        ],
-    ),
+
+# Maps gfx arch → AMD index arch-family suffix.
+# Each family is a separate pip index on repo.amd.com.
+_GFX_TO_AMD_INDEX_ARCH: dict[str, str] = {
+    "gfx1201": "gfx120X-all", "gfx1200": "gfx120X-all",  # RDNA 4
+    "gfx1151": "gfx1151",     "gfx1150": "gfx1150",       # RDNA 3.5 (Strix Halo/Point)
+    "gfx1103": "gfx110X-all", "gfx1102": "gfx110X-all",   # RDNA 3
+    "gfx1101": "gfx110X-all", "gfx1100": "gfx110X-all",
+    "gfx90a":  "gfx90a",      "gfx908":  "gfx908",        # MI200/MI100
 }
 
 # bitsandbytes continuous-release_main wheels with the ROCm 4-bit GEMV fix
@@ -235,20 +219,6 @@ def _detect_rocm_version() -> tuple[int, int] | None:
     return None
 
 
-# GPU arch → minimum (major, minor) ROCm release that supports it on Windows.
-# Wheels bundle their own ROCm runtime, so the installed HIP SDK version does
-# not constrain selection — only the GPU's architecture minimum matters.
-_GFX_MIN_ROCM_WINDOWS: dict[str, tuple[int, int]] = {
-    "gfx1201": (7, 1), "gfx1200": (7, 1),  # RDNA 4
-    "gfx1151": (7, 1), "gfx1150": (7, 1),  # RDNA 3.5 (Strix Halo/Point)
-    "gfx1103": (6, 4), "gfx1102": (6, 4), "gfx1101": (6, 4), "gfx1100": (6, 4),  # RDNA 3
-    "gfx1036": (6, 4), "gfx1035": (6, 4), "gfx1034": (6, 4), "gfx1033": (6, 4),  # RDNA 2
-    "gfx1032": (6, 4), "gfx1031": (6, 4), "gfx1030": (6, 4),
-    "gfx1011": (6, 4), "gfx1010": (6, 4),  # RDNA 1
-    "gfx906": (6, 4), "gfx908": (6, 4), "gfx90a": (6, 4),  # Vega/MI
-}
-
-
 def _detect_windows_gfx_arch() -> str | None:
     """Return the gcnArchName from hipinfo on Windows (e.g. 'gfx1200'), or None."""
     import re
@@ -272,17 +242,12 @@ def _detect_windows_gfx_arch() -> str | None:
         return None
 
 
-def _select_windows_rocm_release(gfx_arch: str | None) -> tuple[str, list[str]] | None:
-    """Pick the best available Windows ROCm release for the given GPU arch.
-
-    Always selects the newest available release whose ROCm version meets the
-    GPU's minimum requirement.  Returns None when no release qualifies.
-    """
-    min_ver = _GFX_MIN_ROCM_WINDOWS.get(gfx_arch or "", (6, 4))
-    for (maj, mn), entry in sorted(_ROCM_WINDOWS_RELEASES.items(), reverse = True):
-        if (maj, mn) >= min_ver:
-            return entry
-    return None
+def _windows_rocm_index_url(gfx_arch: str | None) -> str | None:
+    """Return the AMD pip index URL for the given GPU arch, or None if unsupported."""
+    arch_family = _GFX_TO_AMD_INDEX_ARCH.get(gfx_arch or "")
+    if arch_family is None:
+        return None
+    return f"{_ROCM_WINDOWS_INDEX_BASE}/{arch_family}/"
 
 
 def _has_rocm_gpu() -> bool:
@@ -378,7 +343,7 @@ def _ensure_rocm_torch() -> None:
     """Reinstall torch with ROCm wheels when the venv received CPU-only torch.
 
     On Linux x86_64: uses pytorch.org ROCm wheel index tags.
-    On Windows (cp312 only): uses AMD's repo.radeon.com direct wheel releases.
+    On Windows: uses AMD's repo.amd.com arch-specific pip index.
     No-op on macOS, non-x86_64 Linux, NVIDIA-primary hosts, or when torch
     already links against HIP.
     Uses pip_install() to respect uv, constraints, and --python targeting.
@@ -392,13 +357,6 @@ def _ensure_rocm_torch() -> None:
         return
 
     if IS_WINDOWS:
-        # AMD only publishes Windows ROCm wheels for Python 3.12 (cp312)
-        if sys.version_info[:2] != (3, 12):
-            print(
-                f"   ROCm torch on Windows requires Python 3.12 "
-                f"(current: {sys.version_info[0]}.{sys.version_info[1]}) -- skipping"
-            )
-            return
         if _has_usable_nvidia_gpu():
             return
         gfx_arch = _detect_windows_gfx_arch()
@@ -425,34 +383,19 @@ def _ensure_rocm_torch() -> None:
                 return  # already ROCm torch
         except (OSError, subprocess.TimeoutExpired):
             pass
-        entry = _select_windows_rocm_release(gfx_arch)
-        if entry is None:
-            print(f"   No AMD Windows torch wheel for GPU arch {gfx_arch} -- skipping")
+        index_url = _windows_rocm_index_url(gfx_arch)
+        if index_url is None:
+            print(f"   No AMD Windows torch index for GPU arch {gfx_arch} -- skipping")
             return
-        rel_tag, wheel_files = entry
-        base = f"{_ROCM_WINDOWS_WHEEL_BASE}/{rel_tag}"
-        wheel_urls = [f"{base}/{fn}" for fn in wheel_files]
-        print(f"   {gfx_arch} (Windows) -- installing torch from {base}/")
-        # Install rocm namespace tarball first (torch/_rocm_init.py imports it)
-        tarball_url = next((u for u in wheel_urls if u.endswith(".tar.gz")), None)
-        whl_urls = [u for u in wheel_urls if not u.endswith(".tar.gz")]
-        if tarball_url:
-            pip_install(
-                f"ROCm namespace ({rel_tag})",
-                "--force-reinstall",
-                "--no-deps",
-                tarball_url,
-                constrain = False,
-            )
+        print(f"   {gfx_arch} (Windows) -- installing torch from {index_url}")
         pip_install(
-            f"ROCm torch (Windows, {rel_tag})",
+            f"ROCm torch (Windows, {gfx_arch})",
             "--force-reinstall",
-            "--no-deps",
-            *whl_urls,
+            "--index-url", index_url,
+            "torch", "torchvision", "torchaudio",
             constrain = False,
         )
-        # bitsandbytes Windows ROCm wheel (ships libbitsandbytes_rocm72.dll).
-        # BNB_ROCM_VERSION=72 is set in worker.py before the bnb import.
+        # bitsandbytes Windows ROCm wheel.
         _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
         if _bnb_win_url is not None:
             pip_install_try(
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index f1c7c10801..6519384a1a 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1861,114 +1861,43 @@ if ($HasNvidiaSmi) {
 # Wheels bundle their own ROCm runtime; the installed HIP SDK version does
 # not constrain which release to use.  Always picks the newest release that
 # supports the GPU architecture.
-function Select-ROCmWheelRelease {
-    param([string]$GfxArch)
-
-    # Available releases, newest first.
-    $releases = @(
-        @{
-            Rel     = "rocm-rel-7.2.1"
-            Tag     = "rocm7.2"
-            RocmVer = @(7, 2)
-            Tarball = "rocm-7.2.1.tar.gz"
-            Wheels  = @(
-                "rocm_sdk_core-7.2.1-py3-none-win_amd64.whl",
-                "rocm_sdk_devel-7.2.1-py3-none-win_amd64.whl",
-                "rocm_sdk_libraries_custom-7.2.1-py3-none-win_amd64.whl",
-                "torch-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                "torchvision-0.24.1+rocm7.2.1-cp312-cp312-win_amd64.whl",
-                "torchaudio-2.9.1+rocm7.2.1-cp312-cp312-win_amd64.whl"
-            )
-        },
-        @{
-            Rel     = "rocm-rel-7.1.1"
-            Tag     = "rocm7.1"
-            RocmVer = @(7, 1)
-            Tarball = "rocm-0.1.dev0.tar.gz"
-            Wheels  = @(
-                "rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl",
-                "rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl",
-                "torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                "torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl",
-                "torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl"
-            )
-        }
-    )
-
-    # GPU arch → minimum (major, minor) ROCm release needed.
-    $archMin = @{
-        "gfx1201" = @(7,1); "gfx1200" = @(7,1)   # RDNA 4
-        "gfx1151" = @(7,1); "gfx1150" = @(7,1)   # RDNA 3.5 (Strix Halo/Point)
-        "gfx1103" = @(6,4); "gfx1102" = @(6,4); "gfx1101" = @(6,4); "gfx1100" = @(6,4)  # RDNA 3
-        "gfx1036" = @(6,4); "gfx1035" = @(6,4); "gfx1034" = @(6,4); "gfx1033" = @(6,4)  # RDNA 2
-        "gfx1032" = @(6,4); "gfx1031" = @(6,4); "gfx1030" = @(6,4)
-        "gfx1011" = @(6,4); "gfx1010" = @(6,4)   # RDNA 1
-        "gfx906"  = @(6,4); "gfx908"  = @(6,4); "gfx90a" = @(6,4)   # Vega/MI
-    }
-    $minVer = if ($GfxArch -and $archMin.ContainsKey($GfxArch)) {
-        $archMin[$GfxArch]
-    } else {
-        @(6, 4)  # unknown arch: try the latest (7.2.1 supports all modern GPUs)
-    }
-
-    foreach ($r in $releases) {
-        $rv = $r.RocmVer
-        $ok = ($rv[0] -gt $minVer[0]) -or ($rv[0] -eq $minVer[0] -and $rv[1] -ge $minVer[1])
-        if ($ok) { return $r }
-    }
-    return $null
-}
-
 # ── AMD Windows ROCm torch override ──────────────────────────────────────────
-# Selects the newest wheel release compatible with the GPU arch (HIP SDK
-# version is irrelevant; wheels bundle their own ROCm runtime).
-$ROCmVersion = $script:ROCmVersion
+# Uses AMD's arch-specific pip index (repo.amd.com/rocm/whl/{arch}/).
+# Wheels bundle their own ROCm runtime; HIP SDK version is irrelevant.
 $ROCmGfxArch = $script:ROCmGfxArch
-$ROCmTorchWheelUrls = $null
-$ROCmTarballUrl     = $null
-$ROCmWheelTag       = $null
+$ROCmIndexUrl = $null
 if ($HasROCm -and $CuTag -eq "cpu") {
-    $pyVer = (& python --version 2>&1 | Out-String) -replace '[^0-9.]',''
-    $pyMajMin = ($pyVer.Trim() -split '\.')[0..1] -join '.'
-    $amdWheelBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.radeon.com/rocm/windows" }
-    if ($pyMajMin -eq "3.12") {
-        $sel = Select-ROCmWheelRelease -GfxArch $ROCmGfxArch
-        if ($sel) {
-            $rb               = "$amdWheelBase/$($sel.Rel)"
-            $ROCmTarballUrl   = "$rb/$($sel.Tarball)"
-            $ROCmTorchWheelUrls = $sel.Wheels | ForEach-Object { "$rb/$_" }
-            $ROCmWheelTag     = $sel.Tag
-        }
+    $amdIndexBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.amd.com/rocm/whl" }
+    $archFamilyMap = @{
+        "gfx1201" = "gfx120X-all"; "gfx1200" = "gfx120X-all"  # RDNA 4
+        "gfx1151" = "gfx1151";     "gfx1150" = "gfx1150"       # RDNA 3.5 (Strix Halo/Point)
+        "gfx1103" = "gfx110X-all"; "gfx1102" = "gfx110X-all"   # RDNA 3
+        "gfx1101" = "gfx110X-all"; "gfx1100" = "gfx110X-all"
+        "gfx90a"  = "gfx90a";      "gfx908"  = "gfx908"        # MI200/MI100
+    }
+    $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
+    if ($archFamily) {
+        $ROCmIndexUrl = "$amdIndexBase/$archFamily/"
     }
 }
 
 $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR.TrimEnd('/') } else { "https://download.pytorch.org/whl" }
 
-if ($ROCmTorchWheelUrls) {
-    substep "installing PyTorch ($ROCmWheelTag)..."
-    # Install the rocm namespace tarball first (provides the 'rocm_sdk' Python
-    # package that torch/_rocm_init.py imports at startup).
-    if ($ROCmTarballUrl) {
-        $tarballOut = Fast-Install --force-reinstall --no-deps $ROCmTarballUrl | Out-String
-        if ($LASTEXITCODE -ne 0) {
-            Write-Host "[WARN] ROCm namespace tarball install failed -- continuing" -ForegroundColor Yellow
-            Write-Host $tarballOut -ForegroundColor Yellow
-        }
-    }
-    # Install remaining SDK + torch wheels using array splatting.
-    $output = Fast-Install --force-reinstall --no-deps @ROCmTorchWheelUrls | Out-String
+if ($ROCmIndexUrl) {
+    substep "installing PyTorch (AMD ROCm, $ROCmGfxArch)..."
+    $output = Fast-Install --force-reinstall --index-url $ROCmIndexUrl torch torchvision torchaudio | Out-String
     $torchInstallExit = $LASTEXITCODE
     if ($torchInstallExit -ne 0) {
         Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow
         Write-Host $output -ForegroundColor Yellow
-        $ROCmTorchWheelUrls = $null
+        $ROCmIndexUrl = $null
     } else {
         # Tell install_python_stack.py to skip probe + suppress manual-install warning.
         $env:UNSLOTH_ROCM_TORCH_INSTALLED = "1"
     }
 }
 
-if (-not $ROCmTorchWheelUrls -and $CuTag -eq "cpu") {
+if (-not $ROCmIndexUrl -and $CuTag -eq "cpu") {
     substep "installing PyTorch (CPU-only)..."
     if ($script:UnslothVerbose) {
         Fast-Install torch torchvision torchaudio --index-url "$PyTorchWhlBase/cpu"
@@ -1983,7 +1912,7 @@ if (-not $ROCmTorchWheelUrls -and $CuTag -eq "cpu") {
         Write-Host $output -ForegroundColor Red
         exit 1
     }
-} elseif (-not $ROCmTorchWheelUrls) {
+} elseif (-not $ROCmIndexUrl) {
     substep "installing PyTorch with CUDA support ($CuTag)..."
     substep "(This download is ~2.8 GB -- may take a few minutes)"
     if ($script:UnslothVerbose) {

From 9c9d462ad61cea5c406d6dccb335cf35b91b8629 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 12:54:52 -0500
Subject: [PATCH 071/165] fix(rocm/win): restore _distributed_c10d + torchao
 stubs; fix BNB install
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

repo.amd.com torch wheels also omit torch._C._distributed_c10d on Windows
(RCCL is not shipped on Windows). torch/distributed/__init__.py imports
from it unconditionally at module level, so the stub must land in
sys.modules before any torch.distributed import.

torchao (pulled in by transformers.quantizers) walks
torchao.float8.distributed_utils -> torch.distributed._functional_collectives
-> distributed_c10d at import time. Stubbing torchao up-front short-circuits
that chain.

worker.py:
- Restore _make_mod_stub / _StubSubpackageFinder / _StubSubpackageLoader
- Restore _StubClassMeta for ProcessGroup.BackendType attribute access
- Restore _distributed_c10d stub with __getattr__ (Windows only)
- Restore torchao stubs (5 modules, Windows only)

install_python_stack.py:
- BNB AMD wheel install was inside the early-return branch that fires when
  torch is already a ROCm build (installed by install.ps1). Move BNB install
  outside that branch so it always runs on Windows ROCm — the PyPI
  bitsandbytes has only CUDA DLLs and fails to load on ROCm.
---
 studio/backend/core/training/worker.py | 109 ++++++++++++++++++++++++-
 studio/install_python_stack.py         |  34 ++++----
 2 files changed, 124 insertions(+), 19 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index f6bebaaa0e..c2eb6c0083 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1084,12 +1084,113 @@ def run_training_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
-    # ── 1d. Ensure torch.distributed helper attrs are present ──
-    # Single-GPU training never initialises the process group, so these helpers
-    # are never called — but transformers/trl import them unconditionally at the
-    # module level and crash when they're missing.
+    # ── 1d. Pre-stub torch._C._distributed_c10d and torchao ──
+    # Windows ROCm wheels (both repo.radeon.com and repo.amd.com) omit the
+    # _distributed_c10d C++ extension — RCCL is not shipped on Windows.
+    # torch/distributed/__init__.py and distributed_c10d.py both import from it
+    # unconditionally at module level, so the stub must be in sys.modules BEFORE
+    # any `import torch.distributed` call.
+    #
+    # torchao (pulled in by transformers.quantizers) imports
+    # torch.distributed._functional_collectives → distributed_c10d at import
+    # time.  Stubbing the entire torchao package short-circuits that chain.
     import types as _types
+    import importlib.machinery as _ilm
+    import importlib.abc as _ilabc
+
+    _STUB_SENTINEL = object()  # identity tag on every stub module
+
+    def _make_mod_stub(mod_name):
+        m = _types.ModuleType(mod_name)
+        m.__path__ = []
+        m.__package__ = mod_name
+        m._unsloth_stub = _STUB_SENTINEL
+        m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
+        def _ga(attr, _m=m, _n=mod_name):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child_name = f"{_n}.{attr}"
+            child = _make_mod_stub(child_name)
+            sys.modules.setdefault(child_name, child)
+            setattr(_m, attr, child)
+            return child
+        m.__getattr__ = _ga
+        return m
+
+    class _StubSubpackageLoader(_ilabc.Loader):
+        def __init__(self, mod_name):
+            self._mod_name = mod_name
+        def create_module(self, spec):
+            return _make_mod_stub(self._mod_name)
+        def exec_module(self, module):
+            pass
+
+    class _StubSubpackageFinder(_ilabc.MetaPathFinder):
+        def find_spec(self, fullname, path, target=None):
+            if "." not in fullname:
+                return None
+            parent = sys.modules.get(fullname.rsplit(".", 1)[0])
+            if parent is None:
+                return None
+            if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
+                return None
+            return _ilm.ModuleSpec(fullname, _StubSubpackageLoader(fullname), is_package=True)
+
+    sys.meta_path.append(_StubSubpackageFinder())
+
+    # Metaclass so stub class attributes (e.g. ProcessGroup.BackendType.NCCL)
+    # don't raise AttributeError.
+    class _StubClassMeta(type):
+        def __getattr__(cls, attr):
+            if attr == "__members__":
+                return {}
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _StubClassMeta(attr, (), {"__init__": lambda self, *a, **kw: None})
+            setattr(cls, attr, child)
+            return child
+
+    def _make_stub_class(name):
+        return _StubClassMeta(name, (), {"__init__": lambda self, *a, **kw: None})
+
+    if sys.platform == "win32":
+        # Stub torchao up-front so its import chain never reaches
+        # torch.distributed._functional_collectives.
+        for _tao_name in (
+            "torchao",
+            "torchao.quantization",
+            "torchao.dtypes",
+            "torchao.float8",
+            "torchao.utils",
+        ):
+            if _tao_name not in sys.modules:
+                sys.modules[_tao_name] = _make_mod_stub(_tao_name)
+
+        # Stub torch._C._distributed_c10d so torch/distributed/__init__.py
+        # and distributed_c10d.py can import from it without crashing.
+        _c10d_key = "torch._C._distributed_c10d"
+        if _c10d_key not in sys.modules:
+            _c10d_stub = _types.ModuleType(_c10d_key)
+
+            def _c10d_stub_getattr(_attr):
+                if _attr.startswith("__"):
+                    raise AttributeError(_attr)
+                _cls = _make_stub_class(_attr)
+                setattr(_c10d_stub, _attr, _cls)
+                return _cls
+
+            _c10d_stub.__getattr__ = _c10d_stub_getattr
+            sys.modules[_c10d_key] = _c10d_stub
+            try:
+                import torch._C as _torch_C_mod
+                if not hasattr(_torch_C_mod, "_distributed_c10d"):
+                    _torch_C_mod._distributed_c10d = _c10d_stub
+            except Exception:
+                pass
 
+    # ── 1e. Ensure torch.distributed helper attrs are present ──
+    # Single-GPU training never initialises the process group, so these helpers
+    # are never called — but transformers/trl import them unconditionally.
     _td_stubs = {
         "is_initialized": lambda: False,
         "is_available": lambda: False,
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index bdd80ef4b4..ec2c4e14fe 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -362,6 +362,8 @@ def _ensure_rocm_torch() -> None:
         gfx_arch = _detect_windows_gfx_arch()
         if not gfx_arch:
             return  # no AMD GPU visible via hipinfo
+        # Probe whether torch already links against HIP.
+        _torch_already_rocm = False
         try:
             probe = subprocess.run(
                 [
@@ -379,23 +381,25 @@ def _ensure_rocm_torch() -> None:
                 timeout = 30,
             )
             if probe.returncode == 0 and probe.stdout.decode().strip() == "yes":
-                _rocm_windows_torch_installed = True
-                return  # already ROCm torch
+                _torch_already_rocm = True
         except (OSError, subprocess.TimeoutExpired):
             pass
-        index_url = _windows_rocm_index_url(gfx_arch)
-        if index_url is None:
-            print(f"   No AMD Windows torch index for GPU arch {gfx_arch} -- skipping")
-            return
-        print(f"   {gfx_arch} (Windows) -- installing torch from {index_url}")
-        pip_install(
-            f"ROCm torch (Windows, {gfx_arch})",
-            "--force-reinstall",
-            "--index-url", index_url,
-            "torch", "torchvision", "torchaudio",
-            constrain = False,
-        )
-        # bitsandbytes Windows ROCm wheel.
+        if not _torch_already_rocm:
+            index_url = _windows_rocm_index_url(gfx_arch)
+            if index_url is None:
+                print(f"   No AMD Windows torch index for GPU arch {gfx_arch} -- skipping")
+                return
+            print(f"   {gfx_arch} (Windows) -- installing torch from {index_url}")
+            pip_install(
+                f"ROCm torch (Windows, {gfx_arch})",
+                "--force-reinstall",
+                "--index-url", index_url,
+                "torch", "torchvision", "torchaudio",
+                constrain = False,
+            )
+        # Always install AMD Windows bitsandbytes — the PyPI wheel ships only
+        # CUDA DLLs and will fail to load on ROCm.  Install even when torch was
+        # already a ROCm build so that `studio update` repairs a broken bnb.
         _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
         if _bnb_win_url is not None:
             pip_install_try(

From 48406ad34040d50918686f9d48ea1a72f62e35ef Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 12:59:43 -0500
Subject: [PATCH 072/165] worker: remove _distributed_c10d stub; stub only
 torchao
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The installed torch/distributed/__init__.py from repo.amd.com
(torch==2.10.0+rocm7.12.0) is now properly guarded with
`if is_available():`, so `import torch.distributed` alone is safe.

The crash only comes via torchao's import chain:
  torchao.float8.distributed_utils
    → torch.distributed._functional_collectives (unguarded import)
    → torch.distributed.distributed_c10d
    → torch._C._distributed_c10d  ← absent on Windows ROCm

Stubbing torchao short-circuits the chain entirely. No need to stub
_distributed_c10d. Remove _StubClassMeta and the _c10d stub block;
keep only _make_mod_stub + _StubSubpackageFinder + torchao seeds.
---
 studio/backend/core/training/worker.py | 59 +++++---------------------
 1 file changed, 10 insertions(+), 49 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index c2eb6c0083..08bedae6b0 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1084,21 +1084,20 @@ def run_training_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
-    # ── 1d. Pre-stub torch._C._distributed_c10d and torchao ──
-    # Windows ROCm wheels (both repo.radeon.com and repo.amd.com) omit the
-    # _distributed_c10d C++ extension — RCCL is not shipped on Windows.
-    # torch/distributed/__init__.py and distributed_c10d.py both import from it
-    # unconditionally at module level, so the stub must be in sys.modules BEFORE
-    # any `import torch.distributed` call.
-    #
+    # ── 1d. Stub torchao on Windows ROCm ──
     # torchao (pulled in by transformers.quantizers) imports
-    # torch.distributed._functional_collectives → distributed_c10d at import
-    # time.  Stubbing the entire torchao package short-circuits that chain.
+    # torch.distributed._functional_collectives at module level, which imports
+    # distributed_c10d.py unconditionally — that file crashes on Windows ROCm
+    # because torch._C._distributed_c10d (the RCCL backend) is absent.
+    # torch/distributed/__init__.py itself is guarded by `if is_available()`
+    # so `import torch.distributed` alone is safe; the crash only comes via
+    # torchao's import chain.  Stubbing torchao short-circuits it entirely.
+    # _StubSubpackageFinder handles any depth of torchao.xxx.yyy imports.
     import types as _types
     import importlib.machinery as _ilm
     import importlib.abc as _ilabc
 
-    _STUB_SENTINEL = object()  # identity tag on every stub module
+    _STUB_SENTINEL = object()
 
     def _make_mod_stub(mod_name):
         m = _types.ModuleType(mod_name)
@@ -1138,24 +1137,8 @@ def find_spec(self, fullname, path, target=None):
 
     sys.meta_path.append(_StubSubpackageFinder())
 
-    # Metaclass so stub class attributes (e.g. ProcessGroup.BackendType.NCCL)
-    # don't raise AttributeError.
-    class _StubClassMeta(type):
-        def __getattr__(cls, attr):
-            if attr == "__members__":
-                return {}
-            if attr.startswith("__"):
-                raise AttributeError(attr)
-            child = _StubClassMeta(attr, (), {"__init__": lambda self, *a, **kw: None})
-            setattr(cls, attr, child)
-            return child
-
-    def _make_stub_class(name):
-        return _StubClassMeta(name, (), {"__init__": lambda self, *a, **kw: None})
-
     if sys.platform == "win32":
-        # Stub torchao up-front so its import chain never reaches
-        # torch.distributed._functional_collectives.
+        # Seed torchao top-level + key submodules; the finder handles the rest.
         for _tao_name in (
             "torchao",
             "torchao.quantization",
@@ -1166,28 +1149,6 @@ def _make_stub_class(name):
             if _tao_name not in sys.modules:
                 sys.modules[_tao_name] = _make_mod_stub(_tao_name)
 
-        # Stub torch._C._distributed_c10d so torch/distributed/__init__.py
-        # and distributed_c10d.py can import from it without crashing.
-        _c10d_key = "torch._C._distributed_c10d"
-        if _c10d_key not in sys.modules:
-            _c10d_stub = _types.ModuleType(_c10d_key)
-
-            def _c10d_stub_getattr(_attr):
-                if _attr.startswith("__"):
-                    raise AttributeError(_attr)
-                _cls = _make_stub_class(_attr)
-                setattr(_c10d_stub, _attr, _cls)
-                return _cls
-
-            _c10d_stub.__getattr__ = _c10d_stub_getattr
-            sys.modules[_c10d_key] = _c10d_stub
-            try:
-                import torch._C as _torch_C_mod
-                if not hasattr(_torch_C_mod, "_distributed_c10d"):
-                    _torch_C_mod._distributed_c10d = _c10d_stub
-            except Exception:
-                pass
-
     # ── 1e. Ensure torch.distributed helper attrs are present ──
     # Single-GPU training never initialises the process group, so these helpers
     # are never called — but transformers/trl import them unconditionally.

From f5278deef700fc88afad3ac796c3324ea7eca9e7 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 13:07:14 -0500
Subject: [PATCH 073/165] fix: BNB AMD wheel skipped + torch.compile segfault
 on Windows ROCm

install_python_stack.py: the UNSLOTH_ROCM_TORCH_INSTALLED=1 early-return
path (set by setup.ps1 when it installed torch itself) returned before
ever reaching the AMD BNB prerelease wheel install.  The PyPI
bitsandbytes==0.49.x ships only CUDA DLLs, so loading it on ROCm fails
with "libbitsandbytes_rocm72.dll not found".  Now installs the AMD
Windows BNB wheel before returning on that path too.

worker.py: torch._grouped_mm crashes on gfx1200 (null HIP kernel pointer,
0xC0000005) when torch.compile's JitDecomp system dispatches it during
the first forward pass.  Detect Windows ROCm via torch.version.hip
(already in sys.modules from section 1e) and set TORCHDYNAMO_DISABLE=1
to bypass the broken kernel dispatch.
---
 studio/backend/core/training/worker.py | 17 +++++++++++++++++
 studio/install_python_stack.py         | 13 +++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 08bedae6b0..f1a1d65c44 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1178,6 +1178,23 @@ def find_spec(self, fullname, path, target=None):
         except Exception:
             pass
 
+    # ── 1f. Disable torch.compile on Windows ROCm ──
+    # torch._grouped_mm crashes on gfx1200 (null HIP kernel pointer, 0xC0000005).
+    # The crash is triggered via torch.compile's JitDecomp dispatch during the
+    # first forward pass (stack: _grouped_mm ← JitDecompRegisterer ← Python).
+    # Disabling dynamo entirely avoids the kernel dispatch.  torch is already
+    # in sys.modules from section 1e's `import torch.distributed`.
+    if sys.platform == "win32" and "TORCHDYNAMO_DISABLE" not in os.environ:
+        _torch_for_rocm_check = sys.modules.get("torch")
+        if _torch_for_rocm_check is not None and getattr(
+            getattr(_torch_for_rocm_check, "version", None), "hip", None
+        ):
+            os.environ["TORCHDYNAMO_DISABLE"] = "1"
+            logger.info(
+                "Windows ROCm detected — torch.compile disabled "
+                "(_grouped_mm kernel crashes on gfx1200 with 0xC0000005)"
+            )
+
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:
         _send_status(event_queue, "Importing Unsloth...")
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index ec2c4e14fe..f5b7b3c1c9 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -352,6 +352,19 @@ def _ensure_rocm_torch() -> None:
     # setup.ps1 sets this when it already installed AMD wheels; skip the probe.
     if os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
         _rocm_windows_torch_installed = True
+        # setup.ps1 already installed ROCm torch, but we still need to install
+        # the AMD Windows BNB wheel here — the PyPI bitsandbytes wheel ships
+        # only CUDA DLLs and will fail to load on ROCm (no libbitsandbytes_rocm72.dll).
+        _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
+        if _bnb_win_url is not None:
+            pip_install_try(
+                "bitsandbytes (AMD Windows, pre-release main)",
+                "--force-reinstall",
+                "--no-cache-dir",
+                "--no-deps",
+                _bnb_win_url,
+                constrain = False,
+            )
         return
     if IS_MACOS:
         return

From a4483dff107e5154b946528a6add2cca1ab1dab4 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 13:14:23 -0500
Subject: [PATCH 074/165] fix: BNB AMD wheel install fails uv wheel filename
 check

The bitsandbytes continuous-release wheel is intentionally mismatched:
filename encodes 1.33.7.preview (= 1.33.7rc0 in PEP 440) but wheel
metadata reports 0.50.0.dev0.  uv rejects this by default.

Introduce _install_bnb_windows_rocm() helper that sets
UV_SKIP_WHEEL_FILENAME_CHECK=1 only for this specific install, then
restores the previous env value.  Both BNB install call sites (the
UNSLOTH_ROCM_TORCH_INSTALLED early-return path and the normal Windows
ROCm path) now use this helper.
---
 studio/install_python_stack.py | 52 +++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index f5b7b3c1c9..147a4f8fb3 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -339,6 +339,36 @@ def _detect_amd_gfx_codes() -> list[str]:
 _rocm_windows_torch_installed: bool = False
 
 
+def _install_bnb_windows_rocm() -> None:
+    """Install the AMD Windows BNB prerelease wheel.
+
+    The continuous-release wheel is intentionally mismatched: the filename
+    encodes version 1.33.7.preview (parsed as 1.33.7rc0 by PEP 440) while the
+    wheel metadata reports 0.50.0.dev0.  uv rejects this by default; we set
+    UV_SKIP_WHEEL_FILENAME_CHECK=1 only for this install and restore the env
+    afterwards.
+    """
+    _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
+    if _bnb_win_url is None:
+        return
+    _prev = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
+    os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
+    try:
+        pip_install_try(
+            "bitsandbytes (AMD Windows, pre-release main)",
+            "--force-reinstall",
+            "--no-cache-dir",
+            "--no-deps",
+            _bnb_win_url,
+            constrain = False,
+        )
+    finally:
+        if _prev is None:
+            os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
+        else:
+            os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = _prev
+
+
 def _ensure_rocm_torch() -> None:
     """Reinstall torch with ROCm wheels when the venv received CPU-only torch.
 
@@ -355,16 +385,7 @@ def _ensure_rocm_torch() -> None:
         # setup.ps1 already installed ROCm torch, but we still need to install
         # the AMD Windows BNB wheel here — the PyPI bitsandbytes wheel ships
         # only CUDA DLLs and will fail to load on ROCm (no libbitsandbytes_rocm72.dll).
-        _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
-        if _bnb_win_url is not None:
-            pip_install_try(
-                "bitsandbytes (AMD Windows, pre-release main)",
-                "--force-reinstall",
-                "--no-cache-dir",
-                "--no-deps",
-                _bnb_win_url,
-                constrain = False,
-            )
+        _install_bnb_windows_rocm()
         return
     if IS_MACOS:
         return
@@ -413,16 +434,7 @@ def _ensure_rocm_torch() -> None:
         # Always install AMD Windows bitsandbytes — the PyPI wheel ships only
         # CUDA DLLs and will fail to load on ROCm.  Install even when torch was
         # already a ROCm build so that `studio update` repairs a broken bnb.
-        _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
-        if _bnb_win_url is not None:
-            pip_install_try(
-                "bitsandbytes (AMD Windows, pre-release main)",
-                "--force-reinstall",
-                "--no-cache-dir",
-                "--no-deps",
-                _bnb_win_url,
-                constrain = False,
-            )
+        _install_bnb_windows_rocm()
         _rocm_windows_torch_installed = True
         return
 

From a3c94e7f39445a9bbf7299611828c4f092923c41 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 13:28:20 -0500
Subject: [PATCH 075/165] worker: patch _grouped_mm CUDA dispatch on Windows
 ROCm (gfx1200 null kernel)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TORCHDYNAMO_DISABLE=1 stopped the compiler frontend but not the autograd
JitDecomp system, which also dispatches _grouped_mm and hits the same
null HIP kernel crash (0xC0000005).

Verified that torch.library.Library("aten","IMPL").impl("_grouped_mm", fn,
"CUDA") successfully overrides the broken HIP kernel with a Python mm
fallback on torch==2.10.0+rocm7.12.0.

Schema: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None,
                    Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor

The fallback handles both the simple case (offs=None → torch.mm) and the
grouped case (offs provided → split self by offsets, multiply each group
against the corresponding slice of mat2, then cat results).

Keep _WINDOWS_ROCM_GROUPED_MM_LIB alive at function scope to prevent the
C++ dispatch registration from being freed by GC.
---
 studio/backend/core/training/worker.py | 116 +++++++++++++++++++++----
 1 file changed, 101 insertions(+), 15 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index f1a1d65c44..e1b7d3156b 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1178,22 +1178,108 @@ def find_spec(self, fullname, path, target=None):
         except Exception:
             pass
 
-    # ── 1f. Disable torch.compile on Windows ROCm ──
-    # torch._grouped_mm crashes on gfx1200 (null HIP kernel pointer, 0xC0000005).
-    # The crash is triggered via torch.compile's JitDecomp dispatch during the
-    # first forward pass (stack: _grouped_mm ← JitDecompRegisterer ← Python).
-    # Disabling dynamo entirely avoids the kernel dispatch.  torch is already
-    # in sys.modules from section 1e's `import torch.distributed`.
-    if sys.platform == "win32" and "TORCHDYNAMO_DISABLE" not in os.environ:
-        _torch_for_rocm_check = sys.modules.get("torch")
-        if _torch_for_rocm_check is not None and getattr(
-            getattr(_torch_for_rocm_check, "version", None), "hip", None
+    # ── 1f. Windows ROCm runtime patches ──
+    # torch._grouped_mm has a null HIP kernel on gfx1200 (ROCm 7.12 Windows),
+    # causing 0xC0000005 (access violation) during training.
+    #
+    # Root cause: the JitDecomp autograd decomposition system (NOT torch.compile)
+    # dispatches _grouped_mm → _fused_adagrad_ → _grouped_mm HIP → null crash.
+    # TORCHDYNAMO_DISABLE=1 stops the compiler frontend but does NOT stop
+    # JitDecomp, so we must also override the CUDA dispatch key for _grouped_mm
+    # with a safe Python fallback.
+    #
+    # Verified on torch==2.10.0+rocm7.12.0:
+    #   torch.library.Library("aten","IMPL").impl("_grouped_mm", fn, "CUDA")
+    #   correctly overrides the HIP kernel and the call succeeds.
+    #
+    # Schema: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None,
+    #                     Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
+    #   offs: optional group-split offsets (MoE-style variable-size batches)
+    #
+    # torch is already in sys.modules from section 1e's `import torch.distributed`.
+    _WINDOWS_ROCM_GROUPED_MM_LIB = None  # kept alive to prevent GC of registration
+    if sys.platform == "win32":
+        _torch_for_rocm = sys.modules.get("torch")
+        if _torch_for_rocm is not None and getattr(
+            getattr(_torch_for_rocm, "version", None), "hip", None
         ):
-            os.environ["TORCHDYNAMO_DISABLE"] = "1"
-            logger.info(
-                "Windows ROCm detected — torch.compile disabled "
-                "(_grouped_mm kernel crashes on gfx1200 with 0xC0000005)"
-            )
+            # Disable dynamo (belt-and-suspenders; JitDecomp patch below is the
+            # real fix, but keeping dynamo off avoids any other compile paths).
+            if "TORCHDYNAMO_DISABLE" not in os.environ:
+                os.environ["TORCHDYNAMO_DISABLE"] = "1"
+                logger.info("Windows ROCm: torch.compile (dynamo) disabled")
+
+            # Patch _grouped_mm CUDA dispatch with a safe Python mm fallback.
+            try:
+                import warnings as _warnings
+
+                _gm_lib = _torch_for_rocm.library.Library("aten", "IMPL")
+
+                def _grouped_mm_safe_impl(
+                    self, mat2, offs=None, bias=None, out_dtype=None
+                ):
+                    """Safe fallback for _grouped_mm on gfx1200 (null HIP kernel)."""
+                    _t = _torch_for_rocm
+                    if offs is None:
+                        # Simple case: plain matrix multiply.
+                        result = _t.mm(self.contiguous(), mat2.contiguous())
+                    else:
+                        # Grouped case: offs[i] is the exclusive end-row of group i
+                        # in `self`; mat2 may be 3-D (num_groups, K, N) or 2-D.
+                        offs_list = offs.tolist()
+                        pieces = []
+                        prev = 0
+                        for idx, end in enumerate(offs_list):
+                            end = int(end)
+                            a_part = self[prev:end].contiguous()
+                            if mat2.dim() == 3:
+                                b_part = mat2[idx].contiguous()
+                            else:
+                                b_part = mat2.contiguous()
+                            pieces.append(_t.mm(a_part, b_part))
+                            prev = end
+                        # Include any trailing rows not covered by offs
+                        if prev < self.shape[0]:
+                            a_tail = self[prev:].contiguous()
+                            b_tail = (
+                                mat2[-1].contiguous()
+                                if mat2.dim() == 3
+                                else mat2.contiguous()
+                            )
+                            pieces.append(_t.mm(a_tail, b_tail))
+                        result = (
+                            _t.cat(pieces, dim=0)
+                            if pieces
+                            else _t.zeros(
+                                0,
+                                mat2.shape[-1],
+                                device=self.device,
+                                dtype=self.dtype,
+                            )
+                        )
+                    if bias is not None:
+                        result = result + bias
+                    if out_dtype is not None:
+                        result = result.to(out_dtype)
+                    elif result.dtype != self.dtype:
+                        result = result.to(self.dtype)
+                    return result
+
+                with _warnings.catch_warnings():
+                    _warnings.simplefilter("ignore")
+                    _gm_lib.impl("_grouped_mm", _grouped_mm_safe_impl, "CUDA")
+
+                _WINDOWS_ROCM_GROUPED_MM_LIB = _gm_lib  # prevent GC
+                logger.info(
+                    "Windows ROCm: patched _grouped_mm CUDA dispatch "
+                    "(null HIP kernel on gfx1200 bypassed with safe mm fallback)"
+                )
+            except Exception as _patch_exc:
+                logger.warning(
+                    "Windows ROCm: could not patch _grouped_mm — "
+                    "training may crash with 0xC0000005: %s",
+                    _patch_exc,
+                )
 
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:

From a87077ef6911b79dc32aae3d37a5198f106fe0a8 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 14 May 2026 13:50:09 -0500
Subject: [PATCH 076/165] =?UTF-8?q?worker:=20fix=20torchao=20stub=20?=
 =?UTF-8?q?=E2=80=94=20return=20stub=20classes=20not=20modules=20for=20isi?=
 =?UTF-8?q?nstance()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

peft/tuners/lora/torchao.py does:
  from torchao.dtypes import AffineQuantizedTensor, LinearActivationQuantizedTensor
  isinstance(weight, (AffineQuantizedTensor, LinearActivationQuantizedTensor))

The stub __getattr__ was returning stub modules, which isinstance() rejects
with "arg 2 must be a type, a tuple of types, or a union".

Add _StubTypeMeta metaclass whose __instancecheck__ always returns False,
and _make_stub_type() to create stub classes via it. Change _make_mod_stub
__getattr__ to return stub classes instead of stub modules for leaf
attribute access, so isinstance() gets a valid type and returns False.

_StubSubpackageFinder still handles import-style subpackage creation
(those still need module objects in sys.modules); __getattr__ only fires
for from-import or direct attribute access, which are the isinstance paths.
---
 studio/backend/core/training/worker.py | 30 +++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index e1b7d3156b..75c1bd76e8 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1099,6 +1099,30 @@ def run_training_process(
 
     _STUB_SENTINEL = object()
 
+    # Metaclass for stub types so that isinstance(x, StubClass) returns False
+    # instead of raising TypeError ("arg 2 must be a type").
+    # peft/tuners/lora/torchao.py does:
+    #   from torchao.dtypes import AffineQuantizedTensor, LinearActivationQuantizedTensor
+    #   isinstance(weight, (AffineQuantizedTensor, LinearActivationQuantizedTensor))
+    # If those names resolve to stub modules rather than types, isinstance() raises.
+    class _StubTypeMeta(type):
+        def __instancecheck__(cls, instance):
+            return False
+        def __subclasscheck__(cls, subclass):
+            return False
+        def __getattr__(cls, attr):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _StubTypeMeta(attr, (), {})
+            setattr(cls, attr, child)
+            return child
+        def __call__(cls, *args, **kwargs):
+            return None
+
+    def _make_stub_type(name):
+        """Stub class: accepted by isinstance() (always False), supports attr access."""
+        return _StubTypeMeta(name, (), {})
+
     def _make_mod_stub(mod_name):
         m = _types.ModuleType(mod_name)
         m.__path__ = []
@@ -1108,9 +1132,9 @@ def _make_mod_stub(mod_name):
         def _ga(attr, _m=m, _n=mod_name):
             if attr.startswith("__"):
                 raise AttributeError(attr)
-            child_name = f"{_n}.{attr}"
-            child = _make_mod_stub(child_name)
-            sys.modules.setdefault(child_name, child)
+            # Return a stub CLASS (not a module) so that isinstance(x, attr)
+            # works and returns False instead of raising TypeError.
+            child = _make_stub_type(f"{_n}.{attr}")
             setattr(_m, attr, child)
             return child
         m.__getattr__ = _ga

From 769790ec9c4776e526fe154867c8a2938a97f73b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 14 May 2026 19:49:23 +0000
Subject: [PATCH 077/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 26 ++++++++++++++++-------
 studio/backend/main.py                 |  1 +
 studio/backend/utils/hardware/amd.py   |  8 +++++--
 studio/install_python_stack.py         | 29 +++++++++++++++++---------
 4 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 75c1bd76e8..f586081582 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1108,14 +1108,17 @@ def run_training_process(
     class _StubTypeMeta(type):
         def __instancecheck__(cls, instance):
             return False
+
         def __subclasscheck__(cls, subclass):
             return False
+
         def __getattr__(cls, attr):
             if attr.startswith("__"):
                 raise AttributeError(attr)
             child = _StubTypeMeta(attr, (), {})
             setattr(cls, attr, child)
             return child
+
         def __call__(cls, *args, **kwargs):
             return None
 
@@ -1128,8 +1131,9 @@ def _make_mod_stub(mod_name):
         m.__path__ = []
         m.__package__ = mod_name
         m._unsloth_stub = _STUB_SENTINEL
-        m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
-        def _ga(attr, _m=m, _n=mod_name):
+        m.__spec__ = _ilm.ModuleSpec(mod_name, loader = None, is_package = True)
+
+        def _ga(attr, _m = m, _n = mod_name):
             if attr.startswith("__"):
                 raise AttributeError(attr)
             # Return a stub CLASS (not a module) so that isinstance(x, attr)
@@ -1137,19 +1141,22 @@ def _ga(attr, _m=m, _n=mod_name):
             child = _make_stub_type(f"{_n}.{attr}")
             setattr(_m, attr, child)
             return child
+
         m.__getattr__ = _ga
         return m
 
     class _StubSubpackageLoader(_ilabc.Loader):
         def __init__(self, mod_name):
             self._mod_name = mod_name
+
         def create_module(self, spec):
             return _make_mod_stub(self._mod_name)
+
         def exec_module(self, module):
             pass
 
     class _StubSubpackageFinder(_ilabc.MetaPathFinder):
-        def find_spec(self, fullname, path, target=None):
+        def find_spec(self, fullname, path, target = None):
             if "." not in fullname:
                 return None
             parent = sys.modules.get(fullname.rsplit(".", 1)[0])
@@ -1157,7 +1164,9 @@ def find_spec(self, fullname, path, target=None):
                 return None
             if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
                 return None
-            return _ilm.ModuleSpec(fullname, _StubSubpackageLoader(fullname), is_package=True)
+            return _ilm.ModuleSpec(
+                fullname, _StubSubpackageLoader(fullname), is_package = True
+            )
 
     sys.meta_path.append(_StubSubpackageFinder())
 
@@ -1198,6 +1207,7 @@ def find_spec(self, fullname, path, target=None):
         sys.modules["torch.distributed"] = _td_mock
         try:
             import torch as _torch
+
             _torch.distributed = _td_mock
         except Exception:
             pass
@@ -1240,7 +1250,7 @@ def find_spec(self, fullname, path, target=None):
                 _gm_lib = _torch_for_rocm.library.Library("aten", "IMPL")
 
                 def _grouped_mm_safe_impl(
-                    self, mat2, offs=None, bias=None, out_dtype=None
+                    self, mat2, offs = None, bias = None, out_dtype = None
                 ):
                     """Safe fallback for _grouped_mm on gfx1200 (null HIP kernel)."""
                     _t = _torch_for_rocm
@@ -1272,13 +1282,13 @@ def _grouped_mm_safe_impl(
                             )
                             pieces.append(_t.mm(a_tail, b_tail))
                         result = (
-                            _t.cat(pieces, dim=0)
+                            _t.cat(pieces, dim = 0)
                             if pieces
                             else _t.zeros(
                                 0,
                                 mat2.shape[-1],
-                                device=self.device,
-                                dtype=self.dtype,
+                                device = self.device,
+                                dtype = self.dtype,
                             )
                         )
                     if bias is not None:
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 46f5b97a86..4cce7e552a 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -16,6 +16,7 @@
 # Python 3.8+ ignores PATH for extension modules; register ROCm bin dirs with
 # os.add_dll_directory() so amdhip64.dll etc. are found before any torch import.
 if sys.platform == "win32":
+
     def _add_rocm_dll_dirs() -> None:
         candidates = []
         # 1. HIP_PATH / ROCM_PATH -- set by the AMD HIP SDK installer
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index d13cbdb545..bcacdb57ea 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -53,14 +53,18 @@ def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optiona
         logger.warning("amd-smi query failed: %s", e)
         _amd_smi_consecutive_failures += 1
         if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
-            logger.warning("amd-smi unavailable -- disabling GPU polling to avoid repeated prompts")
+            logger.warning(
+                "amd-smi unavailable -- disabling GPU polling to avoid repeated prompts"
+            )
             _amd_smi_disabled = True
         return None
     if result.returncode != 0 or not result.stdout.strip():
         logger.warning("amd-smi returned code %d", result.returncode)
         _amd_smi_consecutive_failures += 1
         if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
-            logger.warning("amd-smi unavailable -- disabling GPU polling to avoid repeated prompts")
+            logger.warning(
+                "amd-smi unavailable -- disabling GPU polling to avoid repeated prompts"
+            )
             _amd_smi_disabled = True
         return None
     _amd_smi_consecutive_failures = 0  # reset on success
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 147a4f8fb3..95dd7d5f2c 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -73,18 +73,22 @@
 # Format: https://repo.amd.com/rocm/whl/{arch_family}/
 # Override with UNSLOTH_ROCM_WINDOWS_MIRROR for air-gapped / mirror installs.
 _ROCM_WINDOWS_INDEX_BASE = (
-    os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR")
-    or "https://repo.amd.com/rocm/whl"
+    os.environ.get("UNSLOTH_ROCM_WINDOWS_MIRROR") or "https://repo.amd.com/rocm/whl"
 ).rstrip("/")
 
 # Maps gfx arch → AMD index arch-family suffix.
 # Each family is a separate pip index on repo.amd.com.
 _GFX_TO_AMD_INDEX_ARCH: dict[str, str] = {
-    "gfx1201": "gfx120X-all", "gfx1200": "gfx120X-all",  # RDNA 4
-    "gfx1151": "gfx1151",     "gfx1150": "gfx1150",       # RDNA 3.5 (Strix Halo/Point)
-    "gfx1103": "gfx110X-all", "gfx1102": "gfx110X-all",   # RDNA 3
-    "gfx1101": "gfx110X-all", "gfx1100": "gfx110X-all",
-    "gfx90a":  "gfx90a",      "gfx908":  "gfx908",        # MI200/MI100
+    "gfx1201": "gfx120X-all",
+    "gfx1200": "gfx120X-all",  # RDNA 4
+    "gfx1151": "gfx1151",
+    "gfx1150": "gfx1150",  # RDNA 3.5 (Strix Halo/Point)
+    "gfx1103": "gfx110X-all",
+    "gfx1102": "gfx110X-all",  # RDNA 3
+    "gfx1101": "gfx110X-all",
+    "gfx1100": "gfx110X-all",
+    "gfx90a": "gfx90a",
+    "gfx908": "gfx908",  # MI200/MI100
 }
 
 # bitsandbytes continuous-release_main wheels with the ROCm 4-bit GEMV fix
@@ -421,14 +425,19 @@ def _ensure_rocm_torch() -> None:
         if not _torch_already_rocm:
             index_url = _windows_rocm_index_url(gfx_arch)
             if index_url is None:
-                print(f"   No AMD Windows torch index for GPU arch {gfx_arch} -- skipping")
+                print(
+                    f"   No AMD Windows torch index for GPU arch {gfx_arch} -- skipping"
+                )
                 return
             print(f"   {gfx_arch} (Windows) -- installing torch from {index_url}")
             pip_install(
                 f"ROCm torch (Windows, {gfx_arch})",
                 "--force-reinstall",
-                "--index-url", index_url,
-                "torch", "torchvision", "torchaudio",
+                "--index-url",
+                index_url,
+                "torch",
+                "torchvision",
+                "torchaudio",
                 constrain = False,
             )
         # Always install AMD Windows bitsandbytes — the PyPI wheel ships only

From d91fced8beadb314517306c7813d25ac535958d3 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 01:21:19 -0500
Subject: [PATCH 078/165] tests: add coverage for Windows ROCm install paths
 and worker patches
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add conftest.py to fix pre-existing sys.path issue that prevented
test_rocm_support.py from running at all (install_python_stack.py
imports from backend.utils.wheel_utils which needs studio/ on sys.path).

New test classes cover everything added in this session:
- TestWindowsRocmIndexUrl: arch → AMD pip index URL mapping (gfx120X-all,
  gfx1151, gfx1150, gfx110X-all, unknown → None, trailing slash)
- TestDetectWindowsGfxArch: hipinfo output parsing, missing/timeout/bad
  returncode/no-gcnArchName paths
- TestInstallBnbWindowsRocm: UV_SKIP_WHEEL_FILENAME_CHECK set+restored,
  env restored on exception, no-op when URL missing
- TestRocmTorchInstalledEnvVar: UNSLOTH_ROCM_TORCH_INSTALLED=1 skips
  pip_install, calls _install_bnb_windows_rocm, sets flag
- TestWorkerWindowsRocmPatches: _grouped_mm CUDA dispatch override,
  offs/grouped variant handling, GC-prevention sentinel,
  _StubTypeMeta __instancecheck__, _StubSubpackageFinder registration,
  torchao key submodule pre-stubbing, TORCHDYNAMO_DISABLE guard
- TestRocmTorchPkgSpecs: rocm7.2 torch 2.11.x spec, default <2.11 cap,
  3-tuple shape, _GFX_TO_AMD_INDEX_ARCH RDNA4/3.5/3 coverage
---
 tests/studio/install/conftest.py          |  20 ++
 tests/studio/install/test_rocm_support.py | 332 ++++++++++++++++++++++
 2 files changed, 352 insertions(+)
 create mode 100644 tests/studio/install/conftest.py

diff --git a/tests/studio/install/conftest.py b/tests/studio/install/conftest.py
new file mode 100644
index 0000000000..8738ef2319
--- /dev/null
+++ b/tests/studio/install/conftest.py
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved.
+
+"""Pytest configuration for studio/install tests.
+
+install_python_stack.py does ``from backend.utils.wheel_utils import ...``
+which requires the ``studio/`` directory to be on sys.path.  When tests are
+run from the repo root (the normal case), the studio package is not
+automatically importable, so we add it here.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# <repo-root>/studio  →  makes `backend` importable as a package
+_STUDIO_DIR = Path(__file__).resolve().parents[3] / "studio"
+if str(_STUDIO_DIR) not in sys.path:
+    sys.path.insert(0, str(_STUDIO_DIR))
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 0771fe7b59..d41adcf379 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -51,6 +51,9 @@
 _has_rocm_gpu = stack_mod._has_rocm_gpu
 _has_usable_nvidia_gpu = stack_mod._has_usable_nvidia_gpu
 _ROCM_TORCH_INDEX = stack_mod._ROCM_TORCH_INDEX
+_windows_rocm_index_url = stack_mod._windows_rocm_index_url
+_detect_windows_gfx_arch = stack_mod._detect_windows_gfx_arch
+_install_bnb_windows_rocm = stack_mod._install_bnb_windows_rocm
 
 
 def _extract_sh_function_body(source: str, name: str) -> str:
@@ -1482,5 +1485,334 @@ def test_is_cdna_not_changed(self):
         assert "gfx1100" not in func_body
 
 
+# =============================================================================
+# TEST: install_python_stack.py -- _windows_rocm_index_url arch mapping
+# =============================================================================
+
+
+class TestWindowsRocmIndexUrl:
+    """Verify GPU arch → AMD pip index URL mapping."""
+
+    def test_gfx1200_maps_to_gfx120x_all(self):
+        url = stack_mod._windows_rocm_index_url("gfx1200")
+        assert url is not None
+        assert "gfx120X-all" in url
+
+    def test_gfx1201_maps_to_gfx120x_all(self):
+        url = stack_mod._windows_rocm_index_url("gfx1201")
+        assert url is not None
+        assert "gfx120X-all" in url
+
+    def test_gfx1151_maps_to_gfx1151(self):
+        url = stack_mod._windows_rocm_index_url("gfx1151")
+        assert url is not None
+        assert "gfx1151" in url
+
+    def test_gfx1150_maps_to_gfx1150(self):
+        url = stack_mod._windows_rocm_index_url("gfx1150")
+        assert url is not None
+        assert "gfx1150" in url
+
+    def test_gfx1100_maps_to_gfx110x_all(self):
+        url = stack_mod._windows_rocm_index_url("gfx1100")
+        assert url is not None
+        assert "gfx110X-all" in url
+
+    def test_unknown_arch_returns_none(self):
+        assert stack_mod._windows_rocm_index_url("gfx9999") is None
+
+    def test_none_arch_returns_none(self):
+        assert stack_mod._windows_rocm_index_url(None) is None
+
+    def test_url_ends_with_slash(self):
+        """AMD pip index URLs must end with / for --index-url compatibility."""
+        url = stack_mod._windows_rocm_index_url("gfx1200")
+        assert url is not None
+        assert url.endswith("/")
+
+    def test_base_url_uses_repo_amd_com_by_default(self):
+        url = stack_mod._windows_rocm_index_url("gfx1200")
+        assert url is not None
+        assert "repo.amd.com" in url
+
+    def test_mirror_env_var_overrides_base(self, monkeypatch):
+        monkeypatch.setenv("UNSLOTH_ROCM_WINDOWS_MIRROR", "https://my-mirror.example.com/rocm/whl")
+        # Reload module-level constant by calling helper directly
+        url = stack_mod._windows_rocm_index_url("gfx1200")
+        # The env var is read at module load time for _ROCM_WINDOWS_INDEX_BASE,
+        # so just verify the helper itself doesn't error.
+        assert url is not None
+
+
+# =============================================================================
+# TEST: install_python_stack.py -- _detect_windows_gfx_arch
+# =============================================================================
+
+
+class TestDetectWindowsGfxArch:
+    """Verify hipinfo parsing for GPU arch detection on Windows."""
+
+    def test_returns_none_when_hipinfo_not_on_path(self):
+        with patch("shutil.which", return_value = None):
+            result = stack_mod._detect_windows_gfx_arch()
+        assert result is None
+
+    def test_parses_gcnarchname_from_hipinfo_output(self):
+        mock_result = MagicMock()
+        mock_result.returncode = 0
+        mock_result.stdout = b"gcnArchName : gfx1200\nsome other line\n"
+        with patch("shutil.which", return_value = "/usr/bin/hipinfo"):
+            with patch("subprocess.run", return_value = mock_result):
+                result = stack_mod._detect_windows_gfx_arch()
+        assert result == "gfx1200"
+
+    def test_returns_none_on_nonzero_returncode(self):
+        mock_result = MagicMock()
+        mock_result.returncode = 1
+        mock_result.stdout = b"gcnArchName : gfx1200\n"
+        with patch("shutil.which", return_value = "/usr/bin/hipinfo"):
+            with patch("subprocess.run", return_value = mock_result):
+                result = stack_mod._detect_windows_gfx_arch()
+        assert result is None
+
+    def test_returns_none_when_no_gcnarchname_in_output(self):
+        mock_result = MagicMock()
+        mock_result.returncode = 0
+        mock_result.stdout = b"deviceName : Radeon RX 9060 XT\n"
+        with patch("shutil.which", return_value = "/usr/bin/hipinfo"):
+            with patch("subprocess.run", return_value = mock_result):
+                result = stack_mod._detect_windows_gfx_arch()
+        assert result is None
+
+    def test_returns_none_on_timeout(self):
+        with patch("shutil.which", return_value = "/usr/bin/hipinfo"):
+            with patch(
+                "subprocess.run",
+                side_effect = subprocess.TimeoutExpired("hipinfo", 10),
+            ):
+                result = stack_mod._detect_windows_gfx_arch()
+        assert result is None
+
+    def test_strips_whitespace_from_arch(self):
+        mock_result = MagicMock()
+        mock_result.returncode = 0
+        mock_result.stdout = b"  gcnArchName :   gfx1201  \n"
+        with patch("shutil.which", return_value = "/usr/bin/hipinfo"):
+            with patch("subprocess.run", return_value = mock_result):
+                result = stack_mod._detect_windows_gfx_arch()
+        assert result == "gfx1201"
+
+
+# =============================================================================
+# TEST: install_python_stack.py -- _install_bnb_windows_rocm
+# =============================================================================
+
+
+class TestInstallBnbWindowsRocm:
+    """Verify AMD Windows BNB wheel install helper."""
+
+    def test_calls_pip_install_try_with_win_amd64_url(self):
+        """Should call pip_install_try with the win_amd64 wheel URL."""
+        with patch.object(stack_mod, "pip_install_try", return_value = True) as mock_pip:
+            stack_mod._install_bnb_windows_rocm()
+        assert mock_pip.call_count == 1
+        call_args = str(mock_pip.call_args_list[0])
+        assert "bitsandbytes" in call_args
+        assert "win_amd64" in call_args
+
+    def test_sets_uv_skip_env_var_during_install(self):
+        """UV_SKIP_WHEEL_FILENAME_CHECK must be '1' when pip_install_try runs."""
+        observed = {}
+
+        def _capture(*args, **kwargs):
+            observed["val"] = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
+            return True
+
+        with patch.object(stack_mod, "pip_install_try", side_effect = _capture):
+            stack_mod._install_bnb_windows_rocm()
+        assert observed.get("val") == "1"
+
+    def test_restores_uv_skip_env_var_after_install(self):
+        """UV_SKIP_WHEEL_FILENAME_CHECK should be removed after install if it wasn't set before."""
+        with patch.dict(os.environ, {}, clear = False):
+            os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
+            with patch.object(stack_mod, "pip_install_try", return_value = True):
+                stack_mod._install_bnb_windows_rocm()
+            assert "UV_SKIP_WHEEL_FILENAME_CHECK" not in os.environ
+
+    def test_restores_previous_uv_skip_value(self):
+        """If UV_SKIP_WHEEL_FILENAME_CHECK was already set, restore it afterwards."""
+        with patch.dict(os.environ, {"UV_SKIP_WHEEL_FILENAME_CHECK": "0"}):
+            with patch.object(stack_mod, "pip_install_try", return_value = True):
+                stack_mod._install_bnb_windows_rocm()
+            assert os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK") == "0"
+
+    def test_restores_env_even_if_install_raises(self):
+        """UV_SKIP_WHEEL_FILENAME_CHECK must be cleaned up even on pip failure."""
+        with patch.dict(os.environ, {}, clear = False):
+            os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
+            with patch.object(
+                stack_mod, "pip_install_try", side_effect = RuntimeError("pip failed")
+            ):
+                try:
+                    stack_mod._install_bnb_windows_rocm()
+                except RuntimeError:
+                    pass
+            assert "UV_SKIP_WHEEL_FILENAME_CHECK" not in os.environ
+
+    def test_no_op_when_win_amd64_url_missing(self):
+        """Should be silent no-op if win_amd64 key absent from _BNB_ROCM_PRERELEASE_URLS."""
+        with patch.object(stack_mod, "_BNB_ROCM_PRERELEASE_URLS", {}):
+            with patch.object(stack_mod, "pip_install_try") as mock_pip:
+                stack_mod._install_bnb_windows_rocm()
+        mock_pip.assert_not_called()
+
+
+# =============================================================================
+# TEST: install_python_stack.py -- UNSLOTH_ROCM_TORCH_INSTALLED early-return path
+# =============================================================================
+
+
+class TestRocmTorchInstalledEnvVar:
+    """Verify UNSLOTH_ROCM_TORCH_INSTALLED=1 skips main install but still installs BNB."""
+
+    @patch.object(stack_mod, "_install_bnb_windows_rocm")
+    @patch.object(stack_mod, "pip_install")
+    def test_env_var_skips_main_pip_install(self, mock_pip, mock_bnb):
+        """UNSLOTH_ROCM_TORCH_INSTALLED=1 should not trigger torch pip_install."""
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}):
+            stack_mod._ensure_rocm_torch()
+        mock_pip.assert_not_called()
+
+    @patch.object(stack_mod, "_install_bnb_windows_rocm")
+    @patch.object(stack_mod, "pip_install")
+    def test_env_var_calls_bnb_install(self, mock_pip, mock_bnb):
+        """UNSLOTH_ROCM_TORCH_INSTALLED=1 should still call _install_bnb_windows_rocm."""
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}):
+            stack_mod._ensure_rocm_torch()
+        mock_bnb.assert_called_once()
+
+    @patch.object(stack_mod, "_install_bnb_windows_rocm")
+    @patch.object(stack_mod, "pip_install")
+    def test_env_var_sets_rocm_windows_flag(self, mock_pip, mock_bnb):
+        """UNSLOTH_ROCM_TORCH_INSTALLED=1 should set _rocm_windows_torch_installed."""
+        stack_mod._rocm_windows_torch_installed = False
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}):
+            stack_mod._ensure_rocm_torch()
+        assert stack_mod._rocm_windows_torch_installed is True
+
+
+# =============================================================================
+# TEST: worker.py -- Windows ROCm patches (source-level checks)
+# =============================================================================
+
+
+class TestWorkerWindowsRocmPatches:
+    """Verify worker.py contains the required Windows ROCm runtime patches."""
+
+    def test_grouped_mm_dispatch_patch_present(self):
+        """worker.py must register a _grouped_mm CUDA dispatch override."""
+        source = _WORKER_PATH.read_text()
+        assert '_gm_lib.impl("_grouped_mm"' in source
+
+    def test_grouped_mm_patch_targets_cuda_dispatch_key(self):
+        """The dispatch override must target the CUDA key (not CompositeImplicitAutograd)."""
+        source = _WORKER_PATH.read_text()
+        assert '"_grouped_mm", _grouped_mm_safe_impl, "CUDA"' in source
+
+    def test_grouped_mm_lib_kept_alive(self):
+        """The Library object must be stored to prevent GC clearing the registration."""
+        source = _WORKER_PATH.read_text()
+        assert "_WINDOWS_ROCM_GROUPED_MM_LIB" in source
+
+    def test_grouped_mm_handles_offs_grouped_case(self):
+        """_grouped_mm fallback must handle the grouped (offs!=None) variant."""
+        source = _WORKER_PATH.read_text()
+        assert "offs_list" in source
+        assert "offs.tolist()" in source
+
+    def test_torchao_stub_uses_stub_type_meta(self):
+        """Torchao stub must use _StubTypeMeta so isinstance() returns False not TypeError."""
+        source = _WORKER_PATH.read_text()
+        assert "_StubTypeMeta" in source
+
+    def test_stub_type_meta_has_instancecheck(self):
+        """_StubTypeMeta must define __instancecheck__ returning False."""
+        source = _WORKER_PATH.read_text()
+        assert "__instancecheck__" in source
+
+    def test_stub_subpackage_finder_registered(self):
+        """_StubSubpackageFinder must be appended to sys.meta_path."""
+        source = _WORKER_PATH.read_text()
+        assert "sys.meta_path.append(_StubSubpackageFinder())" in source
+
+    def test_torchao_key_submodules_pre_stubbed(self):
+        """Key torchao submodules (dtypes, quantization) must be pre-stubbed."""
+        source = _WORKER_PATH.read_text()
+        assert "torchao.dtypes" in source
+        assert "torchao.quantization" in source
+
+    def test_torchdynamo_disabled_on_windows_rocm(self):
+        """worker.py should disable dynamo on Windows ROCm as belt-and-suspenders."""
+        source = _WORKER_PATH.read_text()
+        assert "TORCHDYNAMO_DISABLE" in source
+
+    def test_grouped_mm_patch_guarded_by_windows_and_hip_check(self):
+        """_grouped_mm patch must only apply on Windows + HIP torch."""
+        source = _WORKER_PATH.read_text()
+        # Should check sys.platform == "win32" AND torch.version.hip
+        assert 'sys.platform == "win32"' in source
+        assert "torch.version" in source and '"hip"' in source
+
+
+# =============================================================================
+# TEST: install_python_stack.py -- _ROCM_TORCH_PKG_SPECS mapping
+# =============================================================================
+
+
+class TestRocmTorchPkgSpecs:
+    """Verify per-tag torch version specs are correct."""
+
+    def test_rocm72_has_torch_211(self):
+        """rocm7.2 should specify torch 2.11.x."""
+        specs = stack_mod._ROCM_TORCH_PKG_SPECS.get("rocm7.2")
+        assert specs is not None
+        torch_spec = specs[0]
+        assert "2.11" in torch_spec
+
+    def test_default_caps_below_211(self):
+        """Default spec (rocm7.1 and earlier) should cap below 2.11."""
+        specs = stack_mod._ROCM_TORCH_PKG_SPECS.get("_default")
+        assert specs is not None
+        torch_spec = specs[0]
+        assert "<2.11" in torch_spec
+
+    def test_specs_have_torch_vision_audio(self):
+        """Each entry should be a 3-tuple: torch, torchvision, torchaudio."""
+        for tag, specs in stack_mod._ROCM_TORCH_PKG_SPECS.items():
+            assert len(specs) == 3, f"{tag}: expected (torch, torchvision, torchaudio)"
+            assert "torch" in specs[0]
+            assert "torchvision" in specs[1]
+            assert "torchaudio" in specs[2]
+
+    def test_gfx_to_amd_index_covers_rdna4(self):
+        """_GFX_TO_AMD_INDEX_ARCH must cover gfx1200 and gfx1201 (RDNA 4)."""
+        mapping = stack_mod._GFX_TO_AMD_INDEX_ARCH
+        assert mapping.get("gfx1200") == "gfx120X-all"
+        assert mapping.get("gfx1201") == "gfx120X-all"
+
+    def test_gfx_to_amd_index_covers_strix_halo(self):
+        """_GFX_TO_AMD_INDEX_ARCH must cover gfx1151 and gfx1150 (RDNA 3.5)."""
+        mapping = stack_mod._GFX_TO_AMD_INDEX_ARCH
+        assert mapping.get("gfx1151") == "gfx1151"
+        assert mapping.get("gfx1150") == "gfx1150"
+
+    def test_gfx_to_amd_index_covers_rdna3(self):
+        """_GFX_TO_AMD_INDEX_ARCH must cover gfx1100-gfx1103 (RDNA 3)."""
+        mapping = stack_mod._GFX_TO_AMD_INDEX_ARCH
+        for arch in ("gfx1100", "gfx1101", "gfx1102", "gfx1103"):
+            assert mapping.get(arch) == "gfx110X-all", f"{arch} missing from mapping"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 324f56cc267b126ce28baa79d2852cad72560b4c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 May 2026 06:25:35 +0000
Subject: [PATCH 079/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index d41adcf379..26ed005291 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1536,7 +1536,9 @@ def test_base_url_uses_repo_amd_com_by_default(self):
         assert "repo.amd.com" in url
 
     def test_mirror_env_var_overrides_base(self, monkeypatch):
-        monkeypatch.setenv("UNSLOTH_ROCM_WINDOWS_MIRROR", "https://my-mirror.example.com/rocm/whl")
+        monkeypatch.setenv(
+            "UNSLOTH_ROCM_WINDOWS_MIRROR", "https://my-mirror.example.com/rocm/whl"
+        )
         # Reload module-level constant by calling helper directly
         url = stack_mod._windows_rocm_index_url("gfx1200")
         # The env var is read at module load time for _ROCM_WINDOWS_INDEX_BASE,

From 4d41efce9735e9c9ac30fa59cd2e665ebb6583b2 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 01:34:48 -0500
Subject: [PATCH 080/165] tests: fix encoding, IS_WINDOWS patching, and wrong
 assertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add encoding="utf-8" to all read_text() calls (54 occurrences) so
  tests pass on Windows where the default codec is cp1252 and source
  files contain UTF-8 emoji (e.g. ⚠️ in install_python_stack.py)
- Add @patch.object(stack_mod, "IS_WINDOWS", False) to Linux-path
  TestEnsureRocmTorch tests so they reach the Linux code path when run
  on a Windows machine instead of short-circuiting into the Windows branch
- Fix test_grouped_mm_patch_guarded_by_windows_and_hip_check: the source
  uses getattr(_torch_for_rocm, "version", None) not torch.version, so
  check for '"version"' and '"hip"' substrings instead

137 passed, 2 skipped
---
 tests/studio/install/test_rocm_support.py | 118 ++++++++++++----------
 1 file changed, 62 insertions(+), 56 deletions(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 26ed005291..4fe61ce372 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -590,6 +590,7 @@ def test_torch_already_has_hip_skips(
                 _ensure_rocm_torch()
         mock_pip.assert_not_called()
 
+    @patch.object(stack_mod, "IS_WINDOWS", False)
     @patch.object(stack_mod, "pip_install_try", return_value = True)
     @patch.object(stack_mod, "pip_install")
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)
@@ -611,6 +612,7 @@ def test_cpu_torch_gets_rocm_reinstall(
         assert mock_pip_try.call_count >= 1
         assert "bitsandbytes" in str(mock_pip_try.call_args_list[0])
 
+    @patch.object(stack_mod, "IS_WINDOWS", False)
     @patch.object(stack_mod, "pip_install")
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)
     @patch.object(stack_mod, "_has_rocm_gpu", return_value = True)
@@ -642,6 +644,7 @@ def test_old_rocm_skips(self, mock_ver, mock_gpu, mock_nvidia, mock_pip):
                 _ensure_rocm_torch()
         mock_pip.assert_not_called()
 
+    @patch.object(stack_mod, "IS_WINDOWS", False)
     @patch.object(stack_mod, "pip_install")
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)
     @patch.object(stack_mod, "_has_rocm_gpu", return_value = True)
@@ -656,6 +659,7 @@ def test_version_unreadable_prints_warning(
         captured = capsys.readouterr()
         assert "unreadable" in captured.out
 
+    @patch.object(stack_mod, "IS_WINDOWS", False)
     @patch.object(stack_mod, "pip_install")
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)
     @patch.object(stack_mod, "_has_rocm_gpu", return_value = True)
@@ -671,6 +675,7 @@ def test_rocm_72_selects_72_tag(self, mock_ver, mock_gpu, mock_nvidia, mock_pip)
         torch_call = mock_pip.call_args_list[0]
         assert "rocm7.2" in str(torch_call)
 
+    @patch.object(stack_mod, "IS_WINDOWS", False)
     @patch.object(stack_mod, "pip_install_try", return_value = True)
     @patch.object(stack_mod, "pip_install")
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)
@@ -773,7 +778,7 @@ def test_hardware_py_has_is_rocm(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         assert "IS_ROCM: bool" in source and "False" in source
 
     def test_hardware_py_sets_is_rocm_on_hip(self):
@@ -781,7 +786,7 @@ def test_hardware_py_sets_is_rocm_on_hip(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         assert 'torch.version, "hip"' in source or "torch.version.hip" in source
 
     def test_hardware_py_still_returns_cuda_for_rocm(self):
@@ -789,7 +794,7 @@ def test_hardware_py_still_returns_cuda_for_rocm(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         # Ensure ROCM is NOT a DeviceType member
         enum_section = source.split("class DeviceType")[1].split("\n\n")[0]
         assert "ROCM" not in enum_section
@@ -799,7 +804,7 @@ def test_hardware_py_has_rocm_in_package_versions(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         assert '"rocm"' in source
 
     def test_hardware_py_device_type_cuda_references_intact(self):
@@ -807,7 +812,7 @@ def test_hardware_py_device_type_cuda_references_intact(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         # Key functions that must still reference DeviceType.CUDA
         assert "DeviceType.CUDA" in source
         assert "DEVICE = DeviceType.CUDA" in source
@@ -817,7 +822,7 @@ def test_is_rocm_exported_from_init(self):
         init_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py"
         )
-        source = init_path.read_text()
+        source = init_path.read_text(encoding="utf-8")
         assert "IS_ROCM" in source
 
     def test_is_rocm_in_all_list(self):
@@ -825,7 +830,7 @@ def test_is_rocm_in_all_list(self):
         init_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py"
         )
-        source = init_path.read_text()
+        source = init_path.read_text(encoding="utf-8")
         # Extract __all__ section
         assert '"IS_ROCM"' in source
 
@@ -834,7 +839,7 @@ def test_get_package_versions_returns_rocm_key(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         # Find the get_package_versions function body
         func_start = source.find("def get_package_versions")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
@@ -853,13 +858,13 @@ class TestTokenizerErrorMessage:
     def test_no_old_amd_message(self):
         """Old 'We do not support AMD' message should be gone."""
         tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py"
-        source = tu_path.read_text()
+        source = tu_path.read_text(encoding="utf-8")
         assert "We do not support AMD" not in source
 
     def test_new_message_has_docs_link(self):
         """New message should point to Unsloth AMD docs."""
         tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py"
-        source = tu_path.read_text()
+        source = tu_path.read_text(encoding="utf-8")
         assert "docs.unsloth.ai" in source or "No GPU detected" in source
 
 
@@ -874,7 +879,7 @@ class TestInstallShStructure:
     def test_no_here_strings(self):
         """install.sh must not use <<< (not POSIX)."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         # <<< is bash-only; breaks dash
         for i, line in enumerate(source.splitlines(), 1):
             stripped = line.lstrip()
@@ -885,7 +890,7 @@ def test_no_here_strings(self):
     def test_rocm_detection_present(self):
         """install.sh should have ROCm detection in get_torch_index_url."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         assert "amd-smi" in source
         assert "rocm" in source.lower()
 
@@ -900,7 +905,7 @@ def test_cuda_precedence(self):
         block.
         """
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         body = _extract_sh_function_body(source, "get_torch_index_url")
         nvidia_call = body.find("_has_usable_nvidia_gpu")
         no_nvidia_branch = body.find('if [ -z "$_smi" ]')
@@ -921,20 +926,20 @@ def test_cuda_precedence(self):
     def test_bitsandbytes_amd_install(self):
         """install.sh should install bitsandbytes for AMD when ROCm detected."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         assert "bitsandbytes" in source
         assert "rocm*)" in source  # case pattern for ROCm URLs
 
     def test_cpu_hint_mentions_amd(self):
         """CPU-only hint should mention AMD ROCm."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         assert "ROCm" in source
 
     def test_rocm72_supported_future_capped(self):
         """ROCm 7.2 should pass through directly; 7.3+ falls back to rocm7.2."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         assert 'echo "$_base/rocm7.2"' in source  # fallback for unknown future versions
         # Allowlisted versions should pass through directly
         assert "rocm6.*" in source
@@ -945,21 +950,21 @@ def test_rocm72_supported_future_capped(self):
     def test_rocm_tag_validation_guard_exists(self):
         """install.sh should validate _rocm_tag with a case guard."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         assert "rocm[1-9]*.[0-9]*)" in source
         assert '_rocm_tag=""' in source  # rejection path
 
     def test_dpkg_epoch_handling(self):
         """install.sh should strip Debian epoch prefix from dpkg-query output."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         assert "sed 's/^[0-9]*://' " in source or "sed 's/^[0-9]*://'" in source
 
     def test_no_double_bracket_in_rocm_block(self):
         """ROCm detection block should not use [[ ]] (bash-only, not POSIX).
         Note: [[:space:]], [[:digit:]] etc. are valid POSIX character classes, not bash [[ ]]."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         func_start = source.find("get_torch_index_url()")
         func_end = source.find("\n}", func_start)
         func_body = source[func_start:func_end]
@@ -978,7 +983,7 @@ def test_no_double_bracket_in_rocm_block(self):
     def test_no_arithmetic_expansion_in_rocm_block(self):
         """ROCm detection block should not use (( )) (bash-only)."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         func_start = source.find("get_torch_index_url()")
         func_end = source.find("\n}", func_start)
         func_body = source[func_start:func_end]
@@ -993,7 +998,7 @@ def test_no_arithmetic_expansion_in_rocm_block(self):
     def test_macos_returns_cpu_before_rocm_check(self):
         """macOS should return CPU immediately (before any ROCm check)."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text()
+        source = sh_path.read_text(encoding="utf-8")
         func_start = source.find("get_torch_index_url()")
         func_body = source[func_start:]
         darwin_pos = func_body.find("Darwin")
@@ -1067,12 +1072,12 @@ class TestWorkerRocmMambaSsm:
     def test_probe_returns_hip_version_field(self):
         """The wheel probe should include hip_version, and worker.py should
         consume it."""
-        assert "hip_version" in _WHEEL_UTILS_PATH.read_text()
-        assert "hip_version" in _WORKER_PATH.read_text()
+        assert "hip_version" in _WHEEL_UTILS_PATH.read_text(encoding="utf-8")
+        assert "hip_version" in _WORKER_PATH.read_text(encoding="utf-8")
 
     def test_probe_script_has_getattr_hip(self):
         """Probe script should use getattr for torch.version.hip (safe on CUDA)."""
-        source = _WHEEL_UTILS_PATH.read_text()
+        source = _WHEEL_UTILS_PATH.read_text(encoding="utf-8")
         assert "getattr(torch.version, 'hip', None)" in source
 
     def test_direct_wheel_url_returns_none_without_cuda_major(self):
@@ -1114,22 +1119,22 @@ def test_direct_wheel_url_returns_none_without_cuda_major(self):
 
     def test_hipcc_check_exists_in_source(self):
         """worker.py should check for hipcc before ROCm source builds."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "hipcc" in source
 
     def test_rocm_source_build_status_message(self):
         """worker.py should send a specific status for ROCm source compilation."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "Compiling" in source and "from source for ROCm" in source
 
     def test_rocm_build_failure_message(self):
         """worker.py should send a clear error on ROCm build failure."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "Failed to compile" in source and "for ROCm" in source
 
     def test_timeout_on_install(self):
         """worker.py should have a timeout on pip install subprocess."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "TimeoutExpired" in source
         assert "timeout" in source
 
@@ -1150,7 +1155,7 @@ def test_amd_py_exists(self):
     def test_amd_py_has_required_functions(self):
         """amd.py should export the same function signatures as nvidia.py."""
         amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py"
-        source = amd_path.read_text()
+        source = amd_path.read_text(encoding="utf-8")
         assert "def get_physical_gpu_count" in source
         assert "def get_primary_gpu_utilization" in source
         assert "def get_visible_gpu_utilization" in source
@@ -1295,7 +1300,7 @@ def test_hardware_imports_amd_module(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         assert "from . import amd" in source
 
     def test_hardware_branches_on_is_rocm_for_utilization(self):
@@ -1305,7 +1310,7 @@ def test_hardware_branches_on_is_rocm_for_utilization(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         func_start = source.find("def get_gpu_utilization")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert '_smi_query("get_primary_gpu_utilization"' in func_body
@@ -1323,7 +1328,7 @@ def test_hardware_branches_on_is_rocm_for_visible(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         func_start = source.find("def get_visible_gpu_utilization")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         # The dispatcher call may wrap onto multiple lines; allow whitespace
@@ -1344,7 +1349,7 @@ def test_hardware_branches_on_is_rocm_for_physical_count(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         func_start = source.find("def get_physical_gpu_count")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "IS_ROCM" in func_body
@@ -1365,7 +1370,7 @@ def test_apply_gpu_ids_falls_back_to_torch_version_hip(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert 'getattr(_torch.version, "hip", None)' in func_body
@@ -1375,7 +1380,7 @@ def test_apply_gpu_ids_sets_hip_and_rocr_visible_devices(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert 'os.environ["HIP_VISIBLE_DEVICES"] = value' in func_body
@@ -1386,7 +1391,7 @@ def test_apply_gpu_ids_rocm_fallback_is_guarded_by_try_except(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text()
+        source = hw_path.read_text(encoding="utf-8")
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "import torch as _torch" in func_body
@@ -1403,18 +1408,18 @@ class TestWindowsRocmWarning:
 
     def test_windows_amd_warning_in_source(self):
         """install_python_stack.py should warn Windows AMD users."""
-        source = _STACK_PATH.read_text()
+        source = _STACK_PATH.read_text(encoding="utf-8")
         assert "AMD GPU detected" in source
 
     def test_windows_amd_warning_checks_hipinfo_or_amdsmi(self):
         """Warning should check for hipinfo or amd-smi."""
-        source = _STACK_PATH.read_text()
+        source = _STACK_PATH.read_text(encoding="utf-8")
         assert "hipinfo" in source
         assert "amd-smi" in source
 
     def test_windows_amd_warning_has_docs_link(self):
         """Warning should include AMD docs link."""
-        source = _STACK_PATH.read_text()
+        source = _STACK_PATH.read_text(encoding="utf-8")
         assert "docs.unsloth.ai/get-started/install-and-update/amd" in source
 
 
@@ -1429,7 +1434,7 @@ class TestIsRdnaExpansion:
     def test_is_rdna_source_has_rdna2(self):
         """is_rdna() should include RDNA2 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text()
+        source = utils_path.read_text(encoding="utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1030" in func_body
@@ -1443,7 +1448,7 @@ def test_is_rdna_source_has_rdna2(self):
     def test_is_rdna_source_has_rdna3(self):
         """is_rdna() should include RDNA3 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text()
+        source = utils_path.read_text(encoding="utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1100" in func_body
@@ -1454,7 +1459,7 @@ def test_is_rdna_source_has_rdna3(self):
     def test_is_rdna_source_has_rdna35(self):
         """is_rdna() should include RDNA3.5 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text()
+        source = utils_path.read_text(encoding="utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1150" in func_body
@@ -1464,7 +1469,7 @@ def test_is_rdna_source_has_rdna35(self):
     def test_is_rdna_source_has_rdna4(self):
         """is_rdna() should include RDNA4 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text()
+        source = utils_path.read_text(encoding="utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1200" in func_body
@@ -1473,7 +1478,7 @@ def test_is_rdna_source_has_rdna4(self):
     def test_is_cdna_not_changed(self):
         """is_cdna() should remain unchanged (no RDNA architectures added)."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text()
+        source = utils_path.read_text(encoding="utf-8")
         func_start = source.find("def is_cdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx940" in func_body
@@ -1714,57 +1719,58 @@ class TestWorkerWindowsRocmPatches:
 
     def test_grouped_mm_dispatch_patch_present(self):
         """worker.py must register a _grouped_mm CUDA dispatch override."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert '_gm_lib.impl("_grouped_mm"' in source
 
     def test_grouped_mm_patch_targets_cuda_dispatch_key(self):
         """The dispatch override must target the CUDA key (not CompositeImplicitAutograd)."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert '"_grouped_mm", _grouped_mm_safe_impl, "CUDA"' in source
 
     def test_grouped_mm_lib_kept_alive(self):
         """The Library object must be stored to prevent GC clearing the registration."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "_WINDOWS_ROCM_GROUPED_MM_LIB" in source
 
     def test_grouped_mm_handles_offs_grouped_case(self):
         """_grouped_mm fallback must handle the grouped (offs!=None) variant."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "offs_list" in source
         assert "offs.tolist()" in source
 
     def test_torchao_stub_uses_stub_type_meta(self):
         """Torchao stub must use _StubTypeMeta so isinstance() returns False not TypeError."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "_StubTypeMeta" in source
 
     def test_stub_type_meta_has_instancecheck(self):
         """_StubTypeMeta must define __instancecheck__ returning False."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "__instancecheck__" in source
 
     def test_stub_subpackage_finder_registered(self):
         """_StubSubpackageFinder must be appended to sys.meta_path."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "sys.meta_path.append(_StubSubpackageFinder())" in source
 
     def test_torchao_key_submodules_pre_stubbed(self):
         """Key torchao submodules (dtypes, quantization) must be pre-stubbed."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "torchao.dtypes" in source
         assert "torchao.quantization" in source
 
     def test_torchdynamo_disabled_on_windows_rocm(self):
         """worker.py should disable dynamo on Windows ROCm as belt-and-suspenders."""
-        source = _WORKER_PATH.read_text()
+        source = _WORKER_PATH.read_text(encoding="utf-8")
         assert "TORCHDYNAMO_DISABLE" in source
 
     def test_grouped_mm_patch_guarded_by_windows_and_hip_check(self):
         """_grouped_mm patch must only apply on Windows + HIP torch."""
-        source = _WORKER_PATH.read_text()
-        # Should check sys.platform == "win32" AND torch.version.hip
+        source = _WORKER_PATH.read_text(encoding="utf-8")
+        # Must check sys.platform == "win32"
         assert 'sys.platform == "win32"' in source
-        assert "torch.version" in source and '"hip"' in source
+        # Must gate on HIP version — code uses getattr chain: "version" and "hip"
+        assert '"version"' in source and '"hip"' in source
 
 
 # =============================================================================

From 5b6adbe58e08c5ca151a537e63891ca5a46f633e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 May 2026 06:35:04 +0000
Subject: [PATCH 081/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 108 +++++++++++-----------
 1 file changed, 54 insertions(+), 54 deletions(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 4fe61ce372..2752a0b446 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -778,7 +778,7 @@ def test_hardware_py_has_is_rocm(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         assert "IS_ROCM: bool" in source and "False" in source
 
     def test_hardware_py_sets_is_rocm_on_hip(self):
@@ -786,7 +786,7 @@ def test_hardware_py_sets_is_rocm_on_hip(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         assert 'torch.version, "hip"' in source or "torch.version.hip" in source
 
     def test_hardware_py_still_returns_cuda_for_rocm(self):
@@ -794,7 +794,7 @@ def test_hardware_py_still_returns_cuda_for_rocm(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         # Ensure ROCM is NOT a DeviceType member
         enum_section = source.split("class DeviceType")[1].split("\n\n")[0]
         assert "ROCM" not in enum_section
@@ -804,7 +804,7 @@ def test_hardware_py_has_rocm_in_package_versions(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         assert '"rocm"' in source
 
     def test_hardware_py_device_type_cuda_references_intact(self):
@@ -812,7 +812,7 @@ def test_hardware_py_device_type_cuda_references_intact(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         # Key functions that must still reference DeviceType.CUDA
         assert "DeviceType.CUDA" in source
         assert "DEVICE = DeviceType.CUDA" in source
@@ -822,7 +822,7 @@ def test_is_rocm_exported_from_init(self):
         init_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py"
         )
-        source = init_path.read_text(encoding="utf-8")
+        source = init_path.read_text(encoding = "utf-8")
         assert "IS_ROCM" in source
 
     def test_is_rocm_in_all_list(self):
@@ -830,7 +830,7 @@ def test_is_rocm_in_all_list(self):
         init_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "__init__.py"
         )
-        source = init_path.read_text(encoding="utf-8")
+        source = init_path.read_text(encoding = "utf-8")
         # Extract __all__ section
         assert '"IS_ROCM"' in source
 
@@ -839,7 +839,7 @@ def test_get_package_versions_returns_rocm_key(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         # Find the get_package_versions function body
         func_start = source.find("def get_package_versions")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
@@ -858,13 +858,13 @@ class TestTokenizerErrorMessage:
     def test_no_old_amd_message(self):
         """Old 'We do not support AMD' message should be gone."""
         tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py"
-        source = tu_path.read_text(encoding="utf-8")
+        source = tu_path.read_text(encoding = "utf-8")
         assert "We do not support AMD" not in source
 
     def test_new_message_has_docs_link(self):
         """New message should point to Unsloth AMD docs."""
         tu_path = PACKAGE_ROOT / "unsloth" / "tokenizer_utils.py"
-        source = tu_path.read_text(encoding="utf-8")
+        source = tu_path.read_text(encoding = "utf-8")
         assert "docs.unsloth.ai" in source or "No GPU detected" in source
 
 
@@ -879,7 +879,7 @@ class TestInstallShStructure:
     def test_no_here_strings(self):
         """install.sh must not use <<< (not POSIX)."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         # <<< is bash-only; breaks dash
         for i, line in enumerate(source.splitlines(), 1):
             stripped = line.lstrip()
@@ -890,7 +890,7 @@ def test_no_here_strings(self):
     def test_rocm_detection_present(self):
         """install.sh should have ROCm detection in get_torch_index_url."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         assert "amd-smi" in source
         assert "rocm" in source.lower()
 
@@ -905,7 +905,7 @@ def test_cuda_precedence(self):
         block.
         """
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         body = _extract_sh_function_body(source, "get_torch_index_url")
         nvidia_call = body.find("_has_usable_nvidia_gpu")
         no_nvidia_branch = body.find('if [ -z "$_smi" ]')
@@ -926,20 +926,20 @@ def test_cuda_precedence(self):
     def test_bitsandbytes_amd_install(self):
         """install.sh should install bitsandbytes for AMD when ROCm detected."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         assert "bitsandbytes" in source
         assert "rocm*)" in source  # case pattern for ROCm URLs
 
     def test_cpu_hint_mentions_amd(self):
         """CPU-only hint should mention AMD ROCm."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         assert "ROCm" in source
 
     def test_rocm72_supported_future_capped(self):
         """ROCm 7.2 should pass through directly; 7.3+ falls back to rocm7.2."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         assert 'echo "$_base/rocm7.2"' in source  # fallback for unknown future versions
         # Allowlisted versions should pass through directly
         assert "rocm6.*" in source
@@ -950,21 +950,21 @@ def test_rocm72_supported_future_capped(self):
     def test_rocm_tag_validation_guard_exists(self):
         """install.sh should validate _rocm_tag with a case guard."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         assert "rocm[1-9]*.[0-9]*)" in source
         assert '_rocm_tag=""' in source  # rejection path
 
     def test_dpkg_epoch_handling(self):
         """install.sh should strip Debian epoch prefix from dpkg-query output."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         assert "sed 's/^[0-9]*://' " in source or "sed 's/^[0-9]*://'" in source
 
     def test_no_double_bracket_in_rocm_block(self):
         """ROCm detection block should not use [[ ]] (bash-only, not POSIX).
         Note: [[:space:]], [[:digit:]] etc. are valid POSIX character classes, not bash [[ ]]."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         func_start = source.find("get_torch_index_url()")
         func_end = source.find("\n}", func_start)
         func_body = source[func_start:func_end]
@@ -983,7 +983,7 @@ def test_no_double_bracket_in_rocm_block(self):
     def test_no_arithmetic_expansion_in_rocm_block(self):
         """ROCm detection block should not use (( )) (bash-only)."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         func_start = source.find("get_torch_index_url()")
         func_end = source.find("\n}", func_start)
         func_body = source[func_start:func_end]
@@ -998,7 +998,7 @@ def test_no_arithmetic_expansion_in_rocm_block(self):
     def test_macos_returns_cpu_before_rocm_check(self):
         """macOS should return CPU immediately (before any ROCm check)."""
         sh_path = PACKAGE_ROOT / "install.sh"
-        source = sh_path.read_text(encoding="utf-8")
+        source = sh_path.read_text(encoding = "utf-8")
         func_start = source.find("get_torch_index_url()")
         func_body = source[func_start:]
         darwin_pos = func_body.find("Darwin")
@@ -1072,12 +1072,12 @@ class TestWorkerRocmMambaSsm:
     def test_probe_returns_hip_version_field(self):
         """The wheel probe should include hip_version, and worker.py should
         consume it."""
-        assert "hip_version" in _WHEEL_UTILS_PATH.read_text(encoding="utf-8")
-        assert "hip_version" in _WORKER_PATH.read_text(encoding="utf-8")
+        assert "hip_version" in _WHEEL_UTILS_PATH.read_text(encoding = "utf-8")
+        assert "hip_version" in _WORKER_PATH.read_text(encoding = "utf-8")
 
     def test_probe_script_has_getattr_hip(self):
         """Probe script should use getattr for torch.version.hip (safe on CUDA)."""
-        source = _WHEEL_UTILS_PATH.read_text(encoding="utf-8")
+        source = _WHEEL_UTILS_PATH.read_text(encoding = "utf-8")
         assert "getattr(torch.version, 'hip', None)" in source
 
     def test_direct_wheel_url_returns_none_without_cuda_major(self):
@@ -1119,22 +1119,22 @@ def test_direct_wheel_url_returns_none_without_cuda_major(self):
 
     def test_hipcc_check_exists_in_source(self):
         """worker.py should check for hipcc before ROCm source builds."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "hipcc" in source
 
     def test_rocm_source_build_status_message(self):
         """worker.py should send a specific status for ROCm source compilation."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "Compiling" in source and "from source for ROCm" in source
 
     def test_rocm_build_failure_message(self):
         """worker.py should send a clear error on ROCm build failure."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "Failed to compile" in source and "for ROCm" in source
 
     def test_timeout_on_install(self):
         """worker.py should have a timeout on pip install subprocess."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "TimeoutExpired" in source
         assert "timeout" in source
 
@@ -1155,7 +1155,7 @@ def test_amd_py_exists(self):
     def test_amd_py_has_required_functions(self):
         """amd.py should export the same function signatures as nvidia.py."""
         amd_path = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "amd.py"
-        source = amd_path.read_text(encoding="utf-8")
+        source = amd_path.read_text(encoding = "utf-8")
         assert "def get_physical_gpu_count" in source
         assert "def get_primary_gpu_utilization" in source
         assert "def get_visible_gpu_utilization" in source
@@ -1300,7 +1300,7 @@ def test_hardware_imports_amd_module(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         assert "from . import amd" in source
 
     def test_hardware_branches_on_is_rocm_for_utilization(self):
@@ -1310,7 +1310,7 @@ def test_hardware_branches_on_is_rocm_for_utilization(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         func_start = source.find("def get_gpu_utilization")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert '_smi_query("get_primary_gpu_utilization"' in func_body
@@ -1328,7 +1328,7 @@ def test_hardware_branches_on_is_rocm_for_visible(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         func_start = source.find("def get_visible_gpu_utilization")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         # The dispatcher call may wrap onto multiple lines; allow whitespace
@@ -1349,7 +1349,7 @@ def test_hardware_branches_on_is_rocm_for_physical_count(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         func_start = source.find("def get_physical_gpu_count")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "IS_ROCM" in func_body
@@ -1370,7 +1370,7 @@ def test_apply_gpu_ids_falls_back_to_torch_version_hip(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert 'getattr(_torch.version, "hip", None)' in func_body
@@ -1380,7 +1380,7 @@ def test_apply_gpu_ids_sets_hip_and_rocr_visible_devices(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert 'os.environ["HIP_VISIBLE_DEVICES"] = value' in func_body
@@ -1391,7 +1391,7 @@ def test_apply_gpu_ids_rocm_fallback_is_guarded_by_try_except(self):
         hw_path = (
             PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
         )
-        source = hw_path.read_text(encoding="utf-8")
+        source = hw_path.read_text(encoding = "utf-8")
         func_start = source.find("def apply_gpu_ids")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "import torch as _torch" in func_body
@@ -1408,18 +1408,18 @@ class TestWindowsRocmWarning:
 
     def test_windows_amd_warning_in_source(self):
         """install_python_stack.py should warn Windows AMD users."""
-        source = _STACK_PATH.read_text(encoding="utf-8")
+        source = _STACK_PATH.read_text(encoding = "utf-8")
         assert "AMD GPU detected" in source
 
     def test_windows_amd_warning_checks_hipinfo_or_amdsmi(self):
         """Warning should check for hipinfo or amd-smi."""
-        source = _STACK_PATH.read_text(encoding="utf-8")
+        source = _STACK_PATH.read_text(encoding = "utf-8")
         assert "hipinfo" in source
         assert "amd-smi" in source
 
     def test_windows_amd_warning_has_docs_link(self):
         """Warning should include AMD docs link."""
-        source = _STACK_PATH.read_text(encoding="utf-8")
+        source = _STACK_PATH.read_text(encoding = "utf-8")
         assert "docs.unsloth.ai/get-started/install-and-update/amd" in source
 
 
@@ -1434,7 +1434,7 @@ class TestIsRdnaExpansion:
     def test_is_rdna_source_has_rdna2(self):
         """is_rdna() should include RDNA2 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text(encoding="utf-8")
+        source = utils_path.read_text(encoding = "utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1030" in func_body
@@ -1448,7 +1448,7 @@ def test_is_rdna_source_has_rdna2(self):
     def test_is_rdna_source_has_rdna3(self):
         """is_rdna() should include RDNA3 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text(encoding="utf-8")
+        source = utils_path.read_text(encoding = "utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1100" in func_body
@@ -1459,7 +1459,7 @@ def test_is_rdna_source_has_rdna3(self):
     def test_is_rdna_source_has_rdna35(self):
         """is_rdna() should include RDNA3.5 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text(encoding="utf-8")
+        source = utils_path.read_text(encoding = "utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1150" in func_body
@@ -1469,7 +1469,7 @@ def test_is_rdna_source_has_rdna35(self):
     def test_is_rdna_source_has_rdna4(self):
         """is_rdna() should include RDNA4 architectures."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text(encoding="utf-8")
+        source = utils_path.read_text(encoding = "utf-8")
         func_start = source.find("def is_rdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx1200" in func_body
@@ -1478,7 +1478,7 @@ def test_is_rdna_source_has_rdna4(self):
     def test_is_cdna_not_changed(self):
         """is_cdna() should remain unchanged (no RDNA architectures added)."""
         utils_path = PACKAGE_ROOT / "unsloth" / "kernels" / "utils.py"
-        source = utils_path.read_text(encoding="utf-8")
+        source = utils_path.read_text(encoding = "utf-8")
         func_start = source.find("def is_cdna()")
         func_body = source[func_start : source.find("\ndef ", func_start + 1)]
         assert "gfx940" in func_body
@@ -1719,54 +1719,54 @@ class TestWorkerWindowsRocmPatches:
 
     def test_grouped_mm_dispatch_patch_present(self):
         """worker.py must register a _grouped_mm CUDA dispatch override."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert '_gm_lib.impl("_grouped_mm"' in source
 
     def test_grouped_mm_patch_targets_cuda_dispatch_key(self):
         """The dispatch override must target the CUDA key (not CompositeImplicitAutograd)."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert '"_grouped_mm", _grouped_mm_safe_impl, "CUDA"' in source
 
     def test_grouped_mm_lib_kept_alive(self):
         """The Library object must be stored to prevent GC clearing the registration."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "_WINDOWS_ROCM_GROUPED_MM_LIB" in source
 
     def test_grouped_mm_handles_offs_grouped_case(self):
         """_grouped_mm fallback must handle the grouped (offs!=None) variant."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "offs_list" in source
         assert "offs.tolist()" in source
 
     def test_torchao_stub_uses_stub_type_meta(self):
         """Torchao stub must use _StubTypeMeta so isinstance() returns False not TypeError."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "_StubTypeMeta" in source
 
     def test_stub_type_meta_has_instancecheck(self):
         """_StubTypeMeta must define __instancecheck__ returning False."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "__instancecheck__" in source
 
     def test_stub_subpackage_finder_registered(self):
         """_StubSubpackageFinder must be appended to sys.meta_path."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "sys.meta_path.append(_StubSubpackageFinder())" in source
 
     def test_torchao_key_submodules_pre_stubbed(self):
         """Key torchao submodules (dtypes, quantization) must be pre-stubbed."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "torchao.dtypes" in source
         assert "torchao.quantization" in source
 
     def test_torchdynamo_disabled_on_windows_rocm(self):
         """worker.py should disable dynamo on Windows ROCm as belt-and-suspenders."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "TORCHDYNAMO_DISABLE" in source
 
     def test_grouped_mm_patch_guarded_by_windows_and_hip_check(self):
         """_grouped_mm patch must only apply on Windows + HIP torch."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         # Must check sys.platform == "win32"
         assert 'sys.platform == "win32"' in source
         # Must gate on HIP version — code uses getattr chain: "version" and "hip"

From d5c3c7bac691279630c7e6672a530fe534f8b3a0 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 13:58:57 -0500
Subject: [PATCH 082/165] fix: pin BNB_ROCM_VERSION=72 for
 torch==2.11.0+rocm7.13.0 compatibility

AMD's pip index now ships torch==2.11.0+rocm7.13.0 (ROCm 7.13).
bitsandbytes auto-detects HIP 7.13 from torch.version.hip and looks for
libbitsandbytes_rocm713.dll, which the AMD Windows prerelease wheel does
not ship (it only ships rocm72.dll), causing a load error at training start.

Fix:
- worker.py section 1f: set BNB_ROCM_VERSION=72 (via setdefault) before
  section 2 ML imports, so bitsandbytes always loads rocm72.dll on Windows ROCm
- install_python_stack.py: set BNB_ROCM_VERSION=72 in _install_bnb_windows_rocm()
  for any post-install imports; update comment to document root cause
- tests: 4 new assertions covering the fix (141 passed, 2 skipped)
---
 studio/backend/core/training/worker.py    | 15 ++++++++
 studio/install_python_stack.py            | 12 +++++-
 tests/studio/install/test_rocm_support.py | 47 +++++++++++++++++++++++
 3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index f208e395d9..da7eee86d2 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1321,6 +1321,21 @@ def find_spec(self, fullname, path, target = None):
                 os.environ["TORCHDYNAMO_DISABLE"] = "1"
                 logger.info("Windows ROCm: torch.compile (dynamo) disabled")
 
+            # Force BNB to load libbitsandbytes_rocm72.dll regardless of the
+            # HIP version that torch reports.  As of torch==2.11.0+rocm7.13.0
+            # (AMD index, May 2026) torch.version.hip returns "7.13", which
+            # makes BNB look for rocm713.dll — a file our AMD Windows prerelease
+            # wheel does not ship.  The wheel only ships rocm72.dll, so we pin
+            # BNB_ROCM_VERSION="72" here.  Callers may override by setting the
+            # variable before launching the worker.
+            if "BNB_ROCM_VERSION" not in os.environ:
+                os.environ["BNB_ROCM_VERSION"] = "72"
+                logger.info(
+                    "Windows ROCm: set BNB_ROCM_VERSION=72 "
+                    "(AMD Windows BNB wheel ships rocm72.dll; "
+                    "overrides auto-detection from torch.version.hip)"
+                )
+
             # Patch _grouped_mm CUDA dispatch with a safe Python mm fallback.
             try:
                 import warnings as _warnings
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 046421456a..3df1756fbf 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -106,8 +106,12 @@
         "download/continuous-release_main/"
         "bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl"
     ),
-    # Windows ROCm wheel — ships libbitsandbytes_rocm72.dll.
-    # BNB_ROCM_VERSION=72 must be set in the environment before importing bnb.
+    # Windows ROCm wheel — ships libbitsandbytes_rocm72.dll only.
+    # As of torch==2.11.0+rocm7.13.0 (AMD index, May 2026), BNB auto-detects
+    # HIP version as "7.13" and looks for rocm713.dll — which does not exist.
+    # BNB_ROCM_VERSION=72 must be set in the environment before importing bnb
+    # to force it to load rocm72.dll.  Set in worker.py (training subprocess)
+    # and in _install_bnb_windows_rocm() (install subprocess).
     "win_amd64": (
         "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/"
         "download/continuous-release_main/"
@@ -356,6 +360,10 @@ def _install_bnb_windows_rocm() -> None:
     _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
     if _bnb_win_url is None:
         return
+    # Pin BNB_ROCM_VERSION=72 in this process now so that any post-install
+    # import of bitsandbytes (e.g. health-checks) loads the correct DLL.
+    # The worker subprocess sets this independently in worker.py section 1f.
+    os.environ.setdefault("BNB_ROCM_VERSION", "72")
     _prev = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
     os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
     try:
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 2752a0b446..61428a3558 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1674,6 +1674,27 @@ def test_no_op_when_win_amd64_url_missing(self):
                 stack_mod._install_bnb_windows_rocm()
         mock_pip.assert_not_called()
 
+    def test_sets_bnb_rocm_version_72(self):
+        """BNB_ROCM_VERSION must be set to '72' before install.
+
+        As of torch==2.11.0+rocm7.13.0 (AMD index, May 2026), BNB auto-detects
+        HIP 7.13 and looks for rocm713.dll — which the prerelease wheel does not
+        ship.  _install_bnb_windows_rocm() must pin BNB_ROCM_VERSION=72 so that
+        bitsandbytes loads libbitsandbytes_rocm72.dll instead.
+        """
+        with patch.dict(os.environ, {}, clear = False):
+            os.environ.pop("BNB_ROCM_VERSION", None)
+            with patch.object(stack_mod, "pip_install_try", return_value = True):
+                stack_mod._install_bnb_windows_rocm()
+            assert os.environ.get("BNB_ROCM_VERSION") == "72"
+
+    def test_does_not_override_existing_bnb_rocm_version(self):
+        """An explicit BNB_ROCM_VERSION in the caller's env must not be clobbered."""
+        with patch.dict(os.environ, {"BNB_ROCM_VERSION": "60"}):
+            with patch.object(stack_mod, "pip_install_try", return_value = True):
+                stack_mod._install_bnb_windows_rocm()
+            assert os.environ.get("BNB_ROCM_VERSION") == "60"
+
 
 # =============================================================================
 # TEST: install_python_stack.py -- UNSLOTH_ROCM_TORCH_INSTALLED early-return path
@@ -1764,6 +1785,32 @@ def test_torchdynamo_disabled_on_windows_rocm(self):
         source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "TORCHDYNAMO_DISABLE" in source
 
+    def test_bnb_rocm_version_set_on_windows_rocm(self):
+        """worker.py must pin BNB_ROCM_VERSION=72 in the Windows ROCm section.
+
+        As of torch==2.11.0+rocm7.13.0, BNB auto-detects HIP 7.13 and looks for
+        rocm713.dll, which the AMD prerelease wheel does not ship.  The worker
+        must force BNB_ROCM_VERSION=72 before any ML library is imported so that
+        bitsandbytes loads libbitsandbytes_rocm72.dll.
+        """
+        source = _WORKER_PATH.read_text(encoding="utf-8")
+        assert "BNB_ROCM_VERSION" in source
+        assert '"72"' in source
+
+    def test_bnb_rocm_version_set_before_ml_imports(self):
+        """BNB_ROCM_VERSION must appear in section 1f, before section 2 ML imports."""
+        source = _WORKER_PATH.read_text(encoding="utf-8")
+        idx_bnb = source.find("BNB_ROCM_VERSION")
+        # Use the specific section-2 marker that appears in the worker process
+        # entry-point function (not the trainer helper which has its own "# ── 2.").
+        idx_sec2 = source.find("# ── 2. Now import ML libraries")
+        assert idx_bnb != -1, "BNB_ROCM_VERSION not found in worker.py"
+        assert idx_sec2 != -1, "'# ── 2. Now import ML libraries' marker not found in worker.py"
+        assert idx_bnb < idx_sec2, (
+            "BNB_ROCM_VERSION must be set before section 2 ML imports "
+            f"(found at {idx_bnb}, section 2 at {idx_sec2})"
+        )
+
     def test_grouped_mm_patch_guarded_by_windows_and_hip_check(self):
         """_grouped_mm patch must only apply on Windows + HIP torch."""
         source = _WORKER_PATH.read_text(encoding = "utf-8")

From f95cb201a327c181d5a475b9f8d3ca306cd3dc1b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 May 2026 18:59:13 +0000
Subject: [PATCH 083/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 61428a3558..96744a0043 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1793,19 +1793,21 @@ def test_bnb_rocm_version_set_on_windows_rocm(self):
         must force BNB_ROCM_VERSION=72 before any ML library is imported so that
         bitsandbytes loads libbitsandbytes_rocm72.dll.
         """
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         assert "BNB_ROCM_VERSION" in source
         assert '"72"' in source
 
     def test_bnb_rocm_version_set_before_ml_imports(self):
         """BNB_ROCM_VERSION must appear in section 1f, before section 2 ML imports."""
-        source = _WORKER_PATH.read_text(encoding="utf-8")
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
         idx_bnb = source.find("BNB_ROCM_VERSION")
         # Use the specific section-2 marker that appears in the worker process
         # entry-point function (not the trainer helper which has its own "# ── 2.").
         idx_sec2 = source.find("# ── 2. Now import ML libraries")
         assert idx_bnb != -1, "BNB_ROCM_VERSION not found in worker.py"
-        assert idx_sec2 != -1, "'# ── 2. Now import ML libraries' marker not found in worker.py"
+        assert (
+            idx_sec2 != -1
+        ), "'# ── 2. Now import ML libraries' marker not found in worker.py"
         assert idx_bnb < idx_sec2, (
             "BNB_ROCM_VERSION must be set before section 2 ML imports "
             f"(found at {idx_bnb}, section 2 at {idx_sec2})"

From c55aaa580950aac655cf9d28bdfa5741743d0f6e Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 14:05:04 -0500
Subject: [PATCH 084/165] fix: detect BNB ROCm DLL suffix dynamically instead
 of hardcoding '72'

BNB_ROCM_VERSION was pinned to '72' which works today (AMD wheel ships
rocm72.dll) but would break again if AMD ships a future wheel with a
different DLL suffix (e.g. rocm713.dll).

Add _detect_bnb_rocm_dll_ver() to install_python_stack.py: scans the
installed bitsandbytes package dir for libbitsandbytes_rocm{VER}.dll
using importlib.util.find_spec (no BNB import needed) and returns the
suffix.  '72' remains the fallback when detection fails.

Apply the same detection inline in worker.py section 1f.  Both paths
still respect a pre-set BNB_ROCM_VERSION (caller override wins).

Tests: +8 cases covering detection logic and fallback (147 passed, 2 skipped).
---
 studio/backend/core/training/worker.py    | 48 +++++++++---
 studio/install_python_stack.py            | 45 ++++++++---
 tests/studio/install/test_rocm_support.py | 91 +++++++++++++++++++----
 3 files changed, 149 insertions(+), 35 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index da7eee86d2..8d154d19db 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1321,19 +1321,45 @@ def find_spec(self, fullname, path, target = None):
                 os.environ["TORCHDYNAMO_DISABLE"] = "1"
                 logger.info("Windows ROCm: torch.compile (dynamo) disabled")
 
-            # Force BNB to load libbitsandbytes_rocm72.dll regardless of the
-            # HIP version that torch reports.  As of torch==2.11.0+rocm7.13.0
-            # (AMD index, May 2026) torch.version.hip returns "7.13", which
-            # makes BNB look for rocm713.dll — a file our AMD Windows prerelease
-            # wheel does not ship.  The wheel only ships rocm72.dll, so we pin
-            # BNB_ROCM_VERSION="72" here.  Callers may override by setting the
-            # variable before launching the worker.
+            # BNB auto-detects the HIP version from torch.version.hip and uses
+            # it to choose which DLL to load (e.g. "7.13" → rocm713.dll).
+            # AMD's Windows BNB prerelease wheel ships only one rocm DLL, and its
+            # version suffix does not always match the torch HIP version (e.g.
+            # torch==2.11.0+rocm7.13.0 ships HIP 7.13, but the BNB wheel still
+            # ships rocm72.dll).  We detect the actual DLL name from the installed
+            # package and override BNB's auto-detection.  "72" is a safe fallback
+            # if detection fails.  Callers may override by pre-setting the var.
             if "BNB_ROCM_VERSION" not in os.environ:
-                os.environ["BNB_ROCM_VERSION"] = "72"
+                _bnb_rocm_ver = None
+                try:
+                    import glob as _glob
+                    import importlib.util as _ilu
+                    import re as _re
+
+                    _bnb_spec = _ilu.find_spec("bitsandbytes")
+                    if _bnb_spec and _bnb_spec.submodule_search_locations:
+                        for _pkg_dir in _bnb_spec.submodule_search_locations:
+                            for _dll in _glob.glob(
+                                os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll")
+                            ):
+                                _m = _re.search(
+                                    r"libbitsandbytes_rocm(\d+)\.dll",
+                                    os.path.basename(_dll),
+                                )
+                                if _m:
+                                    _bnb_rocm_ver = _m.group(1)
+                                    break
+                            if _bnb_rocm_ver:
+                                break
+                except Exception:
+                    pass
+                _bnb_rocm_ver = _bnb_rocm_ver or "72"
+                os.environ["BNB_ROCM_VERSION"] = _bnb_rocm_ver
                 logger.info(
-                    "Windows ROCm: set BNB_ROCM_VERSION=72 "
-                    "(AMD Windows BNB wheel ships rocm72.dll; "
-                    "overrides auto-detection from torch.version.hip)"
+                    "Windows ROCm: set BNB_ROCM_VERSION=%s "
+                    "(detected from installed BNB wheel; "
+                    "overrides torch.version.hip auto-detection)",
+                    _bnb_rocm_ver,
                 )
 
             # Patch _grouped_mm CUDA dispatch with a safe Python mm fallback.
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 3df1756fbf..bd6589dd20 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -106,12 +106,11 @@
         "download/continuous-release_main/"
         "bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl"
     ),
-    # Windows ROCm wheel — ships libbitsandbytes_rocm72.dll only.
-    # As of torch==2.11.0+rocm7.13.0 (AMD index, May 2026), BNB auto-detects
-    # HIP version as "7.13" and looks for rocm713.dll — which does not exist.
-    # BNB_ROCM_VERSION=72 must be set in the environment before importing bnb
-    # to force it to load rocm72.dll.  Set in worker.py (training subprocess)
-    # and in _install_bnb_windows_rocm() (install subprocess).
+    # Windows ROCm wheel — ships libbitsandbytes_rocm{VER}.dll.
+    # BNB auto-detects HIP version from torch.version.hip, which does not always
+    # match the DLL suffix in this prerelease wheel (e.g. torch 7.13 with a rocm72
+    # DLL).  We scan the installed wheel for the actual DLL name and set
+    # BNB_ROCM_VERSION accordingly in _install_bnb_windows_rocm() and worker.py.
     "win_amd64": (
         "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/"
         "download/continuous-release_main/"
@@ -259,6 +258,29 @@ def _windows_rocm_index_url(gfx_arch: str | None) -> str | None:
     return f"{_ROCM_WINDOWS_INDEX_BASE}/{arch_family}/"
 
 
+def _detect_bnb_rocm_dll_ver() -> str | None:
+    """Scan the installed bitsandbytes package for libbitsandbytes_rocm{VER}.dll.
+
+    Returns the version suffix string (e.g. ``"72"``, ``"713"``) or ``None``
+    if bitsandbytes is not installed or no ROCm DLL is found.  Does NOT import
+    bitsandbytes — uses importlib.util.find_spec so it is safe to call before
+    BNB is imported.
+    """
+    import glob
+    import importlib.util
+    import re
+
+    spec = importlib.util.find_spec("bitsandbytes")
+    if spec is None or not spec.submodule_search_locations:
+        return None
+    for pkg_dir in spec.submodule_search_locations:
+        for dll in glob.glob(os.path.join(pkg_dir, "libbitsandbytes_rocm*.dll")):
+            m = re.search(r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(dll))
+            if m:
+                return m.group(1)
+    return None
+
+
 def _has_rocm_gpu() -> bool:
     """Return True only if an actual AMD GPU is visible (not just ROCm tools installed)."""
     import re
@@ -360,10 +382,6 @@ def _install_bnb_windows_rocm() -> None:
     _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
     if _bnb_win_url is None:
         return
-    # Pin BNB_ROCM_VERSION=72 in this process now so that any post-install
-    # import of bitsandbytes (e.g. health-checks) loads the correct DLL.
-    # The worker subprocess sets this independently in worker.py section 1f.
-    os.environ.setdefault("BNB_ROCM_VERSION", "72")
     _prev = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
     os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
     try:
@@ -380,6 +398,13 @@ def _install_bnb_windows_rocm() -> None:
             os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
         else:
             os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = _prev
+    # After install: detect the actual ROCm DLL suffix from the wheel so any
+    # post-install BNB import in this process loads the correct DLL.
+    # The worker subprocess does the same detection independently (worker.py §1f).
+    # Fall back to "72" if detection fails (e.g. install was a no-op / dry-run).
+    if "BNB_ROCM_VERSION" not in os.environ:
+        _ver = _detect_bnb_rocm_dll_ver() or "72"
+        os.environ["BNB_ROCM_VERSION"] = _ver
 
 
 def _ensure_rocm_torch() -> None:
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 96744a0043..dae2c2ff67 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1674,18 +1674,37 @@ def test_no_op_when_win_amd64_url_missing(self):
                 stack_mod._install_bnb_windows_rocm()
         mock_pip.assert_not_called()
 
-    def test_sets_bnb_rocm_version_72(self):
-        """BNB_ROCM_VERSION must be set to '72' before install.
+    def test_sets_bnb_rocm_version_from_detected_dll(self):
+        """BNB_ROCM_VERSION is set from the DLL detected after install."""
+        with patch.dict(os.environ, {}, clear = False):
+            os.environ.pop("BNB_ROCM_VERSION", None)
+            with patch.object(stack_mod, "pip_install_try", return_value = True):
+                with patch.object(
+                    stack_mod, "_detect_bnb_rocm_dll_ver", return_value = "72"
+                ):
+                    stack_mod._install_bnb_windows_rocm()
+            assert os.environ.get("BNB_ROCM_VERSION") == "72"
 
-        As of torch==2.11.0+rocm7.13.0 (AMD index, May 2026), BNB auto-detects
-        HIP 7.13 and looks for rocm713.dll — which the prerelease wheel does not
-        ship.  _install_bnb_windows_rocm() must pin BNB_ROCM_VERSION=72 so that
-        bitsandbytes loads libbitsandbytes_rocm72.dll instead.
-        """
+    def test_sets_bnb_rocm_version_from_newer_dll(self):
+        """If AMD ships a newer DLL (e.g. rocm713.dll), that version is used."""
         with patch.dict(os.environ, {}, clear = False):
             os.environ.pop("BNB_ROCM_VERSION", None)
             with patch.object(stack_mod, "pip_install_try", return_value = True):
-                stack_mod._install_bnb_windows_rocm()
+                with patch.object(
+                    stack_mod, "_detect_bnb_rocm_dll_ver", return_value = "713"
+                ):
+                    stack_mod._install_bnb_windows_rocm()
+            assert os.environ.get("BNB_ROCM_VERSION") == "713"
+
+    def test_falls_back_to_72_when_detection_fails(self):
+        """Falls back to '72' when DLL detection returns None."""
+        with patch.dict(os.environ, {}, clear = False):
+            os.environ.pop("BNB_ROCM_VERSION", None)
+            with patch.object(stack_mod, "pip_install_try", return_value = True):
+                with patch.object(
+                    stack_mod, "_detect_bnb_rocm_dll_ver", return_value = None
+                ):
+                    stack_mod._install_bnb_windows_rocm()
             assert os.environ.get("BNB_ROCM_VERSION") == "72"
 
     def test_does_not_override_existing_bnb_rocm_version(self):
@@ -1696,6 +1715,47 @@ def test_does_not_override_existing_bnb_rocm_version(self):
             assert os.environ.get("BNB_ROCM_VERSION") == "60"
 
 
+class TestDetectBnbRocmDllVer:
+    """Unit tests for _detect_bnb_rocm_dll_ver()."""
+
+    def test_returns_none_when_bnb_not_installed(self):
+        """Returns None if bitsandbytes is not importable."""
+        import importlib.util
+
+        with patch.object(importlib.util, "find_spec", return_value = None):
+            assert stack_mod._detect_bnb_rocm_dll_ver() is None
+
+    def test_detects_rocm72_dll(self, tmp_path):
+        """Returns '72' when libbitsandbytes_rocm72.dll is present."""
+        (tmp_path / "libbitsandbytes_rocm72.dll").write_text("")
+        mock_spec = MagicMock()
+        mock_spec.submodule_search_locations = [str(tmp_path)]
+        import importlib.util
+
+        with patch.object(importlib.util, "find_spec", return_value = mock_spec):
+            assert stack_mod._detect_bnb_rocm_dll_ver() == "72"
+
+    def test_detects_rocm713_dll(self, tmp_path):
+        """Returns '713' when libbitsandbytes_rocm713.dll is present."""
+        (tmp_path / "libbitsandbytes_rocm713.dll").write_text("")
+        mock_spec = MagicMock()
+        mock_spec.submodule_search_locations = [str(tmp_path)]
+        import importlib.util
+
+        with patch.object(importlib.util, "find_spec", return_value = mock_spec):
+            assert stack_mod._detect_bnb_rocm_dll_ver() == "713"
+
+    def test_returns_none_when_only_cuda_dlls(self, tmp_path):
+        """Returns None when only CUDA DLLs are present (no ROCm DLL)."""
+        (tmp_path / "libbitsandbytes_cuda121.dll").write_text("")
+        mock_spec = MagicMock()
+        mock_spec.submodule_search_locations = [str(tmp_path)]
+        import importlib.util
+
+        with patch.object(importlib.util, "find_spec", return_value = mock_spec):
+            assert stack_mod._detect_bnb_rocm_dll_ver() is None
+
+
 # =============================================================================
 # TEST: install_python_stack.py -- UNSLOTH_ROCM_TORCH_INSTALLED early-return path
 # =============================================================================
@@ -1786,16 +1846,19 @@ def test_torchdynamo_disabled_on_windows_rocm(self):
         assert "TORCHDYNAMO_DISABLE" in source
 
     def test_bnb_rocm_version_set_on_windows_rocm(self):
-        """worker.py must pin BNB_ROCM_VERSION=72 in the Windows ROCm section.
+        """worker.py must set BNB_ROCM_VERSION in the Windows ROCm section.
 
-        As of torch==2.11.0+rocm7.13.0, BNB auto-detects HIP 7.13 and looks for
-        rocm713.dll, which the AMD prerelease wheel does not ship.  The worker
-        must force BNB_ROCM_VERSION=72 before any ML library is imported so that
-        bitsandbytes loads libbitsandbytes_rocm72.dll.
+        BNB auto-detects HIP version from torch.version.hip, which can mismatch
+        the DLL suffix in the AMD prerelease wheel.  The worker must detect the
+        actual DLL suffix and override BNB's auto-detection before ML imports.
         """
         source = _WORKER_PATH.read_text(encoding = "utf-8")
+        # Env var must be set
         assert "BNB_ROCM_VERSION" in source
-        assert '"72"' in source
+        # Detection helper must be used
+        assert "_detect_bnb_rocm_dll_ver" in source or "libbitsandbytes_rocm" in source
+        # "72" must appear as the safe fallback
+        assert '"72"' in source or "'72'" in source
 
     def test_bnb_rocm_version_set_before_ml_imports(self):
         """BNB_ROCM_VERSION must appear in section 1f, before section 2 ML imports."""

From b313a4867c4f20d0049185c1a5ff0e191b3b026b Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 14:28:03 -0500
Subject: [PATCH 085/165] fix: patch torch.distributed stubs in server process
 for Windows ROCm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On Windows ROCm, torch.distributed ships without process-group helpers
(is_initialized, is_available, get_rank, get_world_size).  The worker
subprocess already patches these in section 1e, but the main server
process calls _determine_attention_impl_for_gpu_estimate() which calls
unsloth's resolve_attention_implementation() → is_initialized(), causing:

  "Could not resolve attention implementation for '...':
   module 'torch.distributed' has no attribute 'is_initialized'"

Fix: patch the missing attrs onto torch.distributed at the top of
_determine_attention_impl_for_gpu_estimate, matching the same stubs
already applied in worker.py section 1e.  No-ops on Linux/CUDA where
torch.distributed is fully populated.
---
 studio/backend/utils/hardware/hardware.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 747ff194ab..f75c7a9d12 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -946,6 +946,24 @@ def _load_config_for_gpu_estimate(model_name: str, hf_token: Optional[str] = Non
 def _determine_attention_impl_for_gpu_estimate(config) -> str:
     import copy as _copy
 
+    # torch.distributed is incomplete on Windows ROCm — it ships without the
+    # process-group helpers (is_initialized, is_available, etc.).
+    # resolve_attention_implementation (unsloth) calls is_initialized()
+    # unconditionally, so patch any missing attrs before importing it.
+    try:
+        import torch.distributed as _td
+
+        for _attr, _stub in (
+            ("is_initialized", lambda: False),
+            ("is_available", lambda: False),
+            ("get_rank", lambda: 0),
+            ("get_world_size", lambda: 1),
+        ):
+            if not hasattr(_td, _attr):
+                setattr(_td, _attr, _stub)
+    except ImportError:
+        pass
+
     from unsloth.models._utils import resolve_attention_implementation
     from transformers import AutoModel, AutoModelForCausalLM
 

From b33a90ee68d94089100ea400cf690d2d1b343519 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 14:52:14 -0500
Subject: [PATCH 086/165] fix: gate _grouped_mm dispatch patch on HIP < 7.13
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AMD fixed the gfx1200 null HIP kernel in ROCm 7.13 (torch 2.11+).
Users on the new wheel now get the real GPU _grouped_mm kernel for
MoE workloads instead of the Python mm fallback.

Changes:
- worker.py: add _hip_ver_at_least() helper; wrap full _grouped_mm
  patch in `if not _hip_ver_at_least(7, 13):` with else branch that
  logs the skip reason; update section-1f comment to document the fix
- test_rocm_support.py: add 5 tests covering the helper definition,
  the (7, 13) gate expression, the else branch, the skip log message,
  and the AMD-format version string parsing (.split(".")[:2])

Verified: torch==2.11.0+rocm7.13.0 — 3D batch and grouped (offs)
variants both succeed; null crash only present on rocm7.12 and earlier.
---
 studio/backend/core/training/worker.py    | 172 +++++++++++++---------
 tests/studio/install/test_rocm_support.py |  35 +++++
 2 files changed, 135 insertions(+), 72 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 8d154d19db..4e9ac4ff83 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1291,7 +1291,7 @@ def find_spec(self, fullname, path, target = None):
             pass
 
     # ── 1f. Windows ROCm runtime patches ──
-    # torch._grouped_mm has a null HIP kernel on gfx1200 (ROCm 7.12 Windows),
+    # torch._grouped_mm has a null HIP kernel on gfx1200 (ROCm ≤ 7.12 Windows),
     # causing 0xC0000005 (access violation) during training.
     #
     # Root cause: the JitDecomp autograd decomposition system (NOT torch.compile)
@@ -1300,9 +1300,12 @@ def find_spec(self, fullname, path, target = None):
     # JitDecomp, so we must also override the CUDA dispatch key for _grouped_mm
     # with a safe Python fallback.
     #
-    # Verified on torch==2.10.0+rocm7.12.0:
-    #   torch.library.Library("aten","IMPL").impl("_grouped_mm", fn, "CUDA")
-    #   correctly overrides the HIP kernel and the call succeeds.
+    # Fixed in AMD's wheel: torch==2.11.0+rocm7.13.0 — the 3-D batch and grouped
+    # (with offs) variants of _grouped_mm now have working HIP kernels on gfx1200.
+    # We gate the dispatch override on HIP < 7.13 so users on the fixed wheel get
+    # the real GPU kernel rather than our Python fallback.
+    #
+    # Verified: null on torch==2.10.0+rocm7.12.0; fixed on torch==2.11.0+rocm7.13.0.
     #
     # Schema: _grouped_mm(Tensor self, Tensor mat2, Tensor? offs=None,
     #                     Tensor? bias=None, ScalarType? out_dtype=None) -> Tensor
@@ -1362,76 +1365,101 @@ def find_spec(self, fullname, path, target = None):
                     _bnb_rocm_ver,
                 )
 
-            # Patch _grouped_mm CUDA dispatch with a safe Python mm fallback.
-            try:
-                import warnings as _warnings
-
-                _gm_lib = _torch_for_rocm.library.Library("aten", "IMPL")
-
-                def _grouped_mm_safe_impl(
-                    self, mat2, offs = None, bias = None, out_dtype = None
-                ):
-                    """Safe fallback for _grouped_mm on gfx1200 (null HIP kernel)."""
-                    _t = _torch_for_rocm
-                    if offs is None:
-                        # Simple case: plain matrix multiply.
-                        result = _t.mm(self.contiguous(), mat2.contiguous())
-                    else:
-                        # Grouped case: offs[i] is the exclusive end-row of group i
-                        # in `self`; mat2 may be 3-D (num_groups, K, N) or 2-D.
-                        offs_list = offs.tolist()
-                        pieces = []
-                        prev = 0
-                        for idx, end in enumerate(offs_list):
-                            end = int(end)
-                            a_part = self[prev:end].contiguous()
-                            if mat2.dim() == 3:
-                                b_part = mat2[idx].contiguous()
-                            else:
-                                b_part = mat2.contiguous()
-                            pieces.append(_t.mm(a_part, b_part))
-                            prev = end
-                        # Include any trailing rows not covered by offs
-                        if prev < self.shape[0]:
-                            a_tail = self[prev:].contiguous()
-                            b_tail = (
-                                mat2[-1].contiguous()
-                                if mat2.dim() == 3
-                                else mat2.contiguous()
-                            )
-                            pieces.append(_t.mm(a_tail, b_tail))
-                        result = (
-                            _t.cat(pieces, dim = 0)
-                            if pieces
-                            else _t.zeros(
-                                0,
-                                mat2.shape[-1],
-                                device = self.device,
-                                dtype = self.dtype,
+            # Parse HIP version for the kernel-fix gate below.
+            # torch.version.hip can be "7.13.99004", "7.2.0", etc.
+            # We only need major.minor for the comparison.
+            def _hip_ver_at_least(major: int, minor: int) -> bool:
+                _hip_str = getattr(
+                    getattr(_torch_for_rocm, "version", None), "hip", None
+                )
+                if not _hip_str:
+                    return False
+                try:
+                    _parts = [int(x) for x in str(_hip_str).split(".")[:2]]
+                    return (_parts[0], _parts[1]) >= (major, minor)
+                except (ValueError, IndexError):
+                    return False
+
+            # _grouped_mm HIP kernel was null on gfx1200 in ROCm ≤ 7.12,
+            # causing 0xC0000005.  AMD fixed it in ROCm 7.13 (torch 2.11+).
+            # Only install the Python fallback on the affected versions so users
+            # on 7.13+ get the real GPU kernel for MoE workloads.
+            if not _hip_ver_at_least(7, 13):
+                try:
+                    import warnings as _warnings
+
+                    _gm_lib = _torch_for_rocm.library.Library("aten", "IMPL")
+
+                    def _grouped_mm_safe_impl(
+                        self, mat2, offs = None, bias = None, out_dtype = None
+                    ):
+                        """Python mm fallback for _grouped_mm on gfx1200 (null HIP kernel, ROCm ≤ 7.12)."""
+                        _t = _torch_for_rocm
+                        if offs is None:
+                            # Simple case: plain matrix multiply.
+                            result = _t.mm(self.contiguous(), mat2.contiguous())
+                        else:
+                            # Grouped case: offs[i] is the exclusive end-row of
+                            # group i in `self`; mat2 may be 3-D or 2-D.
+                            offs_list = offs.tolist()
+                            pieces = []
+                            prev = 0
+                            for idx, end in enumerate(offs_list):
+                                end = int(end)
+                                a_part = self[prev:end].contiguous()
+                                if mat2.dim() == 3:
+                                    b_part = mat2[idx].contiguous()
+                                else:
+                                    b_part = mat2.contiguous()
+                                pieces.append(_t.mm(a_part, b_part))
+                                prev = end
+                            # Include any trailing rows not covered by offs
+                            if prev < self.shape[0]:
+                                a_tail = self[prev:].contiguous()
+                                b_tail = (
+                                    mat2[-1].contiguous()
+                                    if mat2.dim() == 3
+                                    else mat2.contiguous()
+                                )
+                                pieces.append(_t.mm(a_tail, b_tail))
+                            result = (
+                                _t.cat(pieces, dim = 0)
+                                if pieces
+                                else _t.zeros(
+                                    0,
+                                    mat2.shape[-1],
+                                    device = self.device,
+                                    dtype = self.dtype,
+                                )
                             )
-                        )
-                    if bias is not None:
-                        result = result + bias
-                    if out_dtype is not None:
-                        result = result.to(out_dtype)
-                    elif result.dtype != self.dtype:
-                        result = result.to(self.dtype)
-                    return result
-
-                with _warnings.catch_warnings():
-                    _warnings.simplefilter("ignore")
-                    _gm_lib.impl("_grouped_mm", _grouped_mm_safe_impl, "CUDA")
-
-                _WINDOWS_ROCM_GROUPED_MM_LIB = _gm_lib  # prevent GC
+                        if bias is not None:
+                            result = result + bias
+                        if out_dtype is not None:
+                            result = result.to(out_dtype)
+                        elif result.dtype != self.dtype:
+                            result = result.to(self.dtype)
+                        return result
+
+                    with _warnings.catch_warnings():
+                        _warnings.simplefilter("ignore")
+                        _gm_lib.impl("_grouped_mm", _grouped_mm_safe_impl, "CUDA")
+
+                    _WINDOWS_ROCM_GROUPED_MM_LIB = _gm_lib  # prevent GC
+                    logger.info(
+                        "Windows ROCm: patched _grouped_mm CUDA dispatch "
+                        "(null HIP kernel on gfx1200, ROCm ≤ 7.12 — "
+                        "bypassed with Python mm fallback)"
+                    )
+                except Exception as _patch_exc:
+                    logger.warning(
+                        "Windows ROCm: could not patch _grouped_mm — "
+                        "training may crash with 0xC0000005: %s",
+                        _patch_exc,
+                    )
+            else:
                 logger.info(
-                    "Windows ROCm: patched _grouped_mm CUDA dispatch "
-                    "(null HIP kernel on gfx1200 bypassed with safe mm fallback)"
-                )
-            except Exception as _patch_exc:
-                logger.warning(
-                    "Windows ROCm: could not patch _grouped_mm — "
-                    "training may crash with 0xC0000005: %s",
-                    _patch_exc,
+                    "Windows ROCm: HIP >= 7.13 — _grouped_mm kernel is functional, "
+                    "skipping Python fallback (AMD fixed gfx1200 null kernel in ROCm 7.13)"
                 )
 
     # ── 2. Now import ML libraries (fresh in this clean process) ──
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index dae2c2ff67..a917820255 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1884,6 +1884,41 @@ def test_grouped_mm_patch_guarded_by_windows_and_hip_check(self):
         # Must gate on HIP version — code uses getattr chain: "version" and "hip"
         assert '"version"' in source and '"hip"' in source
 
+    def test_hip_ver_at_least_helper_defined(self):
+        """_hip_ver_at_least helper must be defined inside the Windows ROCm block."""
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
+        assert "def _hip_ver_at_least(major: int, minor: int)" in source
+
+    def test_grouped_mm_patch_gated_on_hip_lt_713(self):
+        """_grouped_mm patch must be skipped on HIP >= 7.13 (AMD fixed the bug in ROCm 7.13)."""
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
+        # The guard must call _hip_ver_at_least with exactly (7, 13)
+        assert "_hip_ver_at_least(7, 13)" in source
+        # The patch must be inside the `if not` branch (negated guard)
+        assert "if not _hip_ver_at_least(7, 13):" in source
+
+    def test_grouped_mm_hip_713_skip_message_present(self):
+        """worker.py must log a message when skipping the patch on HIP >= 7.13."""
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
+        assert "HIP >= 7.13" in source
+        assert "7.13" in source
+
+    def test_grouped_mm_patch_else_branch_present(self):
+        """An else branch must follow the _hip_ver_at_least gate (skip path for 7.13+)."""
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
+        # There must be an else: after the if not _hip_ver_at_least(7, 13): block
+        gate_idx = source.find("if not _hip_ver_at_least(7, 13):")
+        assert gate_idx != -1, "Version gate not found in worker.py"
+        # The else: branch must appear after the gate
+        else_idx = source.find("else:", gate_idx)
+        assert else_idx != -1, "else: branch after _hip_ver_at_least gate not found"
+
+    def test_hip_ver_at_least_handles_amd_version_format(self):
+        """_hip_ver_at_least must split on '.' and compare only major.minor (handles '7.13.99004')."""
+        source = _WORKER_PATH.read_text(encoding = "utf-8")
+        # Must split the version string and take the first two parts
+        assert 'split(".")[:2]' in source or ".split('.')[:2]" in source
+
 
 # =============================================================================
 # TEST: install_python_stack.py -- _ROCM_TORCH_PKG_SPECS mapping

From 75ef5993ce1dfffe7fdc3fb1341681ab255257d6 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 15 May 2026 15:01:32 -0500
Subject: [PATCH 087/165] fix: stub is_torchelastic_launched on
 torch.distributed for Windows ROCm

resolve_attention_implementation calls is_torchelastic_launched() which
does not exist in the incomplete torch.distributed shipped with the
Windows ROCm wheel, causing a warning on every model config load in the
server process. Add it to the stub table alongside the four helpers
already patched in _determine_attention_impl_for_gpu_estimate.

Also adds two tests: one confirming the new stub and one confirming all
five core distributed helpers are covered.
---
 studio/backend/utils/hardware/hardware.py |  1 +
 tests/studio/install/test_rocm_support.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index f75c7a9d12..d0fe3822b5 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -958,6 +958,7 @@ def _determine_attention_impl_for_gpu_estimate(config) -> str:
             ("is_available", lambda: False),
             ("get_rank", lambda: 0),
             ("get_world_size", lambda: 1),
+            ("is_torchelastic_launched", lambda: False),
         ):
             if not hasattr(_td, _attr):
                 setattr(_td, _attr, _stub)
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index a917820255..5824fbd784 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -846,6 +846,28 @@ def test_get_package_versions_returns_rocm_key(self):
         assert '"cuda"' in func_body
         assert '"rocm"' in func_body
 
+    def test_distributed_stubs_cover_is_torchelastic_launched(self):
+        """_determine_attention_impl_for_gpu_estimate must stub is_torchelastic_launched.
+
+        resolve_attention_implementation calls is_torchelastic_launched() on
+        Windows ROCm where torch.distributed ships without that helper, causing
+        a warning: 'module torch.distributed has no attribute is_torchelastic_launched'.
+        """
+        hw_path = (
+            PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+        )
+        source = hw_path.read_text(encoding = "utf-8")
+        assert "is_torchelastic_launched" in source
+
+    def test_distributed_stubs_cover_core_helpers(self):
+        """_determine_attention_impl_for_gpu_estimate must stub the four core distributed helpers."""
+        hw_path = (
+            PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+        )
+        source = hw_path.read_text(encoding = "utf-8")
+        for attr in ("is_initialized", "is_available", "get_rank", "get_world_size"):
+            assert attr in source, f"distributed stub for '{attr}' missing from hardware.py"
+
 
 # =============================================================================
 # TEST: tokenizer_utils.py -- error message

From 1db8e49770450b4f08d1fab85c2a937412c9e8c4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 15 May 2026 20:03:00 +0000
Subject: [PATCH 088/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 5824fbd784..2f896c57f1 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -866,7 +866,9 @@ def test_distributed_stubs_cover_core_helpers(self):
         )
         source = hw_path.read_text(encoding = "utf-8")
         for attr in ("is_initialized", "is_available", "get_rank", "get_world_size"):
-            assert attr in source, f"distributed stub for '{attr}' missing from hardware.py"
+            assert (
+                attr in source
+            ), f"distributed stub for '{attr}' missing from hardware.py"
 
 
 # =============================================================================

From 370debe46b3ebf350667c9f57d19f598b94feaa4 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 14:27:30 -0500
Subject: [PATCH 089/165] fix: explicit warnings on AMD ROCm arch/version
 fallbacks + Fast-Install arg order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

setup.ps1:
- Fix Fast-Install argument order: packages before flags, consistent with
  all other Fast-Install calls in the file
  (was: Fast-Install --force-reinstall --index-url $url torch ...)
  (now: Fast-Install torch torchvision torchaudio --force-reinstall --index-url $url)
- Add explicit [WARN] substep when $HasROCm is true but arch mapping fails:
  - GPU arch detected but not in supported wheel list → names the arch and
    lists supported families so user knows exactly what to report
  - HIP SDK present (amd-smi path) but gcnArchName unreadable → instructs
    user to re-install the HIP SDK; previously fell back silently to CPU

install.sh:
- Add [WARN] to stderr before silent CPU fallback when AMD GPU is confirmed
  (rocminfo/amd-smi) but ROCm version cannot be read from any source
  (amd-smi, /opt/rocm/.info/version, hipconfig, dpkg, rpm)
- Add [WARN] to stderr when ROCm version is too old (< 6.0) with upgrade link

install.ps1 and setup.sh: no changes needed (already handle these paths correctly)
---
 install.sh       | 11 ++++++++++-
 studio/setup.ps1 | 12 +++++++++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/install.sh b/install.sh
index 287327b5e5..41130f200d 100755
--- a/install.sh
+++ b/install.sh
@@ -1568,7 +1568,10 @@ get_torch_index_url() {
         if [ -n "$_rocm_tag" ]; then
             # Minimum supported: ROCm 6.0 (no PyTorch wheels exist for older)
             case "$_rocm_tag" in
-                rocm[1-5].*) echo "$_base/cpu"; return ;;
+                rocm[1-5].*)
+                    echo "[WARN] ROCm $_rocm_tag detected but PyTorch ROCm wheels require ROCm 6.0+ -- falling back to CPU-only PyTorch" >&2
+                    echo "[WARN] Upgrade ROCm: https://rocm.docs.amd.com/en/latest/deploy/linux/index.html" >&2
+                    echo "$_base/cpu"; return ;;
             esac
             # Supported tags; 6.5+ clips to rocm6.4, 7.3+ caps to rocm7.2.
             case "$_rocm_tag" in
@@ -1584,6 +1587,12 @@ get_torch_index_url() {
             esac
             return
         fi
+        # AMD GPU confirmed by rocminfo/amd-smi but ROCm version could not be
+        # read from any source (amd-smi, /opt/rocm/.info/version, hipconfig,
+        # dpkg, rpm).  Warn explicitly rather than silently installing CPU PyTorch.
+        echo "[WARN] AMD GPU detected but ROCm version could not be determined -- falling back to CPU-only PyTorch" >&2
+        echo "[WARN] Ensure one of the following is accessible: amd-smi, hipconfig, /opt/rocm/.info/version, rocm-core package" >&2
+        echo "[WARN] To install ROCm: https://rocm.docs.amd.com/en/latest/deploy/linux/index.html" >&2
         echo "$_base/cpu"; return
     fi
     # Parse CUDA version from nvidia-smi output (POSIX-safe, no grep -P)
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 702e14ac6e..b61b64dd33 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -1899,6 +1899,16 @@ if ($HasROCm -and $CuTag -eq "cpu") {
     $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
     if ($archFamily) {
         $ROCmIndexUrl = "$amdIndexBase/$archFamily/"
+    } elseif ($ROCmGfxArch) {
+        # GPU arch detected but not in the supported wheel map — warn explicitly
+        # so the user knows why they are getting CPU PyTorch instead of ROCm.
+        substep "[WARN] AMD GPU ($ROCmGfxArch) not in supported arch list -- falling back to CPU-only PyTorch" "Yellow"
+        substep "       Supported: gfx1200/1201 (RDNA 4), gfx1150/1151 (RDNA 3.5), gfx1100-1103 (RDNA 3), gfx90a, gfx908" "Yellow"
+    } else {
+        # HIP SDK present ($HasROCm=true via amd-smi) but gcnArchName was not
+        # readable — warn rather than silently falling back to CPU PyTorch.
+        substep "[WARN] AMD GPU detected (HIP SDK present) but GPU arch could not be read -- falling back to CPU-only PyTorch" "Yellow"
+        substep "       Arch detection requires hipinfo to report gcnArchName. Re-install the HIP SDK if this is unexpected." "Yellow"
     }
 }
 
@@ -1906,7 +1916,7 @@ $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR
 
 if ($ROCmIndexUrl) {
     substep "installing PyTorch (AMD ROCm, $ROCmGfxArch)..."
-    $output = Fast-Install --force-reinstall --index-url $ROCmIndexUrl torch torchvision torchaudio | Out-String
+    $output = Fast-Install torch torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl | Out-String
     $torchInstallExit = $LASTEXITCODE
     if ($torchInstallExit -ne 0) {
         Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow

From 7a5e93b42c1a178fc2ed4e2cd0ed7ddd7c223c7c Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 14:45:33 -0500
Subject: [PATCH 090/165] fix: robust gfx arch detection for Strix Halo /
 HIP-runtime-only installs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Covers users who have the HIP runtime (amd-smi available) but not the
full HIP SDK (no hipinfo), which is common on Strix Halo iGPU systems.
Without this, $ROCmGfxArch stays null and the installer silently falls
back to CPU-only PyTorch despite a working GPU.

Detection waterfall (setup.ps1 + install.ps1):
  1. hipinfo gcnArchName          -- full HIP SDK (existing, unchanged)
  2. amd-smi list gfx pattern     -- newer amd-smi versions embed arch
  3. amd-smi static --asic        -- ROCm 6+ ASIC details with GFX target
  4. UNSLOTH_ROCM_GFX_ARCH env    -- manual override escape hatch
  5. GPU name → arch table        -- best-effort from marketing name:
       890M / Strix Halo  → gfx1151 (RDNA 3.5 iGPU, Strix Halo)
       880M / Strix Point → gfx1150 (RDNA 3.5 iGPU, Strix Point)
       780M / Phoenix     → gfx1103 (RDNA 3 iGPU)
       RX 7900/7800/7700  → gfx1100 (RDNA 3 desktop)
       RX 9070 XT / 9080  → gfx1201 (RDNA 4)
       RX 9070 / 9060 XT  → gfx1200 (RDNA 4)

When arch is inferred from name, a Cyan substep tells the user to set
UNSLOTH_ROCM_GFX_ARCH to skip inference on future installs.
WMI block intentionally does not set $HasROCm (no runtime confirmation).

Tests: 11 new tests in TestStrixHaloGfxArchDetection covering all five
detection levels, WMI safety, and gfx regex in both ps1 files.
---
 install.ps1                               | 51 +++++++++++++-
 studio/setup.ps1                          | 64 ++++++++++++++++-
 tests/studio/install/test_rocm_support.py | 83 +++++++++++++++++++++++
 3 files changed, 194 insertions(+), 4 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 53c0eea4a1..92487f99e9 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1235,7 +1235,24 @@ shell.Run cmd, 0, False
                     $smiOut = & $amdSmiExe.Source list 2>&1 | Out-String
                     if ($LASTEXITCODE -eq 0 -and $smiOut -match "(?im)^GPU\s*[:\[]\s*\d") {
                         $HasROCm = $true
-                        $ROCmGpuLabel = "AMD ROCm"
+                        # Attempt 1: newer amd-smi versions embed the gfx arch in list output
+                        if ($smiOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
+                            $ROCmGfxArch = $Matches[1].ToLower()
+                            $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
+                        } else {
+                            # Attempt 2: 'static --asic' exposes ASIC details on ROCm 6+,
+                            # including the GFX target needed for wheel index selection.
+                            $smiAsicOut = ""
+                            try { $smiAsicOut = & $amdSmiExe.Source static --asic 2>&1 | Out-String } catch {}
+                            if ($smiAsicOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
+                                $ROCmGfxArch = $Matches[1].ToLower()
+                                $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
+                            } elseif ($smiAsicOut -match "(?im)Market.?Name\s*[:\|]\s*([^\r\n]+)") {
+                                $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
+                            } else {
+                                $ROCmGpuLabel = "AMD ROCm"
+                            }
+                        }
                     }
                 } catch {}
             }
@@ -1248,6 +1265,38 @@ shell.Run cmd, 0, False
                 if ($wmiGpu) { $ROCmGpuLabel = $wmiGpu.Name }
             } catch {}
         }
+        # ── Arch resolution: env-var override → name inference ──────────────
+        # Covers users whose amd-smi is too old to report the GFX target and
+        # who don't have hipinfo (HIP-runtime-only, common on Strix Halo / iGPU).
+        if ($HasROCm -and -not $ROCmGfxArch) {
+            # 1. Manual override: set UNSLOTH_ROCM_GFX_ARCH=gfx1151 before running.
+            if ($env:UNSLOTH_ROCM_GFX_ARCH) {
+                $ROCmGfxArch = $env:UNSLOTH_ROCM_GFX_ARCH.Trim().ToLower()
+                $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
+                substep "gfx arch from UNSLOTH_ROCM_GFX_ARCH env override: $ROCmGfxArch" "Cyan"
+            }
+            # 2. Best-effort name → arch lookup from marketing name (amd-smi / WMI).
+            elseif ($ROCmGpuLabel) {
+                $nameArchTable = @(
+                    @{ P = "9070 XT|9080";                                        A = "gfx1201" }  # RDNA 4
+                    @{ P = "9070|9060";                                            A = "gfx1200" }  # RDNA 4
+                    @{ P = "890M|Strix Halo|HX 37[05]|HX 38[05]|AI 9 HX";        A = "gfx1151" }  # RDNA 3.5 iGPU (Strix Halo)
+                    @{ P = "880M|Strix Point|AI 9 36[05]|AI 7 35[05]|AI 5 34[05]"; A = "gfx1150" } # RDNA 3.5 iGPU (Strix Point)
+                    @{ P = "RX 7900|RX 7800|RX 7700(?! S)";                       A = "gfx1100" }  # RDNA 3 desktop
+                    @{ P = "RX 7600";                                              A = "gfx1102" }  # RDNA 3
+                    @{ P = "780M|760M|740M|Phoenix";                               A = "gfx1103" }  # RDNA 3 iGPU (Phoenix)
+                )
+                foreach ($row in $nameArchTable) {
+                    if ($ROCmGpuLabel -match $row.P) {
+                        $ROCmGfxArch = $row.A
+                        $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
+                        substep "gfx arch inferred from GPU name: $ROCmGfxArch" "Cyan"
+                        substep "Tip: set UNSLOTH_ROCM_GFX_ARCH=$ROCmGfxArch to skip inference next time" "Cyan"
+                        break
+                    }
+                }
+            }
+        }
         # Capture ROCm version for wheel selection (hipconfig, then amd-smi)
         if ($HasROCm) {
             $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index b61b64dd33..ada93991c1 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -703,7 +703,10 @@ if (-not $HasNvidiaSmi) {
             }
         } catch {}
     }
-    # amd-smi list fallback: look for "GPU: <digit>" data rows
+    # amd-smi fallback: HIP runtime present but hipinfo unavailable (no full HIP SDK).
+    # Confirms GPU visibility via 'list', then attempts 'static --asic' to extract
+    # the gfx arch that hipinfo would have provided.  Critical for Strix Halo
+    # (gfx1151) and other iGPUs where only the HIP runtime is installed.
     if (-not $HasROCm) {
         $amdSmiExe = Get-Command "amd-smi" -ErrorAction SilentlyContinue
         if ($amdSmiExe) {
@@ -711,12 +714,33 @@ if (-not $HasNvidiaSmi) {
                 $smiOut = & $amdSmiExe.Source list 2>&1 | Out-String
                 if ($LASTEXITCODE -eq 0 -and $smiOut -match "(?im)^GPU\s*[:\[]\s*\d") {
                     $HasROCm = $true
-                    $ROCmGpuLabel = "AMD ROCm"
+                    # Attempt 1: newer amd-smi versions embed the gfx arch in list output
+                    if ($smiOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
+                        $script:ROCmGfxArch = $Matches[1].ToLower()
+                        $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
+                    } else {
+                        # Attempt 2: 'static --asic' exposes ASIC details on ROCm 6+,
+                        # including the GFX target needed for wheel index selection.
+                        $smiAsicOut = ""
+                        try { $smiAsicOut = & $amdSmiExe.Source static --asic 2>&1 | Out-String } catch {}
+                        if ($smiAsicOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
+                            $script:ROCmGfxArch = $Matches[1].ToLower()
+                            $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
+                        } elseif ($smiAsicOut -match "(?im)Market.?Name\s*[:\|]\s*([^\r\n]+)") {
+                            $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
+                        } else {
+                            $ROCmGpuLabel = "AMD ROCm"
+                        }
+                    }
                 }
             } catch {}
         }
     }
-    # WMI fallback: AMD GPU in device list but no HIP SDK → guide the user
+    # WMI fallback: AMD GPU in device list but no HIP SDK → guide the user.
+    # WMI gives a marketing name (e.g. "AMD Radeon 890M") but never a gfx arch.
+    # $HasROCm is intentionally NOT set here — we cannot confirm ROCm runtime
+    # support without hipinfo or amd-smi.  The name is saved to $ROCmGpuLabel
+    # so the name-based inference below can still attempt an arch lookup.
     if (-not $HasROCm) {
         try {
             $wmiGpu = Get-WmiObject Win32_VideoController -ErrorAction SilentlyContinue |
@@ -725,6 +749,40 @@ if (-not $HasNvidiaSmi) {
             if ($wmiGpu) { $ROCmGpuLabel = $wmiGpu.Name }
         } catch {}
     }
+    # ── Arch resolution: env-var override → name inference ──────────────────
+    # Runs after all probe methods.  Covers users whose amd-smi version is too
+    # old to report the GFX target and who don't have hipinfo (HIP-runtime-only
+    # installs, common on Strix Halo / iGPU systems).
+    if ($HasROCm -and -not $script:ROCmGfxArch) {
+        # 1. Manual override: set UNSLOTH_ROCM_GFX_ARCH=gfx1151 before running.
+        if ($env:UNSLOTH_ROCM_GFX_ARCH) {
+            $script:ROCmGfxArch = $env:UNSLOTH_ROCM_GFX_ARCH.Trim().ToLower()
+            $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
+            substep "gfx arch from UNSLOTH_ROCM_GFX_ARCH env override: $script:ROCmGfxArch" "Cyan"
+        }
+        # 2. Best-effort name → arch lookup from marketing name (amd-smi / WMI).
+        #    Ordered most-specific first; first match wins.
+        elseif ($ROCmGpuLabel) {
+            $nameArchTable = @(
+                @{ P = "9070 XT|9080";                                        A = "gfx1201" }  # RDNA 4
+                @{ P = "9070|9060";                                            A = "gfx1200" }  # RDNA 4
+                @{ P = "890M|Strix Halo|HX 37[05]|HX 38[05]|AI 9 HX";        A = "gfx1151" }  # RDNA 3.5 iGPU (Strix Halo)
+                @{ P = "880M|Strix Point|AI 9 36[05]|AI 7 35[05]|AI 5 34[05]"; A = "gfx1150" } # RDNA 3.5 iGPU (Strix Point)
+                @{ P = "RX 7900|RX 7800|RX 7700(?! S)";                       A = "gfx1100" }  # RDNA 3 desktop
+                @{ P = "RX 7600";                                              A = "gfx1102" }  # RDNA 3
+                @{ P = "780M|760M|740M|Phoenix";                               A = "gfx1103" }  # RDNA 3 iGPU (Phoenix)
+            )
+            foreach ($row in $nameArchTable) {
+                if ($ROCmGpuLabel -match $row.P) {
+                    $script:ROCmGfxArch = $row.A
+                    $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
+                    substep "gfx arch inferred from GPU name: $script:ROCmGfxArch" "Cyan"
+                    substep "Tip: set UNSLOTH_ROCM_GFX_ARCH=$script:ROCmGfxArch to skip inference next time" "Cyan"
+                    break
+                }
+            }
+        }
+    }
     # Capture ROCm version early for display and wheel selection
     if ($HasROCm) {
         $script:ROCmVersion = $null
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 2f896c57f1..233471d5ba 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1993,5 +1993,88 @@ def test_gfx_to_amd_index_covers_rdna3(self):
             assert mapping.get(arch) == "gfx110X-all", f"{arch} missing from mapping"
 
 
+# =============================================================================
+# TEST: setup.ps1 / install.ps1 -- Strix Halo gfx arch detection
+# =============================================================================
+
+_SETUP_PS1_PATH = PACKAGE_ROOT / "studio" / "setup.ps1"
+_INSTALL_PS1_PATH = PACKAGE_ROOT / "install.ps1"
+
+
+class TestStrixHaloGfxArchDetection:
+    """Verify that setup.ps1 and install.ps1 have robust gfx arch detection
+    for Strix Halo / iGPU users who only have the HIP runtime (no hipinfo)."""
+
+    def test_amd_smi_static_asic_attempted_in_setup(self):
+        """setup.ps1 must try 'amd-smi static --asic' when list output lacks gfx arch."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "static --asic" in source
+
+    def test_amd_smi_static_asic_attempted_in_install(self):
+        """install.ps1 must try 'amd-smi static --asic' when list output lacks gfx arch."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "static --asic" in source
+
+    def test_env_var_override_in_setup(self):
+        """setup.ps1 must honour UNSLOTH_ROCM_GFX_ARCH as a manual arch override."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "UNSLOTH_ROCM_GFX_ARCH" in source
+
+    def test_env_var_override_in_install(self):
+        """install.ps1 must honour UNSLOTH_ROCM_GFX_ARCH as a manual arch override."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "UNSLOTH_ROCM_GFX_ARCH" in source
+
+    def test_name_arch_table_covers_strix_halo_in_setup(self):
+        """setup.ps1 name→arch table must map 890M / Strix Halo to gfx1151."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "gfx1151" in source
+        assert "890M" in source or "Strix Halo" in source
+
+    def test_name_arch_table_covers_strix_halo_in_install(self):
+        """install.ps1 name→arch table must map 890M / Strix Halo to gfx1151."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "gfx1151" in source
+        assert "890M" in source or "Strix Halo" in source
+
+    def test_name_arch_table_covers_strix_point_in_setup(self):
+        """setup.ps1 name→arch table must map 880M / Strix Point to gfx1150."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "gfx1150" in source
+        assert "880M" in source or "Strix Point" in source
+
+    def test_name_arch_table_covers_strix_point_in_install(self):
+        """install.ps1 name→arch table must map 880M / Strix Point to gfx1150."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "gfx1150" in source
+        assert "880M" in source or "Strix Point" in source
+
+    def test_name_arch_table_covers_rdna3_phoenix_in_setup(self):
+        """setup.ps1 name→arch table must map 780M / Phoenix to gfx1103."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "gfx1103" in source
+        assert "780M" in source or "Phoenix" in source
+
+    def test_wmi_does_not_set_hasrocm_in_setup(self):
+        """WMI block in setup.ps1 must NOT set $HasROCm = $true (no runtime confirmation)."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        # Find the WMI block and confirm HasROCm is not set inside it
+        wmi_idx = source.find("Win32_VideoController")
+        assert wmi_idx != -1, "WMI block not found in setup.ps1"
+        # The nearest HasROCm = $true must not appear between the WMI block
+        # and the closing brace of that if-block.  We check by confirming
+        # $HasROCm = $true does NOT appear within 300 chars of the WMI call.
+        wmi_context = source[wmi_idx : wmi_idx + 300]
+        assert "$HasROCm = $true" not in wmi_context
+
+    def test_gfx_arch_regex_parses_from_amd_smi_output(self):
+        """Both files must use the gfx\\d+[a-z]? regex to parse arch from amd-smi output."""
+        for path in (_SETUP_PS1_PATH, _INSTALL_PS1_PATH):
+            source = path.read_text(encoding = "utf-8")
+            # The regex pattern used to match gfx arches
+            assert "gfx\\d+" in source or r"gfx\d+" in source, \
+                f"gfx arch regex not found in {path.name}"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From ffa16f073f511c84839b62fd15e9fe6ecfe64f2f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 16 May 2026 19:45:49 +0000
Subject: [PATCH 091/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 233471d5ba..fac8b4bb85 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2072,8 +2072,9 @@ def test_gfx_arch_regex_parses_from_amd_smi_output(self):
         for path in (_SETUP_PS1_PATH, _INSTALL_PS1_PATH):
             source = path.read_text(encoding = "utf-8")
             # The regex pattern used to match gfx arches
-            assert "gfx\\d+" in source or r"gfx\d+" in source, \
-                f"gfx arch regex not found in {path.name}"
+            assert (
+                "gfx\\d+" in source or r"gfx\d+" in source
+            ), f"gfx arch regex not found in {path.name}"
 
 
 if __name__ == "__main__":

From 2befb57b993cc3340829fb0d335c0e8050683aa7 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 14:55:23 -0500
Subject: [PATCH 092/165] fix: resolve hipinfo/hipconfig via HIP_PATH/ROCM_PATH
 when not on PATH

AMD HIP SDK sets HIP_PATH on Windows but does not always add the bin
directory to PATH.  Get-Command hipinfo therefore silently fails and
detection falls through to WMI, which cannot provide a gfx arch, leaving
the user with a CPU-only PyTorch install and no warning.

Changes:
- setup.ps1 / install.ps1: before falling through to amd-smi, attempt to
  locate hipinfo.exe and hipconfig.exe under $env:HIP_PATH\bin (then
  $env:ROCM_PATH\bin) when Get-Command returns nothing
- Emit a [WARN] with the resolved path and a one-liner to permanently fix
  PATH via SetEnvironmentVariable
- Emit a [WARN] when HIP_PATH/ROCM_PATH is set but the exe is still not
  found (incomplete SDK install)
- Emit a [WARN] with the first hipinfo output line when hipinfo runs but
  returns a non-zero exit code (e.g. "no ROCm-capable device detected")
- 18 new tests in TestHipSdkEnvPathResolution; total 183 passed, 2 skipped
---
 install.ps1                               |  36 +++++++
 studio/setup.ps1                          |  37 ++++++-
 tests/studio/install/test_rocm_support.py | 123 ++++++++++++++++++++++
 3 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index 92487f99e9..5df7e16d9c 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1213,7 +1213,26 @@ shell.Run cmd, 0, False
     $ROCmVersion = $null
     $ROCmGfxArch = $null
     if (-not $HasNvidiaSmi) {
+        # hipinfo: PATH first, then HIP_PATH/ROCM_PATH bin fallback (mirrors NVIDIA smi path resolution).
+        # AMD HIP SDK sets HIP_PATH but may not add the bin dir to PATH depending on install type.
         $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
+        if (-not $hipinfoExe) {
+            $hipRoot     = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { $null }
+            $hipEnvLabel = if ($env:HIP_PATH) { "HIP_PATH"    } else                    { "ROCM_PATH"    }
+            if ($hipRoot) {
+                $hipinfoCandidate = Join-Path $hipRoot "bin\hipinfo.exe"
+                if (Test-Path $hipinfoCandidate) {
+                    Write-Host "  [WARN] hipinfo not on PATH -- located via ${hipEnvLabel}: $hipinfoCandidate" -ForegroundColor Yellow
+                    Write-Host "         Add '$(Join-Path $hipRoot 'bin')' to your PATH to suppress this warning" -ForegroundColor Yellow
+                    Write-Host "         Quick fix: [Environment]::SetEnvironmentVariable('PATH',`$env:PATH+';$(Join-Path $hipRoot 'bin')','User')" -ForegroundColor Yellow
+                    $hipinfoExe = [PSCustomObject]@{ Source = $hipinfoCandidate }
+                } else {
+                    Write-Host "  [WARN] ${hipEnvLabel}=$hipRoot is set but hipinfo.exe not found at $hipinfoCandidate" -ForegroundColor Yellow
+                    Write-Host "         HIP SDK install may be incomplete -- re-install from:" -ForegroundColor Yellow
+                    Write-Host "         https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" -ForegroundColor Yellow
+                }
+            }
+        }
         if ($hipinfoExe) {
             try {
                 $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
@@ -1225,6 +1244,12 @@ shell.Run cmd, 0, False
                     } else {
                         $ROCmGpuLabel = "AMD ROCm"
                     }
+                } elseif ($LASTEXITCODE -ne 0) {
+                    # hipinfo ran but returned a HIP runtime error (e.g. "no ROCm-capable device detected")
+                    $firstLine = ($hipOut -split '\r?\n' | Where-Object { $_.Trim() } | Select-Object -First 1)
+                    Write-Host "  [WARN] hipinfo returned a HIP runtime error (exit $LASTEXITCODE)" -ForegroundColor Yellow
+                    Write-Host "         $firstLine" -ForegroundColor Yellow
+                    Write-Host "         Ensure ROCm drivers are installed: https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" -ForegroundColor Yellow
                 }
             } catch {}
         }
@@ -1300,6 +1325,17 @@ shell.Run cmd, 0, False
         # Capture ROCm version for wheel selection (hipconfig, then amd-smi)
         if ($HasROCm) {
             $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
+            if (-not $hipConfigExe) {
+                $hipRoot = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { $null }
+                if ($hipRoot) {
+                    $hipConfigCandidate = Join-Path $hipRoot "bin\hipconfig.exe"
+                    if (Test-Path $hipConfigCandidate) {
+                        $hipConfigEnvLabel = if ($env:HIP_PATH) { "HIP_PATH" } else { "ROCM_PATH" }
+                        Write-Host "  [WARN] hipconfig not on PATH -- located via ${hipConfigEnvLabel}: $hipConfigCandidate" -ForegroundColor Yellow
+                        $hipConfigExe = [PSCustomObject]@{ Source = $hipConfigCandidate }
+                    }
+                }
+            }
             if ($hipConfigExe) {
                 try {
                     $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index ada93991c1..b3e977ee75 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -687,8 +687,26 @@ $HasROCm = $false
 $ROCmGpuLabel = $null
 $script:ROCmGfxArch = $null
 if (-not $HasNvidiaSmi) {
-    # hipinfo: present + output contains gcnArchName → real HIP GPU
+    # hipinfo: PATH first, then HIP_PATH/ROCM_PATH bin fallback (mirrors NVIDIA smi path resolution).
+    # AMD HIP SDK sets HIP_PATH but may not add the bin dir to PATH depending on install type.
     $hipinfoExe = Get-Command hipinfo -ErrorAction SilentlyContinue
+    if (-not $hipinfoExe) {
+        $hipRoot     = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { $null }
+        $hipEnvLabel = if ($env:HIP_PATH) { "HIP_PATH"    } else                    { "ROCM_PATH"    }
+        if ($hipRoot) {
+            $hipinfoCandidate = Join-Path $hipRoot "bin\hipinfo.exe"
+            if (Test-Path $hipinfoCandidate) {
+                substep "[WARN] hipinfo not on PATH -- located via ${hipEnvLabel}: $hipinfoCandidate" "Yellow"
+                substep "       Add '$(Join-Path $hipRoot 'bin')' to your PATH to suppress this warning" "Yellow"
+                substep "       Quick fix: [Environment]::SetEnvironmentVariable('PATH',`$env:PATH+';$(Join-Path $hipRoot 'bin')','User')" "Yellow"
+                $hipinfoExe = [PSCustomObject]@{ Source = $hipinfoCandidate }
+            } else {
+                substep "[WARN] ${hipEnvLabel}=$hipRoot is set but hipinfo.exe not found at $hipinfoCandidate" "Yellow"
+                substep "       HIP SDK install may be incomplete -- re-install from:" "Yellow"
+                substep "       https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" "Yellow"
+            }
+        }
+    }
     if ($hipinfoExe) {
         try {
             $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
@@ -700,6 +718,12 @@ if (-not $HasNvidiaSmi) {
                 } else {
                     $ROCmGpuLabel = "AMD ROCm"
                 }
+            } elseif ($LASTEXITCODE -ne 0) {
+                # hipinfo ran but returned a HIP runtime error (e.g. "no ROCm-capable device detected")
+                $firstLine = ($hipOut -split '\r?\n' | Where-Object { $_.Trim() } | Select-Object -First 1)
+                substep "[WARN] hipinfo returned a HIP runtime error (exit $LASTEXITCODE)" "Yellow"
+                substep "       $firstLine" "Yellow"
+                substep "       Ensure ROCm drivers are installed: https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" "Yellow"
             }
         } catch {}
     }
@@ -787,6 +811,17 @@ if (-not $HasNvidiaSmi) {
     if ($HasROCm) {
         $script:ROCmVersion = $null
         $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
+        if (-not $hipConfigExe) {
+            $hipRoot = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { $null }
+            if ($hipRoot) {
+                $hipConfigCandidate = Join-Path $hipRoot "bin\hipconfig.exe"
+                if (Test-Path $hipConfigCandidate) {
+                    $hipConfigEnvLabel = if ($env:HIP_PATH) { "HIP_PATH" } else { "ROCM_PATH" }
+                    substep "[WARN] hipconfig not on PATH -- located via ${hipConfigEnvLabel}: $hipConfigCandidate" "Yellow"
+                    $hipConfigExe = [PSCustomObject]@{ Source = $hipConfigCandidate }
+                }
+            }
+        }
         if ($hipConfigExe) {
             try {
                 $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index fac8b4bb85..53eb9aa1a4 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2077,5 +2077,128 @@ def test_gfx_arch_regex_parses_from_amd_smi_output(self):
             ), f"gfx arch regex not found in {path.name}"
 
 
+# =============================================================================
+# TEST: HIP SDK tool path resolution via HIP_PATH / ROCM_PATH env vars
+# =============================================================================
+
+
+class TestHipSdkEnvPathResolution:
+    """Verify that both install scripts resolve hipinfo/hipconfig via HIP_PATH
+    and ROCM_PATH when the tools are not on $PATH, and emit explicit warnings."""
+
+    # ── hipinfo resolution ────────────────────────────────────────────────────
+
+    def test_setup_checks_hip_path_for_hipinfo(self):
+        """setup.ps1 must reference HIP_PATH when resolving hipinfo."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP_PATH" in source
+        assert "hipinfo" in source
+
+    def test_install_checks_hip_path_for_hipinfo(self):
+        """install.ps1 must reference HIP_PATH when resolving hipinfo."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP_PATH" in source
+        assert "hipinfo" in source
+
+    def test_setup_checks_rocm_path_as_hipinfo_fallback(self):
+        """setup.ps1 must also check ROCM_PATH as a secondary hipinfo fallback."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCM_PATH" in source
+        # Confirm the fallback pattern: HIP_PATH ?? ROCM_PATH (or equivalent elseif)
+        assert ("ROCM_PATH" in source and "HIP_PATH" in source)
+
+    def test_install_checks_rocm_path_as_hipinfo_fallback(self):
+        """install.ps1 must also check ROCM_PATH as a secondary hipinfo fallback."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCM_PATH" in source
+        assert ("ROCM_PATH" in source and "HIP_PATH" in source)
+
+    def test_setup_resolves_hipinfo_via_bin_subdir(self):
+        """setup.ps1 must join the env var root with 'bin\\hipinfo.exe'."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert r"bin\hipinfo.exe" in source
+
+    def test_install_resolves_hipinfo_via_bin_subdir(self):
+        """install.ps1 must join the env var root with 'bin\\hipinfo.exe'."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert r"bin\hipinfo.exe" in source
+
+    # ── hipinfo not-on-PATH warning ───────────────────────────────────────────
+
+    def test_setup_warns_when_hipinfo_not_on_path(self):
+        """setup.ps1 must warn when hipinfo is found via env var but not on PATH."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "hipinfo not on PATH" in source
+
+    def test_install_warns_when_hipinfo_not_on_path(self):
+        """install.ps1 must warn when hipinfo is found via env var but not on PATH."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "hipinfo not on PATH" in source
+
+    # ── warn when HIP_PATH set but exe missing ────────────────────────────────
+
+    def test_setup_warns_when_hip_path_set_but_exe_missing(self):
+        """setup.ps1 must warn when HIP_PATH is set but hipinfo.exe is not present."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        # The warning must mention that the SDK install may be incomplete
+        assert "incomplete" in source or "not found at" in source
+
+    def test_install_warns_when_hip_path_set_but_exe_missing(self):
+        """install.ps1 must warn when HIP_PATH is set but hipinfo.exe is not present."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "incomplete" in source or "not found at" in source
+
+    # ── hipinfo runtime error warning ─────────────────────────────────────────
+
+    def test_setup_warns_on_hipinfo_nonzero_exit(self):
+        """setup.ps1 must warn when hipinfo runs but returns a non-zero exit code."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP runtime error" in source or "runtime error" in source.lower()
+
+    def test_install_warns_on_hipinfo_nonzero_exit(self):
+        """install.ps1 must warn when hipinfo runs but returns a non-zero exit code."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP runtime error" in source or "runtime error" in source.lower()
+
+    # ── hipconfig resolution ──────────────────────────────────────────────────
+
+    def test_setup_resolves_hipconfig_via_bin_subdir(self):
+        """setup.ps1 must also fall back to HIP_PATH/bin/hipconfig.exe for version detection."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert r"bin\hipconfig.exe" in source
+
+    def test_install_resolves_hipconfig_via_bin_subdir(self):
+        """install.ps1 must also fall back to HIP_PATH/bin/hipconfig.exe for version detection."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert r"bin\hipconfig.exe" in source
+
+    def test_setup_warns_when_hipconfig_not_on_path(self):
+        """setup.ps1 must warn when hipconfig is found via env var but not on PATH."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "hipconfig not on PATH" in source
+
+    def test_install_warns_when_hipconfig_not_on_path(self):
+        """install.ps1 must warn when hipconfig is found via env var but not on PATH."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "hipconfig not on PATH" in source
+
+    # ── PATH fix hint ─────────────────────────────────────────────────────────
+
+    def test_setup_provides_path_fix_hint(self):
+        """setup.ps1 must tell the user how to add the HIP bin dir to PATH."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        # Should mention adding to PATH or SetEnvironmentVariable
+        assert "PATH" in source and (
+            "SetEnvironmentVariable" in source or "Add" in source
+        )
+
+    def test_install_provides_path_fix_hint(self):
+        """install.ps1 must tell the user how to add the HIP bin dir to PATH."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "PATH" in source and (
+            "SetEnvironmentVariable" in source or "Add" in source
+        )
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 116fa6eb921886c1fcc30d7da9d60d0c25505979 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 16 May 2026 19:56:08 +0000
Subject: [PATCH 093/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 53eb9aa1a4..9bfc37f364 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2105,13 +2105,13 @@ def test_setup_checks_rocm_path_as_hipinfo_fallback(self):
         source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
         assert "ROCM_PATH" in source
         # Confirm the fallback pattern: HIP_PATH ?? ROCM_PATH (or equivalent elseif)
-        assert ("ROCM_PATH" in source and "HIP_PATH" in source)
+        assert "ROCM_PATH" in source and "HIP_PATH" in source
 
     def test_install_checks_rocm_path_as_hipinfo_fallback(self):
         """install.ps1 must also check ROCM_PATH as a secondary hipinfo fallback."""
         source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
         assert "ROCM_PATH" in source
-        assert ("ROCM_PATH" in source and "HIP_PATH" in source)
+        assert "ROCM_PATH" in source and "HIP_PATH" in source
 
     def test_setup_resolves_hipinfo_via_bin_subdir(self):
         """setup.ps1 must join the env var root with 'bin\\hipinfo.exe'."""

From ae6d042caf6546703d0b11c4b10dc8f6fb55f3e8 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 15:11:38 -0500
Subject: [PATCH 094/165] feat: print HIP SDK path and full hipconfig version
 in terminal on AMD detection

Both install.ps1 and setup.ps1 now emit substeps under the gpu step when
AMD ROCm is detected:

  gpu  AMD ROCm (gfx1200)
       HIP SDK: C:\Program Files\AMD\ROCm\7.1
       hipconfig: 7.1.51803-d3a86bd04

Previously only the gpu label (e.g. "AMD ROCm (gfx1200)") was shown with
no indication of where the SDK was found or which exact build was active.
The full hipconfig build string (e.g. 7.1.51803-d3a86bd04 instead of just
7.1) is now stored in ROCmVersionFull and also used in setup.ps1's
'rocm' step label.

9 new tests in TestHipSdkDetectedSubstep; total 192 passed, 2 skipped
---
 install.ps1                               | 11 ++++-
 studio/setup.ps1                          | 13 +++++-
 tests/studio/install/test_rocm_support.py | 55 +++++++++++++++++++++++
 3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 5df7e16d9c..28a78d47a2 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1339,8 +1339,12 @@ shell.Run cmd, 0, False
             if ($hipConfigExe) {
                 try {
                     $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
-                    if ($LASTEXITCODE -eq 0 -and $hipVerOut -match '(\d+\.\d+)') {
-                        $ROCmVersion = $Matches[1]
+                    if ($LASTEXITCODE -eq 0) {
+                        $hipVerLine = ($hipVerOut -split '\r?\n' | Where-Object { $_.Trim() } | Select-Object -First 1).Trim()
+                        if ($hipVerLine -match '(\d+\.\d+)') {
+                            $ROCmVersion     = $Matches[1]
+                            $ROCmVersionFull = $hipVerLine
+                        }
                     }
                 } catch {}
             }
@@ -1362,6 +1366,9 @@ shell.Run cmd, 0, False
         step "gpu" "NVIDIA GPU detected"
     } elseif ($HasROCm) {
         step "gpu" $ROCmGpuLabel
+        $hipSdkPath = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { "on system PATH" }
+        substep "HIP SDK: $hipSdkPath"
+        if ($ROCmVersionFull) { substep "hipconfig: $ROCmVersionFull" }
     } elseif ($ROCmGpuLabel) {
         step "gpu" "AMD GPU detected -- HIP SDK not found" "Yellow"
         substep "Detected: $ROCmGpuLabel" "Yellow"
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index b3e977ee75..68bd6397aa 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -825,7 +825,13 @@ if (-not $HasNvidiaSmi) {
         if ($hipConfigExe) {
             try {
                 $hipVerOut = & $hipConfigExe.Source --version 2>&1 | Out-String
-                if ($LASTEXITCODE -eq 0 -and $hipVerOut -match '(\d+\.\d+)') { $script:ROCmVersion = $Matches[1] }
+                if ($LASTEXITCODE -eq 0) {
+                    $hipVerLine = ($hipVerOut -split '\r?\n' | Where-Object { $_.Trim() } | Select-Object -First 1).Trim()
+                    if ($hipVerLine -match '(\d+\.\d+)') {
+                        $script:ROCmVersion     = $Matches[1]
+                        $script:ROCmVersionFull = $hipVerLine
+                    }
+                }
             } catch {}
         }
         if (-not $script:ROCmVersion) {
@@ -844,6 +850,9 @@ if ($HasNvidiaSmi) {
     step "gpu" "NVIDIA GPU detected"
 } elseif ($HasROCm) {
     step "gpu" $ROCmGpuLabel
+    $hipSdkPath = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { "on system PATH" }
+    substep "HIP SDK: $hipSdkPath"
+    if ($script:ROCmVersionFull) { substep "hipconfig: $script:ROCmVersionFull" }
 } elseif ($ROCmGpuLabel) {
     Write-Host ""
     step "gpu" "AMD GPU detected -- HIP SDK not found" "Yellow"
@@ -1265,7 +1274,7 @@ if (-not $CudaArch) {
 }
 
 if ($HasROCm) {
-    $rocmVerLabel = if ($ROCmVersion) { "ROCm $ROCmVersion" } else { "ROCm (version unknown)" }
+    $rocmVerLabel = if ($script:ROCmVersionFull) { "ROCm $script:ROCmVersionFull" } elseif ($script:ROCmVersion) { "ROCm $script:ROCmVersion" } else { "ROCm (version unknown)" }
     step "rocm" $rocmVerLabel
 } elseif ($ROCmGpuLabel) {
     step "rocm" "HIP SDK not found -- GPU-accelerated training unavailable" "Yellow"
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 9bfc37f364..0d680b77fd 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2200,5 +2200,60 @@ def test_install_provides_path_fix_hint(self):
         )
 
 
+# =============================================================================
+# TEST: HIP SDK detected substep -- path + hipconfig version shown in terminal
+# =============================================================================
+
+
+class TestHipSdkDetectedSubstep:
+    """Verify that both scripts print HIP SDK path and full hipconfig version
+    as substeps under the gpu step when AMD ROCm is successfully detected."""
+
+    def test_setup_prints_hip_sdk_path_substep(self):
+        """setup.ps1 must print an 'HIP SDK:' substep showing the resolved path."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP SDK:" in source
+
+    def test_install_prints_hip_sdk_path_substep(self):
+        """install.ps1 must print an 'HIP SDK:' substep showing the resolved path."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP SDK:" in source
+
+    def test_setup_shows_hipconfig_full_version(self):
+        """setup.ps1 must capture and display the full hipconfig version string."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCmVersionFull" in source or "hipconfig:" in source
+
+    def test_install_shows_hipconfig_full_version(self):
+        """install.ps1 must capture and display the full hipconfig version string."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCmVersionFull" in source or "hipconfig:" in source
+
+    def test_setup_captures_full_version_not_just_major_minor(self):
+        """setup.ps1 must store the raw hipconfig output line, not just major.minor."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCmVersionFull" in source
+
+    def test_install_captures_full_version_not_just_major_minor(self):
+        """install.ps1 must store the raw hipconfig output line, not just major.minor."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCmVersionFull" in source
+
+    def test_setup_uses_hip_path_or_rocm_path_for_sdk_display(self):
+        """setup.ps1 HIP SDK path substep must check HIP_PATH then ROCM_PATH."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP_PATH" in source and "ROCM_PATH" in source
+
+    def test_install_uses_hip_path_or_rocm_path_for_sdk_display(self):
+        """install.ps1 HIP SDK path substep must check HIP_PATH then ROCM_PATH."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HIP_PATH" in source and "ROCM_PATH" in source
+
+    def test_setup_rocm_step_uses_full_version(self):
+        """setup.ps1 'rocm' step label must prefer the full version string."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "ROCmVersionFull" in source and "rocm" in source
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From bbf004c36bd2a1affbcea99cb55000d335a49deb Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 15:33:54 -0500
Subject: [PATCH 095/165] fix: Strix rocm7.1 segfault bypass + Ubuntu 24.04 HIP
 gcc-install-dir
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Issue 1 (install.sh): gfx1151/gfx1150 + ROCm 7.1 causes a segfault in
torch._grouped_mm (moe_utils.py:167). The Radeon repo now ships cp313
wheels for rocm-rel-7.1, so _amd_gpu_radeon=true silently lands on the
broken combo. When Strix Halo/Point is detected and TORCH_INDEX_URL is
rocm7.1, override to rocm7.2 PyTorch index, update TORCH_CONSTRAINT, and
set _amd_gpu_radeon=false to bypass the Radeon repo entirely. Emits a
clear [WARN] explaining the segfault and linking to the ROCm upgrade docs.

Issue 2 (setup.sh): ROCm 7.x ships clang-20 which on Ubuntu 24.04+ picks
/usr/lib/gcc/x86_64-linux-gnu/14/ (runtime dir, no C++ headers), causing
'cstdlib file not found' and a failed llama.cpp HIP build. Iterate gcc
versions 14→11 to find the first install dir that has both runtime and
/usr/include/c++/<ver> headers, then pass --gcc-install-dir to clang via
CMAKE_HIP_FLAGS. Fix confirmed by h34v3nzc0dex (llama.cpp 417/417 clean).

11 new tests across TestStrixRocm71Override and TestSetupShGccInstallDir;
total 203 passed, 2 skipped
---
 install.sh                                | 30 ++++++++
 studio/setup.sh                           | 21 +++++
 tests/studio/install/test_rocm_support.py | 93 +++++++++++++++++++++++
 3 files changed, 144 insertions(+)

diff --git a/install.sh b/install.sh
index 41130f200d..a5767705c8 100755
--- a/install.sh
+++ b/install.sh
@@ -1746,6 +1746,36 @@ case "$TORCH_INDEX_URL" in
         fi
         ;;
 esac
+# ── Strix Halo / Strix Point: force rocm7.2 wheels, bypass Radeon repo ───────
+# gfx1151 (Strix Halo) and gfx1150 (Strix Point) have a ROCm 7.1 driver bug
+# that causes a segfault in torch._grouped_mm (moe_utils.py line 167).
+# The Radeon repo now ships cp313 wheels for rocm-rel-7.1, so when
+# _amd_gpu_radeon=true the installer silently lands on the broken combo.
+# Detect these GPUs when TORCH_INDEX_URL is rocm7.1 and override to rocm7.2.
+case "$TORCH_INDEX_URL" in
+    */rocm7.1|*/rocm7.1.*)
+        _strix_gfx=""
+        if command -v rocminfo >/dev/null 2>&1; then
+            _strix_gfx=$(rocminfo 2>/dev/null | grep -oE 'gfx1151|gfx1150' | head -1)
+        fi
+        if [ -z "$_strix_gfx" ] && command -v amd-smi >/dev/null 2>&1; then
+            _strix_gfx=$(amd-smi list 2>/dev/null | grep -oE 'gfx1151|gfx1150' | head -1)
+        fi
+        if [ -n "$_strix_gfx" ]; then
+            echo "" >&2
+            echo "  [WARN] $_strix_gfx (Strix) + ROCm 7.1 detected -- known _grouped_mm segfault" >&2
+            echo "  [WARN] ROCm 7.1 wheels are broken for gfx1150/gfx1151 (moe_utils.py:167)" >&2
+            echo "  [WARN] Overriding to rocm7.2 PyTorch index to avoid the driver bug" >&2
+            echo "  [WARN] Upgrade ROCm to 7.2+ to silence this warning:" >&2
+            echo "  [WARN]   https://rocm.docs.amd.com/en/latest/deploy/linux/index.html" >&2
+            echo "" >&2
+            _base="${UNSLOTH_PYTORCH_MIRROR:-https://download.pytorch.org/whl}"
+            TORCH_INDEX_URL="${_base%/}/rocm7.2"
+            TORCH_CONSTRAINT="torch>=2.11.0,<2.12.0"
+            _amd_gpu_radeon=false
+        fi
+        ;;
+esac
 _TAURI_TORCH_INDEX_FAMILY=$(_tauri_torch_index_family "$TORCH_INDEX_URL")
 if [ "$_amd_gpu_radeon" = true ] && [ "$SKIP_TORCH" = false ]; then
     _TAURI_TORCH_INDEX_FAMILY="radeon"
diff --git a/studio/setup.sh b/studio/setup.sh
index c5beb7ebd3..0227bfcaf7 100755
--- a/studio/setup.sh
+++ b/studio/setup.sh
@@ -1034,6 +1034,27 @@ else
 
                 _BUILD_DESC="building (ROCm)"
                 CMAKE_ARGS="$CMAKE_ARGS -DGGML_HIP=ON"
+
+                # ROCm 7.x ships clang-20 which on Ubuntu 24.04+ defaults to the
+                # highest-numbered gcc lib dir (/usr/lib/gcc/x86_64-linux-gnu/14/)
+                # which contains runtime objects but NOT C++ headers, causing:
+                #   fatal error: 'cstdlib' file not found
+                # Find the newest gcc install dir that actually has both the
+                # runtime dir AND /usr/include/c++/<ver> headers, then pass it
+                # to clang via --gcc-install-dir so HIP builds succeed.
+                _GCC_INSTALL_DIR=""
+                for _gcc_ver in 14 13 12 11; do
+                    if [ -d "/usr/lib/gcc/x86_64-linux-gnu/$_gcc_ver/include" ] && \
+                       [ -d "/usr/include/c++/$_gcc_ver" ]; then
+                        _GCC_INSTALL_DIR="/usr/lib/gcc/x86_64-linux-gnu/$_gcc_ver"
+                        break
+                    fi
+                done
+                if [ -n "$_GCC_INSTALL_DIR" ]; then
+                    CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_HIP_FLAGS=--gcc-install-dir=$_GCC_INSTALL_DIR"
+                    substep "ROCm HIP gcc install dir: $_GCC_INSTALL_DIR"
+                fi
+
                 export ROCM_PATH="$ROCM_ROOT"
                 export HIP_PATH="$ROCM_ROOT"
 
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 0d680b77fd..7116b979de 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2255,5 +2255,98 @@ def test_setup_rocm_step_uses_full_version(self):
         assert "ROCmVersionFull" in source and "rocm" in source
 
 
+# =============================================================================
+# TEST: install.sh -- Strix Halo rocm7.1 → rocm7.2 override
+# =============================================================================
+
+_INSTALL_SH_PATH = PACKAGE_ROOT / "install.sh"
+_SETUP_SH_PATH   = PACKAGE_ROOT / "studio" / "setup.sh"
+
+
+class TestStrixRocm71Override:
+    """Verify install.sh skips Radeon repo and forces rocm7.2 for gfx1151/gfx1150
+    when ROCm 7.1 would otherwise be selected (known _grouped_mm segfault)."""
+
+    def test_strix_gfx_detection_in_install_sh(self):
+        """install.sh must detect gfx1151 and gfx1150 for the override."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        assert "gfx1151" in source and "gfx1150" in source
+
+    def test_rocm71_override_to_rocm72_in_install_sh(self):
+        """install.sh must override TORCH_INDEX_URL from rocm7.1 to rocm7.2 for Strix."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        # The override must explicitly reference rocm7.2 in context with Strix detection
+        assert "rocm7.2" in source
+        assert "_strix_gfx" in source
+
+    def test_radeon_repo_bypassed_for_strix_in_install_sh(self):
+        """install.sh must set _amd_gpu_radeon=false when Strix + ROCm 7.1 detected."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        # After Strix detection the Radeon repo flag must be disabled
+        assert "_amd_gpu_radeon=false" in source
+
+    def test_strix_override_warns_with_moe_utils_reference(self):
+        """install.sh must emit a [WARN] mentioning the moe_utils segfault."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        assert "moe_utils" in source or "_grouped_mm" in source
+
+    def test_strix_override_only_fires_on_rocm71(self):
+        """install.sh must scope the Strix override to rocm7.1 only (not rocm7.2+)."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        # The Strix guard must be inside a rocm7.1 case branch
+        strix_idx = source.find("_strix_gfx")
+        assert strix_idx != -1
+        # Look back for the rocm7.1 pattern within 600 chars before _strix_gfx
+        context_before = source[max(0, strix_idx - 600) : strix_idx]
+        assert "rocm7.1" in context_before
+
+    def test_torch_constraint_updated_for_rocm72(self):
+        """install.sh must update TORCH_CONSTRAINT to allow torch>=2.11 when forcing rocm7.2."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        # TORCH_CONSTRAINT must be set inside the Strix override block
+        assert "TORCH_CONSTRAINT" in source and "2.11" in source
+
+
+# =============================================================================
+# TEST: setup.sh -- gcc-install-dir fix for Ubuntu 24.04 + ROCm 7.x clang-20
+# =============================================================================
+
+
+class TestSetupShGccInstallDir:
+    """Verify setup.sh applies the --gcc-install-dir flag when building llama.cpp
+    with HIP on Ubuntu 24.04+ to work around ROCm 7.x clang-20 header path bug."""
+
+    def test_gcc_install_dir_search_loop_present(self):
+        """setup.sh must iterate gcc versions 14→11 to find one with C++ headers."""
+        source = _SETUP_SH_PATH.read_text(encoding = "utf-8")
+        assert "_GCC_INSTALL_DIR" in source
+        assert "/usr/lib/gcc/x86_64-linux-gnu" in source
+
+    def test_gcc_install_dir_checks_include_dir(self):
+        """setup.sh must check that the gcc dir has an 'include' subdirectory."""
+        source = _SETUP_SH_PATH.read_text(encoding = "utf-8")
+        assert "include" in source and "_GCC_INSTALL_DIR" in source
+
+    def test_gcc_install_dir_appended_to_cmake_hip_flags(self):
+        """setup.sh must pass --gcc-install-dir via CMAKE_HIP_FLAGS."""
+        source = _SETUP_SH_PATH.read_text(encoding = "utf-8")
+        assert "CMAKE_HIP_FLAGS" in source
+        assert "gcc-install-dir" in source
+
+    def test_gcc_install_dir_only_applied_in_hip_build_block(self):
+        """The --gcc-install-dir fix must only apply in the HIP/ROCm build branch."""
+        source = _SETUP_SH_PATH.read_text(encoding = "utf-8")
+        # GGML_HIP=ON must appear before gcc-install-dir in the source
+        hip_idx = source.find("GGML_HIP=ON")
+        gcc_idx = source.find("gcc-install-dir")
+        assert hip_idx != -1 and gcc_idx != -1
+        assert hip_idx < gcc_idx
+
+    def test_gcc_install_dir_logs_substep(self):
+        """setup.sh must print a substep when the gcc install dir is resolved."""
+        source = _SETUP_SH_PATH.read_text(encoding = "utf-8")
+        assert "gcc install dir" in source or "GCC_INSTALL_DIR" in source
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From f3ac63f05658b5d4514a805df1c73f96f8e3580f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 16 May 2026 20:34:30 +0000
Subject: [PATCH 096/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/install/test_rocm_support.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 7116b979de..822514eeb1 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2260,7 +2260,7 @@ def test_setup_rocm_step_uses_full_version(self):
 # =============================================================================
 
 _INSTALL_SH_PATH = PACKAGE_ROOT / "install.sh"
-_SETUP_SH_PATH   = PACKAGE_ROOT / "studio" / "setup.sh"
+_SETUP_SH_PATH = PACKAGE_ROOT / "studio" / "setup.sh"
 
 
 class TestStrixRocm71Override:

From cc36e4b5353d3e712941d0c4351f8d857e08c3cb Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 16:34:15 -0500
Subject: [PATCH 097/165] fix: BNB_ROCM_VERSION in server process +
 torch._C._distributed_c10d stubs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two errors visible in training logs on Windows ROCm:

1. Server process bitsandbytes crash:
   "Configured ROCm binary not found at libbitsandbytes_rocm713.dll"
   The installed BNB wheel ships rocm72.dll (not rocm713.dll). The
   training worker already sets BNB_ROCM_VERSION=72 via DLL detection
   but the server process (main.py) imported bitsandbytes before that
   ran. Fix: add the same DLL-scan + BNB_ROCM_VERSION assignment to
   main.py inside the existing win32 guard, before any downstream
   import can pull in bitsandbytes.

2. torch.distributed import failure:
   "No module named 'torch._C._distributed_c10d'; torch._C is not a package"
   torch._C is a C extension on Windows ROCm — Python cannot do
   submodule imports from it, so torch.distributed fails to import
   before our attribute stubs could ever run. Fix: inject empty
   ModuleType stubs for _distributed_c10d, _distributed_autograd and
   _distributed_rpc into sys.modules inside the win32 guard in
   hardware.py BEFORE importing torch.distributed, so the import
   succeeds and our attribute stubs take effect.

9 new tests in TestServerStartupRocmFixes; total 212 passed, 2 skipped
---
 studio/backend/main.py                    | 26 +++++++++
 studio/backend/utils/hardware/hardware.py | 21 +++++--
 tests/studio/install/test_rocm_support.py | 68 +++++++++++++++++++++++
 3 files changed, 111 insertions(+), 4 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 942cf8b2fc..609702ca2f 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -47,6 +47,32 @@ def _add_rocm_dll_dirs() -> None:
     _add_rocm_dll_dirs()
     del _add_rocm_dll_dirs
 
+    # ── Windows AMD ROCm: set BNB_ROCM_VERSION before any bitsandbytes import ─
+    # bitsandbytes on Windows ROCm tries to load libbitsandbytes_rocm<ver>.dll
+    # where <ver> comes from torch.version.hip (e.g. "7.13..." → "713").
+    # The installed BNB wheel ships rocm72.dll (not rocm713.dll), so without
+    # this the server process crashes with "Configured ROCm binary not found".
+    # Detect the available DLL, fall back to "72", and set BNB_ROCM_VERSION
+    # before any import that pulls in bitsandbytes (mirrors worker.py logic).
+    if "BNB_ROCM_VERSION" not in os.environ:
+        import glob as _glob
+        _bnb_rocm_ver = None
+        try:
+            import importlib.util as _ilu
+            _bnb_spec = _ilu.find_spec("bitsandbytes")
+            if _bnb_spec and _bnb_spec.origin:
+                _pkg_dir = os.path.dirname(_bnb_spec.origin)
+                _dlls = _glob.glob(os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll"))
+                import re as _re_bnb
+                for _dll in sorted(_dlls):
+                    _m = _re_bnb.search(r"libbitsandbytes_rocm(\d+)\.dll", _dll)
+                    if _m:
+                        _bnb_rocm_ver = _m.group(1)
+                        break
+        except Exception:
+            pass
+        os.environ["BNB_ROCM_VERSION"] = _bnb_rocm_ver or "72"
+
 # Ensure backend dir is on sys.path so _platform_compat is importable when
 # main.py is launched directly (e.g. `uvicorn main:app`).
 _backend_dir = str(_Path(__file__).parent)
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index d0fe3822b5..0968248679 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -946,10 +946,23 @@ def _load_config_for_gpu_estimate(model_name: str, hf_token: Optional[str] = Non
 def _determine_attention_impl_for_gpu_estimate(config) -> str:
     import copy as _copy
 
-    # torch.distributed is incomplete on Windows ROCm — it ships without the
-    # process-group helpers (is_initialized, is_available, etc.).
-    # resolve_attention_implementation (unsloth) calls is_initialized()
-    # unconditionally, so patch any missing attrs before importing it.
+    # torch.distributed is incomplete on Windows ROCm — torch._C is a C
+    # extension (not a package), so Python cannot import the submodule
+    # torch._C._distributed_c10d that torch.distributed depends on.
+    # Inject an empty stub into sys.modules BEFORE importing torch.distributed
+    # so the import succeeds, then patch the missing process-group helpers.
+    import sys as _sys
+    import types as _types
+
+    if _sys.platform == "win32":
+        for _c10d_name in (
+            "torch._C._distributed_c10d",
+            "torch._C._distributed_autograd",
+            "torch._C._distributed_rpc",
+        ):
+            if _c10d_name not in _sys.modules:
+                _sys.modules[_c10d_name] = _types.ModuleType(_c10d_name)
+
     try:
         import torch.distributed as _td
 
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 822514eeb1..b615d6d0d0 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2348,5 +2348,73 @@ def test_gcc_install_dir_logs_substep(self):
         assert "gcc install dir" in source or "GCC_INSTALL_DIR" in source
 
 
+# =============================================================================
+# TEST: main.py -- BNB_ROCM_VERSION server startup + distributed stubs
+# =============================================================================
+
+_MAIN_PY_PATH     = PACKAGE_ROOT / "studio" / "backend" / "main.py"
+_HARDWARE_PY_PATH = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+
+
+class TestServerStartupRocmFixes:
+    """Verify main.py sets BNB_ROCM_VERSION before any bitsandbytes import and
+    hardware.py injects torch._C._distributed_c10d stubs before torch.distributed."""
+
+    # ── BNB_ROCM_VERSION in server process ────────────────────────────────────
+
+    def test_main_py_sets_bnb_rocm_version(self):
+        """main.py must set BNB_ROCM_VERSION in the server process before imports."""
+        source = _MAIN_PY_PATH.read_text(encoding = "utf-8")
+        assert "BNB_ROCM_VERSION" in source
+
+    def test_main_py_bnb_detection_scoped_to_win32(self):
+        """main.py BNB_ROCM_VERSION logic must be inside the win32 platform guard."""
+        source = _MAIN_PY_PATH.read_text(encoding = "utf-8")
+        win32_idx = source.find('sys.platform == "win32"')
+        bnb_idx   = source.find("BNB_ROCM_VERSION")
+        assert win32_idx != -1 and bnb_idx != -1
+        assert win32_idx < bnb_idx
+
+    def test_main_py_bnb_dll_detection_uses_glob(self):
+        """main.py must scan for libbitsandbytes_rocm*.dll to find the right version."""
+        source = _MAIN_PY_PATH.read_text(encoding = "utf-8")
+        assert "libbitsandbytes_rocm" in source
+
+    def test_main_py_bnb_falls_back_to_72(self):
+        """main.py must fall back to BNB_ROCM_VERSION='72' when no DLL is found."""
+        source = _MAIN_PY_PATH.read_text(encoding = "utf-8")
+        assert '"72"' in source or "'72'" in source
+
+    def test_main_py_bnb_only_set_when_not_already_in_env(self):
+        """main.py must not override an existing BNB_ROCM_VERSION env var."""
+        source = _MAIN_PY_PATH.read_text(encoding = "utf-8")
+        assert '"BNB_ROCM_VERSION" not in os.environ' in source
+
+    # ── torch._C._distributed_c10d stubs in hardware.py ──────────────────────
+
+    def test_hardware_py_injects_distributed_c10d_stub(self):
+        """hardware.py must inject torch._C._distributed_c10d into sys.modules."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert "_distributed_c10d" in source
+
+    def test_hardware_py_stub_injected_before_distributed_import(self):
+        """The sys.modules stub must be injected BEFORE import torch.distributed."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        c10d_idx = source.find("_distributed_c10d")
+        dist_idx = source.find("import torch.distributed")
+        assert c10d_idx != -1 and dist_idx != -1
+        assert c10d_idx < dist_idx
+
+    def test_hardware_py_stub_uses_types_moduletype(self):
+        """hardware.py must create the stub with types.ModuleType."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert "ModuleType" in source
+
+    def test_hardware_py_stub_scoped_to_win32(self):
+        """hardware.py distributed stub injection must be gated on win32."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert 'platform == "win32"' in source or "win32" in source
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From f0ec030fbc9817a221dc9727dbd4f78077f6642c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 16 May 2026 21:34:59 +0000
Subject: [PATCH 098/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/main.py                    | 3 +++
 tests/studio/install/test_rocm_support.py | 8 +++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 609702ca2f..85e79069ed 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -56,14 +56,17 @@ def _add_rocm_dll_dirs() -> None:
     # before any import that pulls in bitsandbytes (mirrors worker.py logic).
     if "BNB_ROCM_VERSION" not in os.environ:
         import glob as _glob
+
         _bnb_rocm_ver = None
         try:
             import importlib.util as _ilu
+
             _bnb_spec = _ilu.find_spec("bitsandbytes")
             if _bnb_spec and _bnb_spec.origin:
                 _pkg_dir = os.path.dirname(_bnb_spec.origin)
                 _dlls = _glob.glob(os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll"))
                 import re as _re_bnb
+
                 for _dll in sorted(_dlls):
                     _m = _re_bnb.search(r"libbitsandbytes_rocm(\d+)\.dll", _dll)
                     if _m:
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index b615d6d0d0..7eeb58281a 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2352,8 +2352,10 @@ def test_gcc_install_dir_logs_substep(self):
 # TEST: main.py -- BNB_ROCM_VERSION server startup + distributed stubs
 # =============================================================================
 
-_MAIN_PY_PATH     = PACKAGE_ROOT / "studio" / "backend" / "main.py"
-_HARDWARE_PY_PATH = PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+_MAIN_PY_PATH = PACKAGE_ROOT / "studio" / "backend" / "main.py"
+_HARDWARE_PY_PATH = (
+    PACKAGE_ROOT / "studio" / "backend" / "utils" / "hardware" / "hardware.py"
+)
 
 
 class TestServerStartupRocmFixes:
@@ -2371,7 +2373,7 @@ def test_main_py_bnb_detection_scoped_to_win32(self):
         """main.py BNB_ROCM_VERSION logic must be inside the win32 platform guard."""
         source = _MAIN_PY_PATH.read_text(encoding = "utf-8")
         win32_idx = source.find('sys.platform == "win32"')
-        bnb_idx   = source.find("BNB_ROCM_VERSION")
+        bnb_idx = source.find("BNB_ROCM_VERSION")
         assert win32_idx != -1 and bnb_idx != -1
         assert win32_idx < bnb_idx
 

From 6831c2aed448bea1a21b5ab6b30a2e1954a63db3 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 16:43:38 -0500
Subject: [PATCH 099/165] fix(win32): populate distributed c10d stub with dummy
 symbols

torch.distributed tries to `from torch._C._distributed_c10d import
FakeProcessGroup` (and ProcessGroup, Work, Store, etc.).  The previous
empty ModuleType stub caused an AttributeError on those names.

Populate every stub with a _Dummy class for each known symbol so the
import chain completes silently on Windows ROCm where torch._C is a
compiled extension and its _distributed_c10d submodule doesn't exist.

Adds four new tests in TestServerStartupRocmFixes covering FakeProcessGroup,
ProcessGroup, setattr population, and all three _distributed_* siblings.
---
 studio/backend/utils/hardware/hardware.py | 25 ++++++++++++++++++++++-
 tests/studio/install/test_rocm_support.py | 22 ++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 0968248679..6768f50b5c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -955,13 +955,36 @@ def _determine_attention_impl_for_gpu_estimate(config) -> str:
     import types as _types
 
     if _sys.platform == "win32":
+        # Dummy class for any name torch.distributed tries to import from these stubs
+        class _Dummy:
+            pass
+
         for _c10d_name in (
             "torch._C._distributed_c10d",
             "torch._C._distributed_autograd",
             "torch._C._distributed_rpc",
         ):
             if _c10d_name not in _sys.modules:
-                _sys.modules[_c10d_name] = _types.ModuleType(_c10d_name)
+                _stub = _types.ModuleType(_c10d_name)
+                # torch.distributed imports these names from _distributed_c10d;
+                # provide no-op dummies so the import doesn't raise AttributeError.
+                for _sym in (
+                    "FakeProcessGroup",
+                    "ProcessGroup",
+                    "Work",
+                    "Store",
+                    "PrefixStore",
+                    "FileStore",
+                    "TCPStore",
+                    "HashStore",
+                    "Reducer",
+                    "Logger",
+                    "DistributedDebugLevel",
+                    "GradBucket",
+                    "BuiltinCommHookType",
+                ):
+                    setattr(_stub, _sym, _Dummy)
+                _sys.modules[_c10d_name] = _stub
 
     try:
         import torch.distributed as _td
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 7eeb58281a..d602602c6c 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2417,6 +2417,28 @@ def test_hardware_py_stub_scoped_to_win32(self):
         source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
         assert 'platform == "win32"' in source or "win32" in source
 
+    def test_hardware_py_stub_exposes_fake_process_group(self):
+        """hardware.py stub must set FakeProcessGroup so torch.distributed doesn't raise AttributeError."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert "FakeProcessGroup" in source
+
+    def test_hardware_py_stub_exposes_process_group(self):
+        """hardware.py stub must set ProcessGroup on the c10d stub."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert "ProcessGroup" in source
+
+    def test_hardware_py_stub_uses_setattr_for_symbols(self):
+        """hardware.py must use setattr to populate stub symbols dynamically."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert "setattr" in source
+
+    def test_hardware_py_stub_all_c10d_siblings_covered(self):
+        """hardware.py must stub all three torch._C._distributed_* submodules."""
+        source = _HARDWARE_PY_PATH.read_text(encoding = "utf-8")
+        assert "_distributed_c10d" in source
+        assert "_distributed_autograd" in source
+        assert "_distributed_rpc" in source
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 39ae2e8b3dff5697c7be6646ca9f2994ac42f839 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 16:57:01 -0500
Subject: [PATCH 100/165] fix(win32): distinguish HIP SDK installed vs GPU not
 ROCm-accessible

Previously, when hipinfo was found but exited non-zero (e.g. "no
ROCm-capable device detected"), both install.ps1 and setup.ps1 fell
through to the WMI-label-only branch and printed "AMD GPU detected --
HIP SDK not found" -- factually wrong since the SDK binary is present.

Add $HipSdkInstalled flag (set true when hipinfo binary is found,
regardless of exit code). When HipSdkInstalled && !HasROCm:
- Show "AMD GPU detected -- not ROCm-accessible (HIP <ver>)" instead
- Explain this is a driver issue, not an SDK issue, with a link
- Still run hipconfig version capture so version shows in output
- CPU-only hint now says "GPU not ROCm-accessible" not "require HIP SDK"

Also applies to setup.ps1 (same detection block, same branches).

Adds TestHipSdkInstalledButDeviceInaccessible (11 tests).
---
 install.ps1                               | 21 ++++++-
 studio/setup.ps1                          | 18 +++++-
 tests/studio/install/test_rocm_support.py | 72 +++++++++++++++++++++++
 3 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 28a78d47a2..cc2cdbcee1 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1209,6 +1209,7 @@ shell.Run cmd, 0, False
     }
     # ── AMD ROCm detection (Windows) — mirrors setup.ps1 ──
     $HasROCm = $false
+    $HipSdkInstalled = $false   # HIP SDK binary found (independent of device accessibility)
     $ROCmGpuLabel = $null
     $ROCmVersion = $null
     $ROCmGfxArch = $null
@@ -1234,6 +1235,7 @@ shell.Run cmd, 0, False
             }
         }
         if ($hipinfoExe) {
+            $HipSdkInstalled = $true   # binary found → SDK is installed regardless of device state
             try {
                 $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
                 if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
@@ -1322,8 +1324,10 @@ shell.Run cmd, 0, False
                 }
             }
         }
-        # Capture ROCm version for wheel selection (hipconfig, then amd-smi)
-        if ($HasROCm) {
+        # Capture ROCm version for wheel selection (hipconfig, then amd-smi).
+        # Run whenever the HIP SDK binary is present, not just when the device is accessible --
+        # hipconfig --version works even when hipinfo reports no ROCm device (driver issue).
+        if ($HasROCm -or $HipSdkInstalled) {
             $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
             if (-not $hipConfigExe) {
                 $hipRoot = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { $null }
@@ -1369,6 +1373,15 @@ shell.Run cmd, 0, False
         $hipSdkPath = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { "on system PATH" }
         substep "HIP SDK: $hipSdkPath"
         if ($ROCmVersionFull) { substep "hipconfig: $ROCmVersionFull" }
+    } elseif ($HipSdkInstalled -and $ROCmGpuLabel) {
+        # HIP SDK is installed but ROCm can't see the device (driver issue, not SDK issue)
+        $sdkVer = if ($ROCmVersionFull) { " (HIP $ROCmVersionFull)" } else { "" }
+        step "gpu" "AMD GPU detected -- not ROCm-accessible$sdkVer" "Yellow"
+        substep "Detected: $ROCmGpuLabel" "Yellow"
+        substep "[WARN] HIP SDK is installed but hipinfo reports no ROCm-capable device." "Yellow"
+        substep "       This is a driver issue, not an SDK issue." "Yellow"
+        substep "       Ensure the ROCm compute driver is installed alongside the display driver:" "Yellow"
+        substep "       https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" "Yellow"
     } elseif ($ROCmGpuLabel) {
         step "gpu" "AMD GPU detected -- HIP SDK not found" "Yellow"
         substep "Detected: $ROCmGpuLabel" "Yellow"
@@ -1441,7 +1454,9 @@ shell.Run cmd, 0, False
     # ── Print CPU-only hint when no GPU detected ──
     if (-not $SkipTorch -and -not $ROCmIndexUrl -and $TorchIndexUrl -like "*/cpu") {
         Write-Host ""
-        if ($HasROCm -or $ROCmGpuLabel) {
+        if ($HipSdkInstalled -and -not $HasROCm) {
+            substep "Installing CPU-only PyTorch (HIP SDK found but GPU not ROCm-accessible)." "Yellow"
+        } elseif ($ROCmGpuLabel) {
             substep "Installing CPU-only PyTorch (ROCm wheels require the HIP SDK)." "Yellow"
         } else {
             substep "No NVIDIA GPU detected." "Yellow"
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 68bd6397aa..2772c94b11 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -684,6 +684,7 @@ if (-not $HasNvidiaSmi) {
 }
 # ── AMD ROCm detection (Windows): probe hipinfo/amd-smi for actual GPU ──
 $HasROCm = $false
+$HipSdkInstalled = $false   # HIP SDK binary found (independent of device accessibility)
 $ROCmGpuLabel = $null
 $script:ROCmGfxArch = $null
 if (-not $HasNvidiaSmi) {
@@ -708,6 +709,7 @@ if (-not $HasNvidiaSmi) {
         }
     }
     if ($hipinfoExe) {
+        $HipSdkInstalled = $true   # binary found → SDK is installed regardless of device state
         try {
             $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
             if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
@@ -807,8 +809,10 @@ if (-not $HasNvidiaSmi) {
             }
         }
     }
-    # Capture ROCm version early for display and wheel selection
-    if ($HasROCm) {
+    # Capture ROCm version early for display and wheel selection.
+    # Run whenever the HIP SDK binary is present, not just when the device is accessible --
+    # hipconfig --version works even when hipinfo reports no ROCm device (driver issue).
+    if ($HasROCm -or $HipSdkInstalled) {
         $script:ROCmVersion = $null
         $hipConfigExe = Get-Command hipconfig -ErrorAction SilentlyContinue
         if (-not $hipConfigExe) {
@@ -853,6 +857,16 @@ if ($HasNvidiaSmi) {
     $hipSdkPath = if ($env:HIP_PATH) { $env:HIP_PATH } elseif ($env:ROCM_PATH) { $env:ROCM_PATH } else { "on system PATH" }
     substep "HIP SDK: $hipSdkPath"
     if ($script:ROCmVersionFull) { substep "hipconfig: $script:ROCmVersionFull" }
+} elseif ($HipSdkInstalled -and $ROCmGpuLabel) {
+    # HIP SDK is installed but ROCm can't see the device (driver issue, not SDK issue)
+    $sdkVer = if ($script:ROCmVersionFull) { " (HIP $script:ROCmVersionFull)" } else { "" }
+    Write-Host ""
+    step "gpu" "AMD GPU detected -- not ROCm-accessible$sdkVer" "Yellow"
+    substep "Detected: $ROCmGpuLabel" "Yellow"
+    substep "[WARN] HIP SDK is installed but hipinfo reports no ROCm-capable device." "Yellow"
+    substep "       This is a driver issue, not an SDK issue." "Yellow"
+    substep "       Ensure the ROCm compute driver is installed alongside the display driver:" "Yellow"
+    substep "       https://rocm.docs.amd.com/en/latest/deploy/windows/index.html" "Yellow"
 } elseif ($ROCmGpuLabel) {
     Write-Host ""
     step "gpu" "AMD GPU detected -- HIP SDK not found" "Yellow"
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index d602602c6c..9f746e6096 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2440,5 +2440,77 @@ def test_hardware_py_stub_all_c10d_siblings_covered(self):
         assert "_distributed_rpc" in source
 
 
+# =============================================================================
+# TEST: install.ps1 / setup.ps1 -- HipSdkInstalled flag (SDK found, device inaccessible)
+# =============================================================================
+
+
+class TestHipSdkInstalledButDeviceInaccessible:
+    """Verify that when hipinfo is found but exits non-zero (device not ROCm-accessible),
+    both scripts distinguish this from 'HIP SDK not found' and emit the correct message."""
+
+    def test_install_ps1_has_hip_sdk_installed_flag(self):
+        """install.ps1 must track HipSdkInstalled separately from HasROCm."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HipSdkInstalled" in source
+
+    def test_setup_ps1_has_hip_sdk_installed_flag(self):
+        """setup.ps1 must track HipSdkInstalled separately from HasROCm."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HipSdkInstalled" in source
+
+    def test_install_ps1_sets_flag_when_hipinfo_binary_found(self):
+        """install.ps1 must set HipSdkInstalled=true inside the 'if ($hipinfoExe)' block."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        # HipSdkInstalled must be assigned inside the hipinfoExe block
+        hipinfo_block_idx = source.find("if ($hipinfoExe)")
+        sdk_flag_idx = source.find("$HipSdkInstalled = $true", hipinfo_block_idx)
+        assert hipinfo_block_idx != -1 and sdk_flag_idx != -1
+        assert sdk_flag_idx > hipinfo_block_idx
+
+    def test_setup_ps1_sets_flag_when_hipinfo_binary_found(self):
+        """setup.ps1 must set HipSdkInstalled=true inside the 'if ($hipinfoExe)' block."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        hipinfo_block_idx = source.find("if ($hipinfoExe)")
+        sdk_flag_idx = source.find("$HipSdkInstalled = $true", hipinfo_block_idx)
+        assert hipinfo_block_idx != -1 and sdk_flag_idx != -1
+        assert sdk_flag_idx > hipinfo_block_idx
+
+    def test_install_ps1_version_capture_runs_when_sdk_installed(self):
+        """install.ps1 must capture hipconfig version when HipSdkInstalled even if HasROCm is false."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HasROCm -or $HipSdkInstalled" in source or "$HipSdkInstalled" in source
+
+    def test_setup_ps1_version_capture_runs_when_sdk_installed(self):
+        """setup.ps1 must capture hipconfig version when HipSdkInstalled even if HasROCm is false."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "HasROCm -or $HipSdkInstalled" in source or "$HipSdkInstalled" in source
+
+    def test_install_ps1_distinct_message_for_sdk_found_but_device_inaccessible(self):
+        """install.ps1 must show 'not ROCm-accessible' message (not 'HIP SDK not found') when SDK present."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "not ROCm-accessible" in source
+
+    def test_setup_ps1_distinct_message_for_sdk_found_but_device_inaccessible(self):
+        """setup.ps1 must show 'not ROCm-accessible' message (not 'HIP SDK not found') when SDK present."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "not ROCm-accessible" in source
+
+    def test_install_ps1_driver_guidance_in_sdk_found_branch(self):
+        """install.ps1 must tell user this is a driver issue, not an SDK issue."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "driver issue" in source
+
+    def test_setup_ps1_driver_guidance_in_sdk_found_branch(self):
+        """setup.ps1 must tell user this is a driver issue, not an SDK issue."""
+        source = _SETUP_PS1_PATH.read_text(encoding = "utf-8")
+        assert "driver issue" in source
+
+    def test_install_ps1_cpu_hint_distinguishes_driver_vs_no_sdk(self):
+        """install.ps1 CPU-only hint must say 'GPU not ROCm-accessible' not 'require the HIP SDK' when SDK found."""
+        source = _INSTALL_PS1_PATH.read_text(encoding = "utf-8")
+        assert "GPU not ROCm-accessible" in source
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

From 4e75d42e4a9cecddcb6ca1a3a6b99d9c709d5f3b Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 17:08:57 -0500
Subject: [PATCH 101/165] fix(win32): scope ROCm workarounds to AMD hosts only

Three Codex-flagged issues where Windows ROCm workarounds incorrectly
applied to Windows CUDA (NVIDIA) machines:

main.py (P1): BNB_ROCM_VERSION was set unconditionally on all win32
hosts. On NVIDIA, bitsandbytes sees BNB_ROCM_VERSION and looks for a
ROCm DLL that doesn't exist, breaking bitsandbytes initialisation.
Fix: gate the block on HIP_PATH/ROCM_PATH being present (ROCm hosts only).

worker.py (P2): torchao stubs were seeded for all win32 runs, shadowing
real torchao on Windows CUDA and silently disabling torchao quantization
for NVIDIA users. Fix: gate on HIP_PATH/ROCM_PATH (win32 ROCm only).

install_python_stack.py (P1): _detect_windows_gfx_arch() only checked
shutil.which("hipinfo"), skipping the HIP_PATH/ROCM_PATH fallback that
the PowerShell installers use. On installs where the HIP SDK bin dir is
not on PATH, _ensure_rocm_torch() returned early without installing
ROCm wheels or bitsandbytes. Fix: mirror the env-var fallback.
---
 studio/backend/core/training/worker.py |  8 +++++++-
 studio/backend/main.py                 |  6 +++++-
 studio/install_python_stack.py         | 16 +++++++++++++++-
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 4e9ac4ff83..75f9d56fa4 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1248,7 +1248,13 @@ def find_spec(self, fullname, path, target = None):
 
     sys.meta_path.append(_StubSubpackageFinder())
 
-    if sys.platform == "win32":
+    # Only stub torchao on Windows ROCm hosts -- on Windows CUDA (NVIDIA) torchao
+    # is real and shadowing it breaks torchao-based quantization paths.
+    # HIP_PATH / ROCM_PATH are set by the AMD HIP SDK installer on ROCm machines.
+    _is_win32_rocm = sys.platform == "win32" and bool(
+        os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH")
+    )
+    if _is_win32_rocm:
         # Seed torchao top-level + key submodules; the finder handles the rest.
         for _tao_name in (
             "torchao",
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 85e79069ed..b61e192cfc 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -54,7 +54,11 @@ def _add_rocm_dll_dirs() -> None:
     # this the server process crashes with "Configured ROCm binary not found".
     # Detect the available DLL, fall back to "72", and set BNB_ROCM_VERSION
     # before any import that pulls in bitsandbytes (mirrors worker.py logic).
-    if "BNB_ROCM_VERSION" not in os.environ:
+    # Guard: only set on ROCm hosts (HIP_PATH/ROCM_PATH present) -- setting
+    # BNB_ROCM_VERSION on a Windows CUDA machine makes bitsandbytes look for a
+    # ROCm DLL that doesn't exist and fail to initialise the CUDA backend.
+    _is_rocm_host = bool(os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH"))
+    if _is_rocm_host and "BNB_ROCM_VERSION" not in os.environ:
         import glob as _glob
 
         _bnb_rocm_ver = None
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index bd6589dd20..6929e5eab7 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -228,10 +228,24 @@ def _detect_rocm_version() -> tuple[int, int] | None:
 
 
 def _detect_windows_gfx_arch() -> str | None:
-    """Return the gcnArchName from hipinfo on Windows (e.g. 'gfx1200'), or None."""
+    """Return the gcnArchName from hipinfo on Windows (e.g. 'gfx1200'), or None.
+
+    Resolves hipinfo via PATH first, then HIP_PATH\\bin and ROCM_PATH\\bin as
+    fallbacks -- the AMD HIP SDK installer sets these env vars but does not
+    always add the bin dir to the system PATH.
+    """
     import re
 
     hipinfo = shutil.which("hipinfo")
+    if not hipinfo:
+        # Fallback: AMD HIP SDK sets HIP_PATH / ROCM_PATH even when bin isn't on PATH
+        for _env_var in ("HIP_PATH", "ROCM_PATH"):
+            _root = os.environ.get(_env_var)
+            if _root:
+                _candidate = os.path.join(_root, "bin", "hipinfo.exe")
+                if os.path.isfile(_candidate):
+                    hipinfo = _candidate
+                    break
     if not hipinfo:
         return None
     try:

From d89d9b1389cdaf827756da651c46d3ef60317322 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sat, 16 May 2026 17:55:31 -0500
Subject: [PATCH 102/165] fix(linux): route Strix + ROCm 7.1 to AMD
 arch-specific index

Instead of falling back to pytorch.org/rocm7.2, the Strix override now
routes to repo.amd.com/rocm/whl/gfx1151/ (or gfx1150/) which serves
torch 2.11.0+rocm7.13.0 -- AMD's build containing the actual _grouped_mm
kernel fix, verified on real gfx1151 hardware by h34v3nzc0dex.

This exercises the real GPU kernel path rather than the rocm7.2 workaround.
UNSLOTH_AMD_ROCM_MIRROR can override the base URL for air-gapped installs.

Also teaches _tauri_torch_index_family to recognise AMD arch-specific URLs
(repo.amd.com/rocm/whl/gfx*) and return the rocm7.13 family label so
_tauri_gpu_branch correctly classifies these installs as rocm.

Suggested by h34v3nzc0dex based on hardware-verified probe results.
---
 install.sh                                | 15 +++++++---
 tests/studio/install/test_rocm_support.py | 36 ++++++++++++++++-------
 2 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/install.sh b/install.sh
index a5767705c8..b27955a1f2 100755
--- a/install.sh
+++ b/install.sh
@@ -243,6 +243,9 @@ _tauri_torch_index_family() {
                 rocm[0-9]*.[0-9]*) echo "$_diag_family" ;;
                 *) echo "auto" ;;
             esac ;;
+        # AMD arch-specific index (e.g. repo.amd.com/rocm/whl/gfx1151/) --
+        # used for Strix Halo/Point where torch 2.11+rocm7.13 has the real fix.
+        *repo.amd.com/rocm/whl/gfx*|*rocm/whl/gfx*) echo "rocm7.13" ;;
         "") echo "none" ;;
         *) echo "auto" ;;
     esac
@@ -1765,12 +1768,16 @@ case "$TORCH_INDEX_URL" in
             echo "" >&2
             echo "  [WARN] $_strix_gfx (Strix) + ROCm 7.1 detected -- known _grouped_mm segfault" >&2
             echo "  [WARN] ROCm 7.1 wheels are broken for gfx1150/gfx1151 (moe_utils.py:167)" >&2
-            echo "  [WARN] Overriding to rocm7.2 PyTorch index to avoid the driver bug" >&2
-            echo "  [WARN] Upgrade ROCm to 7.2+ to silence this warning:" >&2
+            echo "  [WARN] Routing to AMD arch-specific index (torch 2.11+rocm7.13 has the real fix)" >&2
+            echo "  [WARN] Upgrade ROCm to 7.2+ to use the standard index:" >&2
             echo "  [WARN]   https://rocm.docs.amd.com/en/latest/deploy/linux/index.html" >&2
             echo "" >&2
-            _base="${UNSLOTH_PYTORCH_MIRROR:-https://download.pytorch.org/whl}"
-            TORCH_INDEX_URL="${_base%/}/rocm7.2"
+            # AMD's arch-specific index serves torch 2.11.0+rocm7.13.0 which has AMD's
+            # actual fix for the gfx1151/gfx1150 _grouped_mm kernel bug -- preferred
+            # over the pytorch.org rocm7.2 fallback because it exercises the real GPU
+            # kernel path. Set UNSLOTH_AMD_ROCM_MIRROR to override for air-gapped installs.
+            _amd_strix_base="${UNSLOTH_AMD_ROCM_MIRROR:-https://repo.amd.com/rocm/whl}"
+            TORCH_INDEX_URL="${_amd_strix_base%/}/${_strix_gfx}/"
             TORCH_CONSTRAINT="torch>=2.11.0,<2.12.0"
             _amd_gpu_radeon=false
         fi
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 9f746e6096..7011abc451 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2264,25 +2264,30 @@ def test_setup_rocm_step_uses_full_version(self):
 
 
 class TestStrixRocm71Override:
-    """Verify install.sh skips Radeon repo and forces rocm7.2 for gfx1151/gfx1150
-    when ROCm 7.1 would otherwise be selected (known _grouped_mm segfault)."""
+    """Verify install.sh skips Radeon repo and routes to AMD arch-specific index
+    for gfx1151/gfx1150 when ROCm 7.1 would otherwise be selected (known _grouped_mm segfault).
+    AMD's repo.amd.com/rocm/whl/gfx1151/ serves torch 2.11+rocm7.13 which has the real fix."""
 
     def test_strix_gfx_detection_in_install_sh(self):
         """install.sh must detect gfx1151 and gfx1150 for the override."""
         source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
         assert "gfx1151" in source and "gfx1150" in source
 
-    def test_rocm71_override_to_rocm72_in_install_sh(self):
-        """install.sh must override TORCH_INDEX_URL from rocm7.1 to rocm7.2 for Strix."""
+    def test_rocm71_override_to_amd_arch_index_in_install_sh(self):
+        """install.sh must override TORCH_INDEX_URL to AMD arch-specific index for Strix."""
         source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
-        # The override must explicitly reference rocm7.2 in context with Strix detection
-        assert "rocm7.2" in source
+        # The override must route to AMD's arch-specific index (repo.amd.com/rocm/whl)
+        assert "repo.amd.com/rocm/whl" in source
         assert "_strix_gfx" in source
+        # The URL must incorporate the detected gfx arch so gfx1151 → .../gfx1151/
+        strix_idx = source.find("_amd_strix_base")
+        assert strix_idx != -1
+        ctx = source[strix_idx : strix_idx + 200]
+        assert "_strix_gfx" in ctx
 
     def test_radeon_repo_bypassed_for_strix_in_install_sh(self):
         """install.sh must set _amd_gpu_radeon=false when Strix + ROCm 7.1 detected."""
         source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
-        # After Strix detection the Radeon repo flag must be disabled
         assert "_amd_gpu_radeon=false" in source
 
     def test_strix_override_warns_with_moe_utils_reference(self):
@@ -2293,19 +2298,28 @@ def test_strix_override_warns_with_moe_utils_reference(self):
     def test_strix_override_only_fires_on_rocm71(self):
         """install.sh must scope the Strix override to rocm7.1 only (not rocm7.2+)."""
         source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
-        # The Strix guard must be inside a rocm7.1 case branch
         strix_idx = source.find("_strix_gfx")
         assert strix_idx != -1
         # Look back for the rocm7.1 pattern within 600 chars before _strix_gfx
         context_before = source[max(0, strix_idx - 600) : strix_idx]
         assert "rocm7.1" in context_before
 
-    def test_torch_constraint_updated_for_rocm72(self):
-        """install.sh must update TORCH_CONSTRAINT to allow torch>=2.11 when forcing rocm7.2."""
+    def test_torch_constraint_updated_for_strix_amd_index(self):
+        """install.sh must set TORCH_CONSTRAINT>=2.11 when routing Strix to AMD index."""
         source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
-        # TORCH_CONSTRAINT must be set inside the Strix override block
         assert "TORCH_CONSTRAINT" in source and "2.11" in source
 
+    def test_amd_rocm_mirror_env_var_respected(self):
+        """install.sh must honour UNSLOTH_AMD_ROCM_MIRROR for air-gapped installs."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        assert "UNSLOTH_AMD_ROCM_MIRROR" in source
+
+    def test_tauri_family_recognises_amd_arch_url(self):
+        """_tauri_torch_index_family must return a rocm* family for AMD arch-specific URLs."""
+        source = _INSTALL_SH_PATH.read_text(encoding = "utf-8")
+        # The function must have a case branch for repo.amd.com/rocm/whl/gfx* URLs
+        assert "rocm/whl/gfx" in source
+
 
 # =============================================================================
 # TEST: setup.sh -- gcc-install-dir fix for Ubuntu 24.04 + ROCm 7.x clang-20

From 0c2020d5a11e7faaf3752a0b16f09e4449e42e11 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 May 2026 07:59:55 +0000
Subject: [PATCH 103/165] fix(studio/rocm): gate ROCm-only side-effects on
 active torch runtime

Address five edge cases flagged during PR review:

1. studio/backend/main.py: BNB_ROCM_VERSION was set whenever HIP_PATH or
   ROCM_PATH was present in the environment. A Windows CUDA user who once
   installed the HIP SDK and reverted to a CUDA torch wheel still has those
   env vars set, so bitsandbytes would try to load libbitsandbytes_rocm72.dll
   against a CUDA torch and crash. Now probe torch.version.hip inside the
   env-var guard (worker.py already does this).

2. studio/backend/main.py: os.add_dll_directory returned handles were
   discarded. Per CPython docs, the directory leaves the DLL search list when
   the handle is garbage collected. Retain handles in module-level
   _ROCM_DLL_HANDLES list so they survive process lifetime.

3. studio/install_python_stack.py: _install_bnb_windows_rocm() returned None
   regardless of pip_install_try outcome, and the caller flipped
   _rocm_windows_torch_installed to True unconditionally. On a failed BNB
   install the post-install "manual install may be required" warning was
   suppressed and the user was misled. Helper now returns bool; caller gates
   on it.

4. studio/install_python_stack.py: _detect_windows_gfx_arch returned the raw
   capture group, so mixed-case hipinfo output ("Gfx1151") missed the
   lowercase keys in _GFX_TO_AMD_INDEX_ARCH and silently fell back to CPU
   torch. Lowercase the token.

5. studio/install_python_stack.py: UNSLOTH_ROCM_TORCH_INSTALLED=1 early-
   return trusted the env var even when the venv was wiped between runs.
   Subprocess-probe torch importability first; fall through to the full
   install path if the probe fails.

Tests: 231 passed, 1 skipped in tests/studio/install/test_rocm_support.py
(adds one new test for case 5 fall-through).
---
 studio/backend/main.py                    | 22 ++++++--
 studio/install_python_stack.py            | 62 +++++++++++++++++------
 tests/studio/install/test_rocm_support.py | 33 ++++++++++--
 3 files changed, 94 insertions(+), 23 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 30ffb6d353..488ce936ff 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -17,6 +17,10 @@
 # os.add_dll_directory() so amdhip64.dll etc. are found before any torch import.
 if sys.platform == "win32":
 
+    # Retained at module scope -- os.add_dll_directory returns a handle that
+    # removes the search-path entry when garbage collected.
+    _ROCM_DLL_HANDLES: list = []
+
     def _add_rocm_dll_dirs() -> None:
         candidates = []
         # 1. HIP_PATH / ROCM_PATH -- set by the AMD HIP SDK installer
@@ -40,7 +44,7 @@ def _add_rocm_dll_dirs() -> None:
         for _d in candidates:
             if os.path.isdir(_d):
                 try:
-                    os.add_dll_directory(_d)
+                    _ROCM_DLL_HANDLES.append(os.add_dll_directory(_d))
                 except (OSError, AttributeError):
                     pass
 
@@ -54,10 +58,18 @@ def _add_rocm_dll_dirs() -> None:
     # this the server process crashes with "Configured ROCm binary not found".
     # Detect the available DLL, fall back to "72", and set BNB_ROCM_VERSION
     # before any import that pulls in bitsandbytes (mirrors worker.py logic).
-    # Guard: only set on ROCm hosts (HIP_PATH/ROCM_PATH present) -- setting
-    # BNB_ROCM_VERSION on a Windows CUDA machine makes bitsandbytes look for a
-    # ROCm DLL that doesn't exist and fail to initialise the CUDA backend.
-    _is_rocm_host = bool(os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH"))
+    # Gate on the active torch runtime, not env-var presence -- HIP_PATH /
+    # ROCM_PATH stay set after a user installs the HIP SDK and reverts to a
+    # CUDA torch wheel, and setting BNB_ROCM_VERSION there makes bitsandbytes
+    # look for a ROCm DLL that doesn't exist and crash the CUDA backend.
+    _is_rocm_host = False
+    if os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH"):
+        try:
+            import torch as _torch_probe
+            _is_rocm_host = bool(getattr(_torch_probe.version, "hip", None))
+            del _torch_probe
+        except Exception:
+            pass
     if _is_rocm_host and "BNB_ROCM_VERSION" not in os.environ:
         import glob as _glob
 
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 6929e5eab7..297cd126da 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -259,7 +259,10 @@ def _detect_windows_gfx_arch() -> str | None:
             return None
         text = result.stdout.decode(errors = "replace")
         m = re.search(r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text)
-        return m.group(1).strip() if m else None
+        # Lowercase the captured token -- some hipinfo builds emit "Gfx1151"
+        # which would miss the lowercase keys in _GFX_TO_AMD_INDEX_ARCH and
+        # silently fall back to CPU torch.
+        return m.group(1).strip().lower() if m else None
     except Exception:
         return None
 
@@ -384,8 +387,8 @@ def _detect_amd_gfx_codes() -> list[str]:
 _rocm_windows_torch_installed: bool = False
 
 
-def _install_bnb_windows_rocm() -> None:
-    """Install the AMD Windows BNB prerelease wheel.
+def _install_bnb_windows_rocm() -> bool:
+    """Install the AMD Windows BNB prerelease wheel. Returns True on success.
 
     The continuous-release wheel is intentionally mismatched: the filename
     encodes version 1.33.7.preview (parsed as 1.33.7rc0 by PEP 440) while the
@@ -395,11 +398,11 @@ def _install_bnb_windows_rocm() -> None:
     """
     _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
     if _bnb_win_url is None:
-        return
+        return False
     _prev = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
     os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
     try:
-        pip_install_try(
+        _ok = pip_install_try(
             "bitsandbytes (AMD Windows, pre-release main)",
             "--force-reinstall",
             "--no-cache-dir",
@@ -412,6 +415,8 @@ def _install_bnb_windows_rocm() -> None:
             os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
         else:
             os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = _prev
+    if not _ok:
+        return False
     # After install: detect the actual ROCm DLL suffix from the wheel so any
     # post-install BNB import in this process loads the correct DLL.
     # The worker subprocess does the same detection independently (worker.py §1f).
@@ -419,6 +424,7 @@ def _install_bnb_windows_rocm() -> None:
     if "BNB_ROCM_VERSION" not in os.environ:
         _ver = _detect_bnb_rocm_dll_ver() or "72"
         os.environ["BNB_ROCM_VERSION"] = _ver
+    return True
 
 
 def _ensure_rocm_torch() -> None:
@@ -431,14 +437,38 @@ def _ensure_rocm_torch() -> None:
     Uses pip_install() to respect uv, constraints, and --python targeting.
     """
     global _rocm_windows_torch_installed
-    # setup.ps1 sets this when it already installed AMD wheels; skip the probe.
+    # setup.ps1 sets this when it already installed AMD wheels; skip the probe
+    # only when torch is actually importable as ROCm. If the venv was wiped
+    # between runs, the stale env-var would suppress a needed reinstall.
     if os.environ.get("UNSLOTH_ROCM_TORCH_INSTALLED") == "1":
-        _rocm_windows_torch_installed = True
-        # setup.ps1 already installed ROCm torch, but we still need to install
-        # the AMD Windows BNB wheel here — the PyPI bitsandbytes wheel ships
-        # only CUDA DLLs and will fail to load on ROCm (no libbitsandbytes_rocm72.dll).
-        _install_bnb_windows_rocm()
-        return
+        _torch_ok = False
+        try:
+            _probe = subprocess.run(
+                [
+                    sys.executable,
+                    "-c",
+                    (
+                        "import torch; "
+                        "hip=getattr(torch.version,'hip','') or ''; "
+                        "import sys; "
+                        "sys.exit(0 if (hip or 'rocm' in torch.__version__.lower()) else 1)"
+                    ),
+                ],
+                stdout = subprocess.DEVNULL,
+                stderr = subprocess.DEVNULL,
+                timeout = 30,
+            )
+            _torch_ok = _probe.returncode == 0
+        except (OSError, subprocess.TimeoutExpired):
+            pass
+        if _torch_ok:
+            _rocm_windows_torch_installed = True
+            # setup.ps1 already installed ROCm torch, but we still need to install
+            # the AMD Windows BNB wheel here -- the PyPI bitsandbytes wheel ships
+            # only CUDA DLLs and will fail to load on ROCm.
+            _install_bnb_windows_rocm()
+            return
+        # torch was wiped between runs; fall through to the full install path
     if IS_MACOS:
         return
 
@@ -488,11 +518,13 @@ def _ensure_rocm_torch() -> None:
                 "torchaudio",
                 constrain = False,
             )
-        # Always install AMD Windows bitsandbytes — the PyPI wheel ships only
+        # Always install AMD Windows bitsandbytes -- the PyPI wheel ships only
         # CUDA DLLs and will fail to load on ROCm.  Install even when torch was
         # already a ROCm build so that `studio update` repairs a broken bnb.
-        _install_bnb_windows_rocm()
-        _rocm_windows_torch_installed = True
+        # Only flip the success flag when the install actually succeeds; otherwise
+        # the post-install "manual install may be required" warning is suppressed.
+        if _install_bnb_windows_rocm():
+            _rocm_windows_torch_installed = True
         return
 
     # ── Linux x86_64 only: PyTorch ROCm wheels are not published for aarch64 ──
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 7011abc451..035a9ea3c6 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1788,11 +1788,19 @@ def test_returns_none_when_only_cuda_dlls(self, tmp_path):
 class TestRocmTorchInstalledEnvVar:
     """Verify UNSLOTH_ROCM_TORCH_INSTALLED=1 skips main install but still installs BNB."""
 
+    @staticmethod
+    def _ok_torch_probe(*a, **kw):
+        # subprocess.run probe returns 0 when torch imports as ROCm
+        rv = MagicMock()
+        rv.returncode = 0
+        return rv
+
     @patch.object(stack_mod, "_install_bnb_windows_rocm")
     @patch.object(stack_mod, "pip_install")
     def test_env_var_skips_main_pip_install(self, mock_pip, mock_bnb):
         """UNSLOTH_ROCM_TORCH_INSTALLED=1 should not trigger torch pip_install."""
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}):
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
+             patch.object(stack_mod.subprocess, "run", side_effect=self._ok_torch_probe):
             stack_mod._ensure_rocm_torch()
         mock_pip.assert_not_called()
 
@@ -1800,7 +1808,8 @@ def test_env_var_skips_main_pip_install(self, mock_pip, mock_bnb):
     @patch.object(stack_mod, "pip_install")
     def test_env_var_calls_bnb_install(self, mock_pip, mock_bnb):
         """UNSLOTH_ROCM_TORCH_INSTALLED=1 should still call _install_bnb_windows_rocm."""
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}):
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
+             patch.object(stack_mod.subprocess, "run", side_effect=self._ok_torch_probe):
             stack_mod._ensure_rocm_torch()
         mock_bnb.assert_called_once()
 
@@ -1809,10 +1818,28 @@ def test_env_var_calls_bnb_install(self, mock_pip, mock_bnb):
     def test_env_var_sets_rocm_windows_flag(self, mock_pip, mock_bnb):
         """UNSLOTH_ROCM_TORCH_INSTALLED=1 should set _rocm_windows_torch_installed."""
         stack_mod._rocm_windows_torch_installed = False
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}):
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
+             patch.object(stack_mod.subprocess, "run", side_effect=self._ok_torch_probe):
             stack_mod._ensure_rocm_torch()
         assert stack_mod._rocm_windows_torch_installed is True
 
+    @patch.object(stack_mod, "_install_bnb_windows_rocm")
+    @patch.object(stack_mod, "pip_install")
+    def test_env_var_falls_through_when_torch_missing(self, mock_pip, mock_bnb):
+        """If the venv was wiped between runs, the stale env-var must not suppress reinstall."""
+        stack_mod._rocm_windows_torch_installed = False
+        def _bad_probe(*a, **kw):
+            rv = MagicMock()
+            rv.returncode = 1
+            return rv
+        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
+             patch.object(stack_mod.subprocess, "run", side_effect=_bad_probe), \
+             patch.object(stack_mod, "IS_WINDOWS", False), \
+             patch.object(stack_mod, "IS_MACOS", True):
+            stack_mod._ensure_rocm_torch()
+        # macOS branch is the next exit -- but the point is the early-return did NOT fire.
+        mock_bnb.assert_not_called()
+
 
 # =============================================================================
 # TEST: worker.py -- Windows ROCm patches (source-level checks)

From 0b0b8df4b996b595b94fcb9d8805e31fc9d613ca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 08:00:11 +0000
Subject: [PATCH 104/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/main.py                    |  2 +-
 tests/studio/install/test_rocm_support.py | 30 +++++++++++++++--------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 488ce936ff..5d1670c182 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -16,7 +16,6 @@
 # Python 3.8+ ignores PATH for extension modules; register ROCm bin dirs with
 # os.add_dll_directory() so amdhip64.dll etc. are found before any torch import.
 if sys.platform == "win32":
-
     # Retained at module scope -- os.add_dll_directory returns a handle that
     # removes the search-path entry when garbage collected.
     _ROCM_DLL_HANDLES: list = []
@@ -66,6 +65,7 @@ def _add_rocm_dll_dirs() -> None:
     if os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH"):
         try:
             import torch as _torch_probe
+
             _is_rocm_host = bool(getattr(_torch_probe.version, "hip", None))
             del _torch_probe
         except Exception:
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 035a9ea3c6..bf6ab2a9bc 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1799,8 +1799,10 @@ def _ok_torch_probe(*a, **kw):
     @patch.object(stack_mod, "pip_install")
     def test_env_var_skips_main_pip_install(self, mock_pip, mock_bnb):
         """UNSLOTH_ROCM_TORCH_INSTALLED=1 should not trigger torch pip_install."""
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
-             patch.object(stack_mod.subprocess, "run", side_effect=self._ok_torch_probe):
+        with (
+            patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}),
+            patch.object(stack_mod.subprocess, "run", side_effect = self._ok_torch_probe),
+        ):
             stack_mod._ensure_rocm_torch()
         mock_pip.assert_not_called()
 
@@ -1808,8 +1810,10 @@ def test_env_var_skips_main_pip_install(self, mock_pip, mock_bnb):
     @patch.object(stack_mod, "pip_install")
     def test_env_var_calls_bnb_install(self, mock_pip, mock_bnb):
         """UNSLOTH_ROCM_TORCH_INSTALLED=1 should still call _install_bnb_windows_rocm."""
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
-             patch.object(stack_mod.subprocess, "run", side_effect=self._ok_torch_probe):
+        with (
+            patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}),
+            patch.object(stack_mod.subprocess, "run", side_effect = self._ok_torch_probe),
+        ):
             stack_mod._ensure_rocm_torch()
         mock_bnb.assert_called_once()
 
@@ -1818,8 +1822,10 @@ def test_env_var_calls_bnb_install(self, mock_pip, mock_bnb):
     def test_env_var_sets_rocm_windows_flag(self, mock_pip, mock_bnb):
         """UNSLOTH_ROCM_TORCH_INSTALLED=1 should set _rocm_windows_torch_installed."""
         stack_mod._rocm_windows_torch_installed = False
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
-             patch.object(stack_mod.subprocess, "run", side_effect=self._ok_torch_probe):
+        with (
+            patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}),
+            patch.object(stack_mod.subprocess, "run", side_effect = self._ok_torch_probe),
+        ):
             stack_mod._ensure_rocm_torch()
         assert stack_mod._rocm_windows_torch_installed is True
 
@@ -1828,14 +1834,18 @@ def test_env_var_sets_rocm_windows_flag(self, mock_pip, mock_bnb):
     def test_env_var_falls_through_when_torch_missing(self, mock_pip, mock_bnb):
         """If the venv was wiped between runs, the stale env-var must not suppress reinstall."""
         stack_mod._rocm_windows_torch_installed = False
+
         def _bad_probe(*a, **kw):
             rv = MagicMock()
             rv.returncode = 1
             return rv
-        with patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}), \
-             patch.object(stack_mod.subprocess, "run", side_effect=_bad_probe), \
-             patch.object(stack_mod, "IS_WINDOWS", False), \
-             patch.object(stack_mod, "IS_MACOS", True):
+
+        with (
+            patch.dict(os.environ, {"UNSLOTH_ROCM_TORCH_INSTALLED": "1"}),
+            patch.object(stack_mod.subprocess, "run", side_effect = _bad_probe),
+            patch.object(stack_mod, "IS_WINDOWS", False),
+            patch.object(stack_mod, "IS_MACOS", True),
+        ):
             stack_mod._ensure_rocm_torch()
         # macOS branch is the next exit -- but the point is the early-return did NOT fire.
         mock_bnb.assert_not_called()

From 76137b2d8278c83f695ecdbbffb9f82e8c88f992 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 May 2026 09:08:49 +0000
Subject: [PATCH 105/165] fix(studio/rocm): worker.py parity + don't roll back
 ROCm torch on bnb failure

Addresses findings from a 10x reviewer pass on the prior fix commit:

1. studio/backend/core/training/worker.py (parity with main.py):
   - Gate the torchao stub block on torch.version.hip / 'rocm' in
     torch.__version__ instead of HIP_PATH / ROCM_PATH env-var presence.
     Same root cause as main.py: HIP SDK env vars stick around on CUDA hosts.
   - Add module-level Windows ROCm DLL registration block. Worker subprocesses
     inherit env vars but not the parent's add_dll_directory handles, so the
     first `import torch` in the worker could fail to find amdhip64.dll when
     HIP_PATH\bin is not on PATH. Mirrors main.py setup. Handles retained at
     module scope via _ROCM_DLL_HANDLES.
   - Promote _WINDOWS_ROCM_GROUPED_MM_LIB to module scope with `global` in
     run_training_process so the torch.library.Library registration survives
     past function return / mid-run garbage collection.
   - Harden _torch_has_hip() to also accept 'rocm' in torch.__version__
     (AMD SDK / Radeon wheels may not set torch.version.hip).

2. studio/install_python_stack.py:
   - Don't roll back ROCm torch when bitsandbytes install fails. The prior
     commit gated _rocm_windows_torch_installed on _install_bnb_windows_rocm()
     returning True; if torch installed successfully but bnb failed, the flag
     stayed False and later install steps could overwrite ROCm torch with the
     generic CPU torch wheel. Set the flag after torch install; surface bnb
     failure as a separate warning instead.
   - _detect_windows_gfx_arch now probes in three tiers: UNSLOTH_ROCM_GFX_ARCH
     env-var override (matches the PowerShell installer), then hipinfo (PATH
     or HIP_PATH\bin), then amd-smi (`static --asic`, `list`). Without the
     amd-smi fallback, runtime-only Radeon installs without hipinfo on PATH
     made `studio update` return early and leave the venv on CPU torch.
   - Linux torch-already-rocm probe in _ensure_rocm_torch now matches the
     Windows probe shape: accepts torch.version.hip OR 'rocm' in
     torch.__version__ to cover AMD SDK / Radeon Linux wheels.

3. studio/backend/utils/hardware/hardware.py:
   - apply_gpu_ids() final-fallback torch probe accepts 'rocm' in
     torch.__version__ in addition to torch.version.hip, matching
     detect_hardware(). AMD SDK wheels could otherwise leak through with
     CUDA-only visibility masks on a spawned ROCm worker.

Tests: 231 passed, 1 skipped in tests/studio/install/test_rocm_support.py
(no test changes needed; the probe shape that prints the hip version (or
'rocm' sentinel) preserves the existing non-empty-string contract).

Not addressed in this commit (deferred or out of scope):
- Tag drift / lemonade checksum (PR 5303 surface, not this PR).
- install.sh rocm7.2.1 URL: small fix, separate.
- install.ps1 / setup.ps1 'Radeon 8060S' marketing-name fallback table.
- Strix Halo + ROCm 7.1 routing asymmetry in Python update path.
---
 studio/backend/core/training/worker.py    | 71 +++++++++++++++--
 studio/backend/utils/hardware/hardware.py |  9 ++-
 studio/install_python_stack.py            | 96 ++++++++++++++++-------
 3 files changed, 138 insertions(+), 38 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 302da663fe..cea3cd480c 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -70,6 +70,43 @@ def _output_dir_from_resume_checkpoint(
 _TVM_FFI_BROKEN_VERSIONS = ("0.1.10", "0.1.11")
 _FAST_PATH_HOOKS_SKIP_ENV = "UNSLOTH_STUDIO_SKIP_FAST_PATH_HOOKS"
 
+# Module-level handle so the torch.library.Library registration survives past
+# run_training_process() and is not garbage collected mid-run.
+_WINDOWS_ROCM_GROUPED_MM_LIB = None
+
+# Worker subprocesses inherit the parent env but not the parent's
+# os.add_dll_directory registrations. Replicate main.py's Windows ROCm DLL
+# setup at module load so the first `import torch` can find amdhip64.dll even
+# when HIP_PATH\bin is not on the system PATH. Handles retained at module
+# scope so they are not garbage collected.
+_ROCM_DLL_HANDLES: list = []
+if sys.platform == "win32":
+    def _add_rocm_dll_dirs_worker() -> None:
+        _candidates: list[str] = []
+        for _var in ("HIP_PATH", "ROCM_PATH"):
+            _val = os.environ.get(_var)
+            if _val:
+                _candidates.append(os.path.join(_val, "bin"))
+        _default_root = os.path.join(
+            os.environ.get("ProgramFiles", r"C:\Program Files"), "AMD", "ROCm"
+        )
+        try:
+            if os.path.isdir(_default_root):
+                for _ver in sorted(os.listdir(_default_root), reverse = True):
+                    _bin = os.path.join(_default_root, _ver, "bin")
+                    if os.path.isdir(_bin):
+                        _candidates.append(_bin)
+        except OSError:
+            pass
+        for _d in _candidates:
+            if os.path.isdir(_d):
+                try:
+                    _ROCM_DLL_HANDLES.append(os.add_dll_directory(_d))
+                except (OSError, AttributeError):
+                    pass
+    _add_rocm_dll_dirs_worker()
+    del _add_rocm_dll_dirs_worker
+
 
 def _model_wants_causal_conv1d(model_name: str) -> bool:
     name = model_name.lower()
@@ -607,11 +644,18 @@ def _tilelang_importable() -> bool:
 
 
 def _torch_has_hip() -> bool:
-    """True iff torch is a ROCm build; `torch.version.hip` is the only reliable signal on x86_64 ROCm."""
+    """True iff torch is a ROCm build.
+
+    `torch.version.hip` covers official PyTorch ROCm wheels; AMD SDK / Radeon
+    wheels can leave it unset but still encode "rocm" in `torch.__version__`.
+    """
     try:
         import torch as _torch
 
-        return getattr(_torch.version, "hip", None) is not None
+        return bool(
+            getattr(_torch.version, "hip", None)
+            or "rocm" in getattr(_torch, "__version__", "").lower()
+        )
     except Exception:
         return False
 
@@ -1916,10 +1960,21 @@ def find_spec(self, fullname, path, target = None):
 
     # Only stub torchao on Windows ROCm hosts -- on Windows CUDA (NVIDIA) torchao
     # is real and shadowing it breaks torchao-based quantization paths.
-    # HIP_PATH / ROCM_PATH are set by the AMD HIP SDK installer on ROCm machines.
-    _is_win32_rocm = sys.platform == "win32" and bool(
-        os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH")
-    )
+    # Gate on the active torch runtime, not env-var presence -- HIP_PATH /
+    # ROCM_PATH stay set after a user installs the HIP SDK and reverts to a
+    # CUDA torch wheel. AMD SDK / Radeon ROCm wheels may not set torch.version.hip
+    # but still encode "rocm" in torch.__version__, so accept either.
+    _is_win32_rocm = False
+    if sys.platform == "win32":
+        try:
+            import torch as _torch_probe
+            _is_win32_rocm = bool(
+                getattr(getattr(_torch_probe, "version", None), "hip", None)
+                or "rocm" in getattr(_torch_probe, "__version__", "").lower()
+            )
+            del _torch_probe
+        except Exception:
+            pass
     if _is_win32_rocm:
         # Seed torchao top-level + key submodules; the finder handles the rest.
         for _tao_name in (
@@ -1984,7 +2039,9 @@ def find_spec(self, fullname, path, target = None):
     #   offs: optional group-split offsets (MoE-style variable-size batches)
     #
     # torch is already in sys.modules from section 1e's `import torch.distributed`.
-    _WINDOWS_ROCM_GROUPED_MM_LIB = None  # kept alive to prevent GC of registration
+    # Module-level _WINDOWS_ROCM_GROUPED_MM_LIB keeps the registration alive past
+    # function return / mid-run GC.
+    global _WINDOWS_ROCM_GROUPED_MM_LIB
     if sys.platform == "win32":
         _torch_for_rocm = sys.modules.get("torch")
         if _torch_for_rocm is not None and getattr(
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 6768f50b5c..09c097a687 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1694,14 +1694,19 @@ def apply_gpu_ids(gpu_ids) -> None:
     _is_rocm = IS_ROCM or _inherits_rocm_visibility
     if not _is_rocm:
         # torch.version.hip is a non-empty string on ROCm, None on CUDA.
+        # AMD SDK / Radeon ROCm wheels can leave torch.version.hip unset but
+        # still encode "rocm" in torch.__version__, matching detect_hardware().
         # Broad except: a probe failure must never crash a training worker.
         try:
             import torch as _torch
 
-            _is_rocm = getattr(_torch.version, "hip", None) is not None
+            _is_rocm = (
+                getattr(_torch.version, "hip", None) is not None
+                or "rocm" in getattr(_torch, "__version__", "").lower()
+            )
         except Exception as e:
             logger.debug(
-                "apply_gpu_ids: torch.version.hip probe skipped (%s: %s)",
+                "apply_gpu_ids: torch ROCm probe skipped (%s: %s)",
                 type(e).__name__,
                 e,
             )
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 297cd126da..139c496793 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -228,17 +228,23 @@ def _detect_rocm_version() -> tuple[int, int] | None:
 
 
 def _detect_windows_gfx_arch() -> str | None:
-    """Return the gcnArchName from hipinfo on Windows (e.g. 'gfx1200'), or None.
+    """Return the gcnArchName on Windows (e.g. 'gfx1200'), or None.
 
-    Resolves hipinfo via PATH first, then HIP_PATH\\bin and ROCM_PATH\\bin as
-    fallbacks -- the AMD HIP SDK installer sets these env vars but does not
-    always add the bin dir to the system PATH.
+    Probe order matches the PowerShell installer: env-var override first,
+    then hipinfo (PATH or HIP_PATH / ROCM_PATH bin), then amd-smi. Without
+    the amd-smi fallback, runtime-only AMD installs without hipinfo on PATH
+    return early and `studio update` cannot repair a CPU-only venv.
     """
     import re
 
+    # 1. Explicit override (matches PowerShell installer's env-var path).
+    _override = os.environ.get("UNSLOTH_ROCM_GFX_ARCH")
+    if _override and _override.strip():
+        return _override.strip().lower()
+
+    # 2. hipinfo via PATH, then HIP_PATH\bin / ROCM_PATH\bin.
     hipinfo = shutil.which("hipinfo")
     if not hipinfo:
-        # Fallback: AMD HIP SDK sets HIP_PATH / ROCM_PATH even when bin isn't on PATH
         for _env_var in ("HIP_PATH", "ROCM_PATH"):
             _root = os.environ.get(_env_var)
             if _root:
@@ -246,25 +252,43 @@ def _detect_windows_gfx_arch() -> str | None:
                 if os.path.isfile(_candidate):
                     hipinfo = _candidate
                     break
-    if not hipinfo:
-        return None
-    try:
-        result = subprocess.run(
-            [hipinfo],
-            stdout = subprocess.PIPE,
-            stderr = subprocess.DEVNULL,
-            timeout = 10,
-        )
-        if result.returncode != 0:
-            return None
-        text = result.stdout.decode(errors = "replace")
-        m = re.search(r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text)
-        # Lowercase the captured token -- some hipinfo builds emit "Gfx1151"
-        # which would miss the lowercase keys in _GFX_TO_AMD_INDEX_ARCH and
-        # silently fall back to CPU torch.
-        return m.group(1).strip().lower() if m else None
-    except Exception:
-        return None
+    if hipinfo:
+        try:
+            result = subprocess.run(
+                [hipinfo],
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                timeout = 10,
+            )
+            if result.returncode == 0:
+                text = result.stdout.decode(errors = "replace")
+                m = re.search(r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text)
+                if m:
+                    # Lowercase -- hipinfo sometimes emits "Gfx1151".
+                    return m.group(1).strip().lower()
+        except Exception:
+            pass
+
+    # 3. amd-smi fallback -- runtime-only Radeon installs ship amd-smi but no hipinfo.
+    amd_smi = shutil.which("amd-smi")
+    if amd_smi:
+        for _args in (("static", "--asic"), ("list",)):
+            try:
+                result = subprocess.run(
+                    [amd_smi, *_args],
+                    stdout = subprocess.PIPE,
+                    stderr = subprocess.DEVNULL,
+                    timeout = 10,
+                )
+                if result.returncode != 0:
+                    continue
+                text = result.stdout.decode(errors = "replace").lower()
+                m = re.search(r"\bgfx[1-9][0-9a-z]{2,3}\b", text)
+                if m:
+                    return m.group(0)
+            except Exception:
+                continue
+    return None
 
 
 def _windows_rocm_index_url(gfx_arch: str | None) -> str | None:
@@ -518,13 +542,19 @@ def _ensure_rocm_torch() -> None:
                 "torchaudio",
                 constrain = False,
             )
+        # ROCm torch is installed (or already was); flag it so later install
+        # phases do not overwrite it with the generic CPU torch wheel. BNB is
+        # a separate dependency -- a BNB install failure must NOT roll the
+        # torch ROCm install back.
+        _rocm_windows_torch_installed = True
         # Always install AMD Windows bitsandbytes -- the PyPI wheel ships only
         # CUDA DLLs and will fail to load on ROCm.  Install even when torch was
         # already a ROCm build so that `studio update` repairs a broken bnb.
-        # Only flip the success flag when the install actually succeeds; otherwise
-        # the post-install "manual install may be required" warning is suppressed.
-        if _install_bnb_windows_rocm():
-            _rocm_windows_torch_installed = True
+        if not _install_bnb_windows_rocm():
+            print(
+                "   Warning: AMD Windows bitsandbytes install failed; "
+                "ROCm torch is installed but bitsandbytes may need manual install"
+            )
         return
 
     # ── Linux x86_64 only: PyTorch ROCm wheels are not published for aarch64 ──
@@ -556,7 +586,15 @@ def _ensure_rocm_torch() -> None:
             [
                 sys.executable,
                 "-c",
-                "import torch; print(getattr(torch.version,'hip','') or '')",
+                (
+                    "import torch; "
+                    "hip=getattr(torch.version,'hip','') or ''; "
+                    "ver=getattr(torch,'__version__','').lower(); "
+                    # Print the HIP version when present (back-compat), else
+                    # "rocm" sentinel when only torch.__version__ flags ROCm
+                    # (AMD SDK / Radeon wheels). Empty string = CPU/CUDA.
+                    "print(hip if hip else ('rocm' if 'rocm' in ver else ''))"
+                ),
             ],
             stdout = subprocess.PIPE,
             stderr = subprocess.DEVNULL,

From 0be974972b5719e3670269e7a15bf5f2e329b3b3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 09:09:02 +0000
Subject: [PATCH 106/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index cea3cd480c..aa98a20366 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -81,6 +81,7 @@ def _output_dir_from_resume_checkpoint(
 # scope so they are not garbage collected.
 _ROCM_DLL_HANDLES: list = []
 if sys.platform == "win32":
+
     def _add_rocm_dll_dirs_worker() -> None:
         _candidates: list[str] = []
         for _var in ("HIP_PATH", "ROCM_PATH"):
@@ -104,6 +105,7 @@ def _add_rocm_dll_dirs_worker() -> None:
                     _ROCM_DLL_HANDLES.append(os.add_dll_directory(_d))
                 except (OSError, AttributeError):
                     pass
+
     _add_rocm_dll_dirs_worker()
     del _add_rocm_dll_dirs_worker
 
@@ -1968,6 +1970,7 @@ def find_spec(self, fullname, path, target = None):
     if sys.platform == "win32":
         try:
             import torch as _torch_probe
+
             _is_win32_rocm = bool(
                 getattr(getattr(_torch_probe, "version", None), "hip", None)
                 or "rocm" in getattr(_torch_probe, "__version__", "").lower()

From 21773215d93ce107ae9b01dd40c2fb0cfb2e915b Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 May 2026 10:22:23 +0000
Subject: [PATCH 107/165] fix(studio/rocm): robustness pass - rocm tag
 normalisation, Strix routing parity, hardened detection

Robustness pass on top of 76137b2d. Four targeted fixes:

1. install.sh ROCm-tag routing normalisation.
   `rocm7.2.1` would route to https://download.pytorch.org/whl/rocm7.2.1
   which does not exist (PyTorch publishes major.minor URLs only). Same
   for any future patch-level tag. Normalise every rocm{maj.min}* pattern
   to the bare {maj.min} index URL.

2. install.ps1 + studio/setup.ps1 marketing-name fallback.
   The gfx1151 row matched 890M / Strix Halo / HX 37x / HX 38x / AI 9 HX
   but not the actual retail name 'AMD Radeon 8060S Graphics' shipped by
   OEMs (Ryzen AI MAX+ 395). Add '8060S' to the regex.

3. install_python_stack.py Strix + ROCm 7.1 routing parity with install.sh.
   The shell installer reroutes Strix Halo / Point + ROCm 7.1 to
   repo.amd.com/rocm/whl/{gfx}/ (which serves torch 2.11.0+rocm7.13.0
   with the upstream _grouped_mm fix). The Python `studio update` path
   only warned and still installed the broken generic rocm7.1 wheel.
   Mirror the override: detect gfx1151/gfx1150 on ROCm 7.1, route to
   the AMD per-gfx index, honour UNSLOTH_AMD_ROCM_MIRROR override.

4. _detect_windows_gfx_arch amd-smi parsing tightened.
   The amd-smi fallback added in the prior commit used a bare
   `\bgfx[1-9][0-9a-z]{2,3}\b` match against the lowercased stdout,
   which could pick up stray gfx references in warnings / device-name
   strings. Anchor on labelled lines first (Target_Graphics_Version,
   ASIC, Arch, gfx) and fall back to the bare match only when no
   labelled line is present.

Tests: 231 passed, 1 skipped in tests/studio/install/test_rocm_support.py;
sim_5301 23 cases pass (6 new sims for the Strix override + amd-smi parsing).
---
 install.ps1                    |   2 +-
 install.sh                     |  12 +++-
 studio/install_python_stack.py | 110 +++++++++++++++++++++++----------
 studio/setup.ps1               |   2 +-
 4 files changed, 90 insertions(+), 36 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 361813f862..a3309485f3 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1307,7 +1307,7 @@ shell.Run cmd, 0, False
                 $nameArchTable = @(
                     @{ P = "9070 XT|9080";                                        A = "gfx1201" }  # RDNA 4
                     @{ P = "9070|9060";                                            A = "gfx1200" }  # RDNA 4
-                    @{ P = "890M|Strix Halo|HX 37[05]|HX 38[05]|AI 9 HX";        A = "gfx1151" }  # RDNA 3.5 iGPU (Strix Halo)
+                    @{ P = "8060S|890M|Strix Halo|HX 37[05]|HX 38[05]|AI 9 HX";  A = "gfx1151" }  # RDNA 3.5 iGPU (Strix Halo / Radeon 8060S retail)
                     @{ P = "880M|Strix Point|AI 9 36[05]|AI 7 35[05]|AI 5 34[05]"; A = "gfx1150" } # RDNA 3.5 iGPU (Strix Point)
                     @{ P = "RX 7900|RX 7800|RX 7700(?! S)";                       A = "gfx1100" }  # RDNA 3 desktop
                     @{ P = "RX 7600";                                              A = "gfx1102" }  # RDNA 3
diff --git a/install.sh b/install.sh
index 72a702dc37..a13ed0a3d0 100755
--- a/install.sh
+++ b/install.sh
@@ -1644,9 +1644,17 @@ get_torch_index_url() {
                     echo "$_base/cpu"; return ;;
             esac
             # Supported tags; 6.5+ clips to rocm6.4, 7.3+ caps to rocm7.2.
+            # PyTorch publishes major.minor URLs only (no patch level), so
+            # rocm7.2.1 / rocm6.0.2 / etc. must normalise to rocm7.2 / rocm6.0.
             case "$_rocm_tag" in
-                rocm6.0|rocm6.0.*|rocm6.1|rocm6.1.*|rocm6.2|rocm6.2.*|rocm6.3|rocm6.3.*|rocm6.4|rocm6.4.*|rocm7.0|rocm7.0.*|rocm7.1|rocm7.1.*|rocm7.2|rocm7.2.*)
-                    echo "$_base/$_rocm_tag" ;;
+                rocm6.0*) echo "$_base/rocm6.0" ;;
+                rocm6.1*) echo "$_base/rocm6.1" ;;
+                rocm6.2*) echo "$_base/rocm6.2" ;;
+                rocm6.3*) echo "$_base/rocm6.3" ;;
+                rocm6.4*) echo "$_base/rocm6.4" ;;
+                rocm7.0*) echo "$_base/rocm7.0" ;;
+                rocm7.1*) echo "$_base/rocm7.1" ;;
+                rocm7.2*) echo "$_base/rocm7.2" ;;
                 rocm6.*)
                     # ROCm 6.5+ (no published PyTorch wheels): clip down
                     # to the last supported 6.x wheel set.
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 139c496793..c2bdd30370 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -282,10 +282,21 @@ def _detect_windows_gfx_arch() -> str | None:
                 )
                 if result.returncode != 0:
                     continue
-                text = result.stdout.decode(errors = "replace").lower()
-                m = re.search(r"\bgfx[1-9][0-9a-z]{2,3}\b", text)
-                if m:
-                    return m.group(0)
+                text = result.stdout.decode(errors = "replace")
+                # Anchor on a labelled gfx line (e.g. "TARGET_GRAPHICS_VERSION: gfx1151"
+                # or "Arch:  gfx1151") to avoid catching stray gfx mentions in
+                # warnings or device-name strings. Fall back to a bare token match
+                # only if no labelled line is found.
+                m = re.search(
+                    r"(?im)^\s*(?:target_graphics_version|gfx|arch|asic)\b[^:\r\n]*:\s*(gfx[1-9][0-9a-z]{2,3})\b",
+                    text,
+                )
+                if not m:
+                    m = re.search(r"\bgfx[1-9][0-9a-z]{2,3}\b", text.lower())
+                    if m:
+                        return m.group(0)
+                    continue
+                return m.group(1).lower()
             except Exception:
                 continue
     return None
@@ -610,42 +621,44 @@ def _ensure_rocm_torch() -> None:
 
     rocm_torch_ready = has_hip_torch
 
-    # Strix Halo (gfx1151) segfaults under ROCm 7.1 due to a ROCm driver bug
-    # fixed in ROCm 7.2.  Warn early so users know why training may crash.
+    # Strix Halo / Strix Point (gfx1151 / gfx1150) segfault under ROCm 7.1
+    # in torch._grouped_mm. AMD's per-gfx repo ships torch 2.11.0+rocm7.13.0
+    # with the real fix, so route those hosts there instead of the generic
+    # pytorch.org rocm7.1 wheel. Mirrors install.sh's Strix override.
+    _strix_override_url: "str | None" = None
+    _strix_override_pkgs: "tuple[str, str, str] | None" = None
     if ver < (7, 2):
         gfx_codes = _detect_amd_gfx_codes()
         _strix_gfx = {"gfx1151", "gfx1150"}
-        if _strix_gfx.intersection(gfx_codes):
-            _gfx_str = ", ".join(sorted(_strix_gfx.intersection(gfx_codes)))
+        _detected_strix = _strix_gfx.intersection(gfx_codes)
+        if _detected_strix:
+            _gfx_str = ", ".join(sorted(_detected_strix))
+            _selected_gfx = sorted(_detected_strix)[0]
+            _amd_mirror = (
+                os.environ.get("UNSLOTH_AMD_ROCM_MIRROR")
+                or "https://repo.amd.com/rocm/whl"
+            ).rstrip("/")
+            _strix_override_url = f"{_amd_mirror}/{_selected_gfx}/"
+            _strix_override_pkgs = (
+                "torch>=2.11.0,<2.12.0",
+                "torchvision",
+                "torchaudio",
+            )
             print(
-                f"\n   ⚠️  {_gfx_str} (AMD Strix Halo) detected with ROCm {ver[0]}.{ver[1]}.\n"
-                f"   ROCm 7.1 has a known segfault on this GPU when tensors are\n"
-                f"   moved to the GPU.  Upgrade to ROCm 7.2+ to enable training.\n"
+                f"\n   ⚠️  {_gfx_str} (AMD Strix) detected with ROCm {ver[0]}.{ver[1]}.\n"
+                f"   ROCm 7.1 has a known _grouped_mm segfault on this GPU;\n"
+                f"   routing torch install to AMD's arch-specific index\n"
+                f"   ({_strix_override_url}) which serves torch 2.11.0+rocm7.13.0\n"
+                f"   with the upstream fix.\n"
             )
 
     if not has_hip_torch:
-        # Select best matching wheel tag (newest ROCm version <= installed)
-        tag = next(
-            (
-                t
-                for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
-                if ver >= (maj, mn)
-            ),
-            None,
-        )
-        if tag is None:
-            print(
-                f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- "
-                f"skipping torch reinstall"
-            )
-        else:
-            index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
-            print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
-            _torch_pkg, _vision_pkg, _audio_pkg = _ROCM_TORCH_PKG_SPECS.get(
-                tag, _ROCM_TORCH_PKG_SPECS["_default"]
-            )
+        if _strix_override_url is not None and _strix_override_pkgs is not None:
+            index_url = _strix_override_url
+            _torch_pkg, _vision_pkg, _audio_pkg = _strix_override_pkgs
+            print(f"   Strix ROCm 7.1 override -- installing torch from {index_url}")
             pip_install(
-                f"ROCm torch ({tag})",
+                "ROCm torch (Strix arch-specific)",
                 "--force-reinstall",
                 "--no-cache-dir",
                 _torch_pkg,
@@ -656,6 +669,39 @@ def _ensure_rocm_torch() -> None:
                 constrain = False,
             )
             rocm_torch_ready = True
+        else:
+            # Select best matching wheel tag (newest ROCm version <= installed)
+            tag = next(
+                (
+                    t
+                    for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
+                    if ver >= (maj, mn)
+                ),
+                None,
+            )
+            if tag is None:
+                print(
+                    f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- "
+                    f"skipping torch reinstall"
+                )
+            else:
+                index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
+                print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
+                _torch_pkg, _vision_pkg, _audio_pkg = _ROCM_TORCH_PKG_SPECS.get(
+                    tag, _ROCM_TORCH_PKG_SPECS["_default"]
+                )
+                pip_install(
+                    f"ROCm torch ({tag})",
+                    "--force-reinstall",
+                    "--no-cache-dir",
+                    _torch_pkg,
+                    _vision_pkg,
+                    _audio_pkg,
+                    "--index-url",
+                    index_url,
+                    constrain = False,
+                )
+                rocm_torch_ready = True
 
     # Install bitsandbytes only when torch links against ROCm. Prefers the
     # continuous-release_main wheel (bnb PR #1887 4-bit GEMV fix) and falls
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 2772c94b11..6ee20143ec 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -792,7 +792,7 @@ if (-not $HasNvidiaSmi) {
             $nameArchTable = @(
                 @{ P = "9070 XT|9080";                                        A = "gfx1201" }  # RDNA 4
                 @{ P = "9070|9060";                                            A = "gfx1200" }  # RDNA 4
-                @{ P = "890M|Strix Halo|HX 37[05]|HX 38[05]|AI 9 HX";        A = "gfx1151" }  # RDNA 3.5 iGPU (Strix Halo)
+                @{ P = "8060S|890M|Strix Halo|HX 37[05]|HX 38[05]|AI 9 HX";  A = "gfx1151" }  # RDNA 3.5 iGPU (Strix Halo / Radeon 8060S retail)
                 @{ P = "880M|Strix Point|AI 9 36[05]|AI 7 35[05]|AI 5 34[05]"; A = "gfx1150" } # RDNA 3.5 iGPU (Strix Point)
                 @{ P = "RX 7900|RX 7800|RX 7700(?! S)";                       A = "gfx1100" }  # RDNA 3 desktop
                 @{ P = "RX 7600";                                              A = "gfx1102" }  # RDNA 3

From 96b9e4659b40a95e668d4936a61055d66fe81691 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 May 2026 10:54:19 +0000
Subject: [PATCH 108/165] fix(studio/rocm): multi-GPU selection, Strix sibling
 handling, defensive cleanups

Round 4 robustness pass based on 5 parallel Opus reviewers of head 21773215.
Seven items from across regression / edge-case / error-paths / architecture
reviews:

1. studio/backend/main.py BNB gate: aligned with the broad ROCm check used
   everywhere else in this PR (torch.version.hip OR 'rocm' in __version__).
   AMD SDK / Radeon Linux wheels do not always populate torch.version.hip;
   without this, main.py would silently skip BNB_ROCM_VERSION while worker.py
   set it.

2. studio/install_python_stack.py _install_bnb_windows_rocm: init _ok = False
   before the try block. Without this, if pip_install_try itself raises
   (e.g. OSError on uv binary missing), the finally block restored env vars
   correctly but the subsequent `if not _ok:` raised UnboundLocalError,
   masking the original exception.

3. studio/install_python_stack.py _detect_windows_gfx_arch:
   - Rewrote to use re.findall (not re.search) on both hipinfo and amd-smi
     output, dedup tokens preserving order, and select via new
     _pick_visible_index() helper.
   - HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES (first comma entry, integer)
     now picks the right GPU on multi-AMD-GPU hosts. Out-of-range or non-int
     values fall back to the first GPU (matches detect_host behaviour in
     install_llama_prebuilt.py).

4. studio/install_python_stack.py Strix override now consults the runtime
   target before flipping:
   - Previous behaviour intersected gfx_codes with {gfx1151, gfx1150} and
     picked the first Strix arch, ignoring whether HIP_VISIBLE_DEVICES
     selected a non-Strix sibling (e.g. discrete RX 7900 in a mixed APU+dGPU
     box). Could install Strix-specific wheels onto a gfx1100 dGPU.
   - Now resolves the runtime gfx via _pick_visible_index() and only
     overrides when that runtime target is in the Strix set.

5. studio/backend/main.py + studio/backend/core/training/worker.py: ROCm
   version dir scan no longer sorts lexically. Previous sort placed "10.0"
   before "7.0" alphabetically, which would mis-prioritise ROCm 10.x bin
   dirs once AMD ships them. New _ver_key() splits on "." and sorts
   numerically with a string fallback.

6. install.sh Strix override URL: replaced ${var%/} (strips one trailing
   slash) with a while-loop that strips all trailing slashes, matching
   Python's .rstrip("/"). A user setting UNSLOTH_AMD_ROCM_MIRROR with
   "http://corp/whl///" no longer ends up with "http://corp/whl///gfx1151/"
   which strict pip proxies (artifactory, sonatype) 404 on.

7. studio/install_python_stack.py: bumped torch import probe timeout from
   30s to 90s. PyTorch's lazy .so loading can take 60-90s on cold NFS or
   USB-backed venvs. The shorter timeout was producing a false "torch
   missing" classification and reinstalling a working ROCm torch.

Tests: 231 passed, 1 skipped. sim_5301 30 cases pass (added 7 new sims for
multi-GPU detection, Strix sibling handling, and _ok-init regression).
---
 install.sh                                |   8 +-
 studio/backend/core/training/worker.py    |  13 ++-
 studio/backend/main.py                    |  20 +++-
 studio/install_python_stack.py            | 130 ++++++++++++++++------
 tests/studio/install/test_rocm_support.py |   2 +-
 5 files changed, 132 insertions(+), 41 deletions(-)

diff --git a/install.sh b/install.sh
index a13ed0a3d0..92531a1f88 100755
--- a/install.sh
+++ b/install.sh
@@ -1852,7 +1852,13 @@ case "$TORCH_INDEX_URL" in
             # over the pytorch.org rocm7.2 fallback because it exercises the real GPU
             # kernel path. Set UNSLOTH_AMD_ROCM_MIRROR to override for air-gapped installs.
             _amd_strix_base="${UNSLOTH_AMD_ROCM_MIRROR:-https://repo.amd.com/rocm/whl}"
-            TORCH_INDEX_URL="${_amd_strix_base%/}/${_strix_gfx}/"
+            # Strip ALL trailing slashes to match Python's .rstrip("/") -- a
+            # double-/triple-slash mirror URL would otherwise produce 404s on
+            # strict pip proxies (artifactory, sonatype).
+            while [ "${_amd_strix_base%/}" != "$_amd_strix_base" ]; do
+                _amd_strix_base="${_amd_strix_base%/}"
+            done
+            TORCH_INDEX_URL="${_amd_strix_base}/${_strix_gfx}/"
             TORCH_CONSTRAINT="torch>=2.11.0,<2.12.0"
             _amd_gpu_radeon=false
         fi
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index aa98a20366..3bcf605323 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -91,9 +91,20 @@ def _add_rocm_dll_dirs_worker() -> None:
         _default_root = os.path.join(
             os.environ.get("ProgramFiles", r"C:\Program Files"), "AMD", "ROCm"
         )
+
+        def _ver_key(name: str) -> tuple:
+            # Numeric tuple key so "10.0" sorts after "7.0"; non-numeric chunks fall back to string.
+            parts = []
+            for chunk in name.split("."):
+                try:
+                    parts.append((0, int(chunk)))
+                except ValueError:
+                    parts.append((1, chunk))
+            return tuple(parts)
+
         try:
             if os.path.isdir(_default_root):
-                for _ver in sorted(os.listdir(_default_root), reverse = True):
+                for _ver in sorted(os.listdir(_default_root), key = _ver_key, reverse = True):
                     _bin = os.path.join(_default_root, _ver, "bin")
                     if os.path.isdir(_bin):
                         _candidates.append(_bin)
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 5d1670c182..576e15e21e 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -32,9 +32,19 @@ def _add_rocm_dll_dirs() -> None:
         _default_root = os.path.join(
             os.environ.get("ProgramFiles", r"C:\Program Files"), "AMD", "ROCm"
         )
+        def _ver_key(name: str) -> tuple:
+            # Numeric tuple key so "10.0" sorts after "7.0"; non-numeric chunks fall back to string.
+            parts = []
+            for chunk in name.split("."):
+                try:
+                    parts.append((0, int(chunk)))
+                except ValueError:
+                    parts.append((1, chunk))
+            return tuple(parts)
+
         try:
             if os.path.isdir(_default_root):
-                for _ver in sorted(os.listdir(_default_root), reverse = True):
+                for _ver in sorted(os.listdir(_default_root), key = _ver_key, reverse = True):
                     _bin = os.path.join(_default_root, _ver, "bin")
                     if os.path.isdir(_bin):
                         candidates.append(_bin)
@@ -66,7 +76,13 @@ def _add_rocm_dll_dirs() -> None:
         try:
             import torch as _torch_probe
 
-            _is_rocm_host = bool(getattr(_torch_probe.version, "hip", None))
+            # Broad check: torch.version.hip OR "rocm" in torch.__version__ --
+            # AMD SDK / Radeon wheels may not populate torch.version.hip but
+            # still encode "rocm" in __version__. Matches worker.py + hardware.py.
+            _is_rocm_host = bool(
+                getattr(getattr(_torch_probe, "version", None), "hip", None)
+                or "rocm" in getattr(_torch_probe, "__version__", "").lower()
+            )
             del _torch_probe
         except Exception:
             pass
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index c2bdd30370..43c972b5ba 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -227,6 +227,28 @@ def _detect_rocm_version() -> tuple[int, int] | None:
     return None
 
 
+def _pick_visible_index(num_tokens: int) -> int:
+    """Resolve HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES to an integer
+    index into a list of length num_tokens. Returns 0 (first GPU) for
+    unset, empty, '-1', UUID-style, or out-of-range values."""
+    for _env in ("HIP_VISIBLE_DEVICES", "ROCR_VISIBLE_DEVICES"):
+        _val = os.environ.get(_env)
+        if _val is None:
+            continue
+        _val = _val.strip()
+        if _val == "" or _val == "-1":
+            return 0
+        _first = _val.split(",")[0].strip()
+        try:
+            _idx = int(_first)
+            if 0 <= _idx < num_tokens:
+                return _idx
+        except ValueError:
+            pass
+        return 0
+    return 0
+
+
 def _detect_windows_gfx_arch() -> str | None:
     """Return the gcnArchName on Windows (e.g. 'gfx1200'), or None.
 
@@ -234,6 +256,10 @@ def _detect_windows_gfx_arch() -> str | None:
     then hipinfo (PATH or HIP_PATH / ROCM_PATH bin), then amd-smi. Without
     the amd-smi fallback, runtime-only AMD installs without hipinfo on PATH
     return early and `studio update` cannot repair a CPU-only venv.
+
+    On multi-GPU hosts, all detected gfx tokens are deduplicated (preserving
+    enumeration order) and HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES selects
+    which one to install for. The first GPU is used when no env var is set.
     """
     import re
 
@@ -242,6 +268,15 @@ def _detect_windows_gfx_arch() -> str | None:
     if _override and _override.strip():
         return _override.strip().lower()
 
+    def _dedup_pick(tokens: list[str]) -> "str | None":
+        if not tokens:
+            return None
+        _seen: list[str] = []
+        for _t in tokens:
+            if _t not in _seen:
+                _seen.append(_t)
+        return _seen[_pick_visible_index(len(_seen))]
+
     # 2. hipinfo via PATH, then HIP_PATH\bin / ROCM_PATH\bin.
     hipinfo = shutil.which("hipinfo")
     if not hipinfo:
@@ -262,10 +297,14 @@ def _detect_windows_gfx_arch() -> str | None:
             )
             if result.returncode == 0:
                 text = result.stdout.decode(errors = "replace")
-                m = re.search(r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text)
-                if m:
-                    # Lowercase -- hipinfo sometimes emits "Gfx1151".
-                    return m.group(1).strip().lower()
+                # findall picks every gcnArchName line so multi-GPU hosts
+                # are enumerable and HIP_VISIBLE_DEVICES selects correctly.
+                _tokens = [t.strip().lower() for t in re.findall(
+                    r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text
+                )]
+                _pick = _dedup_pick(_tokens)
+                if _pick:
+                    return _pick
         except Exception:
             pass
 
@@ -283,20 +322,17 @@ def _detect_windows_gfx_arch() -> str | None:
                 if result.returncode != 0:
                     continue
                 text = result.stdout.decode(errors = "replace")
-                # Anchor on a labelled gfx line (e.g. "TARGET_GRAPHICS_VERSION: gfx1151"
-                # or "Arch:  gfx1151") to avoid catching stray gfx mentions in
-                # warnings or device-name strings. Fall back to a bare token match
-                # only if no labelled line is found.
-                m = re.search(
+                # Prefer labelled gfx lines; fall back to bare tokens.
+                _labelled = re.findall(
                     r"(?im)^\s*(?:target_graphics_version|gfx|arch|asic)\b[^:\r\n]*:\s*(gfx[1-9][0-9a-z]{2,3})\b",
                     text,
                 )
-                if not m:
-                    m = re.search(r"\bgfx[1-9][0-9a-z]{2,3}\b", text.lower())
-                    if m:
-                        return m.group(0)
-                    continue
-                return m.group(1).lower()
+                _tokens = [t.lower() for t in _labelled]
+                if not _tokens:
+                    _tokens = re.findall(r"\bgfx[1-9][0-9a-z]{2,3}\b", text.lower())
+                _pick = _dedup_pick(_tokens)
+                if _pick:
+                    return _pick
             except Exception:
                 continue
     return None
@@ -436,6 +472,7 @@ def _install_bnb_windows_rocm() -> bool:
         return False
     _prev = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
     os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
+    _ok = False  # init so a raise inside pip_install_try does not produce UnboundLocalError
     try:
         _ok = pip_install_try(
             "bitsandbytes (AMD Windows, pre-release main)",
@@ -491,7 +528,7 @@ def _ensure_rocm_torch() -> None:
                 ],
                 stdout = subprocess.DEVNULL,
                 stderr = subprocess.DEVNULL,
-                timeout = 30,
+                timeout = 90,
             )
             _torch_ok = _probe.returncode == 0
         except (OSError, subprocess.TimeoutExpired):
@@ -529,7 +566,7 @@ def _ensure_rocm_torch() -> None:
                 ],
                 stdout = subprocess.PIPE,
                 stderr = subprocess.DEVNULL,
-                timeout = 30,
+                timeout = 90,
             )
             if probe.returncode == 0 and probe.stdout.decode().strip() == "yes":
                 _torch_already_rocm = True
@@ -609,7 +646,7 @@ def _ensure_rocm_torch() -> None:
             ],
             stdout = subprocess.PIPE,
             stderr = subprocess.DEVNULL,
-            timeout = 30,
+            timeout = 90,
         )
     except (OSError, subprocess.TimeoutExpired):
         probe = None
@@ -625,6 +662,10 @@ def _ensure_rocm_torch() -> None:
     # in torch._grouped_mm. AMD's per-gfx repo ships torch 2.11.0+rocm7.13.0
     # with the real fix, so route those hosts there instead of the generic
     # pytorch.org rocm7.1 wheel. Mirrors install.sh's Strix override.
+    # On mixed hosts (Strix iGPU + non-Strix dGPU), only route to the AMD
+    # per-gfx index when the GPU HIP will actually run on is the Strix one --
+    # otherwise the dGPU would get an incompatible wheel. Use HIP_VISIBLE_DEVICES
+    # to determine the runtime target.
     _strix_override_url: "str | None" = None
     _strix_override_pkgs: "tuple[str, str, str] | None" = None
     if ver < (7, 2):
@@ -632,25 +673,42 @@ def _ensure_rocm_torch() -> None:
         _strix_gfx = {"gfx1151", "gfx1150"}
         _detected_strix = _strix_gfx.intersection(gfx_codes)
         if _detected_strix:
-            _gfx_str = ", ".join(sorted(_detected_strix))
-            _selected_gfx = sorted(_detected_strix)[0]
-            _amd_mirror = (
-                os.environ.get("UNSLOTH_AMD_ROCM_MIRROR")
-                or "https://repo.amd.com/rocm/whl"
-            ).rstrip("/")
-            _strix_override_url = f"{_amd_mirror}/{_selected_gfx}/"
-            _strix_override_pkgs = (
-                "torch>=2.11.0,<2.12.0",
-                "torchvision",
-                "torchaudio",
-            )
-            print(
-                f"\n   ⚠️  {_gfx_str} (AMD Strix) detected with ROCm {ver[0]}.{ver[1]}.\n"
-                f"   ROCm 7.1 has a known _grouped_mm segfault on this GPU;\n"
-                f"   routing torch install to AMD's arch-specific index\n"
-                f"   ({_strix_override_url}) which serves torch 2.11.0+rocm7.13.0\n"
-                f"   with the upstream fix.\n"
+            # Pick the runtime-visible GPU. If HIP_VISIBLE_DEVICES selects a
+            # specific index into gfx_codes, use that gfx; else default to the
+            # first listed GPU. Skip the override unless the resolved GPU is
+            # Strix.
+            _runtime_gfx = (
+                gfx_codes[_pick_visible_index(len(gfx_codes))]
+                if gfx_codes
+                else None
             )
+            if _runtime_gfx in _strix_gfx:
+                _selected_gfx = _runtime_gfx
+                _amd_mirror = (
+                    os.environ.get("UNSLOTH_AMD_ROCM_MIRROR")
+                    or "https://repo.amd.com/rocm/whl"
+                ).rstrip("/")
+                _strix_override_url = f"{_amd_mirror}/{_selected_gfx}/"
+                _strix_override_pkgs = (
+                    "torch>=2.11.0,<2.12.0",
+                    "torchvision",
+                    "torchaudio",
+                )
+                print(
+                    f"\n   {_selected_gfx} (AMD Strix) is the runtime target with ROCm "
+                    f"{ver[0]}.{ver[1]}.\n"
+                    f"   ROCm 7.1 has a known _grouped_mm segfault on this GPU;\n"
+                    f"   routing torch install to AMD's arch-specific index\n"
+                    f"   ({_strix_override_url}) which serves torch 2.11.0+rocm7.13.0\n"
+                    f"   with the upstream fix.\n"
+                )
+            else:
+                _gfx_str = ", ".join(sorted(_detected_strix))
+                print(
+                    f"\n   Strix GPU ({_gfx_str}) present but HIP_VISIBLE_DEVICES "
+                    f"selects a non-Strix runtime target ({_runtime_gfx});\n"
+                    f"   skipping AMD per-gfx index override.\n"
+                )
 
     if not has_hip_torch:
         if _strix_override_url is not None and _strix_override_pkgs is not None:
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index bf6ab2a9bc..3b3ebd206d 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2319,7 +2319,7 @@ def test_rocm71_override_to_amd_arch_index_in_install_sh(self):
         # The URL must incorporate the detected gfx arch so gfx1151 → .../gfx1151/
         strix_idx = source.find("_amd_strix_base")
         assert strix_idx != -1
-        ctx = source[strix_idx : strix_idx + 200]
+        ctx = source[strix_idx : strix_idx + 500]
         assert "_strix_gfx" in ctx
 
     def test_radeon_repo_bypassed_for_strix_in_install_sh(self):

From 825cbf5f2126657f42603d4de06b3a6d525a8997 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 10:55:05 +0000
Subject: [PATCH 109/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py |  4 +++-
 studio/backend/main.py                 |  5 ++++-
 studio/install_python_stack.py         | 11 +++++------
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 3bcf605323..9a0de1cd68 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -104,7 +104,9 @@ def _ver_key(name: str) -> tuple:
 
         try:
             if os.path.isdir(_default_root):
-                for _ver in sorted(os.listdir(_default_root), key = _ver_key, reverse = True):
+                for _ver in sorted(
+                    os.listdir(_default_root), key = _ver_key, reverse = True
+                ):
                     _bin = os.path.join(_default_root, _ver, "bin")
                     if os.path.isdir(_bin):
                         _candidates.append(_bin)
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 576e15e21e..0e2175200e 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -32,6 +32,7 @@ def _add_rocm_dll_dirs() -> None:
         _default_root = os.path.join(
             os.environ.get("ProgramFiles", r"C:\Program Files"), "AMD", "ROCm"
         )
+
         def _ver_key(name: str) -> tuple:
             # Numeric tuple key so "10.0" sorts after "7.0"; non-numeric chunks fall back to string.
             parts = []
@@ -44,7 +45,9 @@ def _ver_key(name: str) -> tuple:
 
         try:
             if os.path.isdir(_default_root):
-                for _ver in sorted(os.listdir(_default_root), key = _ver_key, reverse = True):
+                for _ver in sorted(
+                    os.listdir(_default_root), key = _ver_key, reverse = True
+                ):
                     _bin = os.path.join(_default_root, _ver, "bin")
                     if os.path.isdir(_bin):
                         candidates.append(_bin)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 43c972b5ba..b9a5031374 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -299,9 +299,10 @@ def _dedup_pick(tokens: list[str]) -> "str | None":
                 text = result.stdout.decode(errors = "replace")
                 # findall picks every gcnArchName line so multi-GPU hosts
                 # are enumerable and HIP_VISIBLE_DEVICES selects correctly.
-                _tokens = [t.strip().lower() for t in re.findall(
-                    r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text
-                )]
+                _tokens = [
+                    t.strip().lower()
+                    for t in re.findall(r"(?im)^\s*gcnArchName\s*:\s*(\S+)", text)
+                ]
                 _pick = _dedup_pick(_tokens)
                 if _pick:
                     return _pick
@@ -678,9 +679,7 @@ def _ensure_rocm_torch() -> None:
             # first listed GPU. Skip the override unless the resolved GPU is
             # Strix.
             _runtime_gfx = (
-                gfx_codes[_pick_visible_index(len(gfx_codes))]
-                if gfx_codes
-                else None
+                gfx_codes[_pick_visible_index(len(gfx_codes))] if gfx_codes else None
             )
             if _runtime_gfx in _strix_gfx:
                 _selected_gfx = _runtime_gfx

From 8c3024133d3702f18012584ef4a03df15cccaa83 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 May 2026 12:14:14 +0000
Subject: [PATCH 110/165] fix(studio/rocm): worker BNB/grouped_mm broad gate,
 install.sh Strix visibility, runtime-only ROCm detection

Round-5 robustness pass based on 20 parallel reviewers of head 96b9e465.

1. studio/backend/core/training/worker.py - BNB version pin / dynamo disable
   / _grouped_mm fallback block was still gated on torch.version.hip alone
   despite the torchao stub block above already using the broad check. AMD
   SDK / Radeon Windows wheels (torch.__version__ contains "rocm" but
   torch.version.hip is None) silently skipped the Windows ROCm runtime
   patches. Aligned to the same broad check (8/20 reviewers).

2. studio/backend/core/training/worker.py - _hip_ver_at_least() now also
   parses the ROCm version out of torch.__version__ (e.g. "2.11.0+rocm7.13.0")
   when torch.version.hip is missing, so the kernel-fix gate is correct for
   SDK / Radeon wheels too.

3. studio/backend/core/training/worker.py - _grouped_mm_safe_impl with
   offs=None now picks torch.bmm/matmul for 3-D inputs instead of always
   calling torch.mm. The real _grouped_mm accepts 3-D batched matmul; the
   prior fallback raised "self must be a matrix" on MoE workloads (2/20).

4. studio/backend/main.py - dropped the HIP_PATH / ROCM_PATH env-var gate
   from the BNB block; probe torch directly. Runtime-only Radeon / AMD SDK
   Windows installs do not set those SDK env vars but still ship ROCm torch
   (5/20 reviewers).

5. install.sh - Strix override now collects every gfx token from
   rocminfo / amd-smi (in enumeration order), then indexes by
   HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES so a mixed Strix iGPU + non-
   Strix dGPU host where the user selected the dGPU does NOT get rerouted
   to the Strix per-gfx index. Mirrors the Python update path (5/20 reviewers).

6. install.sh - Strix detection chain now also probes `amd-smi static --asic`,
   matching the PowerShell installer (1/20). Closes the gap on runtime-only
   Strix hosts where `amd-smi list` does not surface a gfx token.

7. studio/install_python_stack.py - _has_rocm_gpu() now has the sysfs KFD
   topology fallback (/sys/class/kfd/kfd/topology/nodes/*/gpu_id), matching
   install.sh. On minimal package-managed installs without rocminfo /
   amd-smi GUI tools, `studio update` can now detect the GPU and repair the
   venv instead of returning early (2/20).

8. studio/install_python_stack.py - _detect_amd_gfx_codes() now falls back
   to `amd-smi list` and `amd-smi static --asic` when rocminfo is missing
   (2/20). Strix routing on runtime-only Radeon hosts now matches what
   install.sh has done for a while.

9. studio/install_python_stack.py - Strix override now applies even when
   has_hip_torch is True. The whole point of the override is to repair an
   existing broken torch.version.hip == "7.1" install; skipping the
   reinstall left users on the known _grouped_mm segfaulting stack (3/20).

Tests: 231 passed, 1 skipped. sim_5301 30 cases pass. sim_cross 12 pass.
---
 install.sh                                |  40 +++++-
 studio/backend/core/training/worker.py    |  54 +++++++-
 studio/backend/main.py                    |  33 +++--
 studio/install_python_stack.py            | 153 +++++++++++++---------
 tests/studio/install/test_rocm_support.py |   2 +-
 5 files changed, 192 insertions(+), 90 deletions(-)

diff --git a/install.sh b/install.sh
index 92531a1f88..c52f2669a9 100755
--- a/install.sh
+++ b/install.sh
@@ -1832,13 +1832,45 @@ esac
 # Detect these GPUs when TORCH_INDEX_URL is rocm7.1 and override to rocm7.2.
 case "$TORCH_INDEX_URL" in
     */rocm7.1|*/rocm7.1.*)
-        _strix_gfx=""
+        # Collect every gfx token in rocminfo / amd-smi enumeration order
+        # (skip duplicates), then index by HIP_VISIBLE_DEVICES /
+        # ROCR_VISIBLE_DEVICES so a mixed Strix iGPU + non-Strix dGPU box
+        # where the user selected the dGPU does NOT get rerouted to the
+        # Strix per-gfx index.
+        _gfx_all=""
         if command -v rocminfo >/dev/null 2>&1; then
-            _strix_gfx=$(rocminfo 2>/dev/null | grep -oE 'gfx1151|gfx1150' | head -1)
+            _gfx_all=$(rocminfo 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' | awk '!seen[$0]++')
         fi
-        if [ -z "$_strix_gfx" ] && command -v amd-smi >/dev/null 2>&1; then
-            _strix_gfx=$(amd-smi list 2>/dev/null | grep -oE 'gfx1151|gfx1150' | head -1)
+        if [ -z "$_gfx_all" ] && command -v amd-smi >/dev/null 2>&1; then
+            _gfx_all=$(amd-smi list 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' | awk '!seen[$0]++')
+            # PowerShell paths also probe `amd-smi static --asic`; mirror it
+            # so a host with hipinfo-less amd-smi reports the gfx target.
+            if [ -z "$_gfx_all" ]; then
+                _gfx_all=$(amd-smi static --asic 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' | awk '!seen[$0]++')
+            fi
         fi
+        _runtime_gfx=""
+        if [ -n "$_gfx_all" ]; then
+            _vis="${HIP_VISIBLE_DEVICES:-${ROCR_VISIBLE_DEVICES:-}}"
+            _idx=0
+            if [ -n "$_vis" ] && [ "$_vis" != "-1" ]; then
+                _first=${_vis%%,*}
+                case "$_first" in
+                    ''|*[!0-9]*) _idx=0 ;;
+                    *) _idx=$_first ;;
+                esac
+            fi
+            _runtime_gfx=$(printf '%s\n' "$_gfx_all" | awk -v idx="$_idx" '
+                NF { vals[n++] = $0 }
+                END {
+                    if (idx < 0 || idx >= n) idx = 0
+                    if (n > 0) print vals[idx]
+                }')
+        fi
+        _strix_gfx=""
+        case "$_runtime_gfx" in
+            gfx1151|gfx1150) _strix_gfx="$_runtime_gfx" ;;
+        esac
         if [ -n "$_strix_gfx" ]; then
             echo "" >&2
             echo "  [WARN] $_strix_gfx (Strix) + ROCm 7.1 detected -- known _grouped_mm segfault" >&2
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 9a0de1cd68..59c5958389 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2060,9 +2060,24 @@ def find_spec(self, fullname, path, target = None):
     global _WINDOWS_ROCM_GROUPED_MM_LIB
     if sys.platform == "win32":
         _torch_for_rocm = sys.modules.get("torch")
-        if _torch_for_rocm is not None and getattr(
-            getattr(_torch_for_rocm, "version", None), "hip", None
-        ):
+        # Broad check: torch.version.hip OR "rocm" in torch.__version__.
+        # AMD SDK / Radeon Windows wheels do not always populate
+        # torch.version.hip; without the broad check the BNB version pin,
+        # dynamo-disable, and _grouped_mm fallback below silently skip
+        # (matches the torchao stub gate above and main.py).
+        _build_version_for_rocm = (
+            getattr(_torch_for_rocm, "__version__", "").lower()
+            if _torch_for_rocm is not None
+            else ""
+        )
+        _is_win_rocm_torch = bool(
+            _torch_for_rocm is not None
+            and (
+                getattr(getattr(_torch_for_rocm, "version", None), "hip", None)
+                or "rocm" in _build_version_for_rocm
+            )
+        )
+        if _is_win_rocm_torch:
             # Disable dynamo (belt-and-suspenders; JitDecomp patch below is the
             # real fix, but keeping dynamo off avoids any other compile paths).
             if "TORCHDYNAMO_DISABLE" not in os.environ:
@@ -2112,12 +2127,24 @@ def find_spec(self, fullname, path, target = None):
 
             # Parse HIP version for the kernel-fix gate below.
             # torch.version.hip can be "7.13.99004", "7.2.0", etc.
-            # We only need major.minor for the comparison.
+            # AMD SDK / Radeon wheels may leave torch.version.hip unset and
+            # encode the ROCm version in torch.__version__ instead
+            # (e.g. "2.11.0+rocm7.13.0" or "2.9.0+rocmsdk20251116"); fall back
+            # to that string when version.hip is missing.
             def _hip_ver_at_least(major: int, minor: int) -> bool:
+                import re as _re_ver
                 _hip_str = getattr(
                     getattr(_torch_for_rocm, "version", None), "hip", None
                 )
                 if not _hip_str:
+                    _ver_match = _re_ver.search(
+                        r"rocm(\d+)\.(\d+)", _build_version_for_rocm
+                    )
+                    if _ver_match:
+                        return (
+                            int(_ver_match.group(1)),
+                            int(_ver_match.group(2)),
+                        ) >= (major, minor)
                     return False
                 try:
                     _parts = [int(x) for x in str(_hip_str).split(".")[:2]]
@@ -2138,11 +2165,24 @@ def _hip_ver_at_least(major: int, minor: int) -> bool:
                     def _grouped_mm_safe_impl(
                         self, mat2, offs = None, bias = None, out_dtype = None
                     ):
-                        """Python mm fallback for _grouped_mm on gfx1200 (null HIP kernel, ROCm ≤ 7.12)."""
+                        """Python mm/bmm fallback for _grouped_mm on gfx1200 (null HIP kernel, ROCm ≤ 7.12)."""
                         _t = _torch_for_rocm
                         if offs is None:
-                            # Simple case: plain matrix multiply.
-                            result = _t.mm(self.contiguous(), mat2.contiguous())
+                            # No offsets: behave like the real op, which
+                            # accepts either (M, K) x (K, N) -> mm, or 3-D
+                            # batched inputs -> bmm. Picking torch.mm
+                            # unconditionally previously raised "self must be
+                            # a matrix" on 3-D MoE workloads.
+                            if self.dim() == 3 and mat2.dim() == 3:
+                                result = _t.bmm(self.contiguous(), mat2.contiguous())
+                            elif self.dim() == 3 and mat2.dim() == 2:
+                                # Broadcast 2-D mat2 across the batch dim.
+                                result = _t.matmul(self.contiguous(), mat2.contiguous())
+                            elif self.dim() == 2 and mat2.dim() == 3:
+                                # Broadcast 2-D self across batch via matmul semantics.
+                                result = _t.matmul(self.contiguous(), mat2.contiguous())
+                            else:
+                                result = _t.mm(self.contiguous(), mat2.contiguous())
                         else:
                             # Grouped case: offs[i] is the exclusive end-row of
                             # group i in `self`; mat2 may be 3-D or 2-D.
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 0e2175200e..fac528f1e1 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -70,25 +70,22 @@ def _ver_key(name: str) -> tuple:
     # this the server process crashes with "Configured ROCm binary not found".
     # Detect the available DLL, fall back to "72", and set BNB_ROCM_VERSION
     # before any import that pulls in bitsandbytes (mirrors worker.py logic).
-    # Gate on the active torch runtime, not env-var presence -- HIP_PATH /
-    # ROCM_PATH stay set after a user installs the HIP SDK and reverts to a
-    # CUDA torch wheel, and setting BNB_ROCM_VERSION there makes bitsandbytes
-    # look for a ROCm DLL that doesn't exist and crash the CUDA backend.
+    # Gate on the active torch runtime only. AMD SDK / Radeon Windows wheels
+    # may not set HIP_PATH / ROCM_PATH, but they do populate torch.version.hip
+    # or encode "rocm" in torch.__version__. A previous version of this gate
+    # required HIP_PATH / ROCM_PATH and silently skipped BNB_ROCM_VERSION for
+    # those wheels.
     _is_rocm_host = False
-    if os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH"):
-        try:
-            import torch as _torch_probe
-
-            # Broad check: torch.version.hip OR "rocm" in torch.__version__ --
-            # AMD SDK / Radeon wheels may not populate torch.version.hip but
-            # still encode "rocm" in __version__. Matches worker.py + hardware.py.
-            _is_rocm_host = bool(
-                getattr(getattr(_torch_probe, "version", None), "hip", None)
-                or "rocm" in getattr(_torch_probe, "__version__", "").lower()
-            )
-            del _torch_probe
-        except Exception:
-            pass
+    try:
+        import torch as _torch_probe
+
+        _is_rocm_host = bool(
+            getattr(getattr(_torch_probe, "version", None), "hip", None)
+            or "rocm" in getattr(_torch_probe, "__version__", "").lower()
+        )
+        del _torch_probe
+    except Exception:
+        pass
     if _is_rocm_host and "BNB_ROCM_VERSION" not in os.environ:
         import glob as _glob
 
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index b9a5031374..eb2f9b763d 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -405,6 +405,26 @@ def _has_rocm_gpu() -> bool:
         if result.returncode == 0 and result.stdout.strip():
             if check_fn(result.stdout):
                 return True
+    # sysfs KFD topology fallback (Linux only) -- matches install.sh's
+    # runtime-only detection. On minimal package-managed installs (no
+    # rocminfo / no amd-smi GUI tools), the kernel exposes AMD GPUs via
+    # /sys/class/kfd so `studio update` can still detect the GPU and
+    # repair the venv.
+    if sys.platform != "win32":
+        try:
+            kfd_nodes = "/sys/class/kfd/kfd/topology/nodes"
+            if os.path.isdir(kfd_nodes):
+                for entry in os.listdir(kfd_nodes):
+                    gpu_id_path = os.path.join(kfd_nodes, entry, "gpu_id")
+                    try:
+                        with open(gpu_id_path) as fh:
+                            gpu_id = fh.read().strip()
+                    except OSError:
+                        continue
+                    if gpu_id and gpu_id != "0":  # gpu_id 0 = CPU node
+                        return True
+        except OSError:
+            pass
     return False
 
 
@@ -429,30 +449,40 @@ def _has_usable_nvidia_gpu() -> bool:
 def _detect_amd_gfx_codes() -> list[str]:
     """Return the list of AMD gfx ISA strings visible to ROCm (e.g. ['gfx1151']).
 
-    Parses ``rocminfo`` output for ``ISA Info`` / ``gfx`` entries.  Returns an
-    empty list when rocminfo is not found or no GPU agents are present.
+    Probes rocminfo first, then falls back to ``amd-smi list`` and
+    ``amd-smi static --asic`` for runtime-only Radeon hosts that ship
+    amd-smi but no rocminfo. Returns an empty list when no probe yields
+    a gfx target.
     """
     import re
 
-    exe = shutil.which("rocminfo")
-    if not exe:
-        return []
-    try:
-        result = subprocess.run(
-            [exe],
-            stdout = subprocess.PIPE,
-            stderr = subprocess.DEVNULL,
-            text = True,
-            timeout = 15,
-        )
-    except Exception:
-        return []
-    if result.returncode != 0:
-        return []
-    # Match lines like "  Name:                    gfx1151" or ISA strings
-    # "amdgcn-amd-amdhsa--gfx1151".  Exclude the CPU agent (gfx000).
-    codes = re.findall(r"gfx([1-9][0-9a-z]{2,3})", result.stdout.lower())
-    return list(dict.fromkeys(f"gfx{c}" for c in codes))  # deduplicate, preserve order
+    def _extract(text: str) -> list[str]:
+        codes = re.findall(r"gfx([1-9][0-9a-z]{2,3})", text.lower())
+        return list(dict.fromkeys(f"gfx{c}" for c in codes))
+
+    probes: list[list[str]] = []
+    if shutil.which("rocminfo"):
+        probes.append(["rocminfo"])
+    if shutil.which("amd-smi"):
+        probes.append(["amd-smi", "list"])
+        probes.append(["amd-smi", "static", "--asic"])
+    for cmd in probes:
+        try:
+            result = subprocess.run(
+                cmd,
+                stdout = subprocess.PIPE,
+                stderr = subprocess.DEVNULL,
+                text = True,
+                timeout = 15,
+            )
+        except Exception:
+            continue
+        if result.returncode != 0 or not result.stdout.strip():
+            continue
+        codes = _extract(result.stdout)
+        if codes:
+            return codes
+    return []
 
 
 # Set by _ensure_rocm_torch() on success; suppresses the post-install AMD warning.
@@ -709,13 +739,49 @@ def _ensure_rocm_torch() -> None:
                     f"   skipping AMD per-gfx index override.\n"
                 )
 
-    if not has_hip_torch:
-        if _strix_override_url is not None and _strix_override_pkgs is not None:
-            index_url = _strix_override_url
-            _torch_pkg, _vision_pkg, _audio_pkg = _strix_override_pkgs
-            print(f"   Strix ROCm 7.1 override -- installing torch from {index_url}")
+    # Strix override on ROCm 7.1 must fire even when has_hip_torch is True --
+    # an existing torch with `torch.version.hip == "7.1"` is exactly the broken
+    # combo the override is meant to repair, so skipping it leaves users on
+    # the known _grouped_mm segfault.
+    if _strix_override_url is not None and _strix_override_pkgs is not None:
+        index_url = _strix_override_url
+        _torch_pkg, _vision_pkg, _audio_pkg = _strix_override_pkgs
+        print(f"   Strix ROCm 7.1 override -- installing torch from {index_url}")
+        pip_install(
+            "ROCm torch (Strix arch-specific)",
+            "--force-reinstall",
+            "--no-cache-dir",
+            _torch_pkg,
+            _vision_pkg,
+            _audio_pkg,
+            "--index-url",
+            index_url,
+            constrain = False,
+        )
+        rocm_torch_ready = True
+    elif not has_hip_torch:
+        # Select best matching wheel tag (newest ROCm version <= installed)
+        tag = next(
+            (
+                t
+                for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
+                if ver >= (maj, mn)
+            ),
+            None,
+        )
+        if tag is None:
+            print(
+                f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- "
+                f"skipping torch reinstall"
+            )
+        else:
+            index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
+            print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
+            _torch_pkg, _vision_pkg, _audio_pkg = _ROCM_TORCH_PKG_SPECS.get(
+                tag, _ROCM_TORCH_PKG_SPECS["_default"]
+            )
             pip_install(
-                "ROCm torch (Strix arch-specific)",
+                f"ROCm torch ({tag})",
                 "--force-reinstall",
                 "--no-cache-dir",
                 _torch_pkg,
@@ -726,39 +792,6 @@ def _ensure_rocm_torch() -> None:
                 constrain = False,
             )
             rocm_torch_ready = True
-        else:
-            # Select best matching wheel tag (newest ROCm version <= installed)
-            tag = next(
-                (
-                    t
-                    for (maj, mn), t in sorted(_ROCM_TORCH_INDEX.items(), reverse = True)
-                    if ver >= (maj, mn)
-                ),
-                None,
-            )
-            if tag is None:
-                print(
-                    f"   No PyTorch wheel for ROCm {ver[0]}.{ver[1]} -- "
-                    f"skipping torch reinstall"
-                )
-            else:
-                index_url = f"{_PYTORCH_WHL_BASE}/{tag}"
-                print(f"   ROCm {ver[0]}.{ver[1]} -- installing torch from {index_url}")
-                _torch_pkg, _vision_pkg, _audio_pkg = _ROCM_TORCH_PKG_SPECS.get(
-                    tag, _ROCM_TORCH_PKG_SPECS["_default"]
-                )
-                pip_install(
-                    f"ROCm torch ({tag})",
-                    "--force-reinstall",
-                    "--no-cache-dir",
-                    _torch_pkg,
-                    _vision_pkg,
-                    _audio_pkg,
-                    "--index-url",
-                    index_url,
-                    constrain = False,
-                )
-                rocm_torch_ready = True
 
     # Install bitsandbytes only when torch links against ROCm. Prefers the
     # continuous-release_main wheel (bnb PR #1887 4-bit GEMV fix) and falls
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 3b3ebd206d..6e759afecc 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -2338,7 +2338,7 @@ def test_strix_override_only_fires_on_rocm71(self):
         strix_idx = source.find("_strix_gfx")
         assert strix_idx != -1
         # Look back for the rocm7.1 pattern within 600 chars before _strix_gfx
-        context_before = source[max(0, strix_idx - 600) : strix_idx]
+        context_before = source[max(0, strix_idx - 2400) : strix_idx]
         assert "rocm7.1" in context_before
 
     def test_torch_constraint_updated_for_strix_amd_index(self):

From e6cc98e75ca791c1413b4422a5156eb87e87686c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 12:14:33 +0000
Subject: [PATCH 111/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 59c5958389..ac811d32e9 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2133,6 +2133,7 @@ def find_spec(self, fullname, path, target = None):
             # to that string when version.hip is missing.
             def _hip_ver_at_least(major: int, minor: int) -> bool:
                 import re as _re_ver
+
                 _hip_str = getattr(
                     getattr(_torch_for_rocm, "version", None), "hip", None
                 )

From 06f28e4d03468c58e3d46f6b02cdead90c538932 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 19 May 2026 14:35:40 -0500
Subject: [PATCH 112/165] fix(studio/rocm): code review hardening pass

- main.py: numeric DLL sort (string sort picked rocm72 over rocm713);
  add basename() to regex; log warning on detection failure; log info
  when BNB_ROCM_VERSION is set (mirrors worker.py)
- worker.py: explicit len-guard in _hip_ver_at_least() with warning
  logs instead of silent IndexError/ValueError swallow
- hardware.py: isinstance(result, dict) guard before result.get() in
  _smi_query() to prevent AttributeError on non-dict backend returns
- amd.py: round() before int() on parsed GPU IDs; log warning when
  truncation occurs (defensive against malformed amd-smi output)
- setup.sh: quote --gcc-install-dir value in CMAKE_HIP_FLAGS so paths
  with spaces do not break the CMake argument
- install.ps1, setup.ps1: apply colon-split + ToLower() to hipinfo
  gcnArchName match (consistent with each other and with setup.sh)
- install.sh: tighten ROCm tag case patterns to explicit
  rocmX.Y|rocmX.Y.* to avoid unintended prefix matches
---
 install.ps1                               |  2 +-
 install.sh                                | 16 +++++++-------
 studio/backend/core/training/worker.py    | 18 +++++++++++++++-
 studio/backend/main.py                    | 26 ++++++++++++++++++-----
 studio/backend/utils/hardware/amd.py      | 12 +++++++++--
 studio/backend/utils/hardware/hardware.py |  2 +-
 studio/setup.ps1                          |  2 +-
 studio/setup.sh                           |  7 +++---
 8 files changed, 63 insertions(+), 22 deletions(-)

diff --git a/install.ps1 b/install.ps1
index a3309485f3..a0922a14c9 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1241,7 +1241,7 @@ shell.Run cmd, 0, False
                 if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                     $HasROCm = $true
                     if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
-                        $ROCmGfxArch  = $Matches[1].Trim()
+                        $ROCmGfxArch  = ($Matches[1] -split ':')[0].Trim().ToLower()
                         $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
                     } else {
                         $ROCmGpuLabel = "AMD ROCm"
diff --git a/install.sh b/install.sh
index c52f2669a9..3ea19df56c 100755
--- a/install.sh
+++ b/install.sh
@@ -1647,14 +1647,14 @@ get_torch_index_url() {
             # PyTorch publishes major.minor URLs only (no patch level), so
             # rocm7.2.1 / rocm6.0.2 / etc. must normalise to rocm7.2 / rocm6.0.
             case "$_rocm_tag" in
-                rocm6.0*) echo "$_base/rocm6.0" ;;
-                rocm6.1*) echo "$_base/rocm6.1" ;;
-                rocm6.2*) echo "$_base/rocm6.2" ;;
-                rocm6.3*) echo "$_base/rocm6.3" ;;
-                rocm6.4*) echo "$_base/rocm6.4" ;;
-                rocm7.0*) echo "$_base/rocm7.0" ;;
-                rocm7.1*) echo "$_base/rocm7.1" ;;
-                rocm7.2*) echo "$_base/rocm7.2" ;;
+                rocm6.0|rocm6.0.*) echo "$_base/rocm6.0" ;;
+                rocm6.1|rocm6.1.*) echo "$_base/rocm6.1" ;;
+                rocm6.2|rocm6.2.*) echo "$_base/rocm6.2" ;;
+                rocm6.3|rocm6.3.*) echo "$_base/rocm6.3" ;;
+                rocm6.4|rocm6.4.*) echo "$_base/rocm6.4" ;;
+                rocm7.0|rocm7.0.*) echo "$_base/rocm7.0" ;;
+                rocm7.1|rocm7.1.*) echo "$_base/rocm7.1" ;;
+                rocm7.2|rocm7.2.*) echo "$_base/rocm7.2" ;;
                 rocm6.*)
                     # ROCm 6.5+ (no published PyTorch wheels): clip down
                     # to the last supported 6.x wheel set.
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ac811d32e9..ddd1fff990 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2149,8 +2149,24 @@ def _hip_ver_at_least(major: int, minor: int) -> bool:
                     return False
                 try:
                     _parts = [int(x) for x in str(_hip_str).split(".")[:2]]
+                    if len(_parts) < 2:
+                        logger.warning(
+                            "Windows ROCm: torch.version.hip %r has fewer than "
+                            "two components; cannot compare against %d.%d",
+                            _hip_str,
+                            major,
+                            minor,
+                        )
+                        return False
                     return (_parts[0], _parts[1]) >= (major, minor)
-                except (ValueError, IndexError):
+                except ValueError:
+                    logger.warning(
+                        "Windows ROCm: could not parse torch.version.hip %r as "
+                        "a version number; assuming HIP < %d.%d",
+                        _hip_str,
+                        major,
+                        minor,
+                    )
                     return False
 
             # _grouped_mm HIP kernel was null on gfx1200 in ROCm ≤ 7.12,
diff --git a/studio/backend/main.py b/studio/backend/main.py
index fac528f1e1..482b31d689 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -88,6 +88,7 @@ def _ver_key(name: str) -> tuple:
         pass
     if _is_rocm_host and "BNB_ROCM_VERSION" not in os.environ:
         import glob as _glob
+        import logging as _logging
 
         _bnb_rocm_ver = None
         try:
@@ -99,14 +100,29 @@ def _ver_key(name: str) -> tuple:
                 _dlls = _glob.glob(os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll"))
                 import re as _re_bnb
 
-                for _dll in sorted(_dlls):
-                    _m = _re_bnb.search(r"libbitsandbytes_rocm(\d+)\.dll", _dll)
+                def _bnb_ver_key(p: str) -> int:
+                    _km = _re_bnb.search(r"rocm(\d+)", os.path.basename(p))
+                    return int(_km.group(1)) if _km else -1
+
+                for _dll in sorted(_dlls, key=_bnb_ver_key, reverse=True):
+                    _m = _re_bnb.search(
+                        r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(_dll)
+                    )
                     if _m:
                         _bnb_rocm_ver = _m.group(1)
                         break
-        except Exception:
-            pass
-        os.environ["BNB_ROCM_VERSION"] = _bnb_rocm_ver or "72"
+        except Exception as _e:
+            _logging.getLogger(__name__).warning(
+                "Windows ROCm: BNB DLL detection failed (%s); falling back to version '72'",
+                _e,
+            )
+        _bnb_rocm_ver_final = _bnb_rocm_ver or "72"
+        os.environ["BNB_ROCM_VERSION"] = _bnb_rocm_ver_final
+        _logging.getLogger(__name__).info(
+            "Windows ROCm: set BNB_ROCM_VERSION=%s "
+            "(detected from installed BNB wheel; overrides torch.version.hip auto-detection)",
+            _bnb_rocm_ver_final,
+        )
 
 # Ensure backend dir is on sys.path so _platform_compat is importable when
 # main.py is launched directly (e.g. `uvicorn main:app`).
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index bcacdb57ea..7804836fd2 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -383,7 +383,7 @@ def get_visible_gpu_utilization(
         )
         parsed_id = _parse_numeric(raw_id)
         if parsed_id is None:
-            logger.debug(
+            logger.warning(
                 "amd-smi GPU id %r could not be parsed; falling back to "
                 "enumeration index %d",
                 raw_id,
@@ -391,7 +391,15 @@ def get_visible_gpu_utilization(
             )
             idx = fallback_idx
         else:
-            idx = int(parsed_id)
+            rounded = round(parsed_id)
+            if rounded != parsed_id:
+                logger.warning(
+                    "amd-smi GPU id %r parsed as non-integer %r; truncating to %d",
+                    raw_id,
+                    parsed_id,
+                    rounded,
+                )
+            idx = int(rounded)
         if idx not in visible_set:
             continue
         metrics = _extract_gpu_metrics(gpu_data)
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 09c097a687..a81159324c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -467,7 +467,7 @@ def _smi_query(func_name: str, *args, **kwargs) -> Optional[Dict[str, Any]]:
     try:
         func = getattr(_backend, func_name)
         result = func(*args, **kwargs)
-        if result.get("available"):
+        if isinstance(result, dict) and result.get("available"):
             return result
     except Exception as e:
         logger.warning("%s %s query failed: %s", backend_name, func_name, e)
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 6ee20143ec..cddc252b86 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -715,7 +715,7 @@ if (-not $HasNvidiaSmi) {
             if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                 $HasROCm = $true
                 if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
-                    $script:ROCmGfxArch = $Matches[1].Trim()
+                    $script:ROCmGfxArch = ($Matches[1] -split ':')[0].Trim().ToLower()
                     $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
                 } else {
                     $ROCmGpuLabel = "AMD ROCm"
diff --git a/studio/setup.sh b/studio/setup.sh
index 0227bfcaf7..60af1e26b4 100755
--- a/studio/setup.sh
+++ b/studio/setup.sh
@@ -1043,15 +1043,16 @@ else
                 # runtime dir AND /usr/include/c++/<ver> headers, then pass it
                 # to clang via --gcc-install-dir so HIP builds succeed.
                 _GCC_INSTALL_DIR=""
+                _GCC_MULTIARCH="$(gcc -print-multiarch 2>/dev/null || uname -m)-linux-gnu"
                 for _gcc_ver in 14 13 12 11; do
-                    if [ -d "/usr/lib/gcc/x86_64-linux-gnu/$_gcc_ver/include" ] && \
+                    if [ -d "/usr/lib/gcc/$_GCC_MULTIARCH/$_gcc_ver/include" ] && \
                        [ -d "/usr/include/c++/$_gcc_ver" ]; then
-                        _GCC_INSTALL_DIR="/usr/lib/gcc/x86_64-linux-gnu/$_gcc_ver"
+                        _GCC_INSTALL_DIR="/usr/lib/gcc/$_GCC_MULTIARCH/$_gcc_ver"
                         break
                     fi
                 done
                 if [ -n "$_GCC_INSTALL_DIR" ]; then
-                    CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_HIP_FLAGS=--gcc-install-dir=$_GCC_INSTALL_DIR"
+                    CMAKE_ARGS="$CMAKE_ARGS -DCMAKE_HIP_FLAGS=--gcc-install-dir=\"$_GCC_INSTALL_DIR\""
                     substep "ROCm HIP gcc install dir: $_GCC_INSTALL_DIR"
                 fi
 

From 47fdc857f9d5eead28ab5a400f290747485cb1c9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 19 May 2026 19:36:42 +0000
Subject: [PATCH 113/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/studio/backend/main.py b/studio/backend/main.py
index 482b31d689..e177767e8e 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -104,7 +104,7 @@ def _bnb_ver_key(p: str) -> int:
                     _km = _re_bnb.search(r"rocm(\d+)", os.path.basename(p))
                     return int(_km.group(1)) if _km else -1
 
-                for _dll in sorted(_dlls, key=_bnb_ver_key, reverse=True):
+                for _dll in sorted(_dlls, key = _bnb_ver_key, reverse = True):
                     _m = _re_bnb.search(
                         r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(_dll)
                     )

From d663d12184605db848d2f57faeeb291f71a0f5c7 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 19 May 2026 16:40:07 -0500
Subject: [PATCH 114/165] fix(studio/training): GPU OOM guard to prevent system
 freeze on VRAM exhaustion

On RDNA 4 (gfx1200/gfx1201) and other ROCm GPUs, exhausting VRAM can
cause a HIP driver hang that freezes the entire system rather than
raising a recoverable Python exception.

Two-part fix:
- set_per_process_memory_fraction(0.90) caps the HIP/CUDA allocator at
  90% of VRAM so PyTorch raises OutOfMemoryError before hitting the
  hardware limit, keeping the driver alive and the system responsive
- top-level exception handler detects OOM errors by type and message
  and surfaces a clear actionable message to the UI (reduce
  max_seq_length, enable gradient_checkpointing, lower batch size)
  instead of the raw CUDA/HIP error string
---
 studio/backend/core/training/worker.py | 63 ++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ddd1fff990..4b4d3b03d6 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2264,6 +2264,29 @@ def _grouped_mm_safe_impl(
                     "skipping Python fallback (AMD fixed gfx1200 null kernel in ROCm 7.13)"
                 )
 
+    # ── 1g. ROCm/CUDA OOM guard ──
+    # On RDNA 4 (gfx1200/gfx1201) and other ROCm GPUs, exhausting VRAM can
+    # cause a HIP driver hang that freezes the entire system rather than
+    # raising a Python exception.  set_per_process_memory_fraction caps the
+    # HIP/CUDA allocator at 90% of available VRAM so PyTorch raises
+    # OutOfMemoryError before hitting the hardware limit, giving the UI a
+    # clean error message instead of a system freeze.
+    # Non-fatal: if torch is not importable here (CPU-only path) the guard
+    # is silently skipped and the training path will handle it later.
+    if _hw.DEVICE == _hw.DeviceType.CUDA:
+        try:
+            import torch as _torch_mem
+
+            if _torch_mem.cuda.is_available():
+                _torch_mem.cuda.set_per_process_memory_fraction(0.90)
+                logger.info(
+                    "GPU OOM guard: set_per_process_memory_fraction(0.90) — "
+                    "HIP/CUDA allocator will raise OutOfMemoryError before "
+                    "hitting the hardware VRAM limit"
+                )
+        except Exception as _oom_guard_err:
+            logger.debug("Could not set GPU memory fraction: %s", _oom_guard_err)
+
     # ── 2. Now import ML libraries (fresh in this clean process) ──
     try:
         _send_status(event_queue, "Importing Unsloth...")
@@ -2717,14 +2740,38 @@ def _monitor_tqdm():
             )
 
     except Exception as exc:
-        event_queue.put(
-            {
-                "type": "error",
-                "error": str(exc),
-                "stack": traceback.format_exc(limit = 20),
-                "ts": time.time(),
-            }
-        )
+        _exc_str = str(exc).lower()
+        _is_oom = (
+            "out of memory" in _exc_str
+            or "hip out of memory" in _exc_str
+            or "cuda out of memory" in _exc_str
+            or type(exc).__name__ == "OutOfMemoryError"
+        )
+        if _is_oom:
+            _oom_msg = (
+                "GPU ran out of VRAM during training.\n"
+                "To fix: reduce max_seq_length (e.g. 2048–4096), enable "
+                "gradient_checkpointing=True, lower per_device_train_batch_size, "
+                "or use a smaller model / higher quantization."
+            )
+            logger.error("Training stopped: GPU OOM — %s", exc)
+            event_queue.put(
+                {
+                    "type": "error",
+                    "error": _oom_msg,
+                    "stack": traceback.format_exc(limit = 20),
+                    "ts": time.time(),
+                }
+            )
+        else:
+            event_queue.put(
+                {
+                    "type": "error",
+                    "error": str(exc),
+                    "stack": traceback.format_exc(limit = 20),
+                    "ts": time.time(),
+                }
+            )
 
 
 def _send_status(event_queue: Any, message: str) -> None:

From 536a54df4264c1ca05d79e4eb81b37eccb9fa1d3 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 02:00:47 -0500
Subject: [PATCH 115/165] fix(studio/rocm): OOM guard ROCm-only + unified
 memory, multi-GPU arch selection

OOM guard (worker.py):
- Scope to _hw.IS_ROCM only -- NVIDIA CUDA has a graceful OOM path and
  does not need the allocator cap
- Detect unified memory by comparing torch VRAM against psutil system RAM;
  use 0.80 on unified-memory APUs (gfx1151 Strix Halo) where the GPU pool
  is carved from host RAM, 0.90 on discrete cards

Multi-GPU arch selection:
- install.ps1 / setup.ps1: replace -match (first hit only) with
  [regex]::Matches() to collect all gcnArchName entries, then index by
  HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES
- install_python_stack.py: index into full token list before dedup so
  HIP_VISIBLE_DEVICES=2 on [gfx1100, gfx1100, gfx1151] resolves gfx1151
- install.sh: remove awk dedup from gfx token collection for same reason

GCC multiarch (setup.sh):
- Only append -linux-gnu when gcc -print-multiarch does not already return
  the full triple, fixing double-suffix on Ubuntu 24.04
---
 install.ps1                            |  6 ++++--
 install.sh                             |  6 +++---
 studio/backend/core/training/worker.py | 30 ++++++++++++++++----------
 studio/install_python_stack.py         |  8 +++----
 studio/setup.ps1                       |  6 ++++--
 studio/setup.sh                        |  6 +++++-
 6 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 52904e9296..6761eed35d 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1255,8 +1255,10 @@ shell.Run cmd, 0, False
                 $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
                 if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                     $HasROCm = $true
-                    if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
-                        $ROCmGfxArch  = ($Matches[1] -split ':')[0].Trim().ToLower()
+                    $_hipAllArches = [regex]::Matches($hipOut, "(?im)^\s*gcnArchName\s*:\s*(\S+)") | ForEach-Object { ($_.Groups[1].Value -split ':')[0].Trim().ToLower() }
+                    $_hipVisIdx = if ($env:HIP_VISIBLE_DEVICES -match '^\d') { [int]($env:HIP_VISIBLE_DEVICES -split ',')[0] } elseif ($env:ROCR_VISIBLE_DEVICES -match '^\d') { [int]($env:ROCR_VISIBLE_DEVICES -split ',')[0] } else { 0 }
+                    if ($_hipAllArches.Count -gt 0) {
+                        $ROCmGfxArch  = if ($_hipVisIdx -lt $_hipAllArches.Count) { $_hipAllArches[$_hipVisIdx] } else { $_hipAllArches[0] }
                         $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
                     } else {
                         $ROCmGpuLabel = "AMD ROCm"
diff --git a/install.sh b/install.sh
index 1c31d39aa0..199b83c626 100755
--- a/install.sh
+++ b/install.sh
@@ -1855,14 +1855,14 @@ case "$TORCH_INDEX_URL" in
         # Strix per-gfx index.
         _gfx_all=""
         if command -v rocminfo >/dev/null 2>&1; then
-            _gfx_all=$(rocminfo 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' | awk '!seen[$0]++')
+            _gfx_all=$(rocminfo 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}')
         fi
         if [ -z "$_gfx_all" ] && command -v amd-smi >/dev/null 2>&1; then
-            _gfx_all=$(amd-smi list 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' | awk '!seen[$0]++')
+            _gfx_all=$(amd-smi list 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}')
             # PowerShell paths also probe `amd-smi static --asic`; mirror it
             # so a host with hipinfo-less amd-smi reports the gfx target.
             if [ -z "$_gfx_all" ]; then
-                _gfx_all=$(amd-smi static --asic 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' | awk '!seen[$0]++')
+                _gfx_all=$(amd-smi static --asic 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}')
             fi
         fi
         _runtime_gfx=""
diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 4b4d3b03d6..ad421b8101 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2264,25 +2264,33 @@ def _grouped_mm_safe_impl(
                     "skipping Python fallback (AMD fixed gfx1200 null kernel in ROCm 7.13)"
                 )
 
-    # ── 1g. ROCm/CUDA OOM guard ──
+    # ── 1g. ROCm OOM guard ──
     # On RDNA 4 (gfx1200/gfx1201) and other ROCm GPUs, exhausting VRAM can
     # cause a HIP driver hang that freezes the entire system rather than
     # raising a Python exception.  set_per_process_memory_fraction caps the
-    # HIP/CUDA allocator at 90% of available VRAM so PyTorch raises
-    # OutOfMemoryError before hitting the hardware limit, giving the UI a
-    # clean error message instead of a system freeze.
-    # Non-fatal: if torch is not importable here (CPU-only path) the guard
-    # is silently skipped and the training path will handle it later.
-    if _hw.DEVICE == _hw.DeviceType.CUDA:
+    # HIP allocator so PyTorch raises OutOfMemoryError before hitting the
+    # hardware limit, giving the UI a clean error instead of a system freeze.
+    # Only applied on ROCm -- NVIDIA CUDA has a graceful OOM path and does
+    # not need this cap.
+    # Unified-memory APUs (gfx1150/gfx1151 Strix Halo) share GPU and system
+    # RAM in one pool: 0.90 of 128 GB starves the OS. Use 0.80 there.
+    # Non-fatal: silently skipped if torch is not importable.
+    if _hw.IS_ROCM:
         try:
             import torch as _torch_mem
+            import psutil as _psutil
 
             if _torch_mem.cuda.is_available():
-                _torch_mem.cuda.set_per_process_memory_fraction(0.90)
+                _vram_total = _torch_mem.cuda.get_device_properties(0).total_memory
+                _ram_total = _psutil.virtual_memory().total
+                _is_unified = _vram_total > (_ram_total * 0.5)
+                _mem_fraction = 0.80 if _is_unified else 0.90
+                _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(
-                    "GPU OOM guard: set_per_process_memory_fraction(0.90) — "
-                    "HIP/CUDA allocator will raise OutOfMemoryError before "
-                    "hitting the hardware VRAM limit"
+                    "ROCm OOM guard: set_per_process_memory_fraction(%.2f) — "
+                    "%s memory host",
+                    _mem_fraction,
+                    "unified" if _is_unified else "discrete",
                 )
         except Exception as _oom_guard_err:
             logger.debug("Could not set GPU memory fraction: %s", _oom_guard_err)
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index eb2f9b763d..bef5245824 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -271,11 +271,9 @@ def _detect_windows_gfx_arch() -> str | None:
     def _dedup_pick(tokens: list[str]) -> "str | None":
         if not tokens:
             return None
-        _seen: list[str] = []
-        for _t in tokens:
-            if _t not in _seen:
-                _seen.append(_t)
-        return _seen[_pick_visible_index(len(_seen))]
+        # Index into the full (ordered) list first so HIP_VISIBLE_DEVICES
+        # correctly addresses GPU N on mixed-arch hosts, then return that arch.
+        return tokens[_pick_visible_index(len(tokens))]
 
     # 2. hipinfo via PATH, then HIP_PATH\bin / ROCM_PATH\bin.
     hipinfo = shutil.which("hipinfo")
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 9fe78ff38b..5c71b9b5a8 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -714,8 +714,10 @@ if (-not $HasNvidiaSmi) {
             $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
             if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                 $HasROCm = $true
-                if ($hipOut -match "(?im)^\s*gcnArchName\s*:\s*(\S+)") {
-                    $script:ROCmGfxArch = ($Matches[1] -split ':')[0].Trim().ToLower()
+                $_hipAllArches = [regex]::Matches($hipOut, "(?im)^\s*gcnArchName\s*:\s*(\S+)") | ForEach-Object { ($_.Groups[1].Value -split ':')[0].Trim().ToLower() }
+                $_hipVisIdx = if ($env:HIP_VISIBLE_DEVICES -match '^\d') { [int]($env:HIP_VISIBLE_DEVICES -split ',')[0] } elseif ($env:ROCR_VISIBLE_DEVICES -match '^\d') { [int]($env:ROCR_VISIBLE_DEVICES -split ',')[0] } else { 0 }
+                if ($_hipAllArches.Count -gt 0) {
+                    $script:ROCmGfxArch = if ($_hipVisIdx -lt $_hipAllArches.Count) { $_hipAllArches[$_hipVisIdx] } else { $_hipAllArches[0] }
                     $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
                 } else {
                     $ROCmGpuLabel = "AMD ROCm"
diff --git a/studio/setup.sh b/studio/setup.sh
index 60af1e26b4..c56672ba52 100755
--- a/studio/setup.sh
+++ b/studio/setup.sh
@@ -1043,7 +1043,11 @@ else
                 # runtime dir AND /usr/include/c++/<ver> headers, then pass it
                 # to clang via --gcc-install-dir so HIP builds succeed.
                 _GCC_INSTALL_DIR=""
-                _GCC_MULTIARCH="$(gcc -print-multiarch 2>/dev/null || uname -m)-linux-gnu"
+                _gcc_pm="$(gcc -print-multiarch 2>/dev/null)"
+                case "$_gcc_pm" in
+                    *-linux-gnu*) _GCC_MULTIARCH="$_gcc_pm" ;;
+                    *) _GCC_MULTIARCH="$(uname -m)-linux-gnu" ;;
+                esac
                 for _gcc_ver in 14 13 12 11; do
                     if [ -d "/usr/lib/gcc/$_GCC_MULTIARCH/$_gcc_ver/include" ] && \
                        [ -d "/usr/include/c++/$_gcc_ver" ]; then

From ec021a03d1b07af47a2d08b5bd930709d811d469 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 02:06:33 -0500
Subject: [PATCH 116/165] fix(tests): update ROCm version cap expectations from
 rocm7.1 to rocm7.2

Daniel's normalisation commit updated the cap from rocm7.1 to rocm7.2
since PyTorch now publishes that index and rocm7.2 ships torch 2.11.0.
Test expectations were stale.
---
 tests/sh/test_get_torch_index_url.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/sh/test_get_torch_index_url.sh b/tests/sh/test_get_torch_index_url.sh
index 7235873f53..0961363730 100755
--- a/tests/sh/test_get_torch_index_url.sh
+++ b/tests/sh/test_get_torch_index_url.sh
@@ -170,10 +170,10 @@ _result=$(run_func "$_dir")
 assert_eq "ROCm 7.1 -> rocm7.1" "https://download.pytorch.org/whl/rocm7.1" "$_result"
 rm -rf "$_dir"
 
-# 11) ROCm 7.2 (no nvidia-smi) -> rocm7.1 (capped due to torch <2.11.0)
+# 11) ROCm 7.2 (no nvidia-smi) -> rocm7.2
 _dir=$(make_mock_amd_smi "7.2")
 _result=$(run_func "$_dir")
-assert_eq "ROCm 7.2 -> rocm7.1 (capped)" "https://download.pytorch.org/whl/rocm7.1" "$_result"
+assert_eq "ROCm 7.2 -> rocm7.2" "https://download.pytorch.org/whl/rocm7.2" "$_result"
 rm -rf "$_dir"
 
 # 12) Both nvidia-smi and amd-smi present -> CUDA takes precedence
@@ -208,10 +208,10 @@ _result=$(run_func "$_dir")
 assert_eq "ROCm 7.0 -> rocm7.0" "https://download.pytorch.org/whl/rocm7.0" "$_result"
 rm -rf "$_dir"
 
-# 17) ROCm 8.0 (future, no nvidia-smi) -> rocm7.1 (capped)
+# 17) ROCm 8.0 (future, no nvidia-smi) -> rocm7.2 (capped to latest known)
 _dir=$(make_mock_amd_smi "8.0")
 _result=$(run_func "$_dir")
-assert_eq "ROCm 8.0 -> rocm7.1 (capped)" "https://download.pytorch.org/whl/rocm7.1" "$_result"
+assert_eq "ROCm 8.0 -> rocm7.2 (capped)" "https://download.pytorch.org/whl/rocm7.2" "$_result"
 rm -rf "$_dir"
 
 # 18) Malformed amd-smi output (empty version field) -> cpu

From 90f6cd417d2f60c09f7d9c26c5a2a3e990861a1a Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 02:10:27 -0500
Subject: [PATCH 117/165] fix(tests): correct MLX smoke test losses_per_step
 assertion

logging_steps=1 with max_steps=30 produces 30 loss entries, not 7.
The assertion was stale from a previous config.
---
 tests/studio/run_real_mlx_smoke.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index 27f682ee4e..9a18313152 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -390,7 +390,7 @@ def _on_step(
         )
         if k in train_result
     }
-    assert len(losses_per_step) == 7, f"expected 7 logged steps, got {losses_per_step}"
+    assert len(losses_per_step) == 30, f"expected 30 logged steps (logging_steps=1, max_steps=30), got {losses_per_step}"
     for i, l in enumerate(losses_per_step):
         # Allow exact 0.0: fp16 per-step loss underflows to 0.0 after
         # the LoRA reaches loss=0 around step ~10 with this fixture +

From 792c3a02a21e5075618ea5813e832b2b1921abe8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 May 2026 07:13:59 +0000
Subject: [PATCH 118/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/studio/run_real_mlx_smoke.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/studio/run_real_mlx_smoke.py b/tests/studio/run_real_mlx_smoke.py
index 9a18313152..75f525b014 100644
--- a/tests/studio/run_real_mlx_smoke.py
+++ b/tests/studio/run_real_mlx_smoke.py
@@ -390,7 +390,9 @@ def _on_step(
         )
         if k in train_result
     }
-    assert len(losses_per_step) == 30, f"expected 30 logged steps (logging_steps=1, max_steps=30), got {losses_per_step}"
+    assert (
+        len(losses_per_step) == 30
+    ), f"expected 30 logged steps (logging_steps=1, max_steps=30), got {losses_per_step}"
     for i, l in enumerate(losses_per_step):
         # Allow exact 0.0: fp16 per-step loss underflows to 0.0 after
         # the LoRA reaches loss=0 around step ~10 with this fixture +

From 5d84704009d72b3722098a29449edbae40b13eaf Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 02:50:07 -0500
Subject: [PATCH 119/165] fix(studio/worker): detect unified-memory APU by GPU
 name not VRAM/RAM ratio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous heuristic (VRAM > 50 % of system RAM) false-positived on discrete
cards in low-RAM systems — e.g. RX 9060 XT 16 GB on a 16 GB or 24 GB machine
would trip the unified-memory path and log "unified memory host" when it should
say "discrete".

AMD iGPUs (gfx1150/gfx1151 Strix Halo, Strix Point, etc.) expose names with a
digit+M suffix ("AMD Radeon 890M"), while discrete cards use "RX NNNN [XT|XTX]"
naming.  Matching that suffix is reliable across all current ROCm-capable AMD
consumer GPUs and does not require psutil.

Also includes the device name in the log line to ease future debugging.
---
 studio/backend/core/training/worker.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index ad421b8101..adb78c4704 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2277,20 +2277,23 @@ def _grouped_mm_safe_impl(
     # Non-fatal: silently skipped if torch is not importable.
     if _hw.IS_ROCM:
         try:
+            import re as _re
             import torch as _torch_mem
-            import psutil as _psutil
 
             if _torch_mem.cuda.is_available():
-                _vram_total = _torch_mem.cuda.get_device_properties(0).total_memory
-                _ram_total = _psutil.virtual_memory().total
-                _is_unified = _vram_total > (_ram_total * 0.5)
+                # iGPUs (gfx1150/gfx1151 Strix Halo, Strix Point, etc.) report
+                # names ending in a digit+M suffix ("AMD Radeon 890M").
+                # Discrete cards use "RX NNNN [XT|XTX]" — no trailing M.
+                _dev_name = _torch_mem.cuda.get_device_properties(0).name
+                _is_unified = bool(_re.search(r'\d[Mm]\b', _dev_name))
                 _mem_fraction = 0.80 if _is_unified else 0.90
                 _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(
                     "ROCm OOM guard: set_per_process_memory_fraction(%.2f) — "
-                    "%s memory host",
+                    "%s memory host (%s)",
                     _mem_fraction,
                     "unified" if _is_unified else "discrete",
+                    _dev_name,
                 )
         except Exception as _oom_guard_err:
             logger.debug("Could not set GPU memory fraction: %s", _oom_guard_err)

From 67ab0a6699c8f033c7e5a8b12f73778e5f43020e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 May 2026 08:04:58 +0000
Subject: [PATCH 120/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index adb78c4704..212d6a2bf3 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2285,7 +2285,7 @@ def _grouped_mm_safe_impl(
                 # names ending in a digit+M suffix ("AMD Radeon 890M").
                 # Discrete cards use "RX NNNN [XT|XTX]" — no trailing M.
                 _dev_name = _torch_mem.cuda.get_device_properties(0).name
-                _is_unified = bool(_re.search(r'\d[Mm]\b', _dev_name))
+                _is_unified = bool(_re.search(r"\d[Mm]\b", _dev_name))
                 _mem_fraction = 0.80 if _is_unified else 0.90
                 _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(

From 32457939298a08f58ae25bd6bc224aebf4cb8d0b Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 03:09:48 -0500
Subject: [PATCH 121/165] fix(install/setup.ps1): force array on hipinfo
 gcnArchName parse to fix single-GPU arch truncation

When [regex]::Matches() finds exactly one match, PowerShell's pipeline
unwraps the result to a scalar string.  Indexing a scalar string with [0]
returns the first *character*, so a one-GPU system would parse
gcnArchName "gfx1200" as "g", which is not in the supported arch map
and triggers the CPU-only fallback.

Wrapping with @() forces the result to remain an array regardless of
match count.  On a single-GPU machine the arch is now correctly read as
"gfx1200" (or whatever the full name is) so the ROCm wheel index is
selected.

Reproducer: hipinfo exits 0 and outputs exactly one gcnArchName line.
Without @(), $_hipAllArches = "gfx1200" (String); $_hipAllArches[0] = 'g'.
With @(), $_hipAllArches = @("gfx1200") (Object[]); $_hipAllArches[0] = "gfx1200".
---
 install.ps1      | 2 +-
 studio/setup.ps1 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 6761eed35d..4e8efaaf86 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1255,7 +1255,7 @@ shell.Run cmd, 0, False
                 $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
                 if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                     $HasROCm = $true
-                    $_hipAllArches = [regex]::Matches($hipOut, "(?im)^\s*gcnArchName\s*:\s*(\S+)") | ForEach-Object { ($_.Groups[1].Value -split ':')[0].Trim().ToLower() }
+                    $_hipAllArches = @([regex]::Matches($hipOut, "(?im)^\s*gcnArchName\s*:\s*(\S+)") | ForEach-Object { ($_.Groups[1].Value -split ':')[0].Trim().ToLower() })
                     $_hipVisIdx = if ($env:HIP_VISIBLE_DEVICES -match '^\d') { [int]($env:HIP_VISIBLE_DEVICES -split ',')[0] } elseif ($env:ROCR_VISIBLE_DEVICES -match '^\d') { [int]($env:ROCR_VISIBLE_DEVICES -split ',')[0] } else { 0 }
                     if ($_hipAllArches.Count -gt 0) {
                         $ROCmGfxArch  = if ($_hipVisIdx -lt $_hipAllArches.Count) { $_hipAllArches[$_hipVisIdx] } else { $_hipAllArches[0] }
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 5c71b9b5a8..42de25a9d0 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -714,7 +714,7 @@ if (-not $HasNvidiaSmi) {
             $hipOut = & $hipinfoExe.Source 2>&1 | Out-String
             if ($LASTEXITCODE -eq 0 -and $hipOut -match "(?i)gcnArchName") {
                 $HasROCm = $true
-                $_hipAllArches = [regex]::Matches($hipOut, "(?im)^\s*gcnArchName\s*:\s*(\S+)") | ForEach-Object { ($_.Groups[1].Value -split ':')[0].Trim().ToLower() }
+                $_hipAllArches = @([regex]::Matches($hipOut, "(?im)^\s*gcnArchName\s*:\s*(\S+)") | ForEach-Object { ($_.Groups[1].Value -split ':')[0].Trim().ToLower() })
                 $_hipVisIdx = if ($env:HIP_VISIBLE_DEVICES -match '^\d') { [int]($env:HIP_VISIBLE_DEVICES -split ',')[0] } elseif ($env:ROCR_VISIBLE_DEVICES -match '^\d') { [int]($env:ROCR_VISIBLE_DEVICES -split ',')[0] } else { 0 }
                 if ($_hipAllArches.Count -gt 0) {
                     $script:ROCmGfxArch = if ($_hipVisIdx -lt $_hipAllArches.Count) { $_hipAllArches[$_hipVisIdx] } else { $_hipAllArches[0] }

From 9c50ee3e3a646e7294668fb0f12e058aa10ceedf Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 13:22:26 -0500
Subject: [PATCH 122/165] fix(studio/rocm): classify unified-memory APU via
 VRAM/RAM ratio, not arch list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the gcnArchName allowlist {gfx1150, gfx1151} with a
psutil-based heuristic: unified APUs expose the entire system RAM
as the HIP pool (ratio ≥ 0.90), discrete cards are well below that.
No arch name required — future APUs classify correctly without code changes.

Also removes the stale import re / \d[Mm]\b device-name regex that
5d84704 left behind, and logs vram/sys GiB for easier on-hardware
verification.

Addresses h34v3nzc0dex review: Radeon 8060S (gfx1151, 128 GiB
unified) now correctly gets 0.80 cap instead of 0.90.
---
 studio/backend/core/training/worker.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 212d6a2bf3..6bd171d875 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2272,28 +2272,36 @@ def _grouped_mm_safe_impl(
     # hardware limit, giving the UI a clean error instead of a system freeze.
     # Only applied on ROCm -- NVIDIA CUDA has a graceful OOM path and does
     # not need this cap.
-    # Unified-memory APUs (gfx1150/gfx1151 Strix Halo) share GPU and system
-    # RAM in one pool: 0.90 of 128 GB starves the OS. Use 0.80 there.
+    # Unified-memory APUs share GPU and system RAM in one pool; 0.90 of 128 GB
+    # starves the OS. Detected by comparing torch VRAM to psutil system RAM
+    # (ratio ≥ 0.90 → unified → use 0.80; discrete cards are far below that).
     # Non-fatal: silently skipped if torch is not importable.
     if _hw.IS_ROCM:
         try:
-            import re as _re
+            import psutil as _psutil
             import torch as _torch_mem
 
             if _torch_mem.cuda.is_available():
-                # iGPUs (gfx1150/gfx1151 Strix Halo, Strix Point, etc.) report
-                # names ending in a digit+M suffix ("AMD Radeon 890M").
-                # Discrete cards use "RX NNNN [XT|XTX]" — no trailing M.
+                # Unified-memory APUs (e.g. Strix Halo gfx1151) expose the
+                # entire system RAM as the GPU pool, so torch total_memory ≈
+                # psutil total RAM (ratio ≥ 0.90).  Discrete cards are always
+                # well below that (16 GB card on 24 GB host → ~0.67).
+                # This threshold is arch-name-agnostic: any future APU that
+                # shares system RAM will classify correctly without a code change.
                 _dev_name = _torch_mem.cuda.get_device_properties(0).name
-                _is_unified = bool(_re.search(r"\d[Mm]\b", _dev_name))
+                _vram = _torch_mem.cuda.get_device_properties(0).total_memory
+                _sys_ram = _psutil.virtual_memory().total
+                _is_unified = _vram >= 0.90 * _sys_ram
                 _mem_fraction = 0.80 if _is_unified else 0.90
                 _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(
                     "ROCm OOM guard: set_per_process_memory_fraction(%.2f) — "
-                    "%s memory host (%s)",
+                    "%s memory host (%s, vram=%.1f GiB, sys=%.1f GiB)",
                     _mem_fraction,
                     "unified" if _is_unified else "discrete",
                     _dev_name,
+                    _vram / 2**30,
+                    _sys_ram / 2**30,
                 )
         except Exception as _oom_guard_err:
             logger.debug("Could not set GPU memory fraction: %s", _oom_guard_err)

From 9393fffe1731f80ea9eb5d436107af54b65dcd42 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 13:26:21 -0500
Subject: [PATCH 123/165] fix(studio/rocm): revert to gcnArchName for
 unified-memory APU classification
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VRAM/RAM ratio >= 0.90 false-positives on machines where discrete VRAM
equals system RAM (e.g. RX 9060 XT 16 GB + 16 GB system RAM → ratio 1.0,
incorrectly classified as unified → wrong 0.80 cap applied).

gcnArchName is the correct signal: naming-independent, stable within a
product family, and already parsed throughout this PR. Unified set is
{gfx1150, gfx1151} (Strix Point + Strix Halo).
---
 studio/backend/core/training/worker.py | 33 +++++++++++++-------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 6bd171d875..614a36edf9 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2272,36 +2272,35 @@ def _grouped_mm_safe_impl(
     # hardware limit, giving the UI a clean error instead of a system freeze.
     # Only applied on ROCm -- NVIDIA CUDA has a graceful OOM path and does
     # not need this cap.
-    # Unified-memory APUs share GPU and system RAM in one pool; 0.90 of 128 GB
-    # starves the OS. Detected by comparing torch VRAM to psutil system RAM
-    # (ratio ≥ 0.90 → unified → use 0.80; discrete cards are far below that).
+    # Unified-memory APUs (gfx1150 Strix Point / gfx1151 Strix Halo) share GPU
+    # and system RAM in one pool: 0.90 of 128 GB starves the OS. Use 0.80 there.
+    # Classified via gcnArchName — naming-independent and already parsed
+    # throughout this PR.
     # Non-fatal: silently skipped if torch is not importable.
     if _hw.IS_ROCM:
         try:
-            import psutil as _psutil
             import torch as _torch_mem
 
             if _torch_mem.cuda.is_available():
-                # Unified-memory APUs (e.g. Strix Halo gfx1151) expose the
-                # entire system RAM as the GPU pool, so torch total_memory ≈
-                # psutil total RAM (ratio ≥ 0.90).  Discrete cards are always
-                # well below that (16 GB card on 24 GB host → ~0.67).
-                # This threshold is arch-name-agnostic: any future APU that
-                # shares system RAM will classify correctly without a code change.
-                _dev_name = _torch_mem.cuda.get_device_properties(0).name
-                _vram = _torch_mem.cuda.get_device_properties(0).total_memory
-                _sys_ram = _psutil.virtual_memory().total
-                _is_unified = _vram >= 0.90 * _sys_ram
+                # Classify unified vs discrete by gcnArchName, not by device
+                # marketing name or VRAM/RAM ratio.  Name regexes miss "8060S";
+                # ratio (vram >= 0.90 * sys_ram) false-positives on machines
+                # where discrete VRAM == system RAM (e.g. 16 GB card + 16 GB RAM).
+                # gcnArchName is stable within a product family and is already
+                # parsed throughout this PR.
+                _props = _torch_mem.cuda.get_device_properties(0)
+                _dev_name = _props.name
+                _gcn_arch = (getattr(_props, "gcnArchName", "") or "").split(":")[0]
+                _is_unified = _gcn_arch in {"gfx1150", "gfx1151"}
                 _mem_fraction = 0.80 if _is_unified else 0.90
                 _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(
                     "ROCm OOM guard: set_per_process_memory_fraction(%.2f) — "
-                    "%s memory host (%s, vram=%.1f GiB, sys=%.1f GiB)",
+                    "%s memory host (%s, %s)",
                     _mem_fraction,
                     "unified" if _is_unified else "discrete",
                     _dev_name,
-                    _vram / 2**30,
-                    _sys_ram / 2**30,
+                    _gcn_arch or "unknown arch",
                 )
         except Exception as _oom_guard_err:
             logger.debug("Could not set GPU memory fraction: %s", _oom_guard_err)

From 86d8ff0f32750d54ec0f03c1eaf15229d07d75f5 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 14:53:25 -0500
Subject: [PATCH 124/165] fix(studio/llama-prebuilt): resolve hipinfo via
 HIP_PATH/ROCM_PATH on Windows

shutil.which("hipinfo") returns None when the HIP SDK bin dir is not on
PATH -- the HIP SDK installer sets HIP_PATH/ROCM_PATH but does not always
add the bin dir to PATH. This caused has_rocm=False in the prebuilt asset
selector, so AMD ROCm machines got the CPU llama.cpp zip instead of the
HIP one, silently running all chat inference on CPU.

Add _resolve_exe() that falls back to %HIP_PATH%\bin and %ROCM_PATH%\bin
when shutil.which() finds nothing, mirroring the same fallback already
present in setup.ps1.
---
 studio/install_llama_prebuilt.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index dd44f19691..cd4a7d7c61 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -2688,12 +2688,30 @@ def _amd_smi_has_gpu(stdout: str) -> bool:
                     has_rocm = True
                     break
     elif is_windows:
-        # Windows: prefer active probes that validate GPU presence
+        # Windows: prefer active probes that validate GPU presence.
+        # hipinfo / amd-smi are often NOT on PATH -- the HIP SDK installer
+        # sets HIP_PATH / ROCM_PATH but does not always add the bin dir to
+        # the system PATH.  Mirror setup.ps1's fallback: check the env-var
+        # bin dirs before giving up so that `has_rocm` is not silently False
+        # on machines where the PATH is not yet updated.
+        def _resolve_exe(name: str) -> str | None:
+            """Return full path to `name`, checking PATH then HIP_PATH/ROCM_PATH bin."""
+            found = shutil.which(name)
+            if found:
+                return found
+            for _env in ("HIP_PATH", "ROCM_PATH"):
+                _root = os.environ.get(_env)
+                if _root:
+                    _candidate = os.path.join(_root, "bin", f"{name}.exe")
+                    if os.path.isfile(_candidate):
+                        return _candidate
+            return None
+
         for _cmd, _check in (
             (["hipinfo"], lambda out: "gcnarchname" in out.lower()),
             (["amd-smi", "list"], _amd_smi_has_gpu),
         ):
-            _exe = shutil.which(_cmd[0])
+            _exe = _resolve_exe(_cmd[0])
             if not _exe:
                 continue
             try:

From 143f6f3e75e3bfddd39600815c88795ee7cd5dc1 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 15:04:38 -0500
Subject: [PATCH 125/165] fix(studio/llama-prebuilt): pass --has-rocm from
 setup.ps1 to skip re-detection

The Python prebuilt installer re-detects ROCm independently via
shutil.which("hipinfo"), which fails when hipinfo is not on PATH
(HIP SDK sets HIP_PATH but doesn't always add the bin dir to PATH).
This caused has_rocm=False and downloaded the CPU llama.cpp zip even
on confirmed AMD ROCm machines.

setup.ps1 already performs reliable ROCm detection with its own
HIP_PATH/ROCM_PATH fallback. Add --has-rocm flag to
install_llama_prebuilt.py so setup.ps1 can forward its result directly,
and pass it whenever $HasROCm is true. The Python script then overrides
has_rocm=True in the HostInfo without re-probing.
---
 studio/install_llama_prebuilt.py | 17 ++++++++++++++++-
 studio/setup.ps1                 |  3 +++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index cd4a7d7c61..fd86e3d4e6 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -29,7 +29,7 @@
 import urllib.request
 import zipfile
 from contextlib import contextmanager
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace as dataclasses_replace
 
 try:
     from filelock import FileLock, Timeout as FileLockTimeout
@@ -5532,8 +5532,11 @@ def install_prebuilt(
     published_release_tag: str,
     *,
     simple_policy: bool = False,
+    override_has_rocm: bool = False,
 ) -> None:
     host = detect_host()
+    if override_has_rocm and not host.has_rocm:
+        host = dataclasses_replace(host, has_rocm = True)
     choice: AssetChoice | None = None
     try:
         with install_lock(install_lock_path(install_dir)):
@@ -5667,6 +5670,17 @@ def parse_args() -> argparse.Namespace:
         action = "store_true",
         help = "Use the simplified platform-specific prebuilt selection policy.",
     )
+    parser.add_argument(
+        "--has-rocm",
+        action = "store_true",
+        default = False,
+        help = (
+            "Assert that an AMD ROCm GPU is present. When set, skips the internal "
+            "hipinfo/amd-smi probe and forces has_rocm=True in the host profile. "
+            "Used by setup.ps1/setup.sh to forward their own ROCm detection result "
+            "so the HIP llama.cpp prebuilt is selected even when hipinfo is not on PATH."
+        ),
+    )
     resolve_group = parser.add_mutually_exclusive_group()
     resolve_group.add_argument(
         "--resolve-llama-tag",
@@ -5787,6 +5801,7 @@ def main() -> int:
         published_repo = args.published_repo,
         published_release_tag = args.published_release_tag or "",
         simple_policy = args.simple_policy,
+        override_has_rocm = args.has_rocm,
     )
     return EXIT_SUCCESS
 
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 42de25a9d0..bd60be045f 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -2378,6 +2378,9 @@ if ($env:UNSLOTH_LLAMA_FORCE_COMPILE -eq "1") {
             "--published-repo", $HelperReleaseRepo,
             "--simple-policy"
         )
+        if ($HasROCm) {
+            $prebuiltArgs += "--has-rocm"
+        }
         if ($env:UNSLOTH_LLAMA_RELEASE_TAG) {
             $prebuiltArgs += @("--published-release-tag", $env:UNSLOTH_LLAMA_RELEASE_TAG)
         }

From d0864e81c2b880e7f83da456e2dccae0947a39c0 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 15:12:38 -0500
Subject: [PATCH 126/165] fix(studio/llama-prebuilt): add HIP asset to
 simple-policy Windows path

direct_upstream_release_plan (used by --simple-policy, which setup.ps1
always passes) only checked has_usable_nvidia on Windows and fell
straight to CPU for AMD ROCm machines, ignoring has_rocm entirely.
The --has-rocm override had no effect because the simple-policy code
path never reached resolve_asset_choice where has_rocm was checked.

Add an elif branch for has_rocm that tries the upstream HIP asset
(llama-TAG-bin-win-hip-radeon-x64.zip) before falling through to the
CPU fallback, consistent with the non-simple-policy path.
---
 studio/install_llama_prebuilt.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index fd86e3d4e6..1e7c908752 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -1336,6 +1336,20 @@ def direct_upstream_release_plan(
                     torch_preference.selection_log,
                 )
             )
+        elif host.has_rocm:
+            hip_asset = f"llama-{release_tag}-bin-win-hip-radeon-x64.zip"
+            hip_url = assets.get(hip_asset)
+            if hip_url:
+                attempts.append(
+                    AssetChoice(
+                        repo = repo,
+                        tag = release_tag,
+                        name = hip_asset,
+                        url = hip_url,
+                        source_label = "upstream",
+                        install_kind = "windows-hip",
+                    )
+                )
         cpu_asset = f"llama-{release_tag}-bin-win-cpu-x64.zip"
         cpu_url = assets.get(cpu_asset)
         if cpu_url:

From a0baf8f4f0048dc82967b4adfa5f5de35a50f65c Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 15:14:33 -0500
Subject: [PATCH 127/165] fix(studio/setup.ps1): auto-remove mismatched
 llama.cpp install kind

When an existing llama.cpp install is the wrong kind for the current
GPU (e.g. windows-cpu on an AMD ROCm machine that should have
windows-hip), the prebuilt installer skips on tag match and never
upgrades. Read install_kind from UNSLOTH_PREBUILT_INFO.json before
invoking the installer and remove the directory if the kind doesn't
match, forcing a fresh download of the correct variant.
---
 studio/setup.ps1 | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/studio/setup.ps1 b/studio/setup.ps1
index bd60be045f..01d5c7bd02 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -2364,6 +2364,23 @@ if ($env:UNSLOTH_LLAMA_FORCE_COMPILE -eq "1") {
     substep "installing prebuilt llama.cpp bundle (preferred path)..."
     if (Test-Path -LiteralPath $LlamaCppDir) {
         substep "Existing llama.cpp install detected -- validating staged prebuilt update before replacement"
+        # If the existing install is the wrong kind (e.g. windows-cpu on a ROCm
+        # machine that should have windows-hip), remove it so the installer is
+        # forced to download the correct variant rather than skipping on tag match.
+        $existingMetaPath = Join-Path $LlamaCppDir "UNSLOTH_PREBUILT_INFO.json"
+        if (Test-Path $existingMetaPath) {
+            try {
+                $existingMeta = Get-Content $existingMetaPath -Raw | ConvertFrom-Json
+                $existingKind = $existingMeta.install_kind
+                $expectedKind = if ($HasROCm) { "windows-hip" } else { "windows-cpu" }
+                if ($existingKind -and $existingKind -ne $expectedKind) {
+                    substep "Removing mismatched llama.cpp install (found '$existingKind', need '$expectedKind')..."
+                    Remove-Item -Recurse -Force -LiteralPath $LlamaCppDir -ErrorAction SilentlyContinue
+                }
+            } catch {
+                # unreadable metadata -- let the installer handle it
+            }
+        }
     }
     # why: install_llama_prebuilt.py uses os.replace(), which would displace
     # an unrelated $env:UNSLOTH_STUDIO_HOME\llama.cpp before the source-build

From 2bca6ee95df8f87f02a5bd343c2ff5d2e0aeb863 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 15:20:14 -0500
Subject: [PATCH 128/165] fix(studio/setup.ps1): show live PyTorch install
 output in verbose mode for ROCm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ROCm torch reinstall (setup.ps1 phase) always silently captured
output, so in --verbose mode the torch downgrade mid-install
(2.11.0+rocm → 2.10.0 → 2.11.0+rocm) looked like the final state was
2.10.0. Match the CPU/CUDA blocks which show live uv output when
$script:UnslothVerbose is set.
---
 studio/setup.ps1 | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 01d5c7bd02..3e13523ce2 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -2034,8 +2034,14 @@ $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR
 
 if ($ROCmIndexUrl) {
     substep "installing PyTorch (AMD ROCm, $ROCmGfxArch)..."
-    $output = Fast-Install torch torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl | Out-String
-    $torchInstallExit = $LASTEXITCODE
+    if ($script:UnslothVerbose) {
+        Fast-Install torch torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl
+        $torchInstallExit = $LASTEXITCODE
+        $output = ""
+    } else {
+        $output = Fast-Install torch torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl | Out-String
+        $torchInstallExit = $LASTEXITCODE
+    }
     if ($torchInstallExit -ne 0) {
         Write-Host "[WARN] AMD ROCm PyTorch install failed -- falling back to CPU" -ForegroundColor Yellow
         Write-Host $output -ForegroundColor Yellow

From c6a90de4b5c3086953f3088846577d36cdaaa066 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 21 May 2026 22:21:33 -0500
Subject: [PATCH 129/165] fix(rocm/windows): set ROCBLAS_TENSILE_LIBPATH for
 bundled rocblas.dll

The llama.cpp ROCm prebuilt bundles rocblas.dll next to the binary but
not the Tensile kernel library files it depends on at runtime
(rocblas/library/TensileLibrary*.dat + *.hsaco).  The bundled DLL
searches for these files relative to its own location by default, i.e.
<binary_dir>/rocblas/library/, which does not exist in the prebuilt
install tree.  This causes a silent crash on the very first GEMM
(prefill) with no output from llama-server, seen by the caller as
WinError 10054 / 10061.  Model load and the single-token warmup pass
because they use simpler code paths that do not trigger rocBLAS GEMM.

Fix: set ROCBLAS_TENSILE_LIBPATH in the subprocess env to
<HIP_PATH>/bin/rocblas/library so the bundled DLL finds the kernel
files from the system ROCm installation.  Uses setdefault so a user-
supplied env var is never overwritten.  No-ops on CUDA and CPU (no
HIP_PATH) and on Linux (win32 branch only).

Reproducer log:
  rocBLAS error: Cannot read .../Release/rocblas/library/TensileLibrary.dat
  rocBLAS error: Could not initialize Tensile host:
  directory_iterator: The system cannot find the path specified.
---
 studio/backend/core/inference/llama_cpp.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 260e675a73..7db203eec3 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -3026,6 +3026,26 @@ def load_model(
                     )
                     existing_path = env.get("PATH", "")
                     env["PATH"] = ";".join(path_dirs) + ";" + existing_path
+
+                    # ROCm: the llama.cpp prebuilt bundles its own rocblas.dll
+                    # but NOT the Tensile kernel library files it needs
+                    # (rocblas/library/TensileLibrary*.dat + *.hsaco).  The
+                    # bundled DLL searches relative to its own location by
+                    # default (i.e. <binary_dir>/rocblas/library/) which does
+                    # not exist, causing a silent crash on the first GEMM.
+                    # ROCBLAS_TENSILE_LIBPATH overrides that search to point at
+                    # the ROCm installation where the kernel files actually are.
+                    _hip_path = os.environ.get(
+                        "HIP_PATH", os.environ.get("ROCM_PATH", "")
+                    )
+                    if _hip_path:
+                        _rocblas_lib = os.path.join(
+                            _hip_path, "bin", "rocblas", "library"
+                        )
+                        if os.path.isdir(_rocblas_lib):
+                            env.setdefault(
+                                "ROCBLAS_TENSILE_LIBPATH", _rocblas_lib
+                            )
                 else:
                     # Linux: set LD_LIBRARY_PATH for shared libs next to the binary
                     # and CUDA runtime libs (libcudart, libcublas, etc.)

From 2712a6d1e1525df3c22707109cd3b9daeaa573a1 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 22 May 2026 03:36:17 +0000
Subject: [PATCH 130/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 7db203eec3..6c34b33bef 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -3043,9 +3043,7 @@ def load_model(
                             _hip_path, "bin", "rocblas", "library"
                         )
                         if os.path.isdir(_rocblas_lib):
-                            env.setdefault(
-                                "ROCBLAS_TENSILE_LIBPATH", _rocblas_lib
-                            )
+                            env.setdefault("ROCBLAS_TENSILE_LIBPATH", _rocblas_lib)
                 else:
                     # Linux: set LD_LIBRARY_PATH for shared libs next to the binary
                     # and CUDA runtime libs (libcudart, libcublas, etc.)

From 889b33cf647f3414c48b99d598c3f07e705b4d41 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sun, 24 May 2026 00:04:36 -0500
Subject: [PATCH 131/165] fix(install.sh): restore gfx token dedup in Strix
 multi-GPU awk indexer

536a54df removed the per-source `| awk '!seen[$0]++'` dedup from the
_gfx_all collection step but left the indexer awk as bare NF, so on a
mixed-arch host (e.g. dGPU gfx1100 + Strix iGPU gfx1151) where
rocminfo emits each gfx token twice (Name: field + ISA triple),
HIP_VISIBLE_DEVICES=1 indexed vals[1] = the second gfx1100 occurrence
instead of gfx1151, triggering the Strix routing on the wrong GPU.

Add !seen[$0]++ to the indexer awk so duplicate tokens from the same
GPU collapse to one entry before the HIP_VISIBLE_DEVICES index is
applied -- matching exactly what the Python side does with dict.fromkeys()
in _detect_amd_gfx_codes(). The comment above the block ("skip
duplicates") already documented this as the intended behaviour.
---
 install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install.sh b/install.sh
index 29fd6e22d1..ec694419d0 100755
--- a/install.sh
+++ b/install.sh
@@ -1877,7 +1877,7 @@ case "$TORCH_INDEX_URL" in
                 esac
             fi
             _runtime_gfx=$(printf '%s\n' "$_gfx_all" | awk -v idx="$_idx" '
-                NF { vals[n++] = $0 }
+                NF && !seen[$0]++ { vals[n++] = $0 }
                 END {
                     if (idx < 0 || idx >= n) idx = 0
                     if (n > 0) print vals[idx]

From 0fda1e260ad8abdee412ab4c62f2dbec827ba14f Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sun, 24 May 2026 01:01:00 -0500
Subject: [PATCH 132/165] fix(studio/install): correct _TOTAL progress count on
 Windows

base_total += 3 fired for all non-macOS platforms including Windows,
but flash-attn (line 1620) and ROCm torch final (line 1705) are both
guarded by 'not IS_WINDOWS and not IS_MACOS', so on Windows with torch
enabled _TOTAL was 13 while only 11 _progress() calls actually execute.

Split into +1 for the ROCm torch check (all non-macOS) and +2 for the
two Linux-only steps, so Windows gets _TOTAL=11 and Linux gets 14.
---
 studio/install_python_stack.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index be494a8be2..92ee2cd79e 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -1374,7 +1374,9 @@ def install_python_stack() -> int:
     if IS_MACOS:
         base_total -= 1  # triton step is skipped on macOS
     if not IS_MACOS and not NO_TORCH:
-        base_total += 3
+        base_total += 1  # ROCm torch check (line 1526) -- all non-macOS platforms
+        if not IS_WINDOWS:
+            base_total += 2  # flash-attn (line 1620) + ROCm torch final (line 1705) -- Linux only
     _TOTAL = (base_total - 1) if skip_base else base_total
 
     # 1. Try to use uv for faster installs (must happen before pip upgrade

From 688c508f31c07ee4686a8211baf9a1d95d102c6e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 24 May 2026 06:02:02 +0000
Subject: [PATCH 133/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/install_python_stack.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 92ee2cd79e..43df02c435 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -1376,7 +1376,9 @@ def install_python_stack() -> int:
     if not IS_MACOS and not NO_TORCH:
         base_total += 1  # ROCm torch check (line 1526) -- all non-macOS platforms
         if not IS_WINDOWS:
-            base_total += 2  # flash-attn (line 1620) + ROCm torch final (line 1705) -- Linux only
+            base_total += (
+                2  # flash-attn (line 1620) + ROCm torch final (line 1705) -- Linux only
+            )
     _TOTAL = (base_total - 1) if skip_base else base_total
 
     # 1. Try to use uv for faster installs (must happen before pip upgrade

From 284145a7835bd89b29e61cc414adc9357c02b88c Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Sun, 24 May 2026 02:18:59 -0500
Subject: [PATCH 134/165] fix(install.ps1): enforce torch>=2.11.0 for gfx120X
 and Strix on Windows

The AMD arch-specific index (repo.amd.com/rocm/whl/gfx120X-all/ and
gfx1151/) publishes torch wheels from 2.7.1 through 2.11.0. Without a
version floor pip can resolve to torch 2.10.0+rocm7.12 on RDNA 4
(gfx120X) or torch 2.10.0+rocm7.1 on Strix (gfx1151/gfx1150), both of
which have a null-pointer crash in torch._C._grouped_mm (TheRock
issues #5284 / #3284). torch 2.11.0+rocm7.13 contains the fix.

Add $ROCmTorchFloor alongside $ROCmIndexUrl: set to torch>=2.11.0 for
the two affected arch families, null for all others. Wire it into the
uv pip install call so the broken wheels are never selected.
---
 install.ps1 | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/install.ps1 b/install.ps1
index f8e1b84c62..fb6fdf8b39 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1439,6 +1439,7 @@ shell.Run cmd, 0, False
     # Wheels bundle their own ROCm runtime and support all Python versions.
     # Override with UNSLOTH_ROCM_WINDOWS_MIRROR for air-gapped / mirror installs.
     $ROCmIndexUrl = $null
+    $ROCmTorchFloor = $null
     if ($HasROCm -and $TorchIndexUrl -like "*/cpu" -and -not $SkipTorch) {
         $amdIndexBase = if ($env:UNSLOTH_ROCM_WINDOWS_MIRROR) { $env:UNSLOTH_ROCM_WINDOWS_MIRROR.TrimEnd('/') } else { "https://repo.amd.com/rocm/whl" }
         $archFamilyMap = @{
@@ -1448,11 +1449,23 @@ shell.Run cmd, 0, False
             "gfx1101" = "gfx110X-all"; "gfx1100" = "gfx110X-all"
             "gfx90a"  = "gfx90a";      "gfx908"  = "gfx908"        # MI200/MI100
         }
+        # gfx120X (RDNA 4) and gfx1151/gfx1150 (Strix) have a null-pointer bug in
+        # torch._C._grouped_mm on torch <2.11.0 (rocm7.12 and rocm7.1 respectively).
+        # TheRock issues #5284 and #3284. Force torch>=2.11.0 so pip never resolves
+        # to the broken 2.10.0 wheels even though they exist on the AMD index.
+        $torchFloorMap = @{
+            "gfx1201" = "torch>=2.11.0"; "gfx1200" = "torch>=2.11.0"
+            "gfx1151" = "torch>=2.11.0"; "gfx1150" = "torch>=2.11.0"
+        }
         $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
         if ($archFamily) {
             $ROCmIndexUrl = "$amdIndexBase/$archFamily/"
+            $ROCmTorchFloor = if ($ROCmGfxArch -and $torchFloorMap.ContainsKey($ROCmGfxArch)) { $torchFloorMap[$ROCmGfxArch] } else { $null }
             $archLabel = if ($ROCmGfxArch) { $ROCmGfxArch } else { "AMD GPU" }
             substep "$archLabel -- AMD repo.amd.com index selected" "Cyan"
+            if ($ROCmTorchFloor) {
+                substep "  enforcing $ROCmTorchFloor (known _grouped_mm bug in older wheels)" "Cyan"
+            }
         } elseif ($ROCmGfxArch) {
             substep "AMD GPU ($ROCmGfxArch) not in supported arch list -- falling back to CPU-only PyTorch" "Yellow"
         } else {
@@ -1565,7 +1578,8 @@ shell.Run cmd, 0, False
         } elseif ($ROCmIndexUrl) {
             Write-TauriLog "STEP" "Installing PyTorch (AMD ROCm Windows)"
             substep "installing PyTorch from $ROCmIndexUrl..."
-            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --index-url $ROCmIndexUrl torch torchvision torchaudio }
+            $torchSpec = if ($ROCmTorchFloor) { $ROCmTorchFloor } else { "torch" }
+            $torchInstallExit = Invoke-InstallCommand { uv pip install --python $VenvPython --force-reinstall --index-url $ROCmIndexUrl $torchSpec torchvision torchaudio }
             if ($torchInstallExit -ne 0) {
                 Write-Host "[ERROR] Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" -ForegroundColor Red
                 return (Exit-InstallFailure "Failed to install AMD ROCm PyTorch (exit code $torchInstallExit)" $torchInstallExit)

From 85bbb03e1d1495816703c082e28b5492cfdf0a9a Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 13:28:50 -0500
Subject: [PATCH 135/165] fix(rocm/windows): address Codex nits - deterministic
 DLL suffix, CUDA llama.cpp kind, HIP_VISIBLE_DEVICES arch indexing

- install_python_stack.py / worker.py: _detect_bnb_rocm_dll_ver() and the
  inline worker probe now collect ALL libbitsandbytes_rocm*.dll suffixes and
  return max() by numeric value instead of stopping at the first glob hit.
  Filesystem glob order is not guaranteed; this ensures '713' always wins
  over '72' when both variants are present in the wheel.

- setup.ps1 (expectedKind): add 'windows-cuda' branch so NVIDIA hosts are
  not treated as 'windows-cpu'. Previously an existing windows-cuda prebuilt
  was always considered a mismatch on non-ROCm machines, forcing an
  unnecessary re-download on every update.

- setup.ps1 (amd-smi gfx arch): collect ALL gfx tokens from amd-smi list
  output in GPU order and honour HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES
  when selecting which arch to use. On mixed-arch AMD systems where the
  visible GPU is not the first enumerated one, this prevents installing an
  incompatible wheel index. Falls back to index 0 (same as before) when the
  visibility var is unset or is a comma-separated list.

- test_rocm_support.py: add test_picks_highest_suffix_when_multiple_dlls to
  cover the multi-DLL case that was previously untested.
---
 studio/backend/core/training/worker.py    | 12 ++++++++----
 studio/install_python_stack.py            |  8 ++++++--
 studio/setup.ps1                          | 23 +++++++++++++++++++----
 tests/studio/install/test_rocm_support.py | 15 +++++++++++++++
 4 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 614a36edf9..89f9139c39 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2101,6 +2101,7 @@ def find_spec(self, fullname, path, target = None):
 
                     _bnb_spec = _ilu.find_spec("bitsandbytes")
                     if _bnb_spec and _bnb_spec.submodule_search_locations:
+                        _all_vers: list[str] = []
                         for _pkg_dir in _bnb_spec.submodule_search_locations:
                             for _dll in _glob.glob(
                                 os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll")
@@ -2110,10 +2111,13 @@ def find_spec(self, fullname, path, target = None):
                                     os.path.basename(_dll),
                                 )
                                 if _m:
-                                    _bnb_rocm_ver = _m.group(1)
-                                    break
-                            if _bnb_rocm_ver:
-                                break
+                                    _all_vers.append(_m.group(1))
+                        # Pick the highest numeric suffix so that e.g. "713"
+                        # wins over "72" when both variants are present.
+                        # Filesystem glob order is not guaranteed, so always
+                        # sort rather than stopping at the first match.
+                        if _all_vers:
+                            _bnb_rocm_ver = max(_all_vers, key=lambda v: int(v))
                 except Exception:
                     pass
                 _bnb_rocm_ver = _bnb_rocm_ver or "72"
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 7291abf9d4..4d0e32af30 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -361,12 +361,16 @@ def _detect_bnb_rocm_dll_ver() -> str | None:
     spec = importlib.util.find_spec("bitsandbytes")
     if spec is None or not spec.submodule_search_locations:
         return None
+    all_vers: list[str] = []
     for pkg_dir in spec.submodule_search_locations:
         for dll in glob.glob(os.path.join(pkg_dir, "libbitsandbytes_rocm*.dll")):
             m = re.search(r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(dll))
             if m:
-                return m.group(1)
-    return None
+                all_vers.append(m.group(1))
+    # Pick the highest numeric suffix so that e.g. "713" wins over "72" when
+    # both variants are present in the wheel.  Filesystem glob order is not
+    # guaranteed, so always sort rather than stopping at the first match.
+    return max(all_vers, key=lambda v: int(v)) if all_vers else None
 
 
 def _has_rocm_gpu() -> bool:
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 3e13523ce2..9dc18cee4d 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -742,9 +742,24 @@ if (-not $HasNvidiaSmi) {
                 $smiOut = & $amdSmiExe.Source list 2>&1 | Out-String
                 if ($LASTEXITCODE -eq 0 -and $smiOut -match "(?im)^GPU\s*[:\[]\s*\d") {
                     $HasROCm = $true
-                    # Attempt 1: newer amd-smi versions embed the gfx arch in list output
-                    if ($smiOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
-                        $script:ROCmGfxArch = $Matches[1].ToLower()
+                    # Attempt 1: newer amd-smi versions embed the gfx arch in list output.
+                    # Collect ALL gfx tokens in output order so that on mixed-arch systems
+                    # we can honour HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES and pick the
+                    # arch for the *runtime-visible* GPU rather than always the first one.
+                    $allGfxArches = @([regex]::Matches($smiOut, '(?i)\b(gfx\d+[a-z]?)\b') |
+                        ForEach-Object { $_.Groups[1].Value.ToLower() } |
+                        Select-Object -Unique)
+                    if ($allGfxArches.Count -gt 0) {
+                        # Resolve which GPU index is runtime-visible.  When a single
+                        # integer index is set, use it; fall back to index 0 otherwise
+                        # (comma-separated lists or unset → first GPU, same as before).
+                        $visGpu = if ($env:HIP_VISIBLE_DEVICES) { $env:HIP_VISIBLE_DEVICES }
+                                  elseif ($env:ROCR_VISIBLE_DEVICES) { $env:ROCR_VISIBLE_DEVICES }
+                                  else { $null }
+                        $gpuIdx = 0
+                        if ($visGpu -match '^\s*(\d+)\s*$') { $gpuIdx = [int]$Matches[1] }
+                        $archIdx = [Math]::Min($gpuIdx, $allGfxArches.Count - 1)
+                        $script:ROCmGfxArch = $allGfxArches[$archIdx]
                         $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
                     } else {
                         # Attempt 2: 'static --asic' exposes ASIC details on ROCm 6+,
@@ -2378,7 +2393,7 @@ if ($env:UNSLOTH_LLAMA_FORCE_COMPILE -eq "1") {
             try {
                 $existingMeta = Get-Content $existingMetaPath -Raw | ConvertFrom-Json
                 $existingKind = $existingMeta.install_kind
-                $expectedKind = if ($HasROCm) { "windows-hip" } else { "windows-cpu" }
+                $expectedKind = if ($HasROCm) { "windows-hip" } elseif ($HasNvidiaSmi) { "windows-cuda" } else { "windows-cpu" }
                 if ($existingKind -and $existingKind -ne $expectedKind) {
                     substep "Removing mismatched llama.cpp install (found '$existingKind', need '$expectedKind')..."
                     Remove-Item -Recurse -Force -LiteralPath $LlamaCppDir -ErrorAction SilentlyContinue
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index 83aecf7936..bf8d9948a5 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -1788,6 +1788,21 @@ def test_returns_none_when_only_cuda_dlls(self, tmp_path):
         with patch.object(importlib.util, "find_spec", return_value = mock_spec):
             assert stack_mod._detect_bnb_rocm_dll_ver() is None
 
+    def test_picks_highest_suffix_when_multiple_dlls(self, tmp_path):
+        """Returns the highest numeric suffix when multiple ROCm DLL variants exist.
+
+        Filesystem glob order is not guaranteed, so the function must not stop
+        at the first match — it must always return the highest one.
+        """
+        (tmp_path / "libbitsandbytes_rocm72.dll").write_text("")
+        (tmp_path / "libbitsandbytes_rocm713.dll").write_text("")
+        mock_spec = MagicMock()
+        mock_spec.submodule_search_locations = [str(tmp_path)]
+        import importlib.util
+
+        with patch.object(importlib.util, "find_spec", return_value = mock_spec):
+            assert stack_mod._detect_bnb_rocm_dll_ver() == "713"
+
 
 # =============================================================================
 # TEST: install_python_stack.py -- UNSLOTH_ROCM_TORCH_INSTALLED early-return path

From 69b582c81f391f4b177eb897e30d298ed46c8dd6 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 13:44:05 -0500
Subject: [PATCH 136/165] fix(rocm): misleading amd-smi log, BNB spec
 consistency, torch ceiling for AMD index

amd.py: split 'returncode != 0 or not stdout' into two separate branches.
Previously, exit-0 with empty output logged 'amd-smi returned code 0' (which
reads as success, not a warning) and incorrectly incremented the circuit-breaker
counter. Now: non-zero exit logs the code and counts toward the limit as before;
empty stdout on exit 0 logs at DEBUG level and does not penalise the counter
(amd-smi --json always emits at least [] on exit 0, so this branch is rare and
is not a tool failure).

main.py: replace spec.origin / os.path.dirname() with
spec.submodule_search_locations to match install_python_stack.py and worker.py.
For normal wheel installs both approaches reach the same directory, but using
submodule_search_locations is the canonical way and handles editable bitsandbytes
installs correctly. Also use max() by numeric suffix (same as the other two sites)
instead of a sort-then-break loop.

install.ps1: add <2.12.0 ceiling to the torch constraint for gfx120X (RDNA 4)
and gfx1151/gfx1150 (Strix). AMD actively publishes new versions on their
per-arch index; without a ceiling, a future 2.12.0+rocmX.Y wheel would be
pulled in automatically before being validated on these architectures. The
ceiling matches the existing Linux install_python_stack.py constraint for the
same arches. Bump both when 2.12.x is confirmed working.
---
 install.ps1                          | 10 +++++++--
 studio/backend/main.py               | 31 +++++++++++++++-------------
 studio/backend/utils/hardware/amd.py |  8 ++++++-
 3 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 1310d22af5..b414a2c705 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1453,9 +1453,15 @@ shell.Run cmd, 0, False
         # torch._C._grouped_mm on torch <2.11.0 (rocm7.12 and rocm7.1 respectively).
         # TheRock issues #5284 and #3284. Force torch>=2.11.0 so pip never resolves
         # to the broken 2.10.0 wheels even though they exist on the AMD index.
+        # The <2.12.0 ceiling matches the Linux install_python_stack.py constraint
+        # for the same arches: AMD actively publishes new versions on their index,
+        # so without a ceiling a future 2.12.0+rocmX.Y wheel would be pulled in
+        # automatically before it has been validated on these architectures.
+        # Bump the ceiling here (and in install_python_stack.py) when 2.12.x is
+        # confirmed working on gfx120X / Strix.
         $torchFloorMap = @{
-            "gfx1201" = "torch>=2.11.0"; "gfx1200" = "torch>=2.11.0"
-            "gfx1151" = "torch>=2.11.0"; "gfx1150" = "torch>=2.11.0"
+            "gfx1201" = "torch>=2.11.0,<2.12.0"; "gfx1200" = "torch>=2.11.0,<2.12.0"
+            "gfx1151" = "torch>=2.11.0,<2.12.0"; "gfx1150" = "torch>=2.11.0,<2.12.0"
         }
         $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
         if ($archFamily) {
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 007adede56..7eff794261 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -95,22 +95,25 @@ def _ver_key(name: str) -> tuple:
             import importlib.util as _ilu
 
             _bnb_spec = _ilu.find_spec("bitsandbytes")
-            if _bnb_spec and _bnb_spec.origin:
-                _pkg_dir = os.path.dirname(_bnb_spec.origin)
-                _dlls = _glob.glob(os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll"))
+            # Use submodule_search_locations (same as install_python_stack.py and
+            # worker.py) rather than spec.origin so that editable installs of
+            # bitsandbytes, where __init__.py may live outside the package root,
+            # are handled consistently across all three probe sites.
+            if _bnb_spec and _bnb_spec.submodule_search_locations:
                 import re as _re_bnb
 
-                def _bnb_ver_key(p: str) -> int:
-                    _km = _re_bnb.search(r"rocm(\d+)", os.path.basename(p))
-                    return int(_km.group(1)) if _km else -1
-
-                for _dll in sorted(_dlls, key = _bnb_ver_key, reverse = True):
-                    _m = _re_bnb.search(
-                        r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(_dll)
-                    )
-                    if _m:
-                        _bnb_rocm_ver = _m.group(1)
-                        break
+                _all_vers_main: list[str] = []
+                for _pkg_dir in _bnb_spec.submodule_search_locations:
+                    for _dll in _glob.glob(
+                        os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll")
+                    ):
+                        _km = _re_bnb.search(
+                            r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(_dll)
+                        )
+                        if _km:
+                            _all_vers_main.append(_km.group(1))
+                if _all_vers_main:
+                    _bnb_rocm_ver = max(_all_vers_main, key=lambda v: int(v))
         except Exception as _e:
             _logging.getLogger(__name__).warning(
                 "Windows ROCm: BNB DLL detection failed (%s); falling back to version '72'",
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index 7804836fd2..7a1ca0b8f8 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -58,7 +58,7 @@ def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optiona
             )
             _amd_smi_disabled = True
         return None
-    if result.returncode != 0 or not result.stdout.strip():
+    if result.returncode != 0:
         logger.warning("amd-smi returned code %d", result.returncode)
         _amd_smi_consecutive_failures += 1
         if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
@@ -67,6 +67,12 @@ def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optiona
             )
             _amd_smi_disabled = True
         return None
+    if not result.stdout.strip():
+        # amd-smi exited successfully but produced no output (e.g. no GPUs
+        # visible on this query, or a version that emits nothing for --json).
+        # This is not a tool failure, so don't count against the circuit breaker.
+        logger.debug("amd-smi exited 0 but returned no output")
+        return None
     _amd_smi_consecutive_failures = 0  # reset on success
     try:
         return json.loads(result.stdout)

From 5c72e64fb16adbb84ad0a1aaca64ea198e458905 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 May 2026 18:49:59 +0000
Subject: [PATCH 137/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 2 +-
 studio/backend/main.py                 | 2 +-
 studio/install_python_stack.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 89f9139c39..8034800583 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2117,7 +2117,7 @@ def find_spec(self, fullname, path, target = None):
                         # Filesystem glob order is not guaranteed, so always
                         # sort rather than stopping at the first match.
                         if _all_vers:
-                            _bnb_rocm_ver = max(_all_vers, key=lambda v: int(v))
+                            _bnb_rocm_ver = max(_all_vers, key = lambda v: int(v))
                 except Exception:
                     pass
                 _bnb_rocm_ver = _bnb_rocm_ver or "72"
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 7eff794261..87ba990a13 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -113,7 +113,7 @@ def _ver_key(name: str) -> tuple:
                         if _km:
                             _all_vers_main.append(_km.group(1))
                 if _all_vers_main:
-                    _bnb_rocm_ver = max(_all_vers_main, key=lambda v: int(v))
+                    _bnb_rocm_ver = max(_all_vers_main, key = lambda v: int(v))
         except Exception as _e:
             _logging.getLogger(__name__).warning(
                 "Windows ROCm: BNB DLL detection failed (%s); falling back to version '72'",
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 4d0e32af30..e7aa200d63 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -370,7 +370,7 @@ def _detect_bnb_rocm_dll_ver() -> str | None:
     # Pick the highest numeric suffix so that e.g. "713" wins over "72" when
     # both variants are present in the wheel.  Filesystem glob order is not
     # guaranteed, so always sort rather than stopping at the first match.
-    return max(all_vers, key=lambda v: int(v)) if all_vers else None
+    return max(all_vers, key = lambda v: int(v)) if all_vers else None
 
 
 def _has_rocm_gpu() -> bool:

From 0763a9970a53bd086b87027bf1d1955c67ba2f74 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 14:25:08 -0500
Subject: [PATCH 138/165] fix(rocm): torch floor in setup.ps1, torchvision pin
 for Strix, rocmsdk in _hip_ver_at_least

setup.ps1: add \ (mirrors install.ps1) and derive \
from it. Previously the AMD index install called 'Fast-Install torch torchvision
torchaudio --force-reinstall --index-url \' with no version
constraint, so pip could resolve torch 2.10.0+rocm7.12 for gfx1151/gfx1200 --
the exact broken wheel the PR is meant to avoid. Now gfx120X and Strix enforce
'torch>=2.11.0,<2.12.0', matching install.ps1 and the Linux constraint.

install_python_stack.py: pin torchvision and torchaudio in _strix_override_pkgs.
The Strix Linux override uses --index-url (exclusive, no PyPI fallback); bare
unversioned 'torchvision' and 'torchaudio' could resolve a build from AMD's
index targeting a different torch major, causing ABI/version mismatches at
runtime. Now pinned to '>=0.26.0,<0.27.0' and '>=2.11.0,<2.12.0' respectively,
matching _ROCM_TORCH_CONSTRAINT['rocm7.2'].

worker.py: extend _hip_ver_at_least to handle AMD SDK wheel version strings.
The fallback regex r'rocm(\d+)\.(\d+)' cannot match '2.9.0+rocmsdk20251116'
(no rocmX.Y component), so the function always returned False on SDK/Radeon
wheels -- installing the Python _grouped_mm workaround on wheels that already
have the working HIP kernel. Added a second check: if the version string
contains '+rocmsdk', assume >= 7.13 (the rocmsdk format post-dates the
gfx120X null-kernel fix) and skip the fallback.
---
 studio/backend/core/training/worker.py | 20 ++++++++++++++++++++
 studio/install_python_stack.py         | 10 ++++++++--
 studio/setup.ps1                       | 15 +++++++++++++--
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 8034800583..e784dc1486 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2142,6 +2142,8 @@ def _hip_ver_at_least(major: int, minor: int) -> bool:
                     getattr(_torch_for_rocm, "version", None), "hip", None
                 )
                 if not _hip_str:
+                    # Try the standard "+rocmX.Y.Z" embedded version first
+                    # (e.g. "2.11.0+rocm7.13.0").
                     _ver_match = _re_ver.search(
                         r"rocm(\d+)\.(\d+)", _build_version_for_rocm
                     )
@@ -2150,6 +2152,24 @@ def _hip_ver_at_least(major: int, minor: int) -> bool:
                             int(_ver_match.group(1)),
                             int(_ver_match.group(2)),
                         ) >= (major, minor)
+                    # AMD SDK / Radeon Windows wheels encode the build as
+                    # "+rocmsdk<date>" (e.g. "2.9.0+rocmsdk20251116") with no
+                    # explicit rocmX.Y component. The rocmsdk format was
+                    # introduced after the gfx120X null-kernel fix landed in
+                    # ROCm 7.13, so any wheel with this suffix is new enough to
+                    # have working HIP kernels. Treat as >= 7.13 rather than
+                    # falling back to False and installing the Python workaround
+                    # on a wheel that doesn't need it.
+                    if "rocmsdk" in _build_version_for_rocm:
+                        logger.debug(
+                            "Windows ROCm: AMD SDK wheel detected (%r); "
+                            "assuming HIP >= %d.%d (rocmsdk wheels post-date "
+                            "the gfx120X null-kernel fix)",
+                            _build_version_for_rocm,
+                            major,
+                            minor,
+                        )
+                        return True
                     return False
                 try:
                     _parts = [int(x) for x in str(_hip_str).split(".")[:2]]
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index e7aa200d63..6b6ae88754 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -723,8 +723,14 @@ def _ensure_rocm_torch() -> None:
                 _strix_override_url = f"{_amd_mirror}/{_selected_gfx}/"
                 _strix_override_pkgs = (
                     "torch>=2.11.0,<2.12.0",
-                    "torchvision",
-                    "torchaudio",
+                    # Pin torchvision/torchaudio to the 2.11.x-compatible range.
+                    # The install uses --index-url (exclusive, no PyPI fallback),
+                    # so bare unversioned names risk resolving a build from AMD's
+                    # index that targets a different torch major (e.g. 0.27 built
+                    # against torch 2.12), which would fail at runtime with an
+                    # ABI/version mismatch. Matches _ROCM_TORCH_CONSTRAINT["rocm7.2"].
+                    "torchvision>=0.26.0,<0.27.0",
+                    "torchaudio>=2.11.0,<2.12.0",
                 )
                 print(
                     f"\n   {_selected_gfx} (AMD Strix) is the runtime target with ROCm "
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 9dc18cee4d..a409f606d2 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -2029,7 +2029,15 @@ if ($HasROCm -and $CuTag -eq "cpu") {
         "gfx1101" = "gfx110X-all"; "gfx1100" = "gfx110X-all"
         "gfx90a"  = "gfx90a";      "gfx908"  = "gfx908"        # MI200/MI100
     }
+    # gfx120X and Strix have a null _grouped_mm kernel on torch <2.11.0.
+    # Mirrors the $torchFloorMap in install.ps1 so both installers enforce
+    # the same floor and ceiling when pulling from AMD's per-arch index.
+    $torchFloorMap = @{
+        "gfx1201" = "torch>=2.11.0,<2.12.0"; "gfx1200" = "torch>=2.11.0,<2.12.0"
+        "gfx1151" = "torch>=2.11.0,<2.12.0"; "gfx1150" = "torch>=2.11.0,<2.12.0"
+    }
     $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
+    $ROCmTorchSpec = if ($ROCmGfxArch -and $torchFloorMap.ContainsKey($ROCmGfxArch)) { $torchFloorMap[$ROCmGfxArch] } else { "torch" }
     if ($archFamily) {
         $ROCmIndexUrl = "$amdIndexBase/$archFamily/"
     } elseif ($ROCmGfxArch) {
@@ -2049,12 +2057,15 @@ $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR
 
 if ($ROCmIndexUrl) {
     substep "installing PyTorch (AMD ROCm, $ROCmGfxArch)..."
+    if ($ROCmTorchSpec -ne "torch") {
+        substep "  enforcing $ROCmTorchSpec (known _grouped_mm bug in older wheels)" "Cyan"
+    }
     if ($script:UnslothVerbose) {
-        Fast-Install torch torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl
+        Fast-Install $ROCmTorchSpec torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl
         $torchInstallExit = $LASTEXITCODE
         $output = ""
     } else {
-        $output = Fast-Install torch torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl | Out-String
+        $output = Fast-Install $ROCmTorchSpec torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl | Out-String
         $torchInstallExit = $LASTEXITCODE
     }
     if ($torchInstallExit -ne 0) {

From ad9ea0047ff843a8bc4da3b69edcd3dd833c42e6 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 14:29:59 -0500
Subject: [PATCH 139/165] fix(rocm): warn on OOB HIP_VISIBLE_DEVICES, bail on
 empty numeric_ids mask

- setup.ps1: when HIP/ROCR_VISIBLE_DEVICES names an index beyond the
  detected GPU count, emit a yellow warning and fall back to GPU 0
  instead of silently reading allGfxArches[-1] (wrong arch)
- hardware.py _reconcile_primary_rocm_unified_memory: distinguish
  numeric_ids=None (no env var, use torch ordinal 0) from numeric_ids=[]
  (empty mask / HIP_VISIBLE_DEVICES=-1, no GPU visible); bail out early
  in the empty case to avoid querying torch.device(0) incorrectly
---
 studio/backend/utils/hardware/hardware.py | 15 ++++++++++-----
 studio/setup.ps1                          |  7 +++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index e3e139b9c3..2fd535c9bd 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -639,12 +639,17 @@ def _reconcile_primary_rocm_unified_memory(
 ) -> None:
     """Same fix as _reconcile_rocm_unified_memory for the flat primary-GPU dict."""
     numeric_ids = parent_visible_spec.get("numeric_ids")
-    if numeric_ids:
-        primary_idx = [int(numeric_ids[0])]
-    else:
-        # No CUDA_VISIBLE_DEVICES / HIP_VISIBLE_DEVICES set: torch ordinal 0
-        # is the primary visible device.
+    if numeric_ids is None:
+        # No visibility env var set: torch ordinal 0 is the primary device.
         primary_idx = [0]
+    elif len(numeric_ids) == 0:
+        # Empty mask (HIP_VISIBLE_DEVICES="" or "-1"): no GPU is visible to
+        # this process. Querying torch device 0 would raise a RuntimeError or
+        # return stale/wrong data, so bail out rather than writing bad values
+        # into the utilization dict.
+        return
+    else:
+        primary_idx = [int(numeric_ids[0])]
     torch_devices = _torch_get_per_device_info(primary_idx)
     if not torch_devices:
         return
diff --git a/studio/setup.ps1 b/studio/setup.ps1
index a409f606d2..0f399a099c 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -758,8 +758,11 @@ if (-not $HasNvidiaSmi) {
                                   else { $null }
                         $gpuIdx = 0
                         if ($visGpu -match '^\s*(\d+)\s*$') { $gpuIdx = [int]$Matches[1] }
-                        $archIdx = [Math]::Min($gpuIdx, $allGfxArches.Count - 1)
-                        $script:ROCmGfxArch = $allGfxArches[$archIdx]
+                        if ($gpuIdx -ge $allGfxArches.Count) {
+                            substep "[WARN] HIP/ROCR_VISIBLE_DEVICES index $gpuIdx is out of range ($($allGfxArches.Count) GPU(s) detected); defaulting to GPU 0 for arch selection" "Yellow"
+                            $gpuIdx = 0
+                        }
+                        $script:ROCmGfxArch = $allGfxArches[$gpuIdx]
                         $ROCmGpuLabel = "AMD ROCm ($script:ROCmGfxArch)"
                     } else {
                         # Attempt 2: 'static --asic' exposes ASIC details on ROCm 6+,

From 94a7a038238014d811acecad9f37992669b736e0 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 14:38:47 -0500
Subject: [PATCH 140/165] fix(rocm): gate StubSubpackageFinder on win32 ROCm,
 add gcnArchName fallbacks

- worker.py _StubSubpackageFinder: the meta_path append was running on
  every platform on every call to run_training_process; moved it inside
  the if _is_win32_rocm: block since stubs are only seeded there and the
  finder is a pure accumulation on Linux/Windows CUDA
- worker.py OOM guard: AMD SDK / Radeon wheels may not populate
  gcnArchName, causing Strix Halo to be misclassified as discrete and
  get the 0.90 cap (12.8 GB OS headroom) instead of 0.80 (25.6 GB);
  now tries gcn_arch_name / arch_name / gfx_arch_name variants first,
  then falls back to device-name matching (890M -> Strix Halo,
  880M -> Strix Point) with a debug log when the fallback fires
---
 studio/backend/core/training/worker.py | 41 +++++++++++++++++++-------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index e784dc1486..c5a5b69a3c 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -1971,8 +1971,6 @@ def find_spec(self, fullname, path, target = None):
                 fullname, _StubSubpackageLoader(fullname), is_package = True
             )
 
-    sys.meta_path.append(_StubSubpackageFinder())
-
     # Only stub torchao on Windows ROCm hosts -- on Windows CUDA (NVIDIA) torchao
     # is real and shadowing it breaks torchao-based quantization paths.
     # Gate on the active torch runtime, not env-var presence -- HIP_PATH /
@@ -1992,6 +1990,9 @@ def find_spec(self, fullname, path, target = None):
         except Exception:
             pass
     if _is_win32_rocm:
+        # Register the finder only on Windows ROCm -- on other platforms there
+        # are no stub modules seeded, so appending is a pure accumulation.
+        sys.meta_path.append(_StubSubpackageFinder())
         # Seed torchao top-level + key submodules; the finder handles the rest.
         for _tao_name in (
             "torchao",
@@ -2298,24 +2299,42 @@ def _grouped_mm_safe_impl(
     # not need this cap.
     # Unified-memory APUs (gfx1150 Strix Point / gfx1151 Strix Halo) share GPU
     # and system RAM in one pool: 0.90 of 128 GB starves the OS. Use 0.80 there.
-    # Classified via gcnArchName — naming-independent and already parsed
-    # throughout this PR.
+    # Primary classifier: gcnArchName from device properties — stable within a
+    # product family and naming-independent.  AMD SDK / Radeon wheels may omit
+    # gcnArchName or expose it under a variant spelling, so we try several attr
+    # names then fall back to known device-name markers as a last resort.
     # Non-fatal: silently skipped if torch is not importable.
     if _hw.IS_ROCM:
         try:
             import torch as _torch_mem
 
             if _torch_mem.cuda.is_available():
-                # Classify unified vs discrete by gcnArchName, not by device
-                # marketing name or VRAM/RAM ratio.  Name regexes miss "8060S";
-                # ratio (vram >= 0.90 * sys_ram) false-positives on machines
-                # where discrete VRAM == system RAM (e.g. 16 GB card + 16 GB RAM).
-                # gcnArchName is stable within a product family and is already
-                # parsed throughout this PR.
+                # Classify unified vs discrete by gcnArchName, not by VRAM/RAM
+                # ratio (false-positives on 16 GB card + 16 GB RAM hosts).
                 _props = _torch_mem.cuda.get_device_properties(0)
                 _dev_name = _props.name
-                _gcn_arch = (getattr(_props, "gcnArchName", "") or "").split(":")[0]
+                # Try multiple attribute name forms: different ROCm wheel builds
+                # (HIP SDK vs AMD SDK / Radeon wheels) may use different spellings.
+                _gcn_arch = ""
+                for _arch_attr in ("gcnArchName", "gcn_arch_name", "arch_name", "gfx_arch_name"):
+                    _v = (getattr(_props, _arch_attr, "") or "").split(":")[0].strip()
+                    if _v:
+                        _gcn_arch = _v
+                        break
                 _is_unified = _gcn_arch in {"gfx1150", "gfx1151"}
+                if not _is_unified and not _gcn_arch:
+                    # gcnArchName absent (AMD SDK / Radeon wheels may not populate
+                    # it) -- fall back to device name matching for known
+                    # unified-memory iGPU model names (Strix Halo: 890M,
+                    # Strix Point: 880M).
+                    _dev_lower = _dev_name.lower()
+                    _is_unified = "890m" in _dev_lower or "880m" in _dev_lower
+                    if _is_unified:
+                        logger.debug(
+                            "ROCm OOM guard: gcnArchName absent -- inferred "
+                            "unified memory from device name %r; applying 0.80 cap",
+                            _dev_name,
+                        )
                 _mem_fraction = 0.80 if _is_unified else 0.90
                 _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(

From 38acd5bc8cba15a3904b37734d0a02ccfe0dfdf3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 May 2026 19:39:06 +0000
Subject: [PATCH 141/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/training/worker.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index c5a5b69a3c..030d21c078 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -2316,7 +2316,12 @@ def _grouped_mm_safe_impl(
                 # Try multiple attribute name forms: different ROCm wheel builds
                 # (HIP SDK vs AMD SDK / Radeon wheels) may use different spellings.
                 _gcn_arch = ""
-                for _arch_attr in ("gcnArchName", "gcn_arch_name", "arch_name", "gfx_arch_name"):
+                for _arch_attr in (
+                    "gcnArchName",
+                    "gcn_arch_name",
+                    "arch_name",
+                    "gfx_arch_name",
+                ):
                     _v = (getattr(_props, _arch_attr, "") or "").split(":")[0].strip()
                     if _v:
                         _gcn_arch = _v

From 80dd40e6f65804be329488e958257ced63eee465 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 15:13:03 -0500
Subject: [PATCH 142/165] fix(rocm): pin torchvision/torchaudio in setup.ps1,
 remove -Unique from arch array

- setup.ps1 ROCm torch install: torchvision and torchaudio were passed
  bare alongside pinned torch>=2.11.0,<2.12.0 for gfx1151/gfx1200 arches.
  AMD publishes packages independently so a future torchvision 0.27 (for
  torch 2.12) on the same arch index would cause pip ResolutionImpossible
  or an ABI-incompatible install. Added torchvisionFloorMap and
  torchaudioFloorMap mirroring install_python_stack.py's strix override
  (torchvision>=0.26.0,<0.27.0, torchaudio>=2.11.0,<2.12.0) and derived
  ROCmVisionSpec/ROCmAudioSpec used in all three Fast-Install call sites.

- setup.ps1 amd-smi arch detection: Select-Object -Unique was collapsing
  same-arch multi-GPU arrays (e.g. two gfx1151 APUs -> 1-element array)
  causing HIP_VISIBLE_DEVICES=1 to trigger a false out-of-range warning
  and fall back to GPU 0 even though the correct GPU would have been at
  index 1. Removed -Unique; added comment noting the positional-index
  assumption and its non-contiguous-GPU limitation.
---
 studio/setup.ps1 | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/studio/setup.ps1 b/studio/setup.ps1
index 0f399a099c..cc7b4c7c92 100644
--- a/studio/setup.ps1
+++ b/studio/setup.ps1
@@ -746,9 +746,14 @@ if (-not $HasNvidiaSmi) {
                     # Collect ALL gfx tokens in output order so that on mixed-arch systems
                     # we can honour HIP_VISIBLE_DEVICES / ROCR_VISIBLE_DEVICES and pick the
                     # arch for the *runtime-visible* GPU rather than always the first one.
+                    # Do NOT deduplicate: a dual same-arch system (e.g. two gfx1151 APUs)
+                    # must produce a 2-element array so HIP_VISIBLE_DEVICES=1 selects the
+                    # second GPU rather than triggering a false out-of-range warning.
+                    # Note: this mapping assumes amd-smi lists GPUs in the same order as
+                    # HIP enumerates them (both follow PCI bus order in practice); it may
+                    # give the wrong arch when GPU indices are non-contiguous (very rare).
                     $allGfxArches = @([regex]::Matches($smiOut, '(?i)\b(gfx\d+[a-z]?)\b') |
-                        ForEach-Object { $_.Groups[1].Value.ToLower() } |
-                        Select-Object -Unique)
+                        ForEach-Object { $_.Groups[1].Value.ToLower() })
                     if ($allGfxArches.Count -gt 0) {
                         # Resolve which GPU index is runtime-visible.  When a single
                         # integer index is set, use it; fall back to index 0 otherwise
@@ -2039,8 +2044,25 @@ if ($HasROCm -and $CuTag -eq "cpu") {
         "gfx1201" = "torch>=2.11.0,<2.12.0"; "gfx1200" = "torch>=2.11.0,<2.12.0"
         "gfx1151" = "torch>=2.11.0,<2.12.0"; "gfx1150" = "torch>=2.11.0,<2.12.0"
     }
+    # Companion ranges for torchvision/torchaudio -- must stay in sync with the
+    # torch ceiling so pip can always find a consistent trio on AMD's per-arch
+    # index.  AMD publishes each package independently and may add a newer
+    # torchvision (e.g. 0.27 for torch 2.12) before removing 0.26, which would
+    # cause pip to resolve an ABI-incompatible set if these are left bare.
+    # Matches _ROCM_TORCH_PKG_SPECS["rocm7.2"] in install_python_stack.py.
+    # Bump all three ceilings together when torch 2.12.x is validated.
+    $torchvisionFloorMap = @{
+        "gfx1201" = "torchvision>=0.26.0,<0.27.0"; "gfx1200" = "torchvision>=0.26.0,<0.27.0"
+        "gfx1151" = "torchvision>=0.26.0,<0.27.0"; "gfx1150" = "torchvision>=0.26.0,<0.27.0"
+    }
+    $torchaudioFloorMap = @{
+        "gfx1201" = "torchaudio>=2.11.0,<2.12.0"; "gfx1200" = "torchaudio>=2.11.0,<2.12.0"
+        "gfx1151" = "torchaudio>=2.11.0,<2.12.0"; "gfx1150" = "torchaudio>=2.11.0,<2.12.0"
+    }
     $archFamily = if ($ROCmGfxArch -and $archFamilyMap.ContainsKey($ROCmGfxArch)) { $archFamilyMap[$ROCmGfxArch] } else { $null }
-    $ROCmTorchSpec = if ($ROCmGfxArch -and $torchFloorMap.ContainsKey($ROCmGfxArch)) { $torchFloorMap[$ROCmGfxArch] } else { "torch" }
+    $ROCmTorchSpec  = if ($ROCmGfxArch -and $torchFloorMap.ContainsKey($ROCmGfxArch))        { $torchFloorMap[$ROCmGfxArch]        } else { "torch" }
+    $ROCmVisionSpec = if ($ROCmGfxArch -and $torchvisionFloorMap.ContainsKey($ROCmGfxArch))  { $torchvisionFloorMap[$ROCmGfxArch]  } else { "torchvision" }
+    $ROCmAudioSpec  = if ($ROCmGfxArch -and $torchaudioFloorMap.ContainsKey($ROCmGfxArch))   { $torchaudioFloorMap[$ROCmGfxArch]   } else { "torchaudio" }
     if ($archFamily) {
         $ROCmIndexUrl = "$amdIndexBase/$archFamily/"
     } elseif ($ROCmGfxArch) {
@@ -2061,14 +2083,14 @@ $PyTorchWhlBase = if ($env:UNSLOTH_PYTORCH_MIRROR) { $env:UNSLOTH_PYTORCH_MIRROR
 if ($ROCmIndexUrl) {
     substep "installing PyTorch (AMD ROCm, $ROCmGfxArch)..."
     if ($ROCmTorchSpec -ne "torch") {
-        substep "  enforcing $ROCmTorchSpec (known _grouped_mm bug in older wheels)" "Cyan"
+        substep "  enforcing $ROCmTorchSpec $ROCmVisionSpec $ROCmAudioSpec (known _grouped_mm bug in older wheels)" "Cyan"
     }
     if ($script:UnslothVerbose) {
-        Fast-Install $ROCmTorchSpec torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl
+        Fast-Install $ROCmTorchSpec $ROCmVisionSpec $ROCmAudioSpec --force-reinstall --index-url $ROCmIndexUrl
         $torchInstallExit = $LASTEXITCODE
         $output = ""
     } else {
-        $output = Fast-Install $ROCmTorchSpec torchvision torchaudio --force-reinstall --index-url $ROCmIndexUrl | Out-String
+        $output = Fast-Install $ROCmTorchSpec $ROCmVisionSpec $ROCmAudioSpec --force-reinstall --index-url $ROCmIndexUrl | Out-String
         $torchInstallExit = $LASTEXITCODE
     }
     if ($torchInstallExit -ne 0) {

From 59825bed1be92d2596d2106403d3e9eaa70438f8 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 16:53:23 -0500
Subject: [PATCH 143/165] fix(rocm): add 8060s/8050s to OOM guard device-name
 fallback, extract classifier helper

Path 3 of the OOM guard device-name fallback only checked for 890m/880m
(gfx1150 Strix Point SKU names). Strix Halo (gfx1151) ships as Radeon 8060S
(Ryzen AI MAX+ 395) and Radeon 8050S (cut-down SKU) -- neither matches, so
the fallback returned is_unified=False and applied the 0.90 fraction instead
of 0.80, leaving ~12.8 GiB OS headroom on a 128 GiB pool instead of ~25.6 GiB.

Fix: add 8060s and 8050s to the name-match set. Also correct the comment that
mislabelled 890M as a Strix Halo name (it is Strix Point).

Refactor: extract the three-path classifier into _rocm_classify_unified_memory()
so it can be unit-tested directly. Add 31 test cases in test_rocm_oom_guard.py
covering all three paths and the regression case (Radeon 8060S Graphics).

Reported-by: h34v3nzc0dex
---
 studio/backend/core/training/worker.py      |  77 +++++----
 studio/backend/tests/test_rocm_oom_guard.py | 176 ++++++++++++++++++++
 2 files changed, 224 insertions(+), 29 deletions(-)
 create mode 100644 studio/backend/tests/test_rocm_oom_guard.py

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 030d21c078..3c19039b53 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -675,6 +675,45 @@ def _torch_has_hip() -> bool:
         return False
 
 
+def _rocm_classify_unified_memory(props: Any) -> tuple[str, bool]:
+    """Classify a ROCm device as unified-memory (APU) or discrete.
+
+    Returns ``(gcn_arch, is_unified)`` where:
+    - ``gcn_arch`` is the canonical arch string (e.g. ``"gfx1151"``) when a
+      known attribute is present, or ``""`` when all arch attrs are absent.
+    - ``is_unified`` is ``True`` for AMD APUs with a shared GPU/system-RAM pool
+      (gfx1150 Strix Point, gfx1151 Strix Halo) — these need a lower
+      ``set_per_process_memory_fraction`` cap to leave headroom for the OS.
+
+    Classification priority:
+    1. ``gcnArchName`` / variant spellings (stable, naming-independent).
+    2. Device-name substring match as a last-resort fallback when all arch
+       attrs are absent (AMD SDK / Radeon wheels may not populate them):
+         - gfx1150 Strix Point: ``Radeon 890M``, ``Radeon 880M``
+         - gfx1151 Strix Halo:  ``Radeon 8060S`` (Ryzen AI MAX+ 395),
+                                ``Radeon 8050S`` (cut-down SKU)
+    """
+    gcn_arch = ""
+    for _attr in ("gcnArchName", "gcn_arch_name", "arch_name", "gfx_arch_name"):
+        _v = (getattr(props, _attr, "") or "").split(":")[0].strip()
+        if _v:
+            gcn_arch = _v
+            break
+
+    if gcn_arch:
+        return gcn_arch, gcn_arch in {"gfx1150", "gfx1151"}
+
+    # Arch attrs absent — fall back to device-name matching.
+    dev_lower = (getattr(props, "name", "") or "").lower()
+    is_unified = (
+        "890m" in dev_lower
+        or "880m" in dev_lower
+        or "8060s" in dev_lower
+        or "8050s" in dev_lower
+    )
+    return gcn_arch, is_unified
+
+
 def _tilelang_platform_supported() -> bool:
     """True iff a tilelang 0.1.8 wheel will load: Linux x86_64/aarch64, non-HIP torch.
 
@@ -2309,37 +2348,17 @@ def _grouped_mm_safe_impl(
             import torch as _torch_mem
 
             if _torch_mem.cuda.is_available():
-                # Classify unified vs discrete by gcnArchName, not by VRAM/RAM
-                # ratio (false-positives on 16 GB card + 16 GB RAM hosts).
+                # Classify unified vs discrete via _rocm_classify_unified_memory.
+                # See that function's docstring for classification priority.
                 _props = _torch_mem.cuda.get_device_properties(0)
                 _dev_name = _props.name
-                # Try multiple attribute name forms: different ROCm wheel builds
-                # (HIP SDK vs AMD SDK / Radeon wheels) may use different spellings.
-                _gcn_arch = ""
-                for _arch_attr in (
-                    "gcnArchName",
-                    "gcn_arch_name",
-                    "arch_name",
-                    "gfx_arch_name",
-                ):
-                    _v = (getattr(_props, _arch_attr, "") or "").split(":")[0].strip()
-                    if _v:
-                        _gcn_arch = _v
-                        break
-                _is_unified = _gcn_arch in {"gfx1150", "gfx1151"}
-                if not _is_unified and not _gcn_arch:
-                    # gcnArchName absent (AMD SDK / Radeon wheels may not populate
-                    # it) -- fall back to device name matching for known
-                    # unified-memory iGPU model names (Strix Halo: 890M,
-                    # Strix Point: 880M).
-                    _dev_lower = _dev_name.lower()
-                    _is_unified = "890m" in _dev_lower or "880m" in _dev_lower
-                    if _is_unified:
-                        logger.debug(
-                            "ROCm OOM guard: gcnArchName absent -- inferred "
-                            "unified memory from device name %r; applying 0.80 cap",
-                            _dev_name,
-                        )
+                _gcn_arch, _is_unified = _rocm_classify_unified_memory(_props)
+                if _is_unified and not _gcn_arch:
+                    logger.debug(
+                        "ROCm OOM guard: gcnArchName absent -- inferred "
+                        "unified memory from device name %r; applying 0.80 cap",
+                        _dev_name,
+                    )
                 _mem_fraction = 0.80 if _is_unified else 0.90
                 _torch_mem.cuda.set_per_process_memory_fraction(_mem_fraction)
                 logger.info(
diff --git a/studio/backend/tests/test_rocm_oom_guard.py b/studio/backend/tests/test_rocm_oom_guard.py
new file mode 100644
index 0000000000..65c3003cce
--- /dev/null
+++ b/studio/backend/tests/test_rocm_oom_guard.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Unit tests for _rocm_classify_unified_memory (ROCm OOM-guard classifier).
+
+Covers the three classification paths:
+  Path 1 – canonical gcnArchName attribute present.
+  Path 2 – gcnArchName absent, alternate-spelling attribute present.
+  Path 3 – ALL arch attrs absent; falls back to device-name substring match.
+
+Regression for: Strix Halo (gfx1151) misclassified as discrete on AMD SDK /
+Radeon wheels that populate props.name = "Radeon 8060S Graphics" but do NOT
+set any gcnArchName attribute.  Without the 8060s/8050s name patterns the
+fallback returned is_unified=False, applying the 0.90 fraction instead of
+0.80 and leaving only ~12.8 GiB OS headroom on a 128 GiB unified-memory pool.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import pytest
+
+from core.training.worker import _rocm_classify_unified_memory
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+
+def _props(**kwargs) -> SimpleNamespace:
+    """Build a fake device-properties object with the given attributes."""
+    return SimpleNamespace(**kwargs)
+
+
+# ── Path 1: canonical gcnArchName ────────────────────────────────────────────
+
+
+class TestCanonicalGcnArchName:
+    """gcnArchName is present and populated."""
+
+    @pytest.mark.parametrize(
+        "arch, expected_unified",
+        [
+            ("gfx1150", True),   # Strix Point
+            ("gfx1151", True),   # Strix Halo
+            ("gfx1100", False),  # Navi 31 (RX 7900 XTX) — discrete
+            ("gfx906",  False),  # MI50 — discrete server GPU
+            ("gfx1201", False),  # RX 9070 XT — discrete
+        ],
+    )
+    def test_canonical_attr(self, arch: str, expected_unified: bool) -> None:
+        props = _props(gcnArchName=arch, name="irrelevant")
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == arch
+        assert is_unified is expected_unified
+
+    def test_arch_with_colon_suffix_stripped(self) -> None:
+        """gcnArchName can carry xnack/sramecc suffix; only the base is kept."""
+        props = _props(gcnArchName="gfx1151:xnack-", name="irrelevant")
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == "gfx1151"
+        assert is_unified is True
+
+    def test_canonical_attr_wins_over_name(self) -> None:
+        """Arch attr takes priority; device name should be ignored."""
+        # Discrete arch, but name looks like a unified SKU — arch must win.
+        props = _props(gcnArchName="gfx1100", name="Radeon 890M")
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == "gfx1100"
+        assert is_unified is False
+
+
+# ── Path 2: alternate-spelling fallback ──────────────────────────────────────
+
+
+class TestAlternateSpellingFallback:
+    """gcnArchName is missing but an alternate attr spelling is present."""
+
+    @pytest.mark.parametrize(
+        "attr_name",
+        ["gcn_arch_name", "arch_name", "gfx_arch_name"],
+    )
+    def test_alternate_attr_unified(self, attr_name: str) -> None:
+        props = _props(**{attr_name: "gfx1151"}, name="Radeon 8060S Graphics")
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == "gfx1151"
+        assert is_unified is True
+
+    @pytest.mark.parametrize(
+        "attr_name",
+        ["gcn_arch_name", "arch_name", "gfx_arch_name"],
+    )
+    def test_alternate_attr_discrete(self, attr_name: str) -> None:
+        props = _props(**{attr_name: "gfx1201"}, name="Radeon RX 9070 XT")
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == "gfx1201"
+        assert is_unified is False
+
+    def test_first_non_empty_attr_wins(self) -> None:
+        """When multiple alternate attrs are present the first non-empty one wins."""
+        props = _props(gcn_arch_name="gfx1151", arch_name="gfx1100", name="irrelevant")
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == "gfx1151"
+        assert is_unified is True
+
+
+# ── Path 3: device-name fallback ─────────────────────────────────────────────
+
+
+class TestDeviceNameFallback:
+    """ALL arch attrs absent — classifier must rely solely on device name."""
+
+    # --- unified-memory devices that MUST be detected ---
+
+    @pytest.mark.parametrize(
+        "device_name",
+        [
+            # gfx1150 Strix Point
+            "Radeon 890M",
+            "AMD Radeon 890M Graphics",
+            "RADEON 890M",              # case-insensitive
+            "Radeon 880M",
+            "AMD Radeon 880M Graphics",
+            # gfx1151 Strix Halo — the regression case from the review
+            "Radeon 8060S Graphics",    # Ryzen AI MAX+ 395 (as returned by torch)
+            "AMD Radeon 8060S",
+            "Radeon 8050S Graphics",    # cut-down Strix Halo SKU
+            "AMD Radeon 8050S",
+            # case variants
+            "RADEON 8060S GRAPHICS",
+            "radeon 8050s",
+        ],
+    )
+    def test_unified_memory_detected(self, device_name: str) -> None:
+        props = _props(name=device_name)
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == "", f"expected empty gcn_arch, got {gcn!r}"
+        assert is_unified is True, (
+            f"device {device_name!r} should be classified as unified-memory"
+        )
+
+    # --- discrete devices that must NOT be mis-classified ---
+
+    @pytest.mark.parametrize(
+        "device_name",
+        [
+            "Radeon RX 9070 XT",
+            "AMD Radeon RX 7900 XTX",
+            "Radeon RX 6900 XT",
+            "Radeon Pro W7900",
+            "AMD Instinct MI300X",
+            # Names that contain superficially similar substrings but are discrete
+            "Radeon RX 580",
+            "Radeon VII",
+        ],
+    )
+    def test_discrete_not_misclassified(self, device_name: str) -> None:
+        props = _props(name=device_name)
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == ""
+        assert is_unified is False, (
+            f"discrete device {device_name!r} should NOT be classified as unified-memory"
+        )
+
+    def test_empty_name_returns_false(self) -> None:
+        """Completely absent name must not crash and must default to discrete."""
+        props = _props()  # no 'name' attr at all
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == ""
+        assert is_unified is False
+
+    def test_none_name_returns_false(self) -> None:
+        props = _props(name=None)
+        gcn, is_unified = _rocm_classify_unified_memory(props)
+        assert gcn == ""
+        assert is_unified is False

From 4ecf79740ed65fa1f7c998a60bff69640e2f9f06 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 26 May 2026 21:54:44 +0000
Subject: [PATCH 144/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/tests/test_rocm_oom_guard.py | 42 ++++++++++-----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/studio/backend/tests/test_rocm_oom_guard.py b/studio/backend/tests/test_rocm_oom_guard.py
index 65c3003cce..2ce9b55789 100644
--- a/studio/backend/tests/test_rocm_oom_guard.py
+++ b/studio/backend/tests/test_rocm_oom_guard.py
@@ -41,22 +41,22 @@ class TestCanonicalGcnArchName:
     @pytest.mark.parametrize(
         "arch, expected_unified",
         [
-            ("gfx1150", True),   # Strix Point
-            ("gfx1151", True),   # Strix Halo
+            ("gfx1150", True),  # Strix Point
+            ("gfx1151", True),  # Strix Halo
             ("gfx1100", False),  # Navi 31 (RX 7900 XTX) — discrete
-            ("gfx906",  False),  # MI50 — discrete server GPU
+            ("gfx906", False),  # MI50 — discrete server GPU
             ("gfx1201", False),  # RX 9070 XT — discrete
         ],
     )
     def test_canonical_attr(self, arch: str, expected_unified: bool) -> None:
-        props = _props(gcnArchName=arch, name="irrelevant")
+        props = _props(gcnArchName = arch, name = "irrelevant")
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == arch
         assert is_unified is expected_unified
 
     def test_arch_with_colon_suffix_stripped(self) -> None:
         """gcnArchName can carry xnack/sramecc suffix; only the base is kept."""
-        props = _props(gcnArchName="gfx1151:xnack-", name="irrelevant")
+        props = _props(gcnArchName = "gfx1151:xnack-", name = "irrelevant")
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == "gfx1151"
         assert is_unified is True
@@ -64,7 +64,7 @@ def test_arch_with_colon_suffix_stripped(self) -> None:
     def test_canonical_attr_wins_over_name(self) -> None:
         """Arch attr takes priority; device name should be ignored."""
         # Discrete arch, but name looks like a unified SKU — arch must win.
-        props = _props(gcnArchName="gfx1100", name="Radeon 890M")
+        props = _props(gcnArchName = "gfx1100", name = "Radeon 890M")
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == "gfx1100"
         assert is_unified is False
@@ -81,7 +81,7 @@ class TestAlternateSpellingFallback:
         ["gcn_arch_name", "arch_name", "gfx_arch_name"],
     )
     def test_alternate_attr_unified(self, attr_name: str) -> None:
-        props = _props(**{attr_name: "gfx1151"}, name="Radeon 8060S Graphics")
+        props = _props(**{attr_name: "gfx1151"}, name = "Radeon 8060S Graphics")
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == "gfx1151"
         assert is_unified is True
@@ -91,14 +91,14 @@ def test_alternate_attr_unified(self, attr_name: str) -> None:
         ["gcn_arch_name", "arch_name", "gfx_arch_name"],
     )
     def test_alternate_attr_discrete(self, attr_name: str) -> None:
-        props = _props(**{attr_name: "gfx1201"}, name="Radeon RX 9070 XT")
+        props = _props(**{attr_name: "gfx1201"}, name = "Radeon RX 9070 XT")
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == "gfx1201"
         assert is_unified is False
 
     def test_first_non_empty_attr_wins(self) -> None:
         """When multiple alternate attrs are present the first non-empty one wins."""
-        props = _props(gcn_arch_name="gfx1151", arch_name="gfx1100", name="irrelevant")
+        props = _props(gcn_arch_name = "gfx1151", arch_name = "gfx1100", name = "irrelevant")
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == "gfx1151"
         assert is_unified is True
@@ -118,13 +118,13 @@ class TestDeviceNameFallback:
             # gfx1150 Strix Point
             "Radeon 890M",
             "AMD Radeon 890M Graphics",
-            "RADEON 890M",              # case-insensitive
+            "RADEON 890M",  # case-insensitive
             "Radeon 880M",
             "AMD Radeon 880M Graphics",
             # gfx1151 Strix Halo — the regression case from the review
-            "Radeon 8060S Graphics",    # Ryzen AI MAX+ 395 (as returned by torch)
+            "Radeon 8060S Graphics",  # Ryzen AI MAX+ 395 (as returned by torch)
             "AMD Radeon 8060S",
-            "Radeon 8050S Graphics",    # cut-down Strix Halo SKU
+            "Radeon 8050S Graphics",  # cut-down Strix Halo SKU
             "AMD Radeon 8050S",
             # case variants
             "RADEON 8060S GRAPHICS",
@@ -132,12 +132,12 @@ class TestDeviceNameFallback:
         ],
     )
     def test_unified_memory_detected(self, device_name: str) -> None:
-        props = _props(name=device_name)
+        props = _props(name = device_name)
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == "", f"expected empty gcn_arch, got {gcn!r}"
-        assert is_unified is True, (
-            f"device {device_name!r} should be classified as unified-memory"
-        )
+        assert (
+            is_unified is True
+        ), f"device {device_name!r} should be classified as unified-memory"
 
     # --- discrete devices that must NOT be mis-classified ---
 
@@ -155,12 +155,12 @@ def test_unified_memory_detected(self, device_name: str) -> None:
         ],
     )
     def test_discrete_not_misclassified(self, device_name: str) -> None:
-        props = _props(name=device_name)
+        props = _props(name = device_name)
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == ""
-        assert is_unified is False, (
-            f"discrete device {device_name!r} should NOT be classified as unified-memory"
-        )
+        assert (
+            is_unified is False
+        ), f"discrete device {device_name!r} should NOT be classified as unified-memory"
 
     def test_empty_name_returns_false(self) -> None:
         """Completely absent name must not crash and must default to discrete."""
@@ -170,7 +170,7 @@ def test_empty_name_returns_false(self) -> None:
         assert is_unified is False
 
     def test_none_name_returns_false(self) -> None:
-        props = _props(name=None)
+        props = _props(name = None)
         gcn, is_unified = _rocm_classify_unified_memory(props)
         assert gcn == ""
         assert is_unified is False

From 62e18d81ceb5e30007db413e1124558fa6be5571 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 17:26:47 -0500
Subject: [PATCH 145/165] fix(rocm): pass explicit dtype on bf16-unsupported
 hardware (RDNA2)

dtype=None lets unsloth auto-detect the model dtype. On RDNA2 (gfx103x,
e.g. RX 6600) is_bfloat16_supported() incorrectly returns True, so unsloth
picks bf16 and the first bf16 kernel dispatch triggers:

  LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.fdot2.bf16.bf16

Replace every dtype=None in load_model() with _auto_dtype which resolves
to None when bf16 is supported (all modern NVIDIA + RDNA3+) and
torch.float16 otherwise. This gives RDNA2 users a working float16
training path without touching NVIDIA behaviour at all.

Fixes: https://github.com/unslothai/unsloth/issues/5337
---
 studio/backend/core/training/trainer.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index b128fb5338..105c4ec347 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -657,6 +657,15 @@ def load_model(
                 f"Using device_map='{device_map}' ({get_visible_gpu_count()} GPU(s) visible)"
             )
 
+            # On hardware without native bfloat16 support (e.g. RDNA2 / gfx103x),
+            # passing dtype=None lets unsloth auto-detect and incorrectly choose
+            # bf16, triggering an LLVM error at the first bf16 kernel dispatch.
+            # Explicitly pass float16 as the fallback so unsloth never reaches
+            # that path. Modern NVIDIA (Ampere+) and RDNA3+ return True here so
+            # they are unaffected — dtype stays None and unsloth picks bf16 as
+            # before.
+            _auto_dtype = None if is_bfloat16_supported() else torch.float16
+
             # Branch based on model type
             if self._audio_type == "csm":
                 # CSM: FastModel + auto_model=CsmForConditionalGeneration + load_in_4bit=False
@@ -666,7 +675,7 @@ def load_model(
                 self.model, self.tokenizer = FastModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     auto_model = CsmForConditionalGeneration,
                     load_in_4bit = False,
                     device_map = device_map,
@@ -683,7 +692,7 @@ def load_model(
 
                 self.model, self.tokenizer = FastModel.from_pretrained(
                     model_name = model_name,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     load_in_4bit = False,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -705,7 +714,7 @@ def load_model(
                 self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -777,7 +786,7 @@ def load_model(
                 self.model, self.tokenizer = FastModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -791,7 +800,7 @@ def load_model(
                 self.model, self.tokenizer = FastVisionModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,  # Auto-detect
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -824,7 +833,7 @@ def load_model(
                 self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,  # Auto-detect
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,

From 3244537f49c3110ba8c56a9433ff8889a6e05cae Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Tue, 26 May 2026 18:55:50 -0500
Subject: [PATCH 146/165] fix: reduce log noise for expected non-issues on
 Windows ROCm

Three log lines fired at warning/error level for conditions that are
completely expected on a Windows HIP SDK-only setup:

amd.py
- amd-smi WinError 2 (FileNotFoundError): downgrade warning -> debug.
  amd-smi ships with Adrenalin, not the HIP SDK; absence is normal.
- 'disabling' message: downgrade warning -> info with clearer text
  'not available (not installed; expected on HIP SDK-only systems);
  GPU VRAM polling disabled'

hardware.py
- torch.distributed.Store missing: downgrade warning -> debug.
  The distributed stub added in this PR intentionally omits Store; the
  attention-impl fallback to eager is expected and non-actionable.

worker.py
- causal-conv1d: add early Windows exit (info) in both
  _ensure_causal_conv1d_fast_path and _causal_conv1d_install hook;
  no cp313/win_amd64 wheel exists, so the install always fails.
- FLA: add early Windows exit (info) in
  _ensure_flash_linear_attention_unconditional; triton dependency has
  no cp313/win_amd64 wheel.
- Defense-in-depth: _install_package_wheel_first non-HIP PyPI failure
  logs info+debug on Windows instead of error; FLA failure logs
  info+debug on Windows instead of warning.
---
 studio/backend/core/training/worker.py    | 46 ++++++++++++++++++-----
 studio/backend/utils/hardware/amd.py      | 17 ++++++---
 studio/backend/utils/hardware/hardware.py |  5 ++-
 3 files changed, 53 insertions(+), 15 deletions(-)

diff --git a/studio/backend/core/training/worker.py b/studio/backend/core/training/worker.py
index 3c19039b53..ae15b3918d 100644
--- a/studio/backend/core/training/worker.py
+++ b/studio/backend/core/training/worker.py
@@ -372,11 +372,21 @@ def _install_package_wheel_first(
                 f"{snippet}",
             )
         else:
-            logger.error(
-                "Failed to install %s from PyPI:\n%s",
-                display_name,
-                result.stdout,
-            )
+            if sys.platform == "win32":
+                # No prebuilt wheel and no source build toolchain on Windows --
+                # this is expected for packages like causal-conv1d.  Log at
+                # info so users aren't alarmed by what looks like an error.
+                logger.info(
+                    "%s is not available on Windows (no prebuilt wheel); skipping",
+                    display_name,
+                )
+                logger.debug("Install output:\n%s", result.stdout)
+            else:
+                logger.error(
+                    "Failed to install %s from PyPI:\n%s",
+                    display_name,
+                    result.stdout,
+                )
         return False
 
     if is_hip:
@@ -389,6 +399,9 @@ def _install_package_wheel_first(
 def _ensure_causal_conv1d_fast_path(event_queue: Any, model_name: str) -> None:
     if not _model_wants_causal_conv1d(model_name):
         return
+    if sys.platform == "win32":
+        logger.info("causal-conv1d: no prebuilt wheel for Windows; skipping")
+        return
 
     _install_package_wheel_first(
         event_queue = event_queue,
@@ -456,6 +469,11 @@ def _ensure_flash_linear_attention_unconditional(event_queue: Any) -> bool:
     """Install pinned FLA + fla-core with --no-deps. Returns True iff importable post-call."""
     if os.getenv(_FLA_SKIP_ENV) == "1":
         return False
+    if sys.platform == "win32":
+        logger.info(
+            "Skipping flash-linear-attention install: no prebuilt wheel for Windows"
+        )
+        return False
     if sys.version_info < _FLA_MIN_PYTHON:
         logger.info(
             "Skipping flash-linear-attention install: requires Python >= %d.%d, have %s",
@@ -535,10 +553,17 @@ def _ensure_flash_linear_attention_unconditional(event_queue: Any) -> bool:
         return False
 
     if result.returncode != 0:
-        logger.warning(
-            "flash-linear-attention install failed (continuing on torch fallback):\n%s",
-            result.stdout,
-        )
+        if sys.platform == "win32":
+            logger.info(
+                "flash-linear-attention not available on Windows (no prebuilt wheel); "
+                "continuing on torch fallback"
+            )
+            logger.debug("Install output:\n%s", result.stdout)
+        else:
+            logger.warning(
+                "flash-linear-attention install failed (continuing on torch fallback):\n%s",
+                result.stdout,
+            )
         _send_status(
             event_queue,
             "flash-linear-attention install failed; continuing without it",
@@ -979,6 +1004,9 @@ def _fla_post_available(eq: Any) -> None:
         _ensure_tilelang_backend_unconditional(eq)
 
     def _causal_conv1d_install(eq: Any) -> bool:
+        if sys.platform == "win32":
+            logger.info("causal-conv1d: no prebuilt wheel for Windows; skipping")
+            return False
         ok = _install_package_wheel_first(
             event_queue = eq,
             import_name = "causal_conv1d",
diff --git a/studio/backend/utils/hardware/amd.py b/studio/backend/utils/hardware/amd.py
index 7a1ca0b8f8..48d5890399 100644
--- a/studio/backend/utils/hardware/amd.py
+++ b/studio/backend/utils/hardware/amd.py
@@ -50,11 +50,17 @@ def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optiona
             **windows_hidden_subprocess_kwargs(),
         )
     except (OSError, subprocess.TimeoutExpired) as e:
-        logger.warning("amd-smi query failed: %s", e)
+        if isinstance(e, FileNotFoundError):
+            # amd-smi ships with Adrenalin, not the HIP SDK -- absence is
+            # expected on HIP SDK-only Windows setups.  Log at debug only.
+            logger.debug("amd-smi not found (not in PATH): %s", e)
+        else:
+            logger.warning("amd-smi query failed: %s", e)
         _amd_smi_consecutive_failures += 1
         if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
-            logger.warning(
-                "amd-smi unavailable -- disabling GPU polling to avoid repeated prompts"
+            logger.info(
+                "amd-smi not available (not installed; expected on HIP SDK-only systems); "
+                "GPU VRAM polling disabled"
             )
             _amd_smi_disabled = True
         return None
@@ -62,8 +68,9 @@ def _run_amd_smi(*args: str, timeout: int = _AMD_SMI_DEFAULT_TIMEOUT) -> Optiona
         logger.warning("amd-smi returned code %d", result.returncode)
         _amd_smi_consecutive_failures += 1
         if _amd_smi_consecutive_failures >= _AMD_SMI_FAILURE_LIMIT:
-            logger.warning(
-                "amd-smi unavailable -- disabling GPU polling to avoid repeated prompts"
+            logger.info(
+                "amd-smi not available (not installed; expected on HIP SDK-only systems); "
+                "GPU VRAM polling disabled"
             )
             _amd_smi_disabled = True
         return None
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 2fd535c9bd..b56360b0e5 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -1202,7 +1202,10 @@ def estimate_required_model_memory_gb(
                 _determine_attention_impl_for_gpu_estimate(config)
             )
         except Exception as e:
-            logger.warning(
+            # Log at debug: on Windows ROCm the torch.distributed stub does
+            # not implement Store, so this fires on every estimate call.
+            # It is expected and non-actionable -- eager is the safe fallback.
+            logger.debug(
                 "Could not resolve attention implementation for '%s': %s",
                 estimate_model,
                 e,

From f5c2e8afd8b496b65f9b3b12c608db41ecbda68f Mon Sep 17 00:00:00 2001
From: Erland366 <erland.pg366@gmail.com>
Date: Wed, 27 May 2026 18:44:49 +0000
Subject: [PATCH 147/165] [AMD] FIx installation of bitsandbytes when it's from
 .dev and skip rebuilding llama.cpp if we build it manually.

---
 install.sh                                | 15 +++++++++++++--
 studio/install_python_stack.py            | 10 +++++++---
 studio/setup.sh                           | 16 ++++++++++++++++
 tests/studio/install/test_rocm_support.py |  2 ++
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/install.sh b/install.sh
index bfb8eb94ab..adca624bb1 100755
--- a/install.sh
+++ b/install.sh
@@ -183,10 +183,21 @@ _install_bnb_rocm() {
     fi
     if [ -n "$_bnb_whl_url" ]; then
         substep "installing bitsandbytes for AMD ROCm (pre-release, PR #1887)..."
-        if run_install_cmd "$_label (pre-release)" "$_venv_py" -m pip install \
-            --force-reinstall --no-cache-dir --no-deps "$_bnb_whl_url"; then
+        _bnb_log=$(mktemp)
+        if "$_venv_py" -m pip install \
+            --disable-pip-version-check \
+            --force-reinstall --no-cache-dir --no-deps \
+            --retries 8 --timeout 90 \
+            "$_bnb_whl_url" >"$_bnb_log" 2>&1; then
+            rm -f "$_bnb_log"
             return 0
         fi
+        _bnb_rc=$?
+        if _is_verbose; then
+            cat "$_bnb_log" >&2
+        fi
+        rm -f "$_bnb_log"
+        step "warning" "$_label (pre-release) failed (exit code $_bnb_rc)" "$C_WARN" >&2
         substep "[WARN] bnb pre-release install failed; falling back to PyPI (4-bit decode broken on ROCm)" "$C_WARN"
     fi
     run_install_cmd "$_label (pypi fallback)" "$_venv_py" -m pip install \
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 13d790b396..40d6634534 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -816,7 +816,9 @@ def _ensure_rocm_torch() -> None:
 
     # Install bitsandbytes only when torch links against ROCm. Prefers the
     # continuous-release_main wheel (bnb PR #1887 4-bit GEMV fix) and falls
-    # back to PyPI when the pre-release URL is unreachable.
+    # back to PyPI when the pre-release wheel cannot be installed. Use pip for
+    # the pre-release wheel because uv rejects the wheel's filename/metadata
+    # version mismatch.
     if rocm_torch_ready:
         _bnb_url = _bnb_rocm_prerelease_url()
         _bnb_installed = False
@@ -828,11 +830,12 @@ def _ensure_rocm_torch() -> None:
                 "--no-deps",
                 _bnb_url,
                 constrain = False,
+                force_pip = True,
             )
             if not _bnb_installed:
                 print(
                     _red(
-                        "   bnb pre-release unreachable; falling back to PyPI "
+                        "   bnb pre-release install failed; falling back to PyPI "
                         "(4-bit decode will be broken on ROCm)"
                     )
                 )
@@ -1279,6 +1282,7 @@ def pip_install_try(
     label: str,
     *args: str,
     constrain: bool = True,
+    force_pip: bool = False,
 ) -> bool:
     """Like pip_install but returns False on failure instead of exiting.
     For optional installs with a follow-up fallback.
@@ -1289,7 +1293,7 @@ def pip_install_try(
         constraint_args_pip = ["-c", str(CONSTRAINTS)]
         constraint_args_uv = ["-c", _uv_safe_path(CONSTRAINTS)]
 
-    if USE_UV:
+    if USE_UV and not force_pip:
         cmd = _build_uv_cmd(args) + constraint_args_uv
     else:
         cmd = _build_pip_cmd(args) + constraint_args_pip
diff --git a/studio/setup.sh b/studio/setup.sh
index 64c2ab18af..a66cbb3132 100755
--- a/studio/setup.sh
+++ b/studio/setup.sh
@@ -793,6 +793,22 @@ else
     fi
 fi
 
+# Source-built llama.cpp installs do not have the prebuilt metadata used above
+# for exact release matching. Reuse a complete local source build unless the
+# caller explicitly requested a rebuild or a PR-specific llama.cpp checkout.
+if [ "$_NEED_LLAMA_SOURCE_BUILD" = true ] && \
+   [ "$_LLAMA_FORCE_COMPILE" != "1" ] && \
+   [ -z "$_LLAMA_PR" ] && \
+   [ -x "$LLAMA_CPP_DIR/build/bin/llama-server" ] && \
+   [ -x "$LLAMA_CPP_DIR/build/bin/llama-quantize" ]; then
+    step "llama.cpp" "existing source build found; skipping rebuild"
+    ln -sf build/bin/llama-quantize "$LLAMA_CPP_DIR/llama-quantize"
+    if [ "$_STUDIO_HOME_IS_CUSTOM" = true ]; then
+        : > "$LLAMA_CPP_DIR/$_STUDIO_OWNED_MARKER" 2>/dev/null || true
+    fi
+    _NEED_LLAMA_SOURCE_BUILD=false
+fi
+
 # ── 8. WSL: pre-install GGUF build dependencies for fallback source builds ──
 # On WSL, sudo requires a password and can't be entered during GGUF export
 # (runs in a non-interactive subprocess). Install build deps here instead.
diff --git a/tests/studio/install/test_rocm_support.py b/tests/studio/install/test_rocm_support.py
index bf8d9948a5..ecb1b9be15 100644
--- a/tests/studio/install/test_rocm_support.py
+++ b/tests/studio/install/test_rocm_support.py
@@ -620,6 +620,7 @@ def test_cpu_torch_gets_rocm_reinstall(
         assert "rocm7.1" in str(mock_pip.call_args_list[0])
         assert mock_pip_try.call_count >= 1
         assert "bitsandbytes" in str(mock_pip_try.call_args_list[0])
+        assert mock_pip_try.call_args.kwargs["force_pip"] is True
 
     @patch.object(stack_mod, "IS_WINDOWS", False)
     @patch.object(stack_mod, "pip_install")
@@ -704,6 +705,7 @@ def test_probe_timeout_triggers_reinstall(
         assert mock_pip.call_count == 1
         assert "rocm7.1" in str(mock_pip.call_args_list[0])
         assert mock_pip_try.call_count >= 1
+        assert mock_pip_try.call_args.kwargs["force_pip"] is True
 
     @patch.object(stack_mod, "pip_install")
     @patch.object(stack_mod, "_has_usable_nvidia_gpu", return_value = False)

From 2ec5d004d5f2e90b2a86ee8df68f9305d2714d6f Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Wed, 27 May 2026 16:08:53 -0500
Subject: [PATCH 148/165] fix: use force_pip for Windows ROCm bitsandbytes
 prebuilt wheel install

uv rejects the bnb continuous-release wheel due to filename/metadata
version mismatch (1.33.7.preview vs 0.50.0.dev0). Switch to force_pip=True
(pip bypass) instead of the UV_SKIP_WHEEL_FILENAME_CHECK env var workaround
-- cleaner and consistent with how the Linux path handles it.

BNB_ROCM_VERSION is still set post-install to the detected DLL suffix so
the worker subprocess loads the correct libbitsandbytes_rocm{VER}.dll even
when torch.version.hip reports a newer HIP version than the wheel ships.
---
 studio/install_python_stack.py | 40 +++++++++++++++-------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index 40d6634534..d505c57d9a 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -509,35 +509,29 @@ def _install_bnb_windows_rocm() -> bool:
 
     The continuous-release wheel is intentionally mismatched: the filename
     encodes version 1.33.7.preview (parsed as 1.33.7rc0 by PEP 440) while the
-    wheel metadata reports 0.50.0.dev0.  uv rejects this by default; we set
-    UV_SKIP_WHEEL_FILENAME_CHECK=1 only for this install and restore the env
-    afterwards.
+    wheel metadata reports 0.50.0.dev0.  uv rejects this filename/metadata
+    mismatch; use pip directly (force_pip=True) to bypass that check.
     """
     _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
     if _bnb_win_url is None:
         return False
-    _prev = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
-    os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
-    _ok = False  # init so a raise inside pip_install_try does not produce UnboundLocalError
-    try:
-        _ok = pip_install_try(
-            "bitsandbytes (AMD Windows, pre-release main)",
-            "--force-reinstall",
-            "--no-cache-dir",
-            "--no-deps",
-            _bnb_win_url,
-            constrain = False,
-        )
-    finally:
-        if _prev is None:
-            os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
-        else:
-            os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = _prev
+    _ok = pip_install_try(
+        "bitsandbytes (AMD Windows, pre-release main)",
+        "--force-reinstall",
+        "--no-cache-dir",
+        "--no-deps",
+        _bnb_win_url,
+        constrain = False,
+        force_pip = True,
+    )
     if not _ok:
         return False
-    # After install: detect the actual ROCm DLL suffix from the wheel so any
-    # post-install BNB import in this process loads the correct DLL.
-    # The worker subprocess does the same detection independently (worker.py §1f).
+    # After install: detect the actual ROCm DLL suffix shipped in the wheel and
+    # set BNB_ROCM_VERSION so bitsandbytes loads the correct DLL regardless of
+    # what torch.version.hip reports.  The wheel may ship an older suffix (e.g.
+    # "72") while torch reports a newer HIP version (e.g. 7.13); the env var
+    # override ensures bitsandbytes does not fail looking for a non-existent DLL.
+    # The worker subprocess inherits this env var automatically.
     # Fall back to "72" if detection fails (e.g. install was a no-op / dry-run).
     if "BNB_ROCM_VERSION" not in os.environ:
         _ver = _detect_bnb_rocm_dll_ver() or "72"

From 7be61bbabdccb48b03623c1ee0fb009015360ae0 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 28 May 2026 03:38:00 -0500
Subject: [PATCH 149/165] fix: three small correctness fixes found in PR review

- _install_bnb_windows_rocm: use UV_SKIP_WHEEL_FILENAME_CHECK=1 with
  try/finally instead of force_pip=True so the env var is always
  restored and the failing CI test passes
- _determine_attention_impl_for_gpu_estimate: gate torch._C distributed
  stubs on IS_ROCM so Windows CUDA users keep the real extension
- install.ps1 amd-smi fallback: collect all gfx tokens and index by
  HIP_VISIBLE_DEVICES, matching the hipinfo path on multi-GPU hosts
---
 install.ps1                               | 15 ++++++++----
 studio/backend/utils/hardware/hardware.py |  2 +-
 studio/install_python_stack.py            | 28 +++++++++++++++--------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/install.ps1 b/install.ps1
index 304ded8595..cab66f5ae1 100644
--- a/install.ps1
+++ b/install.ps1
@@ -1297,17 +1297,22 @@ shell.Run cmd, 0, False
                     $smiOut = & $amdSmiExe.Source list 2>&1 | Out-String
                     if ($LASTEXITCODE -eq 0 -and $smiOut -match "(?im)^GPU\s*[:\[]\s*\d") {
                         $HasROCm = $true
-                        # Attempt 1: newer amd-smi versions embed the gfx arch in list output
-                        if ($smiOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
-                            $ROCmGfxArch = $Matches[1].ToLower()
+                        # Mirror the hipinfo path: collect all gfx tokens in enumeration
+                        # order and pick the runtime-visible one via HIP_VISIBLE_DEVICES.
+                        $_smiVisIdx = if ($env:HIP_VISIBLE_DEVICES -match '^\d') { [int]($env:HIP_VISIBLE_DEVICES -split ',')[0] } elseif ($env:ROCR_VISIBLE_DEVICES -match '^\d') { [int]($env:ROCR_VISIBLE_DEVICES -split ',')[0] } else { 0 }
+                        # Attempt 1: newer amd-smi versions embed the gfx arch in list output.
+                        $_smiGfxTokens = @([regex]::Matches($smiOut, "(?i)\b(gfx\d+[a-z]?)\b") | ForEach-Object { $_.Groups[1].Value.ToLower() })
+                        if ($_smiGfxTokens.Count -gt 0) {
+                            $ROCmGfxArch = if ($_smiVisIdx -lt $_smiGfxTokens.Count) { $_smiGfxTokens[$_smiVisIdx] } else { $_smiGfxTokens[0] }
                             $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
                         } else {
                             # Attempt 2: 'static --asic' exposes ASIC details on ROCm 6+,
                             # including the GFX target needed for wheel index selection.
                             $smiAsicOut = ""
                             try { $smiAsicOut = & $amdSmiExe.Source static --asic 2>&1 | Out-String } catch {}
-                            if ($smiAsicOut -match "(?i)\b(gfx\d+[a-z]?)\b") {
-                                $ROCmGfxArch = $Matches[1].ToLower()
+                            $_asicGfxTokens = @([regex]::Matches($smiAsicOut, "(?i)\b(gfx\d+[a-z]?)\b") | ForEach-Object { $_.Groups[1].Value.ToLower() })
+                            if ($_asicGfxTokens.Count -gt 0) {
+                                $ROCmGfxArch = if ($_smiVisIdx -lt $_asicGfxTokens.Count) { $_asicGfxTokens[$_smiVisIdx] } else { $_asicGfxTokens[0] }
                                 $ROCmGpuLabel = "AMD ROCm ($ROCmGfxArch)"
                             } elseif ($smiAsicOut -match "(?im)Market.?Name\s*[:\|]\s*([^\r\n]+)") {
                                 $ROCmGpuLabel = "AMD ROCm ($($Matches[1].Trim()))"
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index b56360b0e5..210f1209b4 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -960,7 +960,7 @@ def _determine_attention_impl_for_gpu_estimate(config) -> str:
     import sys as _sys
     import types as _types
 
-    if _sys.platform == "win32":
+    if _sys.platform == "win32" and IS_ROCM:
         # Dummy class for any name torch.distributed tries to import from these stubs
         class _Dummy:
             pass
diff --git a/studio/install_python_stack.py b/studio/install_python_stack.py
index d505c57d9a..786b668e7b 100644
--- a/studio/install_python_stack.py
+++ b/studio/install_python_stack.py
@@ -510,20 +510,28 @@ def _install_bnb_windows_rocm() -> bool:
     The continuous-release wheel is intentionally mismatched: the filename
     encodes version 1.33.7.preview (parsed as 1.33.7rc0 by PEP 440) while the
     wheel metadata reports 0.50.0.dev0.  uv rejects this filename/metadata
-    mismatch; use pip directly (force_pip=True) to bypass that check.
+    mismatch; set UV_SKIP_WHEEL_FILENAME_CHECK=1 to bypass that check, then
+    restore the previous value (or remove the var) when done.
     """
     _bnb_win_url = _BNB_ROCM_PRERELEASE_URLS.get("win_amd64")
     if _bnb_win_url is None:
         return False
-    _ok = pip_install_try(
-        "bitsandbytes (AMD Windows, pre-release main)",
-        "--force-reinstall",
-        "--no-cache-dir",
-        "--no-deps",
-        _bnb_win_url,
-        constrain = False,
-        force_pip = True,
-    )
+    _old = os.environ.get("UV_SKIP_WHEEL_FILENAME_CHECK")
+    os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = "1"
+    try:
+        _ok = pip_install_try(
+            "bitsandbytes (AMD Windows, pre-release main)",
+            "--force-reinstall",
+            "--no-cache-dir",
+            "--no-deps",
+            _bnb_win_url,
+            constrain = False,
+        )
+    finally:
+        if _old is None:
+            os.environ.pop("UV_SKIP_WHEEL_FILENAME_CHECK", None)
+        else:
+            os.environ["UV_SKIP_WHEEL_FILENAME_CHECK"] = _old
     if not _ok:
         return False
     # After install: detect the actual ROCm DLL suffix shipped in the wheel and

From bd1162adc3e23493380d8b1ca9451a4b0b8cfbcd Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Thu, 28 May 2026 14:20:18 -0500
Subject: [PATCH 150/165] fix: stub torchao in export subprocess on Windows
 ROCm

On Windows, the ROCm build of PyTorch ships without the distributed
C extension (torch._C._distributed_c10d). torchao, which is pulled in
transitively by transformers.quantizers at import time, walks into
torch.distributed._functional_collectives -> distributed_c10d and
crashes with:

  No module named 'torch._C._distributed_c10d'; 'torch._C' is not a package

This only affected the export subprocess because the training subprocess
already applied an identical torchao stub (introduced separately to fix
the same root cause). The export subprocess had no such guard and died
during 'Importing Unsloth...' before any model loading could happen.

Fix: apply the same _StubSubpackageFinder / torchao stub pattern to the
export subprocess entry point, gated on Windows ROCm detection, before
any import of transformers or unsloth_zoo.

Root cause tracked in ROCm/TheRock#3284 (libuv / torch.distributed
missing on Windows ROCm builds).

Ref: https://github.com/ROCm/TheRock/issues/3284
---
 studio/backend/core/export/worker.py | 96 ++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/studio/backend/core/export/worker.py b/studio/backend/core/export/worker.py
index f77b1966c4..1c0d3a8e72 100644
--- a/studio/backend/core/export/worker.py
+++ b/studio/backend/core/export/worker.py
@@ -439,6 +439,102 @@ def run_export_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
+    # ── 1c. Stub torchao on Windows ROCm ──
+    # torchao (pulled in by transformers.quantizers) imports
+    # torch.distributed._functional_collectives at module level, which imports
+    # distributed_c10d.py unconditionally — that file crashes on Windows ROCm
+    # because torch._C._distributed_c10d (the RCCL backend) is absent.
+    # Stubbing torchao short-circuits the crash entirely.
+    # Must run before any import of transformers / unsloth_zoo.
+    import types as _types
+    import importlib.machinery as _ilm
+    import importlib.abc as _ilabc
+
+    _STUB_SENTINEL = object()
+
+    class _StubTypeMeta(type):
+        def __instancecheck__(cls, instance):
+            return False
+
+        def __subclasscheck__(cls, subclass):
+            return False
+
+        def __getattr__(cls, attr):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _StubTypeMeta(attr, (), {})
+            setattr(cls, attr, child)
+            return child
+
+        def __call__(cls, *args, **kwargs):
+            return None
+
+    def _make_stub_type(name):
+        return _StubTypeMeta(name, (), {})
+
+    def _make_mod_stub(mod_name):
+        m = _types.ModuleType(mod_name)
+        m.__path__ = []
+        m.__package__ = mod_name
+        m._unsloth_stub = _STUB_SENTINEL
+        m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
+
+        def _ga(attr, _m=m, _n=mod_name):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _make_stub_type(f"{_n}.{attr}")
+            setattr(_m, attr, child)
+            return child
+
+        m.__getattr__ = _ga
+        return m
+
+    class _StubSubpackageLoader(_ilabc.Loader):
+        def __init__(self, mod_name):
+            self._mod_name = mod_name
+
+        def create_module(self, spec):
+            return _make_mod_stub(self._mod_name)
+
+        def exec_module(self, module):
+            pass
+
+    class _StubSubpackageFinder(_ilabc.MetaPathFinder):
+        def find_spec(self, fullname, path, target=None):
+            if "." not in fullname:
+                return None
+            parent = sys.modules.get(fullname.rsplit(".", 1)[0])
+            if parent is None:
+                return None
+            if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
+                return None
+            return _ilm.ModuleSpec(
+                fullname, _StubSubpackageLoader(fullname), is_package=True
+            )
+
+    _is_win32_rocm = False
+    if sys.platform == "win32":
+        try:
+            import torch as _torch_probe
+            _is_win32_rocm = bool(
+                getattr(getattr(_torch_probe, "version", None), "hip", None)
+                or "rocm" in getattr(_torch_probe, "__version__", "").lower()
+            )
+            del _torch_probe
+        except Exception:
+            pass
+    if _is_win32_rocm:
+        sys.meta_path.append(_StubSubpackageFinder())
+        for _tao_name in (
+            "torchao",
+            "torchao.quantization",
+            "torchao.dtypes",
+            "torchao.float8",
+            "torchao.utils",
+        ):
+            if _tao_name not in sys.modules:
+                sys.modules[_tao_name] = _make_mod_stub(_tao_name)
+
     # ── 2. Import ML libraries (fresh in this clean process) ──
     try:
         _send_response(

From b3a87920d30d42937880f8f46ff3ccbfee773d79 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 28 May 2026 19:20:44 +0000
Subject: [PATCH 151/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/export/worker.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/studio/backend/core/export/worker.py b/studio/backend/core/export/worker.py
index 1c0d3a8e72..77205bc298 100644
--- a/studio/backend/core/export/worker.py
+++ b/studio/backend/core/export/worker.py
@@ -477,9 +477,9 @@ def _make_mod_stub(mod_name):
         m.__path__ = []
         m.__package__ = mod_name
         m._unsloth_stub = _STUB_SENTINEL
-        m.__spec__ = _ilm.ModuleSpec(mod_name, loader=None, is_package=True)
+        m.__spec__ = _ilm.ModuleSpec(mod_name, loader = None, is_package = True)
 
-        def _ga(attr, _m=m, _n=mod_name):
+        def _ga(attr, _m = m, _n = mod_name):
             if attr.startswith("__"):
                 raise AttributeError(attr)
             child = _make_stub_type(f"{_n}.{attr}")
@@ -500,7 +500,7 @@ def exec_module(self, module):
             pass
 
     class _StubSubpackageFinder(_ilabc.MetaPathFinder):
-        def find_spec(self, fullname, path, target=None):
+        def find_spec(self, fullname, path, target = None):
             if "." not in fullname:
                 return None
             parent = sys.modules.get(fullname.rsplit(".", 1)[0])
@@ -509,13 +509,14 @@ def find_spec(self, fullname, path, target=None):
             if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
                 return None
             return _ilm.ModuleSpec(
-                fullname, _StubSubpackageLoader(fullname), is_package=True
+                fullname, _StubSubpackageLoader(fullname), is_package = True
             )
 
     _is_win32_rocm = False
     if sys.platform == "win32":
         try:
             import torch as _torch_probe
+
             _is_win32_rocm = bool(
                 getattr(getattr(_torch_probe, "version", None), "hip", None)
                 or "rocm" in getattr(_torch_probe, "__version__", "").lower()

From 3a199906d1d80db411d81617e0e424af8e814201 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 01:00:27 -0500
Subject: [PATCH 152/165] install.sh, setup.sh: add GPU arch step logging to
 match PS1 scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both shell scripts were missing the step "gpu" terminal log block that
install.ps1 and setup.ps1 emit. This adds equivalent output: GPU label
with gfx arch (e.g. "AMD ROCm (gfx1151)"), ROCm root path, hipconfig
version, and marketing name substep. Includes the same gfx arch detection
chain (rocminfo → amd-smi list → amd-smi static --asic), UNSLOTH_ROCM_GFX_ARCH
env override, and name-based arch inference table (Strix Halo/Point, RDNA 3/4)
as the PS1 versions. install.sh also replaces bare echo blocks for the AMD
ROCm and CPU-only cases with formatted substep output.
---
 install.sh      | 92 ++++++++++++++++++++++++++++++++++++++++++-------
 studio/setup.sh | 75 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+), 13 deletions(-)

diff --git a/install.sh b/install.sh
index adca624bb1..91fdad8fb4 100755
--- a/install.sh
+++ b/install.sh
@@ -1944,27 +1944,93 @@ fi
 _TAURI_GPU_BRANCH=$(_tauri_gpu_branch "$_TAURI_TORCH_INDEX_FAMILY" "$_amd_gpu_radeon")
 tauri_diag_marker "$_TAURI_GPU_BRANCH" "$_TAURI_TORCH_INDEX_FAMILY"
 
-# ── Print CPU-only hint when no GPU detected ──
+# ── GPU detection summary (mirrors install.ps1 step "gpu" block) ──
+if _has_usable_nvidia_gpu; then
+    step "gpu" "NVIDIA GPU detected"
+elif case "$TORCH_INDEX_URL" in */rocm*|*/gfx*) true ;; *) false ;; esac; then
+    # Probe gfx arch for the display label, honouring HIP_VISIBLE_DEVICES
+    _gpu_disp_gfx_all=""
+    _gpu_disp_mkt=""
+    if command -v rocminfo >/dev/null 2>&1; then
+        _gpu_disp_gfx_all=$(rocminfo 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' || true)
+        _gpu_disp_mkt=$(rocminfo 2>/dev/null | awk -F': ' \
+            '/Marketing Name:/{gsub(/^[[:space:]]+|[[:space:]]+$/,"", $2); if($2){print $2; exit}}' || true)
+    fi
+    if [ -z "$_gpu_disp_gfx_all" ] && command -v amd-smi >/dev/null 2>&1; then
+        _gpu_disp_gfx_all=$(amd-smi list 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' || true)
+        [ -z "$_gpu_disp_gfx_all" ] && \
+            _gpu_disp_gfx_all=$(amd-smi static --asic 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' || true)
+    fi
+    if [ -z "$_gpu_disp_mkt" ] && command -v amd-smi >/dev/null 2>&1; then
+        _gpu_disp_mkt=$(amd-smi static --asic 2>/dev/null | awk -F'[:|]' \
+            '/[Mm]arket.?[Nn]ame/{gsub(/^[[:space:]]+|[[:space:]]+$/,"", $2); if($2){print $2; exit}}' || true)
+    fi
+    _gpu_vis="${HIP_VISIBLE_DEVICES:-${ROCR_VISIBLE_DEVICES:-}}"
+    _gpu_vis_idx=0
+    if [ -n "$_gpu_vis" ] && [ "$_gpu_vis" != "-1" ]; then
+        _gpu_first="${_gpu_vis%%,*}"
+        case "$_gpu_first" in ''|*[!0-9]*) ;; *) _gpu_vis_idx=$_gpu_first ;; esac
+    fi
+    _gpu_disp_gfx=$(printf '%s\n' "$_gpu_disp_gfx_all" | awk -v idx="$_gpu_vis_idx" \
+        'NF && !seen[$0]++ { a[n++]=$0 } END { if(idx>=n) idx=0; if(n>0) print a[idx] }')
+    # UNSLOTH_ROCM_GFX_ARCH env override (mirrors install.ps1)
+    if [ -n "${UNSLOTH_ROCM_GFX_ARCH:-}" ]; then
+        _gpu_disp_gfx="${UNSLOTH_ROCM_GFX_ARCH}"
+        substep "gfx arch from UNSLOTH_ROCM_GFX_ARCH env override: $_gpu_disp_gfx"
+    # Name-based arch inference when tools don't report gfx (mirrors install.ps1 nameArchTable)
+    elif [ -z "$_gpu_disp_gfx" ] && [ -n "$_gpu_disp_mkt" ]; then
+        case "$_gpu_disp_mkt" in
+            *"9070 XT"*|*9080*)                                                     _gpu_disp_gfx="gfx1201" ;;  # RDNA 4
+            *9070*|*9060*)                                                          _gpu_disp_gfx="gfx1200" ;;  # RDNA 4
+            *"8060S"*|*"890M"*|*"Strix Halo"*|*"HX 37"*|*"HX 38"*|*"AI 9 HX"*)  _gpu_disp_gfx="gfx1151" ;;  # RDNA 3.5 iGPU
+            *"880M"*|*"Strix Point"*|*"AI 9 36"*|*"AI 7 35"*|*"AI 5 34"*)        _gpu_disp_gfx="gfx1150" ;;  # RDNA 3.5 iGPU
+            *"RX 7900"*|*"RX 7800"*|*"RX 7700"*)                                  _gpu_disp_gfx="gfx1100" ;;  # RDNA 3 desktop
+            *"RX 7600"*)                                                           _gpu_disp_gfx="gfx1102" ;;  # RDNA 3
+            *"780M"*|*"760M"*|*"740M"*|*"Phoenix"*)                               _gpu_disp_gfx="gfx1103" ;;  # RDNA 3 iGPU
+        esac
+        if [ -n "$_gpu_disp_gfx" ]; then
+            substep "gfx arch inferred from GPU name: $_gpu_disp_gfx"
+            substep "Tip: set UNSLOTH_ROCM_GFX_ARCH=$_gpu_disp_gfx to skip inference next time"
+        fi
+    fi
+    # ROCm version via hipconfig, then amd-smi
+    _gpu_rocm_ver=""
+    if command -v hipconfig >/dev/null 2>&1; then
+        _gpu_rocm_ver=$(hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]/{print; exit}' || true)
+    fi
+    if [ -z "$_gpu_rocm_ver" ] && command -v amd-smi >/dev/null 2>&1; then
+        _gpu_rocm_ver=$(amd-smi version 2>/dev/null | awk -F'ROCm version: ' \
+            'NF>1{gsub(/[[:space:]]/,"", $2); print $2; exit}' || true)
+    fi
+    if [ -n "$_gpu_disp_gfx" ]; then
+        step "gpu" "AMD ROCm ($_gpu_disp_gfx)"
+    else
+        step "gpu" "AMD ROCm"
+    fi
+    _rocm_root="${ROCM_PATH:-${HIP_PATH:-/opt/rocm}}"
+    substep "ROCm: $_rocm_root"
+    [ -n "$_gpu_rocm_ver" ] && substep "hipconfig: $_gpu_rocm_ver"
+    [ -n "$_gpu_disp_mkt" ] && [ -n "$_gpu_disp_gfx" ] && substep "GPU: $_gpu_disp_mkt"
+else
+    step "gpu" "none (CPU-only)" "$C_WARN"
+fi
+
+# ── PyTorch wheel index note ──
 case "$TORCH_INDEX_URL" in
     */cpu)
         if [ "$SKIP_TORCH" = false ] && [ "$OS" != "macos" ]; then
-            echo ""
-            echo "  NOTE: No GPU detected (nvidia-smi and ROCm not found)."
-            echo "  Installing CPU-only PyTorch. If you only need GGUF chat/inference,"
-            echo "  re-run with --no-torch for a faster, lighter install:"
-            echo "    curl -fsSL https://unsloth.ai/install.sh | sh -s -- --no-torch"
-            echo "  AMD ROCm users: see https://docs.unsloth.ai/get-started/install-and-update/amd"
-            echo ""
+            substep "No GPU detected -- installing CPU-only PyTorch." "$C_WARN"
+            substep "AMD ROCm users: see https://docs.unsloth.ai/get-started/install-and-update/amd"
+            substep "Re-run with --no-torch for GGUF-only (faster, no PyTorch):"
+            substep "  curl -fsSL https://unsloth.ai/install.sh | sh -s -- --no-torch"
         fi
         ;;
-    */rocm*)
-        echo ""
+    */rocm*|*/gfx*)
         if [ "$_amd_gpu_radeon" = true ]; then
-            echo "  AMD Radeon + ROCm detected -- installing PyTorch wheels from repo.radeon.com"
+            substep "wheels: repo.radeon.com (Radeon)"
         else
-            echo "  AMD ROCm detected -- installing ROCm-enabled PyTorch ($TORCH_INDEX_URL)"
+            substep "wheels: $TORCH_INDEX_URL"
         fi
-        echo ""
         ;;
 esac
 
diff --git a/studio/setup.sh b/studio/setup.sh
index a66cbb3132..5c88da111f 100755
--- a/studio/setup.sh
+++ b/studio/setup.sh
@@ -635,6 +635,81 @@ if [ "$_NEED_T5_INSTALL" = true ]; then
 fi
 fi
 
+# ── GPU detection summary (mirrors setup.ps1 step "gpu" block) ──
+_setup_amd_detected=false
+_setup_gfx_all=""
+_setup_mkt=""
+if command -v rocminfo >/dev/null 2>&1 && \
+   rocminfo 2>/dev/null | awk '/Name:[[:space:]]*gfx[1-9][0-9]/{found=1} END{exit !found}'; then
+    _setup_amd_detected=true
+    _setup_gfx_all=$(rocminfo 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' || true)
+    _setup_mkt=$(rocminfo 2>/dev/null | awk -F': ' \
+        '/Marketing Name:/{gsub(/^[[:space:]]+|[[:space:]]+$/,"", $2); if($2){print $2; exit}}' || true)
+elif command -v amd-smi >/dev/null 2>&1 && \
+     amd-smi list 2>/dev/null | awk '/^GPU[[:space:]]*[:\[][[:space:]]*[0-9]/{ found=1 } END{ exit !found }'; then
+    _setup_amd_detected=true
+    _setup_gfx_all=$(amd-smi list 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' || true)
+    [ -z "$_setup_gfx_all" ] && \
+        _setup_gfx_all=$(amd-smi static --asic 2>/dev/null | grep -oE 'gfx[1-9][0-9a-z]{2,3}' || true)
+    _setup_mkt=$(amd-smi static --asic 2>/dev/null | awk -F'[:|]' \
+        '/[Mm]arket.?[Nn]ame/{gsub(/^[[:space:]]+|[[:space:]]+$/,"", $2); if($2){print $2; exit}}' || true)
+fi
+
+if command -v nvidia-smi >/dev/null 2>&1 && \
+   nvidia-smi -L 2>/dev/null | awk '/^GPU[[:space:]]+[0-9]+:/{found=1} END{exit !found}'; then
+    step "gpu" "NVIDIA GPU detected"
+elif [ "$_setup_amd_detected" = true ]; then
+    _setup_vis="${HIP_VISIBLE_DEVICES:-${ROCR_VISIBLE_DEVICES:-}}"
+    _setup_vis_idx=0
+    if [ -n "$_setup_vis" ] && [ "$_setup_vis" != "-1" ]; then
+        _setup_first="${_setup_vis%%,*}"
+        case "$_setup_first" in ''|*[!0-9]*) ;; *) _setup_vis_idx=$_setup_first ;; esac
+    fi
+    _setup_gfx=$(printf '%s\n' "$_setup_gfx_all" | awk -v idx="$_setup_vis_idx" \
+        'NF && !seen[$0]++ { a[n++]=$0 } END { if(idx>=n) idx=0; if(n>0) print a[idx] }')
+    # UNSLOTH_ROCM_GFX_ARCH env override (mirrors setup.ps1)
+    if [ -n "${UNSLOTH_ROCM_GFX_ARCH:-}" ]; then
+        _setup_gfx="${UNSLOTH_ROCM_GFX_ARCH}"
+        substep "gfx arch from UNSLOTH_ROCM_GFX_ARCH env override: $_setup_gfx"
+    # Name-based arch inference when tools don't report gfx (mirrors setup.ps1 nameArchTable)
+    elif [ -z "$_setup_gfx" ] && [ -n "$_setup_mkt" ]; then
+        case "$_setup_mkt" in
+            *"9070 XT"*|*9080*)                                                     _setup_gfx="gfx1201" ;;  # RDNA 4
+            *9070*|*9060*)                                                          _setup_gfx="gfx1200" ;;  # RDNA 4
+            *"8060S"*|*"890M"*|*"Strix Halo"*|*"HX 37"*|*"HX 38"*|*"AI 9 HX"*)  _setup_gfx="gfx1151" ;;  # RDNA 3.5 iGPU
+            *"880M"*|*"Strix Point"*|*"AI 9 36"*|*"AI 7 35"*|*"AI 5 34"*)        _setup_gfx="gfx1150" ;;  # RDNA 3.5 iGPU
+            *"RX 7900"*|*"RX 7800"*|*"RX 7700"*)                                  _setup_gfx="gfx1100" ;;  # RDNA 3 desktop
+            *"RX 7600"*)                                                           _setup_gfx="gfx1102" ;;  # RDNA 3
+            *"780M"*|*"760M"*|*"740M"*|*"Phoenix"*)                               _setup_gfx="gfx1103" ;;  # RDNA 3 iGPU
+        esac
+        if [ -n "$_setup_gfx" ]; then
+            substep "gfx arch inferred from GPU name: $_setup_gfx"
+            substep "Tip: set UNSLOTH_ROCM_GFX_ARCH=$_setup_gfx to skip inference next time"
+        fi
+    fi
+    # ROCm version via hipconfig, then amd-smi
+    _setup_rocm_ver=""
+    if command -v hipconfig >/dev/null 2>&1; then
+        _setup_rocm_ver=$(hipconfig --version 2>/dev/null | awk 'NR==1 && /^[0-9]/{print; exit}' || true)
+    fi
+    if [ -z "$_setup_rocm_ver" ] && command -v amd-smi >/dev/null 2>&1; then
+        _setup_rocm_ver=$(amd-smi version 2>/dev/null | awk -F'ROCm version: ' \
+            'NF>1{gsub(/[[:space:]]/,"", $2); print $2; exit}' || true)
+    fi
+    if [ -n "$_setup_gfx" ]; then
+        step "gpu" "AMD ROCm ($_setup_gfx)"
+    else
+        step "gpu" "AMD ROCm"
+    fi
+    _setup_rocm_root="${ROCM_PATH:-${HIP_PATH:-/opt/rocm}}"
+    substep "ROCm: $_setup_rocm_root"
+    [ -n "$_setup_rocm_ver" ] && substep "hipconfig: $_setup_rocm_ver"
+    [ -n "$_setup_mkt" ] && [ -n "$_setup_gfx" ] && substep "GPU: $_setup_mkt"
+else
+    step "gpu" "none (chat-only / GGUF)" "$C_WARN"
+    substep "Training and GPU inference require an NVIDIA or AMD ROCm GPU."
+fi
+
 # ── 7. Prefer prebuilt llama.cpp bundles before any source build path ──
 # Nest llama.cpp under $STUDIO_HOME only for real env-overrides; legacy
 # default keeps ~/.unsloth/llama.cpp so pre-PR builds are still discovered.

From c8c60ab3955dcca0cb14ed078e0e6557fefb84e3 Mon Sep 17 00:00:00 2001
From: danielhanchen <michaelhan2050@gmail.com>
Date: Fri, 29 May 2026 10:23:02 +0000
Subject: [PATCH 153/165] Fix BNB_ROCM_VERSION gate, ROCm GPU mask preference,
 APU unified memory and Release build for PR #5301

- main.py: gate BNB_ROCM_VERSION on the rocm bnb DLL or HIP_PATH/ROCM_PATH instead of importing torch on every Windows host
- hardware.py: prefer HIP/ROCR visible-device masks only on ROCm hosts so a stale mask cannot override CUDA_VISIBLE_DEVICES on NVIDIA
- llama_cpp.py: set GGML_CUDA_ENABLE_UNIFIED_MEMORY=1 only for unified-memory APUs (gfx1150/gfx1151)
- setup.sh: pass -DCMAKE_BUILD_TYPE=Release for the HIP source build
- add test_amd_apu_unified_memory.py
---
 studio/backend/core/inference/llama_cpp.py    | 34 ++++++++++++
 studio/backend/main.py                        | 45 ++++++----------
 .../tests/test_amd_apu_unified_memory.py      | 52 +++++++++++++++++++
 studio/backend/utils/hardware/hardware.py     |  5 +-
 studio/setup.sh                               |  3 +-
 5 files changed, 109 insertions(+), 30 deletions(-)
 create mode 100644 studio/backend/tests/test_amd_apu_unified_memory.py

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 0faa37d95b..67a3ee82d5 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1238,6 +1238,32 @@ def _get_gguf_size_bytes(model_path: str) -> int:
 
         return total
 
+    @staticmethod
+    def _amd_apu_wants_unified_memory() -> bool:
+        """True only for AMD unified-memory APUs (gfx1150/gfx1151), where
+        GGML_CUDA_ENABLE_UNIFIED_MEMORY lets llama.cpp use shared system RAM.
+        False for discrete AMD, NVIDIA, CPU and macOS (the env hurts discrete
+        GPUs). ROCm reuses torch.cuda.*; the gcnArchName suffix is stripped."""
+        try:
+            import torch
+
+            if getattr(torch.version, "hip", None) is None:
+                return False
+            if not (hasattr(torch, "cuda") and torch.cuda.is_available()):
+                return False
+            for _i in range(torch.cuda.device_count()):
+                try:
+                    _arch = getattr(
+                        torch.cuda.get_device_properties(_i), "gcnArchName", ""
+                    ) or ""
+                except Exception:
+                    continue
+                if _arch.split(":")[0].strip().lower() in {"gfx1150", "gfx1151"}:
+                    return True
+        except Exception:
+            return False
+        return False
+
     @staticmethod
     def _get_gpu_free_memory() -> list[tuple[int, int]]:
         """Query free memory per GPU.
@@ -3158,6 +3184,14 @@ def load_model(
                 env = child_env_without_native_path_secret()
                 binary_dir = str(Path(binary).parent)
 
+                # AMD unified-memory APUs (gfx1150/gfx1151): let llama.cpp use
+                # shared system RAM. setdefault so a user value wins.
+                if self._amd_apu_wants_unified_memory():
+                    env.setdefault("GGML_CUDA_ENABLE_UNIFIED_MEMORY", "1")
+                    logger.info(
+                        "AMD unified-memory APU: set GGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
+                    )
+
                 if sys.platform == "win32":
                     # See _build_windows_path_dirs for ordering. #5106.
                     path_dirs = self._build_windows_path_dirs(
diff --git a/studio/backend/main.py b/studio/backend/main.py
index 35d0d901f9..93cc387b2b 100644
--- a/studio/backend/main.py
+++ b/studio/backend/main.py
@@ -70,35 +70,22 @@ def _ver_key(name: str) -> tuple:
     # this the server process crashes with "Configured ROCm binary not found".
     # Detect the available DLL, fall back to "72", and set BNB_ROCM_VERSION
     # before any import that pulls in bitsandbytes (mirrors worker.py logic).
-    # Gate on the active torch runtime only. AMD SDK / Radeon Windows wheels
-    # may not set HIP_PATH / ROCM_PATH, but they do populate torch.version.hip
-    # or encode "rocm" in torch.__version__. A previous version of this gate
-    # required HIP_PATH / ROCM_PATH and silently skipped BNB_ROCM_VERSION for
-    # those wheels.
-    _is_rocm_host = False
-    try:
-        import torch as _torch_probe
-
-        _is_rocm_host = bool(
-            getattr(getattr(_torch_probe, "version", None), "hip", None)
-            or "rocm" in getattr(_torch_probe, "__version__", "").lower()
-        )
-        del _torch_probe
-    except Exception:
-        pass
-    if _is_rocm_host and "BNB_ROCM_VERSION" not in os.environ:
+    # Gate on the rocm bnb DLL (the exact file this configures) or HIP_PATH/
+    # ROCM_PATH, not on torch.version.hip: that needed importing torch on every
+    # Windows host (NVIDIA/CPU included), adding seconds to startup. Radeon
+    # wheels without HIP_PATH still ship the rocm bnb DLL, so they are covered.
+    if "BNB_ROCM_VERSION" not in os.environ:
         import glob as _glob
         import logging as _logging
 
+        _hip_env = bool(os.environ.get("HIP_PATH") or os.environ.get("ROCM_PATH"))
         _bnb_rocm_ver = None
+        _found_rocm_bnb = False
         try:
             import importlib.util as _ilu
 
             _bnb_spec = _ilu.find_spec("bitsandbytes")
-            # Use submodule_search_locations (same as install_python_stack.py and
-            # worker.py) rather than spec.origin so that editable installs of
-            # bitsandbytes, where __init__.py may live outside the package root,
-            # are handled consistently across all three probe sites.
+            # submodule_search_locations (not spec.origin) handles editable installs.
             if _bnb_spec and _bnb_spec.submodule_search_locations:
                 import re as _re_bnb
 
@@ -107,6 +94,7 @@ def _ver_key(name: str) -> tuple:
                     for _dll in _glob.glob(
                         os.path.join(_pkg_dir, "libbitsandbytes_rocm*.dll")
                     ):
+                        _found_rocm_bnb = True
                         _km = _re_bnb.search(
                             r"libbitsandbytes_rocm(\d+)\.dll", os.path.basename(_dll)
                         )
@@ -119,13 +107,14 @@ def _ver_key(name: str) -> tuple:
                 "Windows ROCm: BNB DLL detection failed (%s); falling back to version '72'",
                 _e,
             )
-        _bnb_rocm_ver_final = _bnb_rocm_ver or "72"
-        os.environ["BNB_ROCM_VERSION"] = _bnb_rocm_ver_final
-        _logging.getLogger(__name__).info(
-            "Windows ROCm: set BNB_ROCM_VERSION=%s "
-            "(detected from installed BNB wheel; overrides torch.version.hip auto-detection)",
-            _bnb_rocm_ver_final,
-        )
+        # rocm bnb DLL present, or HIP_PATH/ROCM_PATH set (DLL unparsable -> "72").
+        if _found_rocm_bnb or _hip_env:
+            _bnb_rocm_ver_final = _bnb_rocm_ver or "72"
+            os.environ["BNB_ROCM_VERSION"] = _bnb_rocm_ver_final
+            _logging.getLogger(__name__).info(
+                "Windows ROCm: set BNB_ROCM_VERSION=%s (from installed BNB wheel)",
+                _bnb_rocm_ver_final,
+            )
 
 # Ensure backend dir is on sys.path so _platform_compat is importable when
 # main.py is launched directly (e.g. `uvicorn main:app`).
diff --git a/studio/backend/tests/test_amd_apu_unified_memory.py b/studio/backend/tests/test_amd_apu_unified_memory.py
new file mode 100644
index 0000000000..e99b5b1b54
--- /dev/null
+++ b/studio/backend/tests/test_amd_apu_unified_memory.py
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""GGML_CUDA_ENABLE_UNIFIED_MEMORY must be set only for AMD unified-memory APUs
+(gfx1150/gfx1151), never for discrete AMD, NVIDIA, CPU or macOS."""
+
+from __future__ import annotations
+
+import sys
+import types
+
+import pytest
+
+from core.inference.llama_cpp import LlamaCppBackend
+
+
+def _fake_torch(hip, archs, *, cuda_ok = True):
+    t = types.ModuleType("torch")
+    t.version = types.SimpleNamespace(hip = hip)
+    t.cuda = types.SimpleNamespace(
+        is_available = lambda: cuda_ok,
+        device_count = lambda: len(archs),
+        get_device_properties = lambda i: types.SimpleNamespace(gcnArchName = archs[i]),
+    )
+    return t
+
+
+@pytest.mark.parametrize(
+    "hip,archs,expected",
+    [
+        ("6.2.0", ["gfx1151:xnack-"], True),     # Strix Halo APU (suffix stripped)
+        ("6.2.0", ["gfx1150"], True),            # Strix Point APU
+        ("6.2.0", ["gfx1100"], False),           # discrete RDNA3
+        ("6.2.0", ["gfx1201"], False),           # discrete RDNA4
+        ("6.2.0", ["gfx942"], False),            # MI300X (data center)
+        (None, ["sm_90"], False),                # NVIDIA (no torch.version.hip)
+        ("6.2.0", ["gfx1100", "gfx1151"], True), # mixed dGPU + APU
+    ],
+)
+def test_apu_unified_memory_gating(monkeypatch, hip, archs, expected):
+    monkeypatch.setitem(sys.modules, "torch", _fake_torch(hip, archs))
+    assert LlamaCppBackend._amd_apu_wants_unified_memory() is expected
+
+
+def test_cpu_no_cuda_returns_false(monkeypatch):
+    monkeypatch.setitem(sys.modules, "torch", _fake_torch("6.2.0", [], cuda_ok = False))
+    assert LlamaCppBackend._amd_apu_wants_unified_memory() is False
+
+
+def test_missing_torch_returns_false(monkeypatch):
+    monkeypatch.setitem(sys.modules, "torch", None)
+    assert LlamaCppBackend._amd_apu_wants_unified_memory() is False
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 210f1209b4..1c05a9a56a 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -771,8 +771,11 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
     # Use explicit None checks (not `or`) so empty string "" is honoured
     # as "no visible GPUs" rather than falling through to CUDA_VISIBLE_DEVICES.
     cuda_visible = None
+    # Prefer ROCm masks only on a ROCm host, or when no CUDA mask is set, so a
+    # stale HIP_VISIBLE_DEVICES on an NVIDIA host can't override CUDA_VISIBLE_DEVICES.
     _is_rocm_spec = IS_ROCM or (
-        "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ
+        "CUDA_VISIBLE_DEVICES" not in os.environ
+        and ("HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ)
     )
     if _is_rocm_spec:
         hip_vis = os.environ.get("HIP_VISIBLE_DEVICES")
diff --git a/studio/setup.sh b/studio/setup.sh
index 5c88da111f..64ad68f5a6 100755
--- a/studio/setup.sh
+++ b/studio/setup.sh
@@ -1041,7 +1041,8 @@ else
         fi
 
         if [ "$BUILD_OK" = true ]; then
-            CMAKE_ARGS="-DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=ON"
+            # Set Release explicitly (llama.cpp only defaults to it on non-MSVC/Xcode).
+            CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_NATIVE=ON"
             _TRY_METAL_CPU_FALLBACK=false
             _HOST_SYSTEM="$(uname -s 2>/dev/null || true)"
             _HOST_MACHINE="$(uname -m 2>/dev/null || true)"

From 57165b63b372163f02ea719714e22c60e805394d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 29 May 2026 10:24:29 +0000
Subject: [PATCH 154/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py         |  7 ++++---
 .../backend/tests/test_amd_apu_unified_memory.py   | 14 +++++++-------
 studio/backend/utils/hardware/hardware.py          |  4 +++-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 67a3ee82d5..83c4b4f4da 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1253,9 +1253,10 @@ def _amd_apu_wants_unified_memory() -> bool:
                 return False
             for _i in range(torch.cuda.device_count()):
                 try:
-                    _arch = getattr(
-                        torch.cuda.get_device_properties(_i), "gcnArchName", ""
-                    ) or ""
+                    _arch = (
+                        getattr(torch.cuda.get_device_properties(_i), "gcnArchName", "")
+                        or ""
+                    )
                 except Exception:
                     continue
                 if _arch.split(":")[0].strip().lower() in {"gfx1150", "gfx1151"}:
diff --git a/studio/backend/tests/test_amd_apu_unified_memory.py b/studio/backend/tests/test_amd_apu_unified_memory.py
index e99b5b1b54..e0b819d54b 100644
--- a/studio/backend/tests/test_amd_apu_unified_memory.py
+++ b/studio/backend/tests/test_amd_apu_unified_memory.py
@@ -28,13 +28,13 @@ def _fake_torch(hip, archs, *, cuda_ok = True):
 @pytest.mark.parametrize(
     "hip,archs,expected",
     [
-        ("6.2.0", ["gfx1151:xnack-"], True),     # Strix Halo APU (suffix stripped)
-        ("6.2.0", ["gfx1150"], True),            # Strix Point APU
-        ("6.2.0", ["gfx1100"], False),           # discrete RDNA3
-        ("6.2.0", ["gfx1201"], False),           # discrete RDNA4
-        ("6.2.0", ["gfx942"], False),            # MI300X (data center)
-        (None, ["sm_90"], False),                # NVIDIA (no torch.version.hip)
-        ("6.2.0", ["gfx1100", "gfx1151"], True), # mixed dGPU + APU
+        ("6.2.0", ["gfx1151:xnack-"], True),  # Strix Halo APU (suffix stripped)
+        ("6.2.0", ["gfx1150"], True),  # Strix Point APU
+        ("6.2.0", ["gfx1100"], False),  # discrete RDNA3
+        ("6.2.0", ["gfx1201"], False),  # discrete RDNA4
+        ("6.2.0", ["gfx942"], False),  # MI300X (data center)
+        (None, ["sm_90"], False),  # NVIDIA (no torch.version.hip)
+        ("6.2.0", ["gfx1100", "gfx1151"], True),  # mixed dGPU + APU
     ],
 )
 def test_apu_unified_memory_gating(monkeypatch, hip, archs, expected):
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 1c05a9a56a..4361c8814d 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -775,7 +775,9 @@ def _get_parent_visible_gpu_spec() -> Dict[str, Any]:
     # stale HIP_VISIBLE_DEVICES on an NVIDIA host can't override CUDA_VISIBLE_DEVICES.
     _is_rocm_spec = IS_ROCM or (
         "CUDA_VISIBLE_DEVICES" not in os.environ
-        and ("HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ)
+        and (
+            "HIP_VISIBLE_DEVICES" in os.environ or "ROCR_VISIBLE_DEVICES" in os.environ
+        )
     )
     if _is_rocm_spec:
         hip_vis = os.environ.get("HIP_VISIBLE_DEVICES")

From afb343fa08afc9b9d0faec660f434e6b87a1ed55 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 11:57:53 -0500
Subject: [PATCH 155/165] fix: guard recompile_limit + fix AMD VRAM monitor
 fallback

trainer.py: torch._dynamo.config.recompile_limit does not exist in
some ROCm torch builds (e.g. pytorch.org/whl/rocm6.2 wheels). Guard
the assignment so training doesn't crash on RDNA2/RDNA3.

hardware.py: when amd-smi/nvidia-smi is unavailable or returns no
usable data (HIP SDK-only Windows, Docker, unexpected JSON format),
the existing fallback used torch.cuda.memory_allocated() which is
process-specific and reads near-zero even with a fully loaded model.
Switch to torch.cuda.mem_get_info() via _torch_get_per_device_info()
which reports system-wide VRAM occupancy so the GPU monitor shows
real usage on all AMD systems without requiring amd-smi.
---
 studio/backend/core/training/trainer.py   |  5 ++++-
 studio/backend/utils/hardware/hardware.py | 25 +++++++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
index 5473c6fe1e..f1faaee4fb 100644
--- a/studio/backend/core/training/trainer.py
+++ b/studio/backend/core/training/trainer.py
@@ -42,7 +42,10 @@
     get_visible_gpu_count,
 )
 
-torch._dynamo.config.recompile_limit = 64
+# recompile_limit was removed in some ROCm torch builds (e.g. pytorch.org/whl/rocm6.2).
+# Guard so training doesn't crash on RDNA2/RDNA3 with older ROCm torch wheels.
+if hasattr(torch._dynamo.config, "recompile_limit"):
+    torch._dynamo.config.recompile_limit = 64
 from unsloth import FastLanguageModel, FastVisionModel, is_bfloat16_supported
 from unsloth.chat_templates import get_chat_template
 
diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 4361c8814d..1113187628 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -522,6 +522,31 @@ def get_gpu_utilization() -> Dict[str, Any]:
                     result, _get_parent_visible_gpu_spec()
                 )
             return result
+        # amd-smi / nvidia-smi unavailable or returned no usable data (common on
+        # HIP SDK-only Windows, Docker containers, and some amd-smi JSON versions).
+        # Fall back to torch.cuda.mem_get_info which gives system-wide VRAM
+        # occupancy rather than process-only memory_allocated, so the monitor
+        # shows real GPU usage even without an SMI tool present.
+        _visible_spec = _get_parent_visible_gpu_spec()
+        _numeric_ids = _visible_spec.get("numeric_ids") or [0]
+        _primary_idx = [_numeric_ids[0]] if _numeric_ids else [0]
+        _torch_devices = _torch_get_per_device_info(_primary_idx)
+        if _torch_devices:
+            _td = _torch_devices[0]
+            _total = _td["total_gb"]
+            _used = _td["used_gb"]
+            return {
+                "available": True,
+                "backend": _backend_label(device),
+                "gpu_utilization_pct": None,
+                "temperature_c": None,
+                "vram_used_gb": _used,
+                "vram_total_gb": _total,
+                "vram_utilization_pct": round((_used / _total) * 100, 1) if _total > 0 else None,
+                "power_draw_w": None,
+                "power_limit_w": None,
+                "power_utilization_pct": None,
+            }
 
     # MLX path: single _read_apple_gpu_stats() call carries both VRAM-used
     # bytes and GPU utilization %. psutil for unified-memory total is cheap.

From 3c65ef6659b7b93b23eabf1fa4e7f243c456099a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 29 May 2026 16:58:26 +0000
Subject: [PATCH 156/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 1113187628..0f8a4e5582 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -542,7 +542,9 @@ def get_gpu_utilization() -> Dict[str, Any]:
                 "temperature_c": None,
                 "vram_used_gb": _used,
                 "vram_total_gb": _total,
-                "vram_utilization_pct": round((_used / _total) * 100, 1) if _total > 0 else None,
+                "vram_utilization_pct": round((_used / _total) * 100, 1)
+                if _total > 0
+                else None,
                 "power_draw_w": None,
                 "power_limit_w": None,
                 "power_utilization_pct": None,

From a0a8b026292d4aa5e05652d5d1c494e6d91a0788 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 14:53:59 -0500
Subject: [PATCH 157/165] fix: Windows VRAM monitor via Performance Counter API

When amd-smi/nvidia-smi is unavailable on Windows, query dedicated GPU
VRAM via Windows Performance Counters (same source as Task Manager).
This gives system-wide cross-process usage, fixing the near-zero reading
caused by torch.cuda.mem_get_info only seeing the Studio server process.

Linux fallback path unchanged (mem_get_info is system-wide on ROCm).
---
 studio/backend/utils/hardware/hardware.py | 58 +++++++++++++++++++++--
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 0f8a4e5582..621acfd39a 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -508,6 +508,38 @@ def _read_apple_gpu_stats() -> Dict[str, Any]:
     }
 
 
+def _windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
+    """Query system-wide dedicated GPU VRAM via Windows Performance Counters.
+
+    Uses the same data source as Task Manager so it reflects cross-process
+    usage accurately. Works for any GPU vendor without amd-smi or nvidia-smi.
+    Returns (used_gb, total_gb) or (None, None) on failure.
+    """
+    import subprocess as _sp
+    if platform.system() != "Windows":
+        return None, None
+    try:
+        ps = (
+            "$s=(Get-Counter '\\GPU Adapter Memory(*)\\Dedicated Usage'"
+            " -ErrorAction SilentlyContinue).CounterSamples;"
+            "if($s){($s|Measure-Object CookedValue -Sum).Sum}else{-1}"
+        )
+        r = _sp.run(
+            ["powershell", "-NoProfile", "-NonInteractive", "-Command", ps],
+            capture_output=True, text=True, timeout=5,
+        )
+        if r.returncode != 0 or not r.stdout.strip():
+            return None, None
+        used_bytes = float(r.stdout.strip())
+        if used_bytes < 0:
+            return None, None
+        import torch as _torch
+        total_bytes = _torch.cuda.get_device_properties(0).total_memory
+        return round(used_bytes / (1024 ** 3), 2), round(total_bytes / (1024 ** 3), 2)
+    except Exception:
+        return None, None
+
+
 def get_gpu_utilization() -> Dict[str, Any]:
     """Return a live snapshot of device utilization information."""
     device = get_device()
@@ -522,11 +554,27 @@ def get_gpu_utilization() -> Dict[str, Any]:
                     result, _get_parent_visible_gpu_spec()
                 )
             return result
-        # amd-smi / nvidia-smi unavailable or returned no usable data (common on
-        # HIP SDK-only Windows, Docker containers, and some amd-smi JSON versions).
-        # Fall back to torch.cuda.mem_get_info which gives system-wide VRAM
-        # occupancy rather than process-only memory_allocated, so the monitor
-        # shows real GPU usage even without an SMI tool present.
+        # SMI tool unavailable or returned no usable data. On Windows, query
+        # the Performance Counter API (same source as Task Manager) for
+        # system-wide dedicated VRAM — covers cross-process usage that
+        # torch.cuda.mem_get_info cannot see from the Studio server process.
+        if platform.system() == "Windows":
+            _win_used, _win_total = _windows_perf_counter_vram_gb()
+            if _win_used is not None and _win_total is not None:
+                return {
+                    "available": True,
+                    "backend": _backend_label(device),
+                    "gpu_utilization_pct": None,
+                    "temperature_c": None,
+                    "vram_used_gb": _win_used,
+                    "vram_total_gb": _win_total,
+                    "vram_utilization_pct": round((_win_used / _win_total) * 100, 1)
+                    if _win_total > 0 else None,
+                    "power_draw_w": None,
+                    "power_limit_w": None,
+                    "power_utilization_pct": None,
+                }
+        # Linux/macOS fallback: torch.cuda.mem_get_info is system-wide on Linux ROCm.
         _visible_spec = _get_parent_visible_gpu_spec()
         _numeric_ids = _visible_spec.get("numeric_ids") or [0]
         _primary_idx = [_numeric_ids[0]] if _numeric_ids else [0]

From 82dad78b583134e479a6132848e368901dc06964 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 29 May 2026 19:54:52 +0000
Subject: [PATCH 158/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 621acfd39a..0dc9412820 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -516,6 +516,7 @@ def _windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
     Returns (used_gb, total_gb) or (None, None) on failure.
     """
     import subprocess as _sp
+
     if platform.system() != "Windows":
         return None, None
     try:
@@ -526,7 +527,9 @@ def _windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
         )
         r = _sp.run(
             ["powershell", "-NoProfile", "-NonInteractive", "-Command", ps],
-            capture_output=True, text=True, timeout=5,
+            capture_output = True,
+            text = True,
+            timeout = 5,
         )
         if r.returncode != 0 or not r.stdout.strip():
             return None, None
@@ -534,8 +537,9 @@ def _windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
         if used_bytes < 0:
             return None, None
         import torch as _torch
+
         total_bytes = _torch.cuda.get_device_properties(0).total_memory
-        return round(used_bytes / (1024 ** 3), 2), round(total_bytes / (1024 ** 3), 2)
+        return round(used_bytes / (1024**3), 2), round(total_bytes / (1024**3), 2)
     except Exception:
         return None, None
 
@@ -569,7 +573,8 @@ def get_gpu_utilization() -> Dict[str, Any]:
                     "vram_used_gb": _win_used,
                     "vram_total_gb": _win_total,
                     "vram_utilization_pct": round((_win_used / _win_total) * 100, 1)
-                    if _win_total > 0 else None,
+                    if _win_total > 0
+                    else None,
                     "power_draw_w": None,
                     "power_limit_w": None,
                     "power_utilization_pct": None,

From 8519d418f256e5f6f7cb39114fe6a608d687698c Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 14:56:46 -0500
Subject: [PATCH 159/165] fix: rename to _rocm_windows_perf_counter_vram_gb,
 scope to IS_ROCM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Function is AMD ROCm specific — amd-smi absent on Windows when only the
HIP SDK is installed. Scoped to IS_ROCM so NVIDIA Windows path is
untouched (nvidia-smi handles that case).
---
 studio/backend/utils/hardware/hardware.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 0dc9412820..cb036dbbaf 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -508,7 +508,7 @@ def _read_apple_gpu_stats() -> Dict[str, Any]:
     }
 
 
-def _windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
+def _rocm_windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
     """Query system-wide dedicated GPU VRAM via Windows Performance Counters.
 
     Uses the same data source as Task Manager so it reflects cross-process
@@ -562,8 +562,8 @@ def get_gpu_utilization() -> Dict[str, Any]:
         # the Performance Counter API (same source as Task Manager) for
         # system-wide dedicated VRAM — covers cross-process usage that
         # torch.cuda.mem_get_info cannot see from the Studio server process.
-        if platform.system() == "Windows":
-            _win_used, _win_total = _windows_perf_counter_vram_gb()
+        if IS_ROCM and platform.system() == "Windows":
+            _win_used, _win_total = _rocm_windows_perf_counter_vram_gb()
             if _win_used is not None and _win_total is not None:
                 return {
                     "available": True,

From 0c2f5821ff7e460b876d9dd2138300f0989210e2 Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 14:59:48 -0500
Subject: [PATCH 160/165] =?UTF-8?q?fix:=20AMD=20VRAM=20monitor=20=E2=80=94?=
 =?UTF-8?q?=20Linux=20DRM=20sysfs=20+=20Windows=20perf=20counter?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linux: read /sys/class/drm/card*/device/mem_info_vram_used|total for
system-wide GPU memory across all processes. No tools required, always
present on Linux AMD systems.

Windows: Windows Performance Counter API (already added).

Both paths are gated on IS_ROCM and only fire when amd-smi is absent.
torch mem_get_info remains as last resort (process-local).
---
 studio/backend/utils/hardware/hardware.py | 43 ++++++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index cb036dbbaf..7b6b992739 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -508,6 +508,30 @@ def _read_apple_gpu_stats() -> Dict[str, Any]:
     }
 
 
+def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
+    """Query system-wide AMD GPU VRAM via Linux DRM sysfs.
+
+    Reads /sys/class/drm/card*/device/mem_info_vram_* which the kernel
+    updates in real-time across all processes. No tools required.
+    Returns (used_gb, total_gb) or (None, None) on failure.
+    """
+    import glob as _glob
+    if platform.system() != "Linux":
+        return None, None
+    try:
+        used_files = _glob.glob("/sys/class/drm/card*/device/mem_info_vram_used")
+        total_files = _glob.glob("/sys/class/drm/card*/device/mem_info_vram_total")
+        if not used_files or not total_files:
+            return None, None
+        used_bytes = sum(int(open(f).read().strip()) for f in used_files)
+        total_bytes = sum(int(open(f).read().strip()) for f in total_files)
+        if total_bytes == 0:
+            return None, None
+        return round(used_bytes / (1024 ** 3), 2), round(total_bytes / (1024 ** 3), 2)
+    except Exception:
+        return None, None
+
+
 def _rocm_windows_perf_counter_vram_gb() -> tuple[Optional[float], Optional[float]]:
     """Query system-wide dedicated GPU VRAM via Windows Performance Counters.
 
@@ -579,7 +603,24 @@ def get_gpu_utilization() -> Dict[str, Any]:
                     "power_limit_w": None,
                     "power_utilization_pct": None,
                 }
-        # Linux/macOS fallback: torch.cuda.mem_get_info is system-wide on Linux ROCm.
+        # Linux: DRM sysfs gives system-wide VRAM across all processes, no tools needed.
+        if IS_ROCM and platform.system() == "Linux":
+            _linux_used, _linux_total = _rocm_linux_sysfs_vram_gb()
+            if _linux_used is not None and _linux_total is not None:
+                return {
+                    "available": True,
+                    "backend": _backend_label(device),
+                    "gpu_utilization_pct": None,
+                    "temperature_c": None,
+                    "vram_used_gb": _linux_used,
+                    "vram_total_gb": _linux_total,
+                    "vram_utilization_pct": round((_linux_used / _linux_total) * 100, 1)
+                    if _linux_total > 0 else None,
+                    "power_draw_w": None,
+                    "power_limit_w": None,
+                    "power_utilization_pct": None,
+                }
+        # Last resort: torch mem_get_info (process-local).
         _visible_spec = _get_parent_visible_gpu_spec()
         _numeric_ids = _visible_spec.get("numeric_ids") or [0]
         _primary_idx = [_numeric_ids[0]] if _numeric_ids else [0]

From 32fd3c4f1142f2bb10a76cb503733566cc1f61fb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 29 May 2026 20:00:02 +0000
Subject: [PATCH 161/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 7b6b992739..4b7f6b0e49 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -516,6 +516,7 @@ def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
     Returns (used_gb, total_gb) or (None, None) on failure.
     """
     import glob as _glob
+
     if platform.system() != "Linux":
         return None, None
     try:
@@ -527,7 +528,7 @@ def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
         total_bytes = sum(int(open(f).read().strip()) for f in total_files)
         if total_bytes == 0:
             return None, None
-        return round(used_bytes / (1024 ** 3), 2), round(total_bytes / (1024 ** 3), 2)
+        return round(used_bytes / (1024**3), 2), round(total_bytes / (1024**3), 2)
     except Exception:
         return None, None
 
@@ -615,7 +616,8 @@ def get_gpu_utilization() -> Dict[str, Any]:
                     "vram_used_gb": _linux_used,
                     "vram_total_gb": _linux_total,
                     "vram_utilization_pct": round((_linux_used / _linux_total) * 100, 1)
-                    if _linux_total > 0 else None,
+                    if _linux_total > 0
+                    else None,
                     "power_draw_w": None,
                     "power_limit_w": None,
                     "power_utilization_pct": None,

From f5a4e3ca7374930903cb6ecd18761ef8d9eecdbb Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 23:28:21 -0500
Subject: [PATCH 162/165] =?UTF-8?q?fix:=20AMD=20GPU=20monitor=20=E2=80=94?=
 =?UTF-8?q?=20utilization,=20temperature,=20and=20power=20for=20Windows=20?=
 =?UTF-8?q?and=20Linux=20fallback=20paths?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Windows: GPU utilization via \GPU Engine(*engtype_3D*)\Utilization Percentage perf counter
- Windows: temperature and power via ADL (atiadlxx.dll, ships with Adrenalin)
- Linux: GPU utilization via DRM sysfs gpu_busy_percent
- Linux: temperature via hwmon temp1_input (millidegrees C)
- Linux: power via hwmon power1_average / power1_input (microwatts)

All paths are no-op fallbacks (None) when the source is unavailable.
Mirrors what nvidia-smi provides on the CUDA path.
---
 studio/backend/utils/hardware/hardware.py | 165 +++++++++++++++++++++-
 1 file changed, 159 insertions(+), 6 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 4b7f6b0e49..3a391b680c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -508,6 +508,154 @@ def _read_apple_gpu_stats() -> Dict[str, Any]:
     }
 
 
+def _rocm_linux_sysfs_gpu_busy_pct() -> Optional[float]:
+    """Query AMD GPU compute utilization via Linux DRM sysfs gpu_busy_percent."""
+    import glob as _glob
+
+    if platform.system() != "Linux":
+        return None
+    try:
+        files = _glob.glob("/sys/class/drm/card*/device/gpu_busy_percent")
+        if not files:
+            return None
+        values = [int(open(f).read().strip()) for f in files]
+        return round(sum(values) / len(values), 1)
+    except Exception:
+        return None
+
+
+def _rocm_linux_sysfs_temp_c() -> Optional[float]:
+    """Query AMD GPU edge temperature via Linux DRM hwmon sysfs (temp1_input, millidegrees C)."""
+    import glob as _glob
+
+    if platform.system() != "Linux":
+        return None
+    try:
+        files = _glob.glob("/sys/class/drm/card*/device/hwmon/hwmon*/temp1_input")
+        if not files:
+            return None
+        temps = [int(open(f).read().strip()) / 1000.0 for f in files]
+        return round(max(temps), 1)
+    except Exception:
+        return None
+
+
+def _rocm_linux_sysfs_power_w() -> Optional[float]:
+    """Query AMD GPU average power draw via Linux DRM hwmon sysfs (microwatts)."""
+    import glob as _glob
+
+    if platform.system() != "Linux":
+        return None
+    try:
+        for pattern in (
+            "/sys/class/drm/card*/device/hwmon/hwmon*/power1_average",
+            "/sys/class/drm/card*/device/hwmon/hwmon*/power1_input",
+        ):
+            files = _glob.glob(pattern)
+            if files:
+                watts = sum(int(open(f).read().strip()) / 1_000_000.0 for f in files)
+                return round(watts, 1)
+        return None
+    except Exception:
+        return None
+
+
+def _rocm_windows_adl_temp_and_power() -> tuple[Optional[float], Optional[float]]:
+    """Query AMD GPU temperature and power via ADL (atiadlxx.dll).
+
+    atiadlxx.dll ships with Adrenalin drivers. ctypes is stdlib — no new packages.
+    Uses ADL2_New_QueryPMLogData_Get which returns all PM sensors in one call.
+    Returns (temp_c, power_w) or (None, None) on failure.
+    """
+    if platform.system() != "Windows":
+        return None, None
+    import ctypes
+
+    ADL_OK = 0
+    ADL_PMLOG_TEMPERATURE_EDGE = 7
+    ADL_PMLOG_ASIC_POWER = 17
+    ADL_PMLOG_MAX_SENSORS = 256
+    ADL_SENSOR_UNAVAILABLE = 0xFFFFFFFF
+
+    class _ADLPMLogSample(ctypes.Structure):
+        _fields_ = [("ulSensorInd", ctypes.c_uint), ("ulValue", ctypes.c_uint)]
+
+    class _ADLPMLogData(ctypes.Structure):
+        _fields_ = [
+            ("ulVersion", ctypes.c_uint),
+            ("ulActiveSampleRate", ctypes.c_uint),
+            ("ulLastUpdated", ctypes.c_longlong),
+            ("ulValues", _ADLPMLogSample * ADL_PMLOG_MAX_SENSORS),
+            ("ulReserved", ctypes.c_uint * 256),
+        ]
+
+    class _ADLPMLogDataOutput(ctypes.Structure):
+        _fields_ = [("iSize", ctypes.c_int), ("log", _ADLPMLogData)]
+
+    try:
+        try:
+            _adl = ctypes.cdll.LoadLibrary("atiadlxx.dll")
+        except OSError:
+            return None, None
+
+        _ADL_MALLOC_CB = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_int)
+        _bufs: list = []
+
+        @_ADL_MALLOC_CB
+        def _malloc_cb(size: int) -> int:
+            buf = (ctypes.c_char * size)()
+            _bufs.append(buf)
+            return ctypes.cast(buf, ctypes.c_void_p).value
+
+        ctx = ctypes.c_void_p(None)
+        if _adl.ADL2_Main_Control_Create(_malloc_cb, 1, ctypes.byref(ctx)) != ADL_OK:
+            return None, None
+
+        try:
+            pm = _ADLPMLogDataOutput()
+            pm.iSize = ctypes.sizeof(_ADLPMLogDataOutput)
+            if _adl.ADL2_New_QueryPMLogData_Get(ctx, 0, ctypes.byref(pm)) != ADL_OK:
+                return None, None
+
+            temp_raw = pm.log.ulValues[ADL_PMLOG_TEMPERATURE_EDGE].ulValue
+            power_raw = pm.log.ulValues[ADL_PMLOG_ASIC_POWER].ulValue
+
+            temp = round(float(temp_raw), 1) if temp_raw not in (0, ADL_SENSOR_UNAVAILABLE) else None
+            power = round(float(power_raw), 1) if power_raw not in (0, ADL_SENSOR_UNAVAILABLE) else None
+            return temp, power
+        finally:
+            _adl.ADL2_Main_Control_Destroy(ctx)
+    except Exception:
+        return None, None
+
+
+def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
+    """Query AMD GPU compute utilization via Windows Performance Counters (3D engine nodes)."""
+    import subprocess as _sp
+
+    if platform.system() != "Windows":
+        return None
+    try:
+        ps = (
+            "$s=(Get-Counter '\\GPU Engine(*engtype_3D*)\\Utilization Percentage'"
+            " -ErrorAction SilentlyContinue).CounterSamples;"
+            "if($s){[math]::Min(($s|Measure-Object CookedValue -Sum).Sum,100)}else{-1}"
+        )
+        r = _sp.run(
+            ["powershell", "-NoProfile", "-NonInteractive", "-Command", ps],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        if r.returncode != 0 or not r.stdout.strip():
+            return None
+        val = float(r.stdout.strip())
+        return round(val, 1) if val >= 0 else None
+    except Exception:
+        return None
+
+
+
 def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
     """Query system-wide AMD GPU VRAM via Linux DRM sysfs.
 
@@ -590,17 +738,19 @@ def get_gpu_utilization() -> Dict[str, Any]:
         if IS_ROCM and platform.system() == "Windows":
             _win_used, _win_total = _rocm_windows_perf_counter_vram_gb()
             if _win_used is not None and _win_total is not None:
+                _win_util = _rocm_windows_perf_counter_gpu_util_pct()
+                _win_temp, _win_power = _rocm_windows_adl_temp_and_power()
                 return {
                     "available": True,
                     "backend": _backend_label(device),
-                    "gpu_utilization_pct": None,
-                    "temperature_c": None,
+                    "gpu_utilization_pct": _win_util,
+                    "temperature_c": _win_temp,
                     "vram_used_gb": _win_used,
                     "vram_total_gb": _win_total,
                     "vram_utilization_pct": round((_win_used / _win_total) * 100, 1)
                     if _win_total > 0
                     else None,
-                    "power_draw_w": None,
+                    "power_draw_w": _win_power,
                     "power_limit_w": None,
                     "power_utilization_pct": None,
                 }
@@ -608,17 +758,20 @@ def get_gpu_utilization() -> Dict[str, Any]:
         if IS_ROCM and platform.system() == "Linux":
             _linux_used, _linux_total = _rocm_linux_sysfs_vram_gb()
             if _linux_used is not None and _linux_total is not None:
+                _linux_util = _rocm_linux_sysfs_gpu_busy_pct()
+                _linux_temp = _rocm_linux_sysfs_temp_c()
+                _linux_power = _rocm_linux_sysfs_power_w()
                 return {
                     "available": True,
                     "backend": _backend_label(device),
-                    "gpu_utilization_pct": None,
-                    "temperature_c": None,
+                    "gpu_utilization_pct": _linux_util,
+                    "temperature_c": _linux_temp,
                     "vram_used_gb": _linux_used,
                     "vram_total_gb": _linux_total,
                     "vram_utilization_pct": round((_linux_used / _linux_total) * 100, 1)
                     if _linux_total > 0
                     else None,
-                    "power_draw_w": None,
+                    "power_draw_w": _linux_power,
                     "power_limit_w": None,
                     "power_utilization_pct": None,
                 }

From da6469c5e6565d45de46fdecef4a39600c0e4327 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 30 May 2026 04:29:18 +0000
Subject: [PATCH 163/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 3a391b680c..88019d7685 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -620,8 +620,16 @@ def _malloc_cb(size: int) -> int:
             temp_raw = pm.log.ulValues[ADL_PMLOG_TEMPERATURE_EDGE].ulValue
             power_raw = pm.log.ulValues[ADL_PMLOG_ASIC_POWER].ulValue
 
-            temp = round(float(temp_raw), 1) if temp_raw not in (0, ADL_SENSOR_UNAVAILABLE) else None
-            power = round(float(power_raw), 1) if power_raw not in (0, ADL_SENSOR_UNAVAILABLE) else None
+            temp = (
+                round(float(temp_raw), 1)
+                if temp_raw not in (0, ADL_SENSOR_UNAVAILABLE)
+                else None
+            )
+            power = (
+                round(float(power_raw), 1)
+                if power_raw not in (0, ADL_SENSOR_UNAVAILABLE)
+                else None
+            )
             return temp, power
         finally:
             _adl.ADL2_Main_Control_Destroy(ctx)
@@ -643,9 +651,9 @@ def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
         )
         r = _sp.run(
             ["powershell", "-NoProfile", "-NonInteractive", "-Command", ps],
-            capture_output=True,
-            text=True,
-            timeout=5,
+            capture_output = True,
+            text = True,
+            timeout = 5,
         )
         if r.returncode != 0 or not r.stdout.strip():
             return None
@@ -655,7 +663,6 @@ def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
         return None
 
 
-
 def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
     """Query system-wide AMD GPU VRAM via Linux DRM sysfs.
 

From 271160743c56c5fe23790488d2d34c70a465cb3f Mon Sep 17 00:00:00 2001
From: LeoBorcherding <borchborchmail@gmail.com>
Date: Fri, 29 May 2026 23:38:08 -0500
Subject: [PATCH 164/165] =?UTF-8?q?fix:=20remove=20ADL=20ctypes=20?=
 =?UTF-8?q?=E2=80=94=20does=20not=20support=20AMD=20iGPU=20(Strix=20Halo)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 studio/backend/utils/hardware/hardware.py | 88 ++---------------------
 1 file changed, 6 insertions(+), 82 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index 88019d7685..db3ad1aa1c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -560,82 +560,6 @@ def _rocm_linux_sysfs_power_w() -> Optional[float]:
         return None
 
 
-def _rocm_windows_adl_temp_and_power() -> tuple[Optional[float], Optional[float]]:
-    """Query AMD GPU temperature and power via ADL (atiadlxx.dll).
-
-    atiadlxx.dll ships with Adrenalin drivers. ctypes is stdlib — no new packages.
-    Uses ADL2_New_QueryPMLogData_Get which returns all PM sensors in one call.
-    Returns (temp_c, power_w) or (None, None) on failure.
-    """
-    if platform.system() != "Windows":
-        return None, None
-    import ctypes
-
-    ADL_OK = 0
-    ADL_PMLOG_TEMPERATURE_EDGE = 7
-    ADL_PMLOG_ASIC_POWER = 17
-    ADL_PMLOG_MAX_SENSORS = 256
-    ADL_SENSOR_UNAVAILABLE = 0xFFFFFFFF
-
-    class _ADLPMLogSample(ctypes.Structure):
-        _fields_ = [("ulSensorInd", ctypes.c_uint), ("ulValue", ctypes.c_uint)]
-
-    class _ADLPMLogData(ctypes.Structure):
-        _fields_ = [
-            ("ulVersion", ctypes.c_uint),
-            ("ulActiveSampleRate", ctypes.c_uint),
-            ("ulLastUpdated", ctypes.c_longlong),
-            ("ulValues", _ADLPMLogSample * ADL_PMLOG_MAX_SENSORS),
-            ("ulReserved", ctypes.c_uint * 256),
-        ]
-
-    class _ADLPMLogDataOutput(ctypes.Structure):
-        _fields_ = [("iSize", ctypes.c_int), ("log", _ADLPMLogData)]
-
-    try:
-        try:
-            _adl = ctypes.cdll.LoadLibrary("atiadlxx.dll")
-        except OSError:
-            return None, None
-
-        _ADL_MALLOC_CB = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_int)
-        _bufs: list = []
-
-        @_ADL_MALLOC_CB
-        def _malloc_cb(size: int) -> int:
-            buf = (ctypes.c_char * size)()
-            _bufs.append(buf)
-            return ctypes.cast(buf, ctypes.c_void_p).value
-
-        ctx = ctypes.c_void_p(None)
-        if _adl.ADL2_Main_Control_Create(_malloc_cb, 1, ctypes.byref(ctx)) != ADL_OK:
-            return None, None
-
-        try:
-            pm = _ADLPMLogDataOutput()
-            pm.iSize = ctypes.sizeof(_ADLPMLogDataOutput)
-            if _adl.ADL2_New_QueryPMLogData_Get(ctx, 0, ctypes.byref(pm)) != ADL_OK:
-                return None, None
-
-            temp_raw = pm.log.ulValues[ADL_PMLOG_TEMPERATURE_EDGE].ulValue
-            power_raw = pm.log.ulValues[ADL_PMLOG_ASIC_POWER].ulValue
-
-            temp = (
-                round(float(temp_raw), 1)
-                if temp_raw not in (0, ADL_SENSOR_UNAVAILABLE)
-                else None
-            )
-            power = (
-                round(float(power_raw), 1)
-                if power_raw not in (0, ADL_SENSOR_UNAVAILABLE)
-                else None
-            )
-            return temp, power
-        finally:
-            _adl.ADL2_Main_Control_Destroy(ctx)
-    except Exception:
-        return None, None
-
 
 def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
     """Query AMD GPU compute utilization via Windows Performance Counters (3D engine nodes)."""
@@ -651,9 +575,9 @@ def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
         )
         r = _sp.run(
             ["powershell", "-NoProfile", "-NonInteractive", "-Command", ps],
-            capture_output = True,
-            text = True,
-            timeout = 5,
+            capture_output=True,
+            text=True,
+            timeout=5,
         )
         if r.returncode != 0 or not r.stdout.strip():
             return None
@@ -663,6 +587,7 @@ def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
         return None
 
 
+
 def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
     """Query system-wide AMD GPU VRAM via Linux DRM sysfs.
 
@@ -746,18 +671,17 @@ def get_gpu_utilization() -> Dict[str, Any]:
             _win_used, _win_total = _rocm_windows_perf_counter_vram_gb()
             if _win_used is not None and _win_total is not None:
                 _win_util = _rocm_windows_perf_counter_gpu_util_pct()
-                _win_temp, _win_power = _rocm_windows_adl_temp_and_power()
                 return {
                     "available": True,
                     "backend": _backend_label(device),
                     "gpu_utilization_pct": _win_util,
-                    "temperature_c": _win_temp,
+                    "temperature_c": None,
                     "vram_used_gb": _win_used,
                     "vram_total_gb": _win_total,
                     "vram_utilization_pct": round((_win_used / _win_total) * 100, 1)
                     if _win_total > 0
                     else None,
-                    "power_draw_w": _win_power,
+                    "power_draw_w": None,
                     "power_limit_w": None,
                     "power_utilization_pct": None,
                 }

From 5e99a47c7c97a00a3f290611a12b25913ef96976 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 30 May 2026 04:38:33 +0000
Subject: [PATCH 165/165] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/utils/hardware/hardware.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/studio/backend/utils/hardware/hardware.py b/studio/backend/utils/hardware/hardware.py
index db3ad1aa1c..ebac6a357c 100644
--- a/studio/backend/utils/hardware/hardware.py
+++ b/studio/backend/utils/hardware/hardware.py
@@ -560,7 +560,6 @@ def _rocm_linux_sysfs_power_w() -> Optional[float]:
         return None
 
 
-
 def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
     """Query AMD GPU compute utilization via Windows Performance Counters (3D engine nodes)."""
     import subprocess as _sp
@@ -575,9 +574,9 @@ def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
         )
         r = _sp.run(
             ["powershell", "-NoProfile", "-NonInteractive", "-Command", ps],
-            capture_output=True,
-            text=True,
-            timeout=5,
+            capture_output = True,
+            text = True,
+            timeout = 5,
         )
         if r.returncode != 0 or not r.stdout.strip():
             return None
@@ -587,7 +586,6 @@ def _rocm_windows_perf_counter_gpu_util_pct() -> Optional[float]:
         return None
 
 
-
 def _rocm_linux_sysfs_vram_gb() -> tuple[Optional[float], Optional[float]]:
     """Query system-wide AMD GPU VRAM via Linux DRM sysfs.