unslothai · danielhanchen · May 30, 2026 · May 5, 2026 · May 5, 2026 · May 6, 2026
diff --git a/install.ps1 b/install.ps1
diff --git a/install.sh b/install.sh
diff --git a/studio/backend/core/export/worker.py b/studio/backend/core/export/worker.py
@@ -439,6 +439,103 @@ def run_export_process(
                 'Install for better performance: pip install "triton-windows<3.7"'
             )
 
+    # ── 1c. Stub torchao on Windows ROCm ──
+    # torchao (pulled in by transformers.quantizers) imports
+    # torch.distributed._functional_collectives at module level, which imports
+    # distributed_c10d.py unconditionally — that file crashes on Windows ROCm
+    # because torch._C._distributed_c10d (the RCCL backend) is absent.
+    # Stubbing torchao short-circuits the crash entirely.
+    # Must run before any import of transformers / unsloth_zoo.
+    import types as _types
+    import importlib.machinery as _ilm
+    import importlib.abc as _ilabc
+
+    _STUB_SENTINEL = object()
+
+    class _StubTypeMeta(type):
+        def __instancecheck__(cls, instance):
+            return False
+
+        def __subclasscheck__(cls, subclass):
+            return False
+
+        def __getattr__(cls, attr):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _StubTypeMeta(attr, (), {})
+            setattr(cls, attr, child)
+            return child
+
+        def __call__(cls, *args, **kwargs):
+            return None
+
+    def _make_stub_type(name):
+        return _StubTypeMeta(name, (), {})
+
+    def _make_mod_stub(mod_name):
+        m = _types.ModuleType(mod_name)
+        m.__path__ = []
+        m.__package__ = mod_name
+        m._unsloth_stub = _STUB_SENTINEL
+        m.__spec__ = _ilm.ModuleSpec(mod_name, loader = None, is_package = True)
+
+        def _ga(attr, _m = m, _n = mod_name):
+            if attr.startswith("__"):
+                raise AttributeError(attr)
+            child = _make_stub_type(f"{_n}.{attr}")
+            setattr(_m, attr, child)
+            return child
+
+        m.__getattr__ = _ga
+        return m
+
+    class _StubSubpackageLoader(_ilabc.Loader):
+        def __init__(self, mod_name):
+            self._mod_name = mod_name
+
+        def create_module(self, spec):
+            return _make_mod_stub(self._mod_name)
+
+        def exec_module(self, module):
+            pass
+
+    class _StubSubpackageFinder(_ilabc.MetaPathFinder):
+        def find_spec(self, fullname, path, target = None):
+            if "." not in fullname:
+                return None
+            parent = sys.modules.get(fullname.rsplit(".", 1)[0])
+            if parent is None:
+                return None
+            if getattr(parent, "_unsloth_stub", None) is not _STUB_SENTINEL:
+                return None
+            return _ilm.ModuleSpec(
+                fullname, _StubSubpackageLoader(fullname), is_package = True
+            )
+
+    _is_win32_rocm = False
+    if sys.platform == "win32":
+        try:
+            import torch as _torch_probe
+
+            _is_win32_rocm = bool(
+                getattr(getattr(_torch_probe, "version", None), "hip", None)
+                or "rocm" in getattr(_torch_probe, "__version__", "").lower()
+            )
+            del _torch_probe
+        except Exception:
+            pass
+    if _is_win32_rocm:
+        sys.meta_path.append(_StubSubpackageFinder())
+        for _tao_name in (
+            "torchao",
+            "torchao.quantization",
+            "torchao.dtypes",
+            "torchao.float8",
+            "torchao.utils",
+        ):
+            if _tao_name not in sys.modules:
+                sys.modules[_tao_name] = _make_mod_stub(_tao_name)
+
     # ── 2. Import ML libraries (fresh in this clean process) ──
     try:
         _send_response(

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -1238,6 +1238,33 @@ def _get_gguf_size_bytes(model_path: str) -> int:
 
         return total
 
+    @staticmethod
+    def _amd_apu_wants_unified_memory() -> bool:
+        """True only for AMD unified-memory APUs (gfx1150/gfx1151), where
+        GGML_CUDA_ENABLE_UNIFIED_MEMORY lets llama.cpp use shared system RAM.
+        False for discrete AMD, NVIDIA, CPU and macOS (the env hurts discrete
+        GPUs). ROCm reuses torch.cuda.*; the gcnArchName suffix is stripped."""
+        try:
+            import torch
+
+            if getattr(torch.version, "hip", None) is None:
+                return False
+            if not (hasattr(torch, "cuda") and torch.cuda.is_available()):
+                return False
+            for _i in range(torch.cuda.device_count()):
+                try:
+                    _arch = (
+                        getattr(torch.cuda.get_device_properties(_i), "gcnArchName", "")
+                        or ""
+                    )
+                except Exception:
+                    continue
+                if _arch.split(":")[0].strip().lower() in {"gfx1150", "gfx1151"}:
+                    return True
+        except Exception:
+            return False
+        return False
+
     @staticmethod
     def _get_gpu_free_memory() -> list[tuple[int, int]]:
         """Query free memory per GPU.
@@ -3158,6 +3185,14 @@ def load_model(
                 env = child_env_without_native_path_secret()
                 binary_dir = str(Path(binary).parent)
 
+                # AMD unified-memory APUs (gfx1150/gfx1151): let llama.cpp use
+                # shared system RAM. setdefault so a user value wins.
+                if self._amd_apu_wants_unified_memory():
+                    env.setdefault("GGML_CUDA_ENABLE_UNIFIED_MEMORY", "1")
+                    logger.info(
+                        "AMD unified-memory APU: set GGML_CUDA_ENABLE_UNIFIED_MEMORY=1"
+                    )
+
                 if sys.platform == "win32":
                     # See _build_windows_path_dirs for ordering. #5106.
                     path_dirs = self._build_windows_path_dirs(
@@ -3167,6 +3202,24 @@ def load_model(
                     )
                     existing_path = env.get("PATH", "")
                     env["PATH"] = ";".join(path_dirs) + ";" + existing_path
+
+                    # ROCm: the llama.cpp prebuilt bundles its own rocblas.dll
+                    # but NOT the Tensile kernel library files it needs
+                    # (rocblas/library/TensileLibrary*.dat + *.hsaco).  The
+                    # bundled DLL searches relative to its own location by
+                    # default (i.e. <binary_dir>/rocblas/library/) which does
+                    # not exist, causing a silent crash on the first GEMM.
+                    # ROCBLAS_TENSILE_LIBPATH overrides that search to point at
+                    # the ROCm installation where the kernel files actually are.
+                    _hip_path = os.environ.get(
+                        "HIP_PATH", os.environ.get("ROCM_PATH", "")
+                    )
+                    if _hip_path:
+                        _rocblas_lib = os.path.join(
+                            _hip_path, "bin", "rocblas", "library"
+                        )
+                        if os.path.isdir(_rocblas_lib):
+                            env.setdefault("ROCBLAS_TENSILE_LIBPATH", _rocblas_lib)
                 else:
                     # Linux: set LD_LIBRARY_PATH for shared libs next to the binary
                     # and CUDA runtime libs (libcudart, libcublas, etc.)

diff --git a/studio/backend/core/training/trainer.py b/studio/backend/core/training/trainer.py
@@ -42,7 +42,10 @@
     get_visible_gpu_count,
 )
 
-torch._dynamo.config.recompile_limit = 64
+# recompile_limit was removed in some ROCm torch builds (e.g. pytorch.org/whl/rocm6.2).
+# Guard so training doesn't crash on RDNA2/RDNA3 with older ROCm torch wheels.
+if hasattr(torch._dynamo.config, "recompile_limit"):
+    torch._dynamo.config.recompile_limit = 64
 from unsloth import FastLanguageModel, FastVisionModel, is_bfloat16_supported
 from unsloth.chat_templates import get_chat_template
 
@@ -657,6 +660,15 @@ def load_model(
                 f"Using device_map='{device_map}' ({get_visible_gpu_count()} GPU(s) visible)"
             )
 
+            # On hardware without native bfloat16 support (e.g. RDNA2 / gfx103x),
+            # passing dtype=None lets unsloth auto-detect and incorrectly choose
+            # bf16, triggering an LLVM error at the first bf16 kernel dispatch.
+            # Explicitly pass float16 as the fallback so unsloth never reaches
+            # that path. Modern NVIDIA (Ampere+) and RDNA3+ return True here so
+            # they are unaffected — dtype stays None and unsloth picks bf16 as
+            # before.
+            _auto_dtype = None if is_bfloat16_supported() else torch.float16
+
             # Branch based on model type
             if self._audio_type == "csm":
                 # CSM: FastModel + auto_model=CsmForConditionalGeneration + load_in_4bit=False
@@ -666,7 +678,7 @@ def load_model(
                 self.model, self.tokenizer = FastModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     auto_model = CsmForConditionalGeneration,
                     load_in_4bit = False,
                     device_map = device_map,
@@ -683,7 +695,7 @@ def load_model(
 
                 self.model, self.tokenizer = FastModel.from_pretrained(
                     model_name = model_name,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     load_in_4bit = False,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -705,7 +717,7 @@ def load_model(
                 self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -777,7 +789,7 @@ def load_model(
                 self.model, self.tokenizer = FastModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -791,7 +803,7 @@ def load_model(
                 self.model, self.tokenizer = FastVisionModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,  # Auto-detect
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,
@@ -824,7 +836,7 @@ def load_model(
                 self.model, self.tokenizer = FastLanguageModel.from_pretrained(
                     model_name = model_name,
                     max_seq_length = max_seq_length,
-                    dtype = None,  # Auto-detect
+                    dtype = _auto_dtype,
                     load_in_4bit = load_in_4bit,
                     device_map = device_map,
                     full_finetuning = full_finetuning,