From 9e9729be37cc95a91847f745279e9acf42fc04f3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 May 2026 08:58:05 -0700
Subject: [PATCH 01/13] Studio: add Vulkan llama.cpp support

---
 studio/backend/core/inference/llama_cpp.py | 167 +++++++++++++++++++--
 studio/install_llama_prebuilt.py           | 117 ++++++++++++++-
 2 files changed, 270 insertions(+), 14 deletions(-)
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 0325620b2d..171609ce33 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -600,6 +600,48 @@ def _backfill_usage_from_timings(usage, timings):
     return out
 
 
+# Probe script run in a short-lived subprocess so the Vulkan instance never
+# lives in the long-running backend process. Loads the bundled ggml Vulkan
+# backend and prints "<idx>\t<free_bytes>\t<total_bytes>" per device. The
+# indices are ggml's own Vulkan device ordinals -- the space
+# GGML_VK_VISIBLE_DEVICES expects -- which need not match nvidia-smi order.
+_VULKAN_PROBE_SCRIPT = r"""
+import ctypes, os, sys
+bindir = sys.argv[1]
+if sys.platform == "win32":
+    base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll"
+    try:
+        os.add_dll_directory(bindir)
+    except Exception:
+        pass
+else:
+    base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
+try:
+    ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL)
+    lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL)
+except OSError:
+    sys.exit(0)
+lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
+lib.ggml_backend_vk_get_device_count.argtypes = []
+lib.ggml_backend_vk_get_device_memory.restype = None
+lib.ggml_backend_vk_get_device_memory.argtypes = [
+    ctypes.c_int,
+    ctypes.POINTER(ctypes.c_size_t),
+    ctypes.POINTER(ctypes.c_size_t),
+]
+rows = []
+for i in range(lib.ggml_backend_vk_get_device_count()):
+    free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
+    lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
+    rows.append("%d\t%d\t%d" % (i, free.value, total.value))
+sys.stdout.write("\n".join(rows))
+"""
+
+
+def _vulkan_lib_filename() -> str:
+    return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"
+
+
 class LlamaCppBackend:
     """
     Manages a llama-server subprocess for GGUF model inference.
@@ -1233,7 +1275,41 @@ def _get_gguf_size_bytes(model_path: str) -> int:
         return total
 
     @staticmethod
-    def _get_gpu_free_memory() -> list[tuple[int, int]]:
+    def _is_vulkan_backend(binary: Optional[str] = None) -> bool:
+        """True if the installed llama.cpp build is the Vulkan one.
+
+        Builds are single-backend, so the presence of the Vulkan ggml
+        backend library next to llama-server is sufficient. Used to keep
+        the free-memory probe and the GPU pin in the same device-index
+        space (ggml's Vulkan ordinals, not nvidia-smi order).
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if not binary:
+            return False
+        return (Path(binary).parent / _vulkan_lib_filename()).is_file()
+
+    @staticmethod
+    def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
+        """Query free memory per GPU across all supported backends.
+
+        On a Vulkan build, the ggml Vulkan probe is authoritative so the
+        returned indices are Vulkan ordinals (the space the GPU pin writes
+        to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch
+        cover NVIDIA + AMD ROCm, with the Vulkan probe as a last resort.
+
+        Returns list of (gpu_index, free_mib) sorted by index. Empty
+        list if no supported GPU is reachable.
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if LlamaCppBackend._is_vulkan_backend(binary):
+            return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+        gpus = LlamaCppBackend._get_gpu_free_memory_nvidia_torch()
+        if gpus:
+            return gpus
+        return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+
+    @staticmethod
+    def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:
         """Query free memory per GPU.
 
         Order:
@@ -1356,6 +1432,64 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
             logger.debug(f"torch GPU probe failed: {e}")
             return []
 
+    @staticmethod
+    def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]:
+        """Query free VRAM per device via the bundled ggml Vulkan backend.
+
+        Loads ``libggml-vulkan`` in a short-lived subprocess and calls
+        ``ggml_backend_vk_get_device_memory`` for each device, so no Vulkan
+        instance is created in this process. Returns list of
+        (device_index, free_mib) sorted by index, where the index is ggml's
+        own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES``
+        expects). Returns [] when no Vulkan build is installed or no device
+        is reachable.
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if not binary:
+            return []
+        binary_dir = Path(binary).parent
+        if not (binary_dir / _vulkan_lib_filename()).is_file():
+            return []
+
+        env = child_env_without_native_path_secret()
+        if sys.platform != "win32":
+            # Let the loader resolve sibling ggml libs next to the binary.
+            existing_ld = env.get("LD_LIBRARY_PATH", "")
+            env["LD_LIBRARY_PATH"] = (
+                f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir)
+            )
+        try:
+            result = subprocess.run(
+                [sys.executable, "-c", _VULKAN_PROBE_SCRIPT, str(binary_dir)],
+                capture_output = True,
+                text = True,
+                timeout = 15,
+                env = env,
+                **_windows_hidden_subprocess_kwargs(),
+            )
+        except Exception as e:
+            logger.debug(f"vulkan GPU probe failed: {e}")
+            return []
+
+        gpus: list[tuple[int, int]] = []
+        for line in result.stdout.strip().splitlines():
+            parts = line.split("\t")
+            if len(parts) != 3:
+                continue
+            try:
+                idx = int(parts[0])
+                free_mib = int(parts[1]) // (1024 * 1024)
+            except ValueError:
+                continue
+            gpus.append((idx, free_mib))
+        gpus.sort(key = lambda g: g[0])
+        if gpus:
+            logger.info(
+                "Vulkan GPU memory detected: "
+                + ", ".join(f"VK{idx}={free}MiB" for idx, free in gpus)
+            )
+        return gpus
+
     # Skip the wait when the last kill is older than this; the GPU
     # driver has already reclaimed the prior process's allocations.
     _VRAM_SETTLE_WINDOW_S: float = 15.0
@@ -2670,6 +2804,7 @@ def load_model(
                     "Run setup.sh to build it, install llama.cpp, "
                     "or set LLAMA_SERVER_PATH environment variable."
                 )
+            is_vulkan_backend = self._is_vulkan_backend(binary)
 
             # ── Phase 2: download (NO lock held, so cancel can proceed) ──
             # Scope HF_HUB_OFFLINE to the download block only when DNS is
@@ -2729,7 +2864,7 @@ def load_model(
                 gpus: list[tuple[int, int]] = []
                 try:
                     model_size = self._get_gguf_size_bytes(model_path)
-                    gpus = self._get_gpu_free_memory()
+                    gpus = self._get_gpu_free_memory(binary)
 
                     # Resolve effective context: 0 means let llama-server use the
                     # model's native length.  Only expand to a known native length
@@ -3217,17 +3352,23 @@ def load_model(
                 # the full HIP/ROCR set the parent inherited.
                 if gpu_indices is not None:
                     pinned = ",".join(str(i) for i in gpu_indices)
-                    env["CUDA_VISIBLE_DEVICES"] = pinned
-                    try:
-                        import torch as _torch
-
-                        if getattr(_torch.version, "hip", None) is not None:
-                            env["HIP_VISIBLE_DEVICES"] = pinned
-                            env["ROCR_VISIBLE_DEVICES"] = pinned
-                    except Exception as e:
-                        logger.debug(
-                            "Failed to set ROCm visibility env vars for child: %s", e
-                        )
+                    if is_vulkan_backend:
+                        # gpu_indices are ggml Vulkan ordinals (see
+                        # _get_gpu_free_memory); the Vulkan backend ignores
+                        # CUDA_VISIBLE_DEVICES, so pin via its own mask.
+                        env["GGML_VK_VISIBLE_DEVICES"] = pinned
+                    else:
+                        env["CUDA_VISIBLE_DEVICES"] = pinned
+                        try:
+                            import torch as _torch
+
+                            if getattr(_torch.version, "hip", None) is not None:
+                                env["HIP_VISIBLE_DEVICES"] = pinned
+                                env["ROCR_VISIBLE_DEVICES"] = pinned
+                        except Exception as e:
+                            logger.debug(
+                                "Failed to set ROCm visibility env vars for child: %s", e
+                            )
 
                 # Defensive kill: if a concurrent load slipped past Phase 1
                 # (because its `self._process` was None at the time) and
diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index 394a1c9cd8..53e9dd0c9b 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -9,6 +9,7 @@
 import argparse
 import errno
 import fnmatch
+import glob
 import hashlib
 import json
 import os
@@ -196,6 +197,7 @@ class HostInfo:
     has_physical_nvidia: bool
     has_usable_nvidia: bool
     has_rocm: bool = False
+    has_intel_gpu: bool = False
 
 
 @dataclass
@@ -1336,6 +1338,21 @@ def direct_upstream_release_plan(
                     torch_preference.selection_log,
                 )
             )
+        # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_asset = f"llama-{release_tag}-bin-win-vulkan-x64.zip"
+            vulkan_url = assets.get(vulkan_asset)
+            if vulkan_url:
+                attempts.append(
+                    AssetChoice(
+                        repo = repo,
+                        tag = release_tag,
+                        name = vulkan_asset,
+                        url = vulkan_url,
+                        source_label = "upstream",
+                        install_kind = "windows-vulkan",
+                    )
+                )
         cpu_asset = f"llama-{release_tag}-bin-win-cpu-x64.zip"
         cpu_url = assets.get(cpu_asset)
         if cpu_url:
@@ -1396,6 +1413,21 @@ def direct_upstream_release_plan(
                 )
             )
     elif host.is_linux and host.is_x86_64 and not host.has_usable_nvidia:
+        # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_asset = f"llama-{release_tag}-bin-ubuntu-vulkan-x64.tar.gz"
+            vulkan_url = assets.get(vulkan_asset)
+            if vulkan_url:
+                attempts.append(
+                    AssetChoice(
+                        repo = repo,
+                        tag = release_tag,
+                        name = vulkan_asset,
+                        url = vulkan_url,
+                        source_label = "upstream",
+                        install_kind = "linux-vulkan",
+                    )
+                )
         asset_name = f"llama-{release_tag}-bin-ubuntu-x64.tar.gz"
         asset_url = assets.get(asset_name)
         if asset_url:
@@ -2744,6 +2776,37 @@ def _amd_smi_has_gpu(stdout: str) -> bool:
         # Note: amdhip64.dll presence alone is NOT treated as GPU evidence
         # since the HIP SDK can be installed without an AMD GPU.
 
+    # Detect an Intel GPU; gates the Vulkan prebuilt. Linux reads the DRM
+    # sysfs vendor id (0x8086); Windows queries the WMI video controller list.
+    has_intel_gpu = False
+    if is_linux:
+        for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"):
+            try:
+                with open(_vendor_file) as _vf:
+                    if _vf.read().strip().lower() == "0x8086":
+                        has_intel_gpu = True
+                        break
+            except OSError:
+                continue
+    elif is_windows:
+        _ps = shutil.which("powershell") or shutil.which("pwsh")
+        if _ps:
+            try:
+                _result = run_capture(
+                    [
+                        _ps,
+                        "-NoProfile",
+                        "-Command",
+                        "Get-CimInstance Win32_VideoController | "
+                        "Select-Object -ExpandProperty Name",
+                    ],
+                    timeout = 15,
+                )
+                if _result.returncode == 0 and "intel" in _result.stdout.lower():
+                    has_intel_gpu = True
+            except Exception:
+                pass
+
     return HostInfo(
         system = system,
         machine = machine,
@@ -2759,6 +2822,7 @@ def _amd_smi_has_gpu(stdout: str) -> bool:
         has_physical_nvidia = has_physical_nvidia,
         has_usable_nvidia = has_usable_nvidia,
         has_rocm = has_rocm,
+        has_intel_gpu = has_intel_gpu,
     )
 
 
@@ -3325,6 +3389,21 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                 "falling back to source build with HIP support"
             )
 
+        # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz"
+            if vulkan_name in upstream_assets:
+                log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}")
+                return AssetChoice(
+                    repo = UPSTREAM_REPO,
+                    tag = llama_tag,
+                    name = vulkan_name,
+                    url = upstream_assets[vulkan_name],
+                    source_label = "upstream",
+                    install_kind = "linux-vulkan",
+                )
+            log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU")
+
         upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz"
         if upstream_name not in upstream_assets:
             raise PrebuiltFallback("upstream Linux CPU asset was not found")
@@ -3363,6 +3442,21 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                 "AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU"
             )
 
+        # Intel (or other non-NVIDIA/non-AMD) GPU on Windows: use Vulkan.
+        if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
+            vulkan_name = f"llama-{llama_tag}-bin-win-vulkan-x64.zip"
+            if vulkan_name in upstream_assets:
+                log(f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}")
+                return AssetChoice(
+                    repo = UPSTREAM_REPO,
+                    tag = llama_tag,
+                    name = vulkan_name,
+                    url = upstream_assets[vulkan_name],
+                    source_label = "upstream",
+                    install_kind = "windows-vulkan",
+                )
+            log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU")
+
         upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip"
         if upstream_name not in upstream_assets:
             raise PrebuiltFallback("upstream Windows CPU asset was not found")
@@ -3870,7 +3964,13 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]:
     # libraries between b9279 and b9283) without us re-enumerating
     # every new file. Studio only invokes llama-server and llama-quantize;
     # other CLIs upstream ships (llama-cli, llama-bench, ...) are skipped.
-    if choice.install_kind in {"linux-cpu", "linux-cuda", "linux-rocm", "linux-arm64"}:
+    if choice.install_kind in {
+        "linux-cpu",
+        "linux-cuda",
+        "linux-rocm",
+        "linux-arm64",
+        "linux-vulkan",
+    }:
         return ["llama-server", "llama-quantize", "lib*.so*"]
     if choice.install_kind in {"macos-arm64", "macos-x64"}:
         return ["llama-server", "llama-quantize", "lib*.dylib"]
@@ -3878,6 +3978,7 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]:
         "windows-cpu",
         "windows-cuda",
         "windows-hip",
+        "windows-vulkan",
         "windows-arm64",
     }:
         return ["llama-server.exe", "llama-quantize.exe", "*.dll"]
@@ -4698,8 +4799,10 @@ def validate_server(
         _gpu_kinds = {
             "linux-cuda",
             "linux-rocm",
+            "linux-vulkan",
             "windows-cuda",
             "windows-hip",
+            "windows-vulkan",
             "macos-arm64",
         }
         if install_kind is not None:
@@ -5265,6 +5368,16 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]:
             ["libmtmd.so*"],
             ["libggml-hip.so*"],
         ]
+    if choice.install_kind == "linux-vulkan":
+        return [
+            ["libllama-common.so*"],
+            ["libllama.so*"],
+            ["libggml.so*"],
+            ["libggml-base.so*"],
+            ["libggml-cpu-*.so*"],
+            ["libmtmd.so*"],
+            ["libggml-vulkan.so*"],
+        ]
     if choice.install_kind in {"windows-cpu", "windows-arm64"}:
         return [["llama.dll"]]
     if choice.install_kind == "windows-cuda":
@@ -5284,6 +5397,8 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]:
         return groups
     if choice.install_kind == "windows-hip":
         return [["llama.dll"], ["*hip*.dll"]]
+    if choice.install_kind == "windows-vulkan":
+        return [["llama.dll"], ["ggml-vulkan.dll"]]
     return []
 
 

From c401f10c7847c882d36db5c88c58c0724d57b1e7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 27 May 2026 16:33:19 +0000
Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py |  7 +++++--
 studio/install_llama_prebuilt.py           | 16 ++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 171609ce33..ec63a48b30 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1433,7 +1433,9 @@ def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:
             return []
 
     @staticmethod
-    def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]:
+    def _get_gpu_free_memory_vulkan(
+        binary: Optional[str] = None,
+    ) -> list[tuple[int, int]]:
         """Query free VRAM per device via the bundled ggml Vulkan backend.
 
         Loads ``libggml-vulkan`` in a short-lived subprocess and calls
@@ -3367,7 +3369,8 @@ def load_model(
                                 env["ROCR_VISIBLE_DEVICES"] = pinned
                         except Exception as e:
                             logger.debug(
-                                "Failed to set ROCm visibility env vars for child: %s", e
+                                "Failed to set ROCm visibility env vars for child: %s",
+                                e,
                             )
 
                 # Defensive kill: if a concurrent load slipped past Phase 1
diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index 53e9dd0c9b..ff2233717f 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -3393,7 +3393,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
         if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
             vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz"
             if vulkan_name in upstream_assets:
-                log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}")
+                log(
+                    f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}"
+                )
                 return AssetChoice(
                     repo = UPSTREAM_REPO,
                     tag = llama_tag,
@@ -3402,7 +3404,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                     source_label = "upstream",
                     install_kind = "linux-vulkan",
                 )
-            log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU")
+            log(
+                "Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU"
+            )
 
         upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz"
         if upstream_name not in upstream_assets:
@@ -3446,7 +3450,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
         if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
             vulkan_name = f"llama-{llama_tag}-bin-win-vulkan-x64.zip"
             if vulkan_name in upstream_assets:
-                log(f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}")
+                log(
+                    f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}"
+                )
                 return AssetChoice(
                     repo = UPSTREAM_REPO,
                     tag = llama_tag,
@@ -3455,7 +3461,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                     source_label = "upstream",
                     install_kind = "windows-vulkan",
                 )
-            log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU")
+            log(
+                "Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU"
+            )
 
         upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip"
         if upstream_name not in upstream_assets:

From 84d98a76da5bdfca4a0af49155389038ad550a0d Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 May 2026 09:47:00 -0700
Subject: [PATCH 03/13] Address gemini's feedback

---
 studio/backend/core/inference/llama_cpp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index ec63a48b30..8ed0361ed0 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1469,6 +1469,11 @@ def _get_gpu_free_memory_vulkan(
                 env = env,
                 **_windows_hidden_subprocess_kwargs(),
             )
+            if result.returncode != 0:
+                logger.debug(
+                    f"vulkan GPU probe exited {result.returncode}: {result.stderr.strip()}"
+                )
+                return []
         except Exception as e:
             logger.debug(f"vulkan GPU probe failed: {e}")
             return []

From 11acf229c778546aaa96960f068b10fc33a94357 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 May 2026 10:10:06 -0700
Subject: [PATCH 04/13] Studio: move the Vulkan VRAM probe into a standalone
 script

---
 .../backend/core/inference/_vulkan_probe.py   | 57 +++++++++++++++++++
 studio/backend/core/inference/llama_cpp.py    | 41 +------------
 2 files changed, 59 insertions(+), 39 deletions(-)
 create mode 100644 studio/backend/core/inference/_vulkan_probe.py

diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
new file mode 100644
index 0000000000..776b01826f
--- /dev/null
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -0,0 +1,57 @@
+"""Standalone free-VRAM probe for the bundled ggml Vulkan backend.
+
+Run in a short-lived subprocess (``python _vulkan_probe.py <bindir>``) so the
+Vulkan instance never lives in the long-running backend process. Loads the
+bundled ggml Vulkan backend from ``<bindir>`` and prints one
+``<idx>\\t<free_bytes>\\t<total_bytes>`` line per device to stdout. The indices
+are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES
+expects), which need not match nvidia-smi order.
+
+Uses only the standard library so it stays runnable as a bare script without
+importing the backend package.
+"""
+import ctypes
+import os
+import sys
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        return 0
+    bindir = sys.argv[1]
+
+    if sys.platform == "win32":
+        base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll"
+        try:
+            os.add_dll_directory(bindir)
+        except Exception:
+            pass
+    else:
+        base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
+
+    try:
+        ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL)
+        lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL)
+    except OSError:
+        return 0
+
+    lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
+    lib.ggml_backend_vk_get_device_count.argtypes = []
+    lib.ggml_backend_vk_get_device_memory.restype = None
+    lib.ggml_backend_vk_get_device_memory.argtypes = [
+        ctypes.c_int,
+        ctypes.POINTER(ctypes.c_size_t),
+        ctypes.POINTER(ctypes.c_size_t),
+    ]
+
+    rows = []
+    for i in range(lib.ggml_backend_vk_get_device_count()):
+        free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
+        lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
+        rows.append("%d\t%d\t%d" % (i, free.value, total.value))
+    sys.stdout.write("\n".join(rows))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 8ed0361ed0..e17a490703 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -600,44 +600,6 @@ def _backfill_usage_from_timings(usage, timings):
     return out
 
 
-# Probe script run in a short-lived subprocess so the Vulkan instance never
-# lives in the long-running backend process. Loads the bundled ggml Vulkan
-# backend and prints "<idx>\t<free_bytes>\t<total_bytes>" per device. The
-# indices are ggml's own Vulkan device ordinals -- the space
-# GGML_VK_VISIBLE_DEVICES expects -- which need not match nvidia-smi order.
-_VULKAN_PROBE_SCRIPT = r"""
-import ctypes, os, sys
-bindir = sys.argv[1]
-if sys.platform == "win32":
-    base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll"
-    try:
-        os.add_dll_directory(bindir)
-    except Exception:
-        pass
-else:
-    base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
-try:
-    ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL)
-    lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL)
-except OSError:
-    sys.exit(0)
-lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
-lib.ggml_backend_vk_get_device_count.argtypes = []
-lib.ggml_backend_vk_get_device_memory.restype = None
-lib.ggml_backend_vk_get_device_memory.argtypes = [
-    ctypes.c_int,
-    ctypes.POINTER(ctypes.c_size_t),
-    ctypes.POINTER(ctypes.c_size_t),
-]
-rows = []
-for i in range(lib.ggml_backend_vk_get_device_count()):
-    free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
-    lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
-    rows.append("%d\t%d\t%d" % (i, free.value, total.value))
-sys.stdout.write("\n".join(rows))
-"""
-
-
 def _vulkan_lib_filename() -> str:
     return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"
 
@@ -1460,9 +1422,10 @@ def _get_gpu_free_memory_vulkan(
             env["LD_LIBRARY_PATH"] = (
                 f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir)
             )
+        probe_script = Path(__file__).with_name("_vulkan_probe.py")
         try:
             result = subprocess.run(
-                [sys.executable, "-c", _VULKAN_PROBE_SCRIPT, str(binary_dir)],
+                [sys.executable, str(probe_script), str(binary_dir)],
                 capture_output = True,
                 text = True,
                 timeout = 15,

From 7dd21f33bc4b93d325850c36451b04c3a4604f0a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 27 May 2026 17:10:34 +0000
Subject: [PATCH 05/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/_vulkan_probe.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
index 776b01826f..cf918b2577 100644
--- a/studio/backend/core/inference/_vulkan_probe.py
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -10,6 +10,7 @@
 Uses only the standard library so it stays runnable as a bare script without
 importing the backend package.
 """
+
 import ctypes
 import os
 import sys
@@ -30,8 +31,8 @@ def main() -> int:
         base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
 
     try:
-        ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL)
-        lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL)
+        ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
+        lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL)
     except OSError:
         return 0
 
@@ -47,7 +48,9 @@ def main() -> int:
     rows = []
     for i in range(lib.ggml_backend_vk_get_device_count()):
         free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
-        lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
+        lib.ggml_backend_vk_get_device_memory(
+            i, ctypes.byref(free), ctypes.byref(total)
+        )
         rows.append("%d\t%d\t%d" % (i, free.value, total.value))
     sys.stdout.write("\n".join(rows))
     return 0

From e50b0afc03b6c583a7f6affd9559bd51a049eaa8 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Wed, 27 May 2026 21:17:55 -0700
Subject: [PATCH 06/13] Improve Vulkan probe error reporting

---
 studio/backend/core/inference/_vulkan_probe.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
index cf918b2577..0ffc9e47f1 100644
--- a/studio/backend/core/inference/_vulkan_probe.py
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -33,8 +33,9 @@ def main() -> int:
     try:
         ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
         lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL)
-    except OSError:
-        return 0
+    except OSError as e:
+        print(f"ggml-vulkan load failed: {e}", file = sys.stderr)
+        return 1
 
     lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
     lib.ggml_backend_vk_get_device_count.argtypes = []

From 4fefeebfd00e1fa74cfae286d547809ce484f21f Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Thu, 28 May 2026 20:06:51 -0700
Subject: [PATCH 07/13] Resolve llama-server symlink so Vulkan build is
 detected

---
 studio/backend/core/inference/llama_cpp.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index e17a490703..185647bbef 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -604,6 +604,15 @@ def _vulkan_lib_filename() -> str:
     return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"
 
 
+def _llama_lib_dir(binary: str) -> Path:
+    # The installer exposes llama-server as a top-level symlink
+    # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml
+    # backend libs actually live. Resolve it so callers looking for sibling
+    # libs (Vulkan detection, LD_LIBRARY_PATH, the probe's bindir) hit the real
+    # directory instead of the symlink's parent.
+    return Path(binary).resolve().parent
+
+
 class LlamaCppBackend:
     """
     Manages a llama-server subprocess for GGUF model inference.
@@ -1248,7 +1257,7 @@ def _is_vulkan_backend(binary: Optional[str] = None) -> bool:
         binary = binary or LlamaCppBackend._find_llama_server_binary()
         if not binary:
             return False
-        return (Path(binary).parent / _vulkan_lib_filename()).is_file()
+        return (_llama_lib_dir(binary) / _vulkan_lib_filename()).is_file()
 
     @staticmethod
     def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
@@ -1411,7 +1420,7 @@ def _get_gpu_free_memory_vulkan(
         binary = binary or LlamaCppBackend._find_llama_server_binary()
         if not binary:
             return []
-        binary_dir = Path(binary).parent
+        binary_dir = _llama_lib_dir(binary)
         if not (binary_dir / _vulkan_lib_filename()).is_file():
             return []
 
@@ -3239,7 +3248,7 @@ def load_model(
                 import sys
 
                 env = child_env_without_native_path_secret()
-                binary_dir = str(Path(binary).parent)
+                binary_dir = str(_llama_lib_dir(binary))
 
                 if sys.platform == "win32":
                     # See _build_windows_path_dirs for ordering. #5106.

From 10faad1e18f39d88482a9781a5f7ca3b88fac774 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 31 May 2026 08:40:23 -0700
Subject: [PATCH 08/13] Drop unreachable Vulkan fallback in GPU free-memory
 dispatcher

---
 studio/backend/core/inference/llama_cpp.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 510d57f74a..a88204113a 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1297,7 +1297,7 @@ def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
         On a Vulkan build, the ggml Vulkan probe is authoritative so the
         returned indices are Vulkan ordinals (the space the GPU pin writes
         to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch
-        cover NVIDIA + AMD ROCm, with the Vulkan probe as a last resort.
+        cover NVIDIA + AMD ROCm.
 
         Returns list of (gpu_index, free_mib) sorted by index. Empty
         list if no supported GPU is reachable.
@@ -1305,10 +1305,7 @@ def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
         binary = binary or LlamaCppBackend._find_llama_server_binary()
         if LlamaCppBackend._is_vulkan_backend(binary):
             return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
-        gpus = LlamaCppBackend._get_gpu_free_memory_nvidia_torch()
-        if gpus:
-            return gpus
-        return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+        return LlamaCppBackend._get_gpu_free_memory_nvidia_torch()
 
     @staticmethod
     def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:

From dafeb795c8121841c7e44cce594cead4c205f007 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 31 May 2026 08:58:22 -0700
Subject: [PATCH 09/13] Skip the Intel GPU probe when NVIDIA or ROCm is present

---
 studio/install_llama_prebuilt.py | 56 +++++++++++++++++---------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index 9508616f7c..1ac41cc648 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -3078,34 +3078,38 @@ def _resolve_exe(name: str) -> str | None:
 
     # Detect an Intel GPU; gates the Vulkan prebuilt. Linux reads the DRM
     # sysfs vendor id (0x8086); Windows queries the WMI video controller list.
+    # Only probed when there is no usable NVIDIA and no ROCm GPU, since the
+    # Vulkan selection branches are gated the same way -- this keeps the probe
+    # (notably the Windows powershell call) off the NVIDIA/AMD path.
     has_intel_gpu = False
-    if is_linux:
-        for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"):
-            try:
-                with open(_vendor_file) as _vf:
-                    if _vf.read().strip().lower() == "0x8086":
+    if not has_usable_nvidia and not has_rocm:
+        if is_linux:
+            for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"):
+                try:
+                    with open(_vendor_file) as _vf:
+                        if _vf.read().strip().lower() == "0x8086":
+                            has_intel_gpu = True
+                            break
+                except OSError:
+                    continue
+        elif is_windows:
+            _ps = shutil.which("powershell") or shutil.which("pwsh")
+            if _ps:
+                try:
+                    _result = run_capture(
+                        [
+                            _ps,
+                            "-NoProfile",
+                            "-Command",
+                            "Get-CimInstance Win32_VideoController | "
+                            "Select-Object -ExpandProperty Name",
+                        ],
+                        timeout = 15,
+                    )
+                    if _result.returncode == 0 and "intel" in _result.stdout.lower():
                         has_intel_gpu = True
-                        break
-            except OSError:
-                continue
-    elif is_windows:
-        _ps = shutil.which("powershell") or shutil.which("pwsh")
-        if _ps:
-            try:
-                _result = run_capture(
-                    [
-                        _ps,
-                        "-NoProfile",
-                        "-Command",
-                        "Get-CimInstance Win32_VideoController | "
-                        "Select-Object -ExpandProperty Name",
-                    ],
-                    timeout = 15,
-                )
-                if _result.returncode == 0 and "intel" in _result.stdout.lower():
-                    has_intel_gpu = True
-            except Exception:
-                pass
+                except Exception:
+                    pass
 
     return HostInfo(
         system = system,

From 1980e59221c127ff4dafc51264b74f05bf72add3 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 31 May 2026 12:29:06 -0700
Subject: [PATCH 10/13] Reserve host RAM headroom for Vulkan integrated GPUs

---
 .../backend/core/inference/_vulkan_probe.py   |  58 +++++++-
 studio/backend/core/inference/llama_cpp.py    |  44 +++++-
 .../tests/test_llama_cpp_vulkan_probe.py      | 131 ++++++++++++++++++
 3 files changed, 225 insertions(+), 8 deletions(-)
 create mode 100644 studio/backend/tests/test_llama_cpp_vulkan_probe.py

diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
index 0ffc9e47f1..dcbbe9096b 100644
--- a/studio/backend/core/inference/_vulkan_probe.py
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -3,9 +3,11 @@
 Run in a short-lived subprocess (``python _vulkan_probe.py <bindir>``) so the
 Vulkan instance never lives in the long-running backend process. Loads the
 bundled ggml Vulkan backend from ``<bindir>`` and prints one
-``<idx>\\t<free_bytes>\\t<total_bytes>`` line per device to stdout. The indices
+``<idx>\\t<free_bytes>\\t<is_igpu>`` line per device to stdout. The indices
 are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES
-expects), which need not match nvidia-smi order.
+expects), which need not match nvidia-smi order. ``is_igpu`` is ``1`` for an
+integrated GPU (shared system RAM) and ``0`` otherwise, taken from ggml's own
+device type so the reader needn't guess from VRAM-vs-RAM ratios.
 
 Uses only the standard library so it stays runnable as a bare script without
 importing the backend package.
@@ -15,6 +17,48 @@
 import os
 import sys
 
+# ggml_backend_dev_type enum (ggml-backend.h): CPU=0, GPU=1, IGPU=2, ...
+_GGML_BACKEND_DEVICE_TYPE_IGPU = 2
+
+
+def _igpu_flags(base, lib, count: int) -> list[bool]:
+    """Per-device integrated-GPU flags via ggml's backend registry.
+
+    The Vulkan reg enumerates devices in the same order as
+    ``ggml_backend_vk_get_device_memory`` (ggml-vulkan builds each device
+    context with ``ctx->device = i``), so reg index == device ordinal.
+    Returns all-False on any failure so the reader never over-caps a
+    discrete card just because the type couldn't be read.
+    """
+    flags = [False] * count
+    try:
+        lib.ggml_backend_vk_reg.restype = ctypes.c_void_p
+        lib.ggml_backend_vk_reg.argtypes = []
+        base.ggml_backend_reg_dev_count.restype = ctypes.c_size_t
+        base.ggml_backend_reg_dev_count.argtypes = [ctypes.c_void_p]
+        base.ggml_backend_reg_dev_get.restype = ctypes.c_void_p
+        base.ggml_backend_reg_dev_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
+        base.ggml_backend_dev_type.restype = ctypes.c_int
+        base.ggml_backend_dev_type.argtypes = [ctypes.c_void_p]
+
+        reg = lib.ggml_backend_vk_reg()
+        if not reg:
+            return flags
+        dev_count = base.ggml_backend_reg_dev_count(reg)
+        for i in range(min(count, dev_count)):
+            dev = base.ggml_backend_reg_dev_get(reg, i)
+            if dev:
+                flags[i] = (
+                    base.ggml_backend_dev_type(dev)
+                    == _GGML_BACKEND_DEVICE_TYPE_IGPU
+                )
+    except Exception:
+        # iGPU detection is best-effort: any failure (missing symbol,
+        # registry call error) degrades to "discrete" so the memory
+        # readings still get through instead of crashing the probe.
+        pass
+    return flags
+
 
 def main() -> int:
     if len(sys.argv) < 2:
@@ -31,7 +75,7 @@ def main() -> int:
         base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
 
     try:
-        ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
+        base = ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
         lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL)
     except OSError as e:
         print(f"ggml-vulkan load failed: {e}", file = sys.stderr)
@@ -46,13 +90,17 @@ def main() -> int:
         ctypes.POINTER(ctypes.c_size_t),
     ]
 
+    count = lib.ggml_backend_vk_get_device_count()
+    igpu = _igpu_flags(base, lib, count)
     rows = []
-    for i in range(lib.ggml_backend_vk_get_device_count()):
+    for i in range(count):
         free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
+        # total is a required out-param of the C call but unused: the reader
+        # leaves a flat per-device margin, not a fraction of total.
         lib.ggml_backend_vk_get_device_memory(
             i, ctypes.byref(free), ctypes.byref(total)
         )
-        rows.append("%d\t%d\t%d" % (i, free.value, total.value))
+        rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i])))
     sys.stdout.write("\n".join(rows))
     return 0
 
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index a88204113a..2cda6f8e07 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -611,6 +611,29 @@ def _vulkan_lib_filename() -> str:
     return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"
 
 
+# Free system RAM to leave on an integrated GPU, mirroring llama.cpp's own
+# auto-fit margin (llama-server --fit-target, default 1024 MiB per device).
+# ggml reports an iGPU's "VRAM" as shared system RAM, so we hold back the same
+# per-device margin --fit would rather than inventing a larger reserve.
+_IGPU_HOST_RESERVE_MIB = 1024
+
+
+def _apply_igpu_host_reserve_mib(free_mib: int, is_igpu: bool) -> int:
+    """Reserve host headroom on an integrated (shared-memory) Vulkan GPU.
+
+    ggml sums every memory heap for an integrated GPU (ggml-vulkan's
+    ggml_backend_vk_get_device_memory), so its reported free "VRAM" is really
+    free system RAM. Sizing context/offload against all of it would crowd out
+    the host and push it into swap or the OOM killer. We leave the same
+    per-device margin llama.cpp's --fit uses (``_IGPU_HOST_RESERVE_MIB``).
+    ``is_igpu`` comes straight from ggml's device type, so a discrete card is
+    never touched. Only ever reduces the budget.
+    """
+    if not is_igpu:
+        return free_mib
+    return max(0, free_mib - _IGPU_HOST_RESERVE_MIB)
+
+
 def _llama_lib_dir(binary: str) -> Path:
     # The installer exposes llama-server as a top-level symlink
     # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml
@@ -1440,8 +1463,9 @@ def _get_gpu_free_memory_vulkan(
         instance is created in this process. Returns list of
         (device_index, free_mib) sorted by index, where the index is ggml's
         own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES``
-        expects). Returns [] when no Vulkan build is installed or no device
-        is reachable.
+        expects). Integrated GPUs leave a per-device host-RAM margin (see
+        ``_apply_igpu_host_reserve_mib``). Returns [] when no Vulkan build is
+        installed or no device is reachable.
         """
         binary = binary or LlamaCppBackend._find_llama_server_binary()
         if not binary:
@@ -1451,6 +1475,12 @@ def _get_gpu_free_memory_vulkan(
             return []
 
         env = child_env_without_native_path_secret()
+        # Enumerate ggml's canonical, full device list. An inherited
+        # GGML_VK_VISIBLE_DEVICES would renumber/restrict the ordinals, but
+        # load_model writes its own pin in that same full space, so letting
+        # the probe see a pre-existing mask would make the pin double-apply
+        # and target the wrong device.
+        env.pop("GGML_VK_VISIBLE_DEVICES", None)
         if sys.platform != "win32":
             # Let the loader resolve sibling ggml libs next to the binary.
             existing_ld = env.get("LD_LIBRARY_PATH", "")
@@ -1484,9 +1514,17 @@ def _get_gpu_free_memory_vulkan(
             try:
                 idx = int(parts[0])
                 free_mib = int(parts[1]) // (1024 * 1024)
+                is_igpu = parts[2] == "1"
             except ValueError:
                 continue
-            gpus.append((idx, free_mib))
+            capped = _apply_igpu_host_reserve_mib(free_mib, is_igpu)
+            if capped < free_mib:
+                logger.info(
+                    f"Vulkan device VK{idx} is an integrated GPU sharing system "
+                    f"RAM; reserving {free_mib - capped}MiB host headroom "
+                    f"({free_mib}->{capped}MiB usable)"
+                )
+            gpus.append((idx, capped))
         gpus.sort(key = lambda g: g[0])
         if gpus:
             logger.info(
diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
new file mode 100644
index 0000000000..c641ad4bd0
--- /dev/null
+++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: AGPL-3.0-only
+# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0
+
+"""Vulkan free-VRAM reader regression tests on a synthetic probe output.
+
+Covers the post-probe handling in
+``LlamaCppBackend._get_gpu_free_memory_vulkan``:
+
+  * integrated GPUs (probe reports is_igpu=1) leave a flat per-device host
+    margin matching llama.cpp's --fit-target, so context auto-sizing can't
+    over-commit shared RAM,
+  * discrete GPUs (is_igpu=0) are left untouched,
+  * an inherited ``GGML_VK_VISIBLE_DEVICES`` is stripped before probing so
+    enumeration stays in ggml's canonical full-device space.
+
+The ggml Vulkan library is never loaded: subprocess.run is mocked to emit
+the tab-separated lines the real ``_vulkan_probe.py`` would print.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+import types as _types
+from pathlib import Path
+from unittest import mock
+
+import pytest
+
+_BACKEND_DIR = str(Path(__file__).resolve().parent.parent)
+if _BACKEND_DIR not in sys.path:
+    sys.path.insert(0, _BACKEND_DIR)
+
+import importlib as _importlib  # noqa: E402
+
+
+def _maybe_stub(name: str, builder):
+    try:
+        _importlib.import_module(name)
+    except ImportError:
+        sys.modules[name] = builder()
+
+
+def _build_loggers_stub():
+    m = _types.ModuleType("loggers")
+    m.get_logger = lambda name: __import__("logging").getLogger(name)
+    return m
+
+
+_maybe_stub("loggers", _build_loggers_stub)
+_maybe_stub("structlog", lambda: _types.ModuleType("structlog"))
+
+from core.inference import llama_cpp as _llama_mod  # noqa: E402
+from core.inference.llama_cpp import LlamaCppBackend, _vulkan_lib_filename  # noqa: E402
+
+MIB = 1024 * 1024
+GIB = 1024 * MIB
+
+
+def _make_vulkan_install(tmp_path: Path) -> str:
+    """A binary whose sibling dir holds the Vulkan ggml lib, so the
+    reader's ``is_vulkan_backend`` sibling-file check passes."""
+    bindir = tmp_path / "build" / "bin"
+    bindir.mkdir(parents = True)
+    binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server")
+    binary.write_bytes(b"stub")
+    (bindir / _vulkan_lib_filename()).write_bytes(b"stub")
+    return str(binary)
+
+
+def _mock_probe(rows: list[str], captured_env: dict | None = None):
+    """Patch subprocess.run so the _vulkan_probe.py call returns ``rows``
+    (already tab-formatted), recording the env it was launched with."""
+    real_run = subprocess.run
+
+    def fake_run(cmd, *args, **kwargs):
+        if isinstance(cmd, list) and any("_vulkan_probe" in str(c) for c in cmd):
+            if captured_env is not None:
+                captured_env.clear()
+                captured_env.update(kwargs.get("env") or {})
+            return subprocess.CompletedProcess(
+                args = cmd, returncode = 0, stdout = "\n".join(rows), stderr = ""
+            )
+        return real_run(cmd, *args, **kwargs)
+
+    return mock.patch("subprocess.run", side_effect = fake_run)
+
+
+def _row(idx: int, free_bytes: int, is_igpu: int) -> str:
+    return f"{idx}\t{free_bytes}\t{is_igpu}"
+
+
+def test_integrated_gpu_leaves_host_margin(tmp_path):
+    binary = _make_vulkan_install(tmp_path)
+    # iGPU with 30 GiB free; reserve a flat 1024 MiB (llama.cpp --fit-target).
+    rows = [_row(0, 30 * GIB, is_igpu = 1)]
+    with _mock_probe(rows):
+        gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert gpus == [(0, 30 * 1024 - 1024)], gpus
+
+
+def test_discrete_gpu_free_is_untouched(tmp_path):
+    binary = _make_vulkan_install(tmp_path)
+    rows = [_row(0, 23 * GIB, is_igpu = 0)]
+    with _mock_probe(rows):
+        gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert gpus == [(0, 23 * 1024)], gpus
+
+
+def test_large_discrete_gpu_is_untouched(tmp_path):
+    binary = _make_vulkan_install(tmp_path)
+    # A 48 GiB discrete card stays untouched regardless of size; only the
+    # iGPU flag triggers the host margin, never a VRAM/RAM ratio.
+    rows = [_row(0, 47 * GIB, is_igpu = 0)]
+    with _mock_probe(rows):
+        gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert gpus == [(0, 47 * 1024)], gpus
+
+
+def test_inherited_visible_devices_mask_is_stripped(tmp_path, monkeypatch):
+    binary = _make_vulkan_install(tmp_path)
+    monkeypatch.setenv("GGML_VK_VISIBLE_DEVICES", "1")
+    captured: dict = {}
+    rows = [_row(0, 23 * GIB, is_igpu = 0)]
+    with _mock_probe(rows, captured_env = captured):
+        LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+    assert "GGML_VK_VISIBLE_DEVICES" not in captured, captured
+
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v"]))

From 31f4a36f2c5a5172b2083f3160e14c014766fc02 Mon Sep 17 00:00:00 2001
From: oobabooga <112222186+oobabooga@users.noreply.github.com>
Date: Sun, 31 May 2026 14:40:00 -0700
Subject: [PATCH 11/13] Add a `UNSLOTH_FORCE_VULKAN` environment variable

---
 studio/install_llama_prebuilt.py | 54 +++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index 1ac41cc648..314d3eb978 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -6558,6 +6558,37 @@ def validate_prebuilt_attempts(
     raise PrebuiltFallback("no prebuilt bundle passed validation")
 
 
+def force_vulkan_requested() -> bool:
+    """Whether UNSLOTH_FORCE_VULKAN opts this host into the Vulkan llama.cpp
+    prebuilt instead of its detected CUDA/ROCm backend -- e.g. so an AMD user
+    can run the Vulkan build for inference. Scoped to the llama.cpp backend:
+    the torch/training stack is installed separately and still sees the real
+    GPU.
+    """
+    return os.environ.get("UNSLOTH_FORCE_VULKAN", "").strip().lower() in (
+        "1",
+        "true",
+        "yes",
+    )
+
+
+def _vulkan_only_host(host: HostInfo) -> HostInfo:
+    """Rewrite ``host`` so the asset selectors take their Vulkan branch.
+
+    That branch fires on ``has_intel_gpu and not nvidia and not rocm``, so the
+    CUDA/ROCm flags are cleared and the integrated-GPU flag is raised. The
+    synthetic integrated-GPU flag never leaves install planning -- it only
+    routes the llama.cpp prebuilt choice, not the torch/training stack.
+    """
+    return dataclasses_replace(
+        host,
+        has_usable_nvidia = False,
+        has_physical_nvidia = False,
+        has_rocm = False,
+        has_intel_gpu = True,
+    )
+
+
 def install_prebuilt(
     install_dir: Path,
     llama_tag: str,
@@ -6570,6 +6601,24 @@ def install_prebuilt(
     host = detect_host()
     if override_has_rocm and not host.has_rocm:
         host = dataclasses_replace(host, has_rocm = True)
+    # UNSLOTH_FORCE_VULKAN installs the upstream ggml-org Vulkan prebuilt
+    # instead of the detected CUDA/ROCm backend. The unsloth published repo
+    # ships only CUDA/ROCm assets, hence UPSTREAM_REPO.
+    force_vulkan = False
+    if force_vulkan_requested():
+        if host.is_macos:
+            log(
+                "UNSLOTH_FORCE_VULKAN is set but ignored on macOS "
+                "(Metal is used; there is no Vulkan prebuilt)"
+            )
+        else:
+            log(
+                "UNSLOTH_FORCE_VULKAN is set; installing the upstream Vulkan "
+                "llama.cpp prebuilt instead of the detected GPU backend"
+            )
+            host = _vulkan_only_host(host)
+            published_repo = UPSTREAM_REPO
+            force_vulkan = True
     choice: AssetChoice | None = None
     try:
         with install_lock(install_lock_path(install_dir)):
@@ -6581,7 +6630,10 @@ def install_prebuilt(
                 log(
                     f"no existing llama.cpp install detected at {install_dir}; performing fresh prebuilt install"
                 )
-            if simple_policy:
+            if simple_policy or force_vulkan:
+                # The simple planner is the one that routes a non-unsloth repo
+                # (here UPSTREAM_REPO) through direct_upstream_release_plan,
+                # which carries the Vulkan asset branch.
                 requested_tag, release_plans = resolve_simple_install_release_plans(
                     llama_tag,
                     host,

From c3482d406bfea6759afb1a5f20f6a2bb65deebc0 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 31 May 2026 21:40:35 +0000
Subject: [PATCH 12/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/_vulkan_probe.py      | 3 +--
 studio/backend/tests/test_llama_cpp_vulkan_probe.py | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
index dcbbe9096b..23b91c599c 100644
--- a/studio/backend/core/inference/_vulkan_probe.py
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -49,8 +49,7 @@ def _igpu_flags(base, lib, count: int) -> list[bool]:
             dev = base.ggml_backend_reg_dev_get(reg, i)
             if dev:
                 flags[i] = (
-                    base.ggml_backend_dev_type(dev)
-                    == _GGML_BACKEND_DEVICE_TYPE_IGPU
+                    base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU
                 )
     except Exception:
         # iGPU detection is best-effort: any failure (missing symbol,
diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
index c641ad4bd0..cd92cf21ab 100644
--- a/studio/backend/tests/test_llama_cpp_vulkan_probe.py
+++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
@@ -62,7 +62,9 @@ def _make_vulkan_install(tmp_path: Path) -> str:
     reader's ``is_vulkan_backend`` sibling-file check passes."""
     bindir = tmp_path / "build" / "bin"
     bindir.mkdir(parents = True)
-    binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server")
+    binary = bindir / (
+        "llama-server.exe" if sys.platform == "win32" else "llama-server"
+    )
     binary.write_bytes(b"stub")
     (bindir / _vulkan_lib_filename()).write_bytes(b"stub")
     return str(binary)

From 7563d9143eaafbf5b042be3b2b663595f82db979 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 9 Jun 2026 03:30:16 +0000
Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/_vulkan_probe.py      |  8 ++------
 studio/backend/core/inference/llama_cpp.py          |  5 +----
 studio/backend/tests/test_llama_cpp_vulkan_probe.py |  4 +---
 studio/install_llama_prebuilt.py                    | 12 +++---------
 4 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
index 23b91c599c..f2e225f4c4 100644
--- a/studio/backend/core/inference/_vulkan_probe.py
+++ b/studio/backend/core/inference/_vulkan_probe.py
@@ -48,9 +48,7 @@ def _igpu_flags(base, lib, count: int) -> list[bool]:
         for i in range(min(count, dev_count)):
             dev = base.ggml_backend_reg_dev_get(reg, i)
             if dev:
-                flags[i] = (
-                    base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU
-                )
+                flags[i] = base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU
     except Exception:
         # iGPU detection is best-effort: any failure (missing symbol,
         # registry call error) degrades to "discrete" so the memory
@@ -96,9 +94,7 @@ def main() -> int:
         free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
         # total is a required out-param of the C call but unused: the reader
         # leaves a flat per-device margin, not a fraction of total.
-        lib.ggml_backend_vk_get_device_memory(
-            i, ctypes.byref(free), ctypes.byref(total)
-        )
+        lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
         rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i])))
     sys.stdout.write("\n".join(rows))
     return 0
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 2ab54126a7..fd58532f73 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1431,9 +1431,7 @@ def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:
             return []
 
     @staticmethod
-    def _get_gpu_free_memory_vulkan(
-        binary: Optional[str] = None,
-    ) -> list[tuple[int, int]]:
+    def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]:
         """Query free VRAM per device via the bundled ggml Vulkan backend.
 
         Loads ``libggml-vulkan`` in a short-lived subprocess and calls
@@ -3446,7 +3444,6 @@ def load_model(
                         env["CUDA_VISIBLE_DEVICES"] = pinned
                         try:
                             import torch as _torch
-
                             if getattr(_torch.version, "hip", None) is not None:
                                 env["HIP_VISIBLE_DEVICES"] = pinned
                                 env["ROCR_VISIBLE_DEVICES"] = pinned
diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
index cd92cf21ab..c641ad4bd0 100644
--- a/studio/backend/tests/test_llama_cpp_vulkan_probe.py
+++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py
@@ -62,9 +62,7 @@ def _make_vulkan_install(tmp_path: Path) -> str:
     reader's ``is_vulkan_backend`` sibling-file check passes."""
     bindir = tmp_path / "build" / "bin"
     bindir.mkdir(parents = True)
-    binary = bindir / (
-        "llama-server.exe" if sys.platform == "win32" else "llama-server"
-    )
+    binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server")
     binary.write_bytes(b"stub")
     (bindir / _vulkan_lib_filename()).write_bytes(b"stub")
     return str(binary)
diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py
index 2d97f11cdf..d662ec697e 100644
--- a/studio/install_llama_prebuilt.py
+++ b/studio/install_llama_prebuilt.py
@@ -3926,9 +3926,7 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
         if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm:
             vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz"
             if vulkan_name in upstream_assets:
-                log(
-                    f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}"
-                )
+                log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}")
                 return AssetChoice(
                     repo = UPSTREAM_REPO,
                     tag = llama_tag,
@@ -3937,9 +3935,7 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                     source_label = "upstream",
                     install_kind = "linux-vulkan",
                 )
-            log(
-                "Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU"
-            )
+            log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU")
 
         upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz"
         if upstream_name not in upstream_assets:
@@ -3996,9 +3992,7 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice
                     source_label = "upstream",
                     install_kind = "windows-vulkan",
                 )
-            log(
-                "Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU"
-            )
+            log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU")
 
         upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip"
         if upstream_name not in upstream_assets: