unslothai · oobabooga · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py
@@ -0,0 +1,104 @@
+"""Standalone free-VRAM probe for the bundled ggml Vulkan backend.
+
+Run in a short-lived subprocess (``python _vulkan_probe.py <bindir>``) so the
+Vulkan instance never lives in the long-running backend process. Loads the
+bundled ggml Vulkan backend from ``<bindir>`` and prints one
+``<idx>\\t<free_bytes>\\t<is_igpu>`` line per device to stdout. The indices
+are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES
+expects), which need not match nvidia-smi order. ``is_igpu`` is ``1`` for an
+integrated GPU (shared system RAM) and ``0`` otherwise, taken from ggml's own
+device type so the reader needn't guess from VRAM-vs-RAM ratios.
+
+Uses only the standard library so it stays runnable as a bare script without
+importing the backend package.
+"""
+
+import ctypes
+import os
+import sys
+
+# ggml_backend_dev_type enum (ggml-backend.h): CPU=0, GPU=1, IGPU=2, ...
+_GGML_BACKEND_DEVICE_TYPE_IGPU = 2
+
+
+def _igpu_flags(base, lib, count: int) -> list[bool]:
+    """Per-device integrated-GPU flags via ggml's backend registry.
+
+    The Vulkan reg enumerates devices in the same order as
+    ``ggml_backend_vk_get_device_memory`` (ggml-vulkan builds each device
+    context with ``ctx->device = i``), so reg index == device ordinal.
+    Returns all-False on any failure so the reader never over-caps a
+    discrete card just because the type couldn't be read.
+    """
+    flags = [False] * count
+    try:
+        lib.ggml_backend_vk_reg.restype = ctypes.c_void_p
+        lib.ggml_backend_vk_reg.argtypes = []
+        base.ggml_backend_reg_dev_count.restype = ctypes.c_size_t
+        base.ggml_backend_reg_dev_count.argtypes = [ctypes.c_void_p]
+        base.ggml_backend_reg_dev_get.restype = ctypes.c_void_p
+        base.ggml_backend_reg_dev_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
+        base.ggml_backend_dev_type.restype = ctypes.c_int
+        base.ggml_backend_dev_type.argtypes = [ctypes.c_void_p]
+
+        reg = lib.ggml_backend_vk_reg()
+        if not reg:
+            return flags
+        dev_count = base.ggml_backend_reg_dev_count(reg)
+        for i in range(min(count, dev_count)):
+            dev = base.ggml_backend_reg_dev_get(reg, i)
+            if dev:
+                flags[i] = base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU
+    except Exception:
+        # iGPU detection is best-effort: any failure (missing symbol,
+        # registry call error) degrades to "discrete" so the memory
+        # readings still get through instead of crashing the probe.
+        pass
+    return flags
+
+
+def main() -> int:
+    if len(sys.argv) < 2:
+        return 0
+    bindir = sys.argv[1]
+
+    if sys.platform == "win32":
+        base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll"
+        try:
+            os.add_dll_directory(bindir)
+        except Exception:
+            pass
+    else:
+        base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"
+
+    try:
+        base = ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
+        lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL)
+    except OSError as e:
+        print(f"ggml-vulkan load failed: {e}", file = sys.stderr)
+        return 1
+
+    lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
+    lib.ggml_backend_vk_get_device_count.argtypes = []
+    lib.ggml_backend_vk_get_device_memory.restype = None
+    lib.ggml_backend_vk_get_device_memory.argtypes = [
+        ctypes.c_int,
+        ctypes.POINTER(ctypes.c_size_t),
+        ctypes.POINTER(ctypes.c_size_t),
+    ]
+
+    count = lib.ggml_backend_vk_get_device_count()
+    igpu = _igpu_flags(base, lib, count)
+    rows = []
+    for i in range(count):
+        free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
+        # total is a required out-param of the C call but unused: the reader
+        # leaves a flat per-device margin, not a fraction of total.
+        lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
+        rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i])))
+    sys.stdout.write("\n".join(rows))
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -597,6 +597,42 @@ def _backfill_usage_from_timings(usage, timings):
     return out
 
 
+def _vulkan_lib_filename() -> str:
+    return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"
+
+
+# Free system RAM to leave on an integrated GPU, mirroring llama.cpp's own
+# auto-fit margin (llama-server --fit-target, default 1024 MiB per device).
+# ggml reports an iGPU's "VRAM" as shared system RAM, so we hold back the same
+# per-device margin --fit would rather than inventing a larger reserve.
+_IGPU_HOST_RESERVE_MIB = 1024
+
+
+def _apply_igpu_host_reserve_mib(free_mib: int, is_igpu: bool) -> int:
+    """Reserve host headroom on an integrated (shared-memory) Vulkan GPU.
+
+    ggml sums every memory heap for an integrated GPU (ggml-vulkan's
+    ggml_backend_vk_get_device_memory), so its reported free "VRAM" is really
+    free system RAM. Sizing context/offload against all of it would crowd out
+    the host and push it into swap or the OOM killer. We leave the same
+    per-device margin llama.cpp's --fit uses (``_IGPU_HOST_RESERVE_MIB``).
+    ``is_igpu`` comes straight from ggml's device type, so a discrete card is
+    never touched. Only ever reduces the budget.
+    """
+    if not is_igpu:
+        return free_mib
+    return max(0, free_mib - _IGPU_HOST_RESERVE_MIB)
+
+
+def _llama_lib_dir(binary: str) -> Path:
+    # The installer exposes llama-server as a top-level symlink
+    # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml
+    # backend libs actually live. Resolve it so callers looking for sibling
+    # libs (Vulkan detection, LD_LIBRARY_PATH, the probe's bindir) hit the real
+    # directory instead of the symlink's parent.
+    return Path(binary).resolve().parent
+
+
 class LlamaCppBackend:
     """
     Manages a llama-server subprocess for GGUF model inference.
@@ -1219,6 +1255,20 @@ def _get_gguf_size_bytes(model_path: str) -> int:
 
         return total
 
+    @staticmethod
+    def _is_vulkan_backend(binary: Optional[str] = None) -> bool:
+        """True if the installed llama.cpp build is the Vulkan one.
+
+        Builds are single-backend, so the presence of the Vulkan ggml
+        backend library next to llama-server is sufficient. Used to keep
+        the free-memory probe and the GPU pin in the same device-index
+        space (ggml's Vulkan ordinals, not nvidia-smi order).
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if not binary:
+            return False
+        return (_llama_lib_dir(binary) / _vulkan_lib_filename()).is_file()
+
     @staticmethod
     def _amd_apu_wants_unified_memory() -> bool:
         """True only for AMD unified-memory APUs (gfx1150/gfx1151), where
@@ -1244,7 +1294,24 @@ def _amd_apu_wants_unified_memory() -> bool:
         return False
 
     @staticmethod
-    def _get_gpu_free_memory() -> list[tuple[int, int]]:
+    def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
+        """Query free memory per GPU across all supported backends.
+
+        On a Vulkan build, the ggml Vulkan probe is authoritative so the
+        returned indices are Vulkan ordinals (the space the GPU pin writes
+        to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch
+        cover NVIDIA + AMD ROCm.
+
+        Returns list of (gpu_index, free_mib) sorted by index. Empty
+        list if no supported GPU is reachable.
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if LlamaCppBackend._is_vulkan_backend(binary):
+            return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
+        return LlamaCppBackend._get_gpu_free_memory_nvidia_torch()
+
+    @staticmethod
+    def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:
         """Query free memory per GPU.
 
         Order:
@@ -1363,6 +1430,85 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
             logger.debug(f"torch GPU probe failed: {e}")
             return []
 
+    @staticmethod
+    def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]:
+        """Query free VRAM per device via the bundled ggml Vulkan backend.
+
+        Loads ``libggml-vulkan`` in a short-lived subprocess and calls
+        ``ggml_backend_vk_get_device_memory`` for each device, so no Vulkan
+        instance is created in this process. Returns list of
+        (device_index, free_mib) sorted by index, where the index is ggml's
+        own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES``
+        expects). Integrated GPUs leave a per-device host-RAM margin (see
+        ``_apply_igpu_host_reserve_mib``). Returns [] when no Vulkan build is
+        installed or no device is reachable.
+        """
+        binary = binary or LlamaCppBackend._find_llama_server_binary()
+        if not binary:
+            return []
+        binary_dir = _llama_lib_dir(binary)
+        if not (binary_dir / _vulkan_lib_filename()).is_file():
+            return []
+
+        env = child_env_without_native_path_secret()
+        # Enumerate ggml's canonical, full device list. An inherited
+        # GGML_VK_VISIBLE_DEVICES would renumber/restrict the ordinals, but
+        # load_model writes its own pin in that same full space, so letting
+        # the probe see a pre-existing mask would make the pin double-apply
+        # and target the wrong device.
+        env.pop("GGML_VK_VISIBLE_DEVICES", None)
+        if sys.platform != "win32":
+            # Let the loader resolve sibling ggml libs next to the binary.
+            existing_ld = env.get("LD_LIBRARY_PATH", "")
+            env["LD_LIBRARY_PATH"] = (
+                f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir)
+            )
+        probe_script = Path(__file__).with_name("_vulkan_probe.py")
+        try:
+            result = subprocess.run(
+                [sys.executable, str(probe_script), str(binary_dir)],
+                capture_output = True,
+                text = True,
+                timeout = 15,
+                env = env,
+                **_windows_hidden_subprocess_kwargs(),
+            )
+            if result.returncode != 0:
+                logger.debug(
+                    f"vulkan GPU probe exited {result.returncode}: {result.stderr.strip()}"
+                )
+                return []
+        except Exception as e:
+            logger.debug(f"vulkan GPU probe failed: {e}")
+            return []
+
+        gpus: list[tuple[int, int]] = []
+        for line in result.stdout.strip().splitlines():
+            parts = line.split("\t")
+            if len(parts) != 3:
+                continue
+            try:
+                idx = int(parts[0])
+                free_mib = int(parts[1]) // (1024 * 1024)
+                is_igpu = parts[2] == "1"
+            except ValueError:
+                continue
+            capped = _apply_igpu_host_reserve_mib(free_mib, is_igpu)
+            if capped < free_mib:
+                logger.info(
+                    f"Vulkan device VK{idx} is an integrated GPU sharing system "
+                    f"RAM; reserving {free_mib - capped}MiB host headroom "
+                    f"({free_mib}->{capped}MiB usable)"
+                )
+            gpus.append((idx, capped))
+        gpus.sort(key = lambda g: g[0])
+        if gpus:
+            logger.info(
+                "Vulkan GPU memory detected: "
+                + ", ".join(f"VK{idx}={free}MiB" for idx, free in gpus)
+            )
+        return gpus
+
     # Skip the wait when the last kill is older than this; the GPU
     # driver has already reclaimed the prior process's allocations.
     _VRAM_SETTLE_WINDOW_S: float = 15.0
@@ -2724,6 +2870,7 @@ def load_model(
                     "Run setup.sh to build it, install llama.cpp, "
                     "or set LLAMA_SERVER_PATH environment variable."
                 )
+            is_vulkan_backend = self._is_vulkan_backend(binary)
 
             # ── Phase 2: download (NO lock held, so cancel can proceed) ──
             # Scope HF_HUB_OFFLINE to the download block only when DNS is
@@ -2791,7 +2938,7 @@ def load_model(
                 gpus: list[tuple[int, int]] = []
                 try:
                     model_size = self._get_gguf_size_bytes(model_path)
-                    gpus = self._get_gpu_free_memory()
+                    gpus = self._get_gpu_free_memory(binary)
 
                     # Resolve effective context: 0 means let llama-server use the
                     # model's native length.  Only expand to a known native length
@@ -3187,7 +3334,7 @@ def load_model(
                 import sys
 
                 env = child_env_without_native_path_secret()
-                binary_dir = str(Path(binary).parent)
+                binary_dir = str(_llama_lib_dir(binary))
 
                 # AMD unified-memory APUs (gfx1150/gfx1151): let llama.cpp use
                 # shared system RAM. setdefault so a user value wins.
@@ -3288,14 +3435,23 @@ def load_model(
                 # the full HIP/ROCR set the parent inherited.
                 if gpu_indices is not None:
                     pinned = ",".join(str(i) for i in gpu_indices)
-                    env["CUDA_VISIBLE_DEVICES"] = pinned
-                    try:
-                        import torch as _torch
-                        if getattr(_torch.version, "hip", None) is not None:
-                            env["HIP_VISIBLE_DEVICES"] = pinned
-                            env["ROCR_VISIBLE_DEVICES"] = pinned
-                    except Exception as e:
-                        logger.debug("Failed to set ROCm visibility env vars for child: %s", e)
+                    if is_vulkan_backend:
+                        # gpu_indices are ggml Vulkan ordinals (see
+                        # _get_gpu_free_memory); the Vulkan backend ignores
+                        # CUDA_VISIBLE_DEVICES, so pin via its own mask.
+                        env["GGML_VK_VISIBLE_DEVICES"] = pinned
+                    else:
+                        env["CUDA_VISIBLE_DEVICES"] = pinned
+                        try:
+                            import torch as _torch
+                            if getattr(_torch.version, "hip", None) is not None:
+                                env["HIP_VISIBLE_DEVICES"] = pinned
+                                env["ROCR_VISIBLE_DEVICES"] = pinned
+                        except Exception as e:
+                            logger.debug(
+                                "Failed to set ROCm visibility env vars for child: %s",
+                                e,
+                            )
 
                 # Defensive kill: if a concurrent load slipped past Phase 1
                 # (because its `self._process` was None at the time) and