From 9e9729be37cc95a91847f745279e9acf42fc04f3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 May 2026 08:58:05 -0700 Subject: [PATCH 01/13] Studio: add Vulkan llama.cpp support --- studio/backend/core/inference/llama_cpp.py | 167 +++++++++++++++++++-- studio/install_llama_prebuilt.py | 117 ++++++++++++++- 2 files changed, 270 insertions(+), 14 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 0325620b2d..171609ce33 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -600,6 +600,48 @@ def _backfill_usage_from_timings(usage, timings): return out +# Probe script run in a short-lived subprocess so the Vulkan instance never +# lives in the long-running backend process. Loads the bundled ggml Vulkan +# backend and prints "\t\t" per device. The +# indices are ggml's own Vulkan device ordinals -- the space +# GGML_VK_VISIBLE_DEVICES expects -- which need not match nvidia-smi order. +_VULKAN_PROBE_SCRIPT = r""" +import ctypes, os, sys +bindir = sys.argv[1] +if sys.platform == "win32": + base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll" + try: + os.add_dll_directory(bindir) + except Exception: + pass +else: + base_name, vk_name = "libggml-base.so", "libggml-vulkan.so" +try: + ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL) + lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL) +except OSError: + sys.exit(0) +lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int +lib.ggml_backend_vk_get_device_count.argtypes = [] +lib.ggml_backend_vk_get_device_memory.restype = None +lib.ggml_backend_vk_get_device_memory.argtypes = [ + ctypes.c_int, + ctypes.POINTER(ctypes.c_size_t), + ctypes.POINTER(ctypes.c_size_t), +] +rows = [] +for i in range(lib.ggml_backend_vk_get_device_count()): + free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) + lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total)) + rows.append("%d\t%d\t%d" % (i, free.value, total.value)) +sys.stdout.write("\n".join(rows)) +""" + + +def _vulkan_lib_filename() -> str: + return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so" + + class LlamaCppBackend: """ Manages a llama-server subprocess for GGUF model inference. @@ -1233,7 +1275,41 @@ def _get_gguf_size_bytes(model_path: str) -> int: return total @staticmethod - def _get_gpu_free_memory() -> list[tuple[int, int]]: + def _is_vulkan_backend(binary: Optional[str] = None) -> bool: + """True if the installed llama.cpp build is the Vulkan one. + + Builds are single-backend, so the presence of the Vulkan ggml + backend library next to llama-server is sufficient. Used to keep + the free-memory probe and the GPU pin in the same device-index + space (ggml's Vulkan ordinals, not nvidia-smi order). + """ + binary = binary or LlamaCppBackend._find_llama_server_binary() + if not binary: + return False + return (Path(binary).parent / _vulkan_lib_filename()).is_file() + + @staticmethod + def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]: + """Query free memory per GPU across all supported backends. + + On a Vulkan build, the ggml Vulkan probe is authoritative so the + returned indices are Vulkan ordinals (the space the GPU pin writes + to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch + cover NVIDIA + AMD ROCm, with the Vulkan probe as a last resort. + + Returns list of (gpu_index, free_mib) sorted by index. Empty + list if no supported GPU is reachable. + """ + binary = binary or LlamaCppBackend._find_llama_server_binary() + if LlamaCppBackend._is_vulkan_backend(binary): + return LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + gpus = LlamaCppBackend._get_gpu_free_memory_nvidia_torch() + if gpus: + return gpus + return LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + + @staticmethod + def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]: """Query free memory per GPU. Order: @@ -1356,6 +1432,64 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: logger.debug(f"torch GPU probe failed: {e}") return [] + @staticmethod + def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]: + """Query free VRAM per device via the bundled ggml Vulkan backend. + + Loads ``libggml-vulkan`` in a short-lived subprocess and calls + ``ggml_backend_vk_get_device_memory`` for each device, so no Vulkan + instance is created in this process. Returns list of + (device_index, free_mib) sorted by index, where the index is ggml's + own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES`` + expects). Returns [] when no Vulkan build is installed or no device + is reachable. + """ + binary = binary or LlamaCppBackend._find_llama_server_binary() + if not binary: + return [] + binary_dir = Path(binary).parent + if not (binary_dir / _vulkan_lib_filename()).is_file(): + return [] + + env = child_env_without_native_path_secret() + if sys.platform != "win32": + # Let the loader resolve sibling ggml libs next to the binary. + existing_ld = env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = ( + f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir) + ) + try: + result = subprocess.run( + [sys.executable, "-c", _VULKAN_PROBE_SCRIPT, str(binary_dir)], + capture_output = True, + text = True, + timeout = 15, + env = env, + **_windows_hidden_subprocess_kwargs(), + ) + except Exception as e: + logger.debug(f"vulkan GPU probe failed: {e}") + return [] + + gpus: list[tuple[int, int]] = [] + for line in result.stdout.strip().splitlines(): + parts = line.split("\t") + if len(parts) != 3: + continue + try: + idx = int(parts[0]) + free_mib = int(parts[1]) // (1024 * 1024) + except ValueError: + continue + gpus.append((idx, free_mib)) + gpus.sort(key = lambda g: g[0]) + if gpus: + logger.info( + "Vulkan GPU memory detected: " + + ", ".join(f"VK{idx}={free}MiB" for idx, free in gpus) + ) + return gpus + # Skip the wait when the last kill is older than this; the GPU # driver has already reclaimed the prior process's allocations. _VRAM_SETTLE_WINDOW_S: float = 15.0 @@ -2670,6 +2804,7 @@ def load_model( "Run setup.sh to build it, install llama.cpp, " "or set LLAMA_SERVER_PATH environment variable." ) + is_vulkan_backend = self._is_vulkan_backend(binary) # ── Phase 2: download (NO lock held, so cancel can proceed) ── # Scope HF_HUB_OFFLINE to the download block only when DNS is @@ -2729,7 +2864,7 @@ def load_model( gpus: list[tuple[int, int]] = [] try: model_size = self._get_gguf_size_bytes(model_path) - gpus = self._get_gpu_free_memory() + gpus = self._get_gpu_free_memory(binary) # Resolve effective context: 0 means let llama-server use the # model's native length. Only expand to a known native length @@ -3217,17 +3352,23 @@ def load_model( # the full HIP/ROCR set the parent inherited. if gpu_indices is not None: pinned = ",".join(str(i) for i in gpu_indices) - env["CUDA_VISIBLE_DEVICES"] = pinned - try: - import torch as _torch - - if getattr(_torch.version, "hip", None) is not None: - env["HIP_VISIBLE_DEVICES"] = pinned - env["ROCR_VISIBLE_DEVICES"] = pinned - except Exception as e: - logger.debug( - "Failed to set ROCm visibility env vars for child: %s", e - ) + if is_vulkan_backend: + # gpu_indices are ggml Vulkan ordinals (see + # _get_gpu_free_memory); the Vulkan backend ignores + # CUDA_VISIBLE_DEVICES, so pin via its own mask. + env["GGML_VK_VISIBLE_DEVICES"] = pinned + else: + env["CUDA_VISIBLE_DEVICES"] = pinned + try: + import torch as _torch + + if getattr(_torch.version, "hip", None) is not None: + env["HIP_VISIBLE_DEVICES"] = pinned + env["ROCR_VISIBLE_DEVICES"] = pinned + except Exception as e: + logger.debug( + "Failed to set ROCm visibility env vars for child: %s", e + ) # Defensive kill: if a concurrent load slipped past Phase 1 # (because its `self._process` was None at the time) and diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 394a1c9cd8..53e9dd0c9b 100644 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -9,6 +9,7 @@ import argparse import errno import fnmatch +import glob import hashlib import json import os @@ -196,6 +197,7 @@ class HostInfo: has_physical_nvidia: bool has_usable_nvidia: bool has_rocm: bool = False + has_intel_gpu: bool = False @dataclass @@ -1336,6 +1338,21 @@ def direct_upstream_release_plan( torch_preference.selection_log, ) ) + # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_asset = f"llama-{release_tag}-bin-win-vulkan-x64.zip" + vulkan_url = assets.get(vulkan_asset) + if vulkan_url: + attempts.append( + AssetChoice( + repo = repo, + tag = release_tag, + name = vulkan_asset, + url = vulkan_url, + source_label = "upstream", + install_kind = "windows-vulkan", + ) + ) cpu_asset = f"llama-{release_tag}-bin-win-cpu-x64.zip" cpu_url = assets.get(cpu_asset) if cpu_url: @@ -1396,6 +1413,21 @@ def direct_upstream_release_plan( ) ) elif host.is_linux and host.is_x86_64 and not host.has_usable_nvidia: + # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_asset = f"llama-{release_tag}-bin-ubuntu-vulkan-x64.tar.gz" + vulkan_url = assets.get(vulkan_asset) + if vulkan_url: + attempts.append( + AssetChoice( + repo = repo, + tag = release_tag, + name = vulkan_asset, + url = vulkan_url, + source_label = "upstream", + install_kind = "linux-vulkan", + ) + ) asset_name = f"llama-{release_tag}-bin-ubuntu-x64.tar.gz" asset_url = assets.get(asset_name) if asset_url: @@ -2744,6 +2776,37 @@ def _amd_smi_has_gpu(stdout: str) -> bool: # Note: amdhip64.dll presence alone is NOT treated as GPU evidence # since the HIP SDK can be installed without an AMD GPU. + # Detect an Intel GPU; gates the Vulkan prebuilt. Linux reads the DRM + # sysfs vendor id (0x8086); Windows queries the WMI video controller list. + has_intel_gpu = False + if is_linux: + for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"): + try: + with open(_vendor_file) as _vf: + if _vf.read().strip().lower() == "0x8086": + has_intel_gpu = True + break + except OSError: + continue + elif is_windows: + _ps = shutil.which("powershell") or shutil.which("pwsh") + if _ps: + try: + _result = run_capture( + [ + _ps, + "-NoProfile", + "-Command", + "Get-CimInstance Win32_VideoController | " + "Select-Object -ExpandProperty Name", + ], + timeout = 15, + ) + if _result.returncode == 0 and "intel" in _result.stdout.lower(): + has_intel_gpu = True + except Exception: + pass + return HostInfo( system = system, machine = machine, @@ -2759,6 +2822,7 @@ def _amd_smi_has_gpu(stdout: str) -> bool: has_physical_nvidia = has_physical_nvidia, has_usable_nvidia = has_usable_nvidia, has_rocm = has_rocm, + has_intel_gpu = has_intel_gpu, ) @@ -3325,6 +3389,21 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice "falling back to source build with HIP support" ) + # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz" + if vulkan_name in upstream_assets: + log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}") + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = vulkan_name, + url = upstream_assets[vulkan_name], + source_label = "upstream", + install_kind = "linux-vulkan", + ) + log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU") + upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Linux CPU asset was not found") @@ -3363,6 +3442,21 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice "AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU" ) + # Intel (or other non-NVIDIA/non-AMD) GPU on Windows: use Vulkan. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_name = f"llama-{llama_tag}-bin-win-vulkan-x64.zip" + if vulkan_name in upstream_assets: + log(f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}") + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = vulkan_name, + url = upstream_assets[vulkan_name], + source_label = "upstream", + install_kind = "windows-vulkan", + ) + log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU") + upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Windows CPU asset was not found") @@ -3870,7 +3964,13 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: # libraries between b9279 and b9283) without us re-enumerating # every new file. Studio only invokes llama-server and llama-quantize; # other CLIs upstream ships (llama-cli, llama-bench, ...) are skipped. - if choice.install_kind in {"linux-cpu", "linux-cuda", "linux-rocm", "linux-arm64"}: + if choice.install_kind in { + "linux-cpu", + "linux-cuda", + "linux-rocm", + "linux-arm64", + "linux-vulkan", + }: return ["llama-server", "llama-quantize", "lib*.so*"] if choice.install_kind in {"macos-arm64", "macos-x64"}: return ["llama-server", "llama-quantize", "lib*.dylib"] @@ -3878,6 +3978,7 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: "windows-cpu", "windows-cuda", "windows-hip", + "windows-vulkan", "windows-arm64", }: return ["llama-server.exe", "llama-quantize.exe", "*.dll"] @@ -4698,8 +4799,10 @@ def validate_server( _gpu_kinds = { "linux-cuda", "linux-rocm", + "linux-vulkan", "windows-cuda", "windows-hip", + "windows-vulkan", "macos-arm64", } if install_kind is not None: @@ -5265,6 +5368,16 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: ["libmtmd.so*"], ["libggml-hip.so*"], ] + if choice.install_kind == "linux-vulkan": + return [ + ["libllama-common.so*"], + ["libllama.so*"], + ["libggml.so*"], + ["libggml-base.so*"], + ["libggml-cpu-*.so*"], + ["libmtmd.so*"], + ["libggml-vulkan.so*"], + ] if choice.install_kind in {"windows-cpu", "windows-arm64"}: return [["llama.dll"]] if choice.install_kind == "windows-cuda": @@ -5284,6 +5397,8 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: return groups if choice.install_kind == "windows-hip": return [["llama.dll"], ["*hip*.dll"]] + if choice.install_kind == "windows-vulkan": + return [["llama.dll"], ["ggml-vulkan.dll"]] return [] From c401f10c7847c882d36db5c88c58c0724d57b1e7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 May 2026 16:33:19 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/llama_cpp.py | 7 +++++-- studio/install_llama_prebuilt.py | 16 ++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 171609ce33..ec63a48b30 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1433,7 +1433,9 @@ def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]: return [] @staticmethod - def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]: + def _get_gpu_free_memory_vulkan( + binary: Optional[str] = None, + ) -> list[tuple[int, int]]: """Query free VRAM per device via the bundled ggml Vulkan backend. Loads ``libggml-vulkan`` in a short-lived subprocess and calls @@ -3367,7 +3369,8 @@ def load_model( env["ROCR_VISIBLE_DEVICES"] = pinned except Exception as e: logger.debug( - "Failed to set ROCm visibility env vars for child: %s", e + "Failed to set ROCm visibility env vars for child: %s", + e, ) # Defensive kill: if a concurrent load slipped past Phase 1 diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 53e9dd0c9b..ff2233717f 100644 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -3393,7 +3393,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz" if vulkan_name in upstream_assets: - log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}") + log( + f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}" + ) return AssetChoice( repo = UPSTREAM_REPO, tag = llama_tag, @@ -3402,7 +3404,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice source_label = "upstream", install_kind = "linux-vulkan", ) - log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU") + log( + "Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU" + ) upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: @@ -3446,7 +3450,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: vulkan_name = f"llama-{llama_tag}-bin-win-vulkan-x64.zip" if vulkan_name in upstream_assets: - log(f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}") + log( + f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}" + ) return AssetChoice( repo = UPSTREAM_REPO, tag = llama_tag, @@ -3455,7 +3461,9 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice source_label = "upstream", install_kind = "windows-vulkan", ) - log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU") + log( + "Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU" + ) upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: From 84d98a76da5bdfca4a0af49155389038ad550a0d Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 May 2026 09:47:00 -0700 Subject: [PATCH 03/13] Address gemini's feedback --- studio/backend/core/inference/llama_cpp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index ec63a48b30..8ed0361ed0 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1469,6 +1469,11 @@ def _get_gpu_free_memory_vulkan( env = env, **_windows_hidden_subprocess_kwargs(), ) + if result.returncode != 0: + logger.debug( + f"vulkan GPU probe exited {result.returncode}: {result.stderr.strip()}" + ) + return [] except Exception as e: logger.debug(f"vulkan GPU probe failed: {e}") return [] From 11acf229c778546aaa96960f068b10fc33a94357 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 May 2026 10:10:06 -0700 Subject: [PATCH 04/13] Studio: move the Vulkan VRAM probe into a standalone script --- .../backend/core/inference/_vulkan_probe.py | 57 +++++++++++++++++++ studio/backend/core/inference/llama_cpp.py | 41 +------------ 2 files changed, 59 insertions(+), 39 deletions(-) create mode 100644 studio/backend/core/inference/_vulkan_probe.py diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py new file mode 100644 index 0000000000..776b01826f --- /dev/null +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -0,0 +1,57 @@ +"""Standalone free-VRAM probe for the bundled ggml Vulkan backend. + +Run in a short-lived subprocess (``python _vulkan_probe.py ``) so the +Vulkan instance never lives in the long-running backend process. Loads the +bundled ggml Vulkan backend from ```` and prints one +``\\t\\t`` line per device to stdout. The indices +are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES +expects), which need not match nvidia-smi order. + +Uses only the standard library so it stays runnable as a bare script without +importing the backend package. +""" +import ctypes +import os +import sys + + +def main() -> int: + if len(sys.argv) < 2: + return 0 + bindir = sys.argv[1] + + if sys.platform == "win32": + base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll" + try: + os.add_dll_directory(bindir) + except Exception: + pass + else: + base_name, vk_name = "libggml-base.so", "libggml-vulkan.so" + + try: + ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL) + lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL) + except OSError: + return 0 + + lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int + lib.ggml_backend_vk_get_device_count.argtypes = [] + lib.ggml_backend_vk_get_device_memory.restype = None + lib.ggml_backend_vk_get_device_memory.argtypes = [ + ctypes.c_int, + ctypes.POINTER(ctypes.c_size_t), + ctypes.POINTER(ctypes.c_size_t), + ] + + rows = [] + for i in range(lib.ggml_backend_vk_get_device_count()): + free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) + lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total)) + rows.append("%d\t%d\t%d" % (i, free.value, total.value)) + sys.stdout.write("\n".join(rows)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 8ed0361ed0..e17a490703 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -600,44 +600,6 @@ def _backfill_usage_from_timings(usage, timings): return out -# Probe script run in a short-lived subprocess so the Vulkan instance never -# lives in the long-running backend process. Loads the bundled ggml Vulkan -# backend and prints "\t\t" per device. The -# indices are ggml's own Vulkan device ordinals -- the space -# GGML_VK_VISIBLE_DEVICES expects -- which need not match nvidia-smi order. -_VULKAN_PROBE_SCRIPT = r""" -import ctypes, os, sys -bindir = sys.argv[1] -if sys.platform == "win32": - base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll" - try: - os.add_dll_directory(bindir) - except Exception: - pass -else: - base_name, vk_name = "libggml-base.so", "libggml-vulkan.so" -try: - ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL) - lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL) -except OSError: - sys.exit(0) -lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int -lib.ggml_backend_vk_get_device_count.argtypes = [] -lib.ggml_backend_vk_get_device_memory.restype = None -lib.ggml_backend_vk_get_device_memory.argtypes = [ - ctypes.c_int, - ctypes.POINTER(ctypes.c_size_t), - ctypes.POINTER(ctypes.c_size_t), -] -rows = [] -for i in range(lib.ggml_backend_vk_get_device_count()): - free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) - lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total)) - rows.append("%d\t%d\t%d" % (i, free.value, total.value)) -sys.stdout.write("\n".join(rows)) -""" - - def _vulkan_lib_filename() -> str: return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so" @@ -1460,9 +1422,10 @@ def _get_gpu_free_memory_vulkan( env["LD_LIBRARY_PATH"] = ( f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir) ) + probe_script = Path(__file__).with_name("_vulkan_probe.py") try: result = subprocess.run( - [sys.executable, "-c", _VULKAN_PROBE_SCRIPT, str(binary_dir)], + [sys.executable, str(probe_script), str(binary_dir)], capture_output = True, text = True, timeout = 15, From 7dd21f33bc4b93d325850c36451b04c3a4604f0a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 May 2026 17:10:34 +0000 Subject: [PATCH 05/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/_vulkan_probe.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py index 776b01826f..cf918b2577 100644 --- a/studio/backend/core/inference/_vulkan_probe.py +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -10,6 +10,7 @@ Uses only the standard library so it stays runnable as a bare script without importing the backend package. """ + import ctypes import os import sys @@ -30,8 +31,8 @@ def main() -> int: base_name, vk_name = "libggml-base.so", "libggml-vulkan.so" try: - ctypes.CDLL(os.path.join(bindir, base_name), mode=ctypes.RTLD_GLOBAL) - lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode=ctypes.RTLD_GLOBAL) + ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL) + lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL) except OSError: return 0 @@ -47,7 +48,9 @@ def main() -> int: rows = [] for i in range(lib.ggml_backend_vk_get_device_count()): free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) - lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total)) + lib.ggml_backend_vk_get_device_memory( + i, ctypes.byref(free), ctypes.byref(total) + ) rows.append("%d\t%d\t%d" % (i, free.value, total.value)) sys.stdout.write("\n".join(rows)) return 0 From e50b0afc03b6c583a7f6affd9559bd51a049eaa8 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Wed, 27 May 2026 21:17:55 -0700 Subject: [PATCH 06/13] Improve Vulkan probe error reporting --- studio/backend/core/inference/_vulkan_probe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py index cf918b2577..0ffc9e47f1 100644 --- a/studio/backend/core/inference/_vulkan_probe.py +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -33,8 +33,9 @@ def main() -> int: try: ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL) lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL) - except OSError: - return 0 + except OSError as e: + print(f"ggml-vulkan load failed: {e}", file = sys.stderr) + return 1 lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int lib.ggml_backend_vk_get_device_count.argtypes = [] From 4fefeebfd00e1fa74cfae286d547809ce484f21f Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Thu, 28 May 2026 20:06:51 -0700 Subject: [PATCH 07/13] Resolve llama-server symlink so Vulkan build is detected --- studio/backend/core/inference/llama_cpp.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index e17a490703..185647bbef 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -604,6 +604,15 @@ def _vulkan_lib_filename() -> str: return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so" +def _llama_lib_dir(binary: str) -> Path: + # The installer exposes llama-server as a top-level symlink + # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml + # backend libs actually live. Resolve it so callers looking for sibling + # libs (Vulkan detection, LD_LIBRARY_PATH, the probe's bindir) hit the real + # directory instead of the symlink's parent. + return Path(binary).resolve().parent + + class LlamaCppBackend: """ Manages a llama-server subprocess for GGUF model inference. @@ -1248,7 +1257,7 @@ def _is_vulkan_backend(binary: Optional[str] = None) -> bool: binary = binary or LlamaCppBackend._find_llama_server_binary() if not binary: return False - return (Path(binary).parent / _vulkan_lib_filename()).is_file() + return (_llama_lib_dir(binary) / _vulkan_lib_filename()).is_file() @staticmethod def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]: @@ -1411,7 +1420,7 @@ def _get_gpu_free_memory_vulkan( binary = binary or LlamaCppBackend._find_llama_server_binary() if not binary: return [] - binary_dir = Path(binary).parent + binary_dir = _llama_lib_dir(binary) if not (binary_dir / _vulkan_lib_filename()).is_file(): return [] @@ -3239,7 +3248,7 @@ def load_model( import sys env = child_env_without_native_path_secret() - binary_dir = str(Path(binary).parent) + binary_dir = str(_llama_lib_dir(binary)) if sys.platform == "win32": # See _build_windows_path_dirs for ordering. #5106. From 10faad1e18f39d88482a9781a5f7ca3b88fac774 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 May 2026 08:40:23 -0700 Subject: [PATCH 08/13] Drop unreachable Vulkan fallback in GPU free-memory dispatcher --- studio/backend/core/inference/llama_cpp.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 510d57f74a..a88204113a 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1297,7 +1297,7 @@ def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]: On a Vulkan build, the ggml Vulkan probe is authoritative so the returned indices are Vulkan ordinals (the space the GPU pin writes to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch - cover NVIDIA + AMD ROCm, with the Vulkan probe as a last resort. + cover NVIDIA + AMD ROCm. Returns list of (gpu_index, free_mib) sorted by index. Empty list if no supported GPU is reachable. @@ -1305,10 +1305,7 @@ def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]: binary = binary or LlamaCppBackend._find_llama_server_binary() if LlamaCppBackend._is_vulkan_backend(binary): return LlamaCppBackend._get_gpu_free_memory_vulkan(binary) - gpus = LlamaCppBackend._get_gpu_free_memory_nvidia_torch() - if gpus: - return gpus - return LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + return LlamaCppBackend._get_gpu_free_memory_nvidia_torch() @staticmethod def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]: From dafeb795c8121841c7e44cce594cead4c205f007 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 May 2026 08:58:22 -0700 Subject: [PATCH 09/13] Skip the Intel GPU probe when NVIDIA or ROCm is present --- studio/install_llama_prebuilt.py | 56 +++++++++++++++++--------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 9508616f7c..1ac41cc648 100644 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -3078,34 +3078,38 @@ def _resolve_exe(name: str) -> str | None: # Detect an Intel GPU; gates the Vulkan prebuilt. Linux reads the DRM # sysfs vendor id (0x8086); Windows queries the WMI video controller list. + # Only probed when there is no usable NVIDIA and no ROCm GPU, since the + # Vulkan selection branches are gated the same way -- this keeps the probe + # (notably the Windows powershell call) off the NVIDIA/AMD path. has_intel_gpu = False - if is_linux: - for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"): - try: - with open(_vendor_file) as _vf: - if _vf.read().strip().lower() == "0x8086": + if not has_usable_nvidia and not has_rocm: + if is_linux: + for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"): + try: + with open(_vendor_file) as _vf: + if _vf.read().strip().lower() == "0x8086": + has_intel_gpu = True + break + except OSError: + continue + elif is_windows: + _ps = shutil.which("powershell") or shutil.which("pwsh") + if _ps: + try: + _result = run_capture( + [ + _ps, + "-NoProfile", + "-Command", + "Get-CimInstance Win32_VideoController | " + "Select-Object -ExpandProperty Name", + ], + timeout = 15, + ) + if _result.returncode == 0 and "intel" in _result.stdout.lower(): has_intel_gpu = True - break - except OSError: - continue - elif is_windows: - _ps = shutil.which("powershell") or shutil.which("pwsh") - if _ps: - try: - _result = run_capture( - [ - _ps, - "-NoProfile", - "-Command", - "Get-CimInstance Win32_VideoController | " - "Select-Object -ExpandProperty Name", - ], - timeout = 15, - ) - if _result.returncode == 0 and "intel" in _result.stdout.lower(): - has_intel_gpu = True - except Exception: - pass + except Exception: + pass return HostInfo( system = system, From 1980e59221c127ff4dafc51264b74f05bf72add3 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 May 2026 12:29:06 -0700 Subject: [PATCH 10/13] Reserve host RAM headroom for Vulkan integrated GPUs --- .../backend/core/inference/_vulkan_probe.py | 58 +++++++- studio/backend/core/inference/llama_cpp.py | 44 +++++- .../tests/test_llama_cpp_vulkan_probe.py | 131 ++++++++++++++++++ 3 files changed, 225 insertions(+), 8 deletions(-) create mode 100644 studio/backend/tests/test_llama_cpp_vulkan_probe.py diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py index 0ffc9e47f1..dcbbe9096b 100644 --- a/studio/backend/core/inference/_vulkan_probe.py +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -3,9 +3,11 @@ Run in a short-lived subprocess (``python _vulkan_probe.py ``) so the Vulkan instance never lives in the long-running backend process. Loads the bundled ggml Vulkan backend from ```` and prints one -``\\t\\t`` line per device to stdout. The indices +``\\t\\t`` line per device to stdout. The indices are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES -expects), which need not match nvidia-smi order. +expects), which need not match nvidia-smi order. ``is_igpu`` is ``1`` for an +integrated GPU (shared system RAM) and ``0`` otherwise, taken from ggml's own +device type so the reader needn't guess from VRAM-vs-RAM ratios. Uses only the standard library so it stays runnable as a bare script without importing the backend package. @@ -15,6 +17,48 @@ import os import sys +# ggml_backend_dev_type enum (ggml-backend.h): CPU=0, GPU=1, IGPU=2, ... +_GGML_BACKEND_DEVICE_TYPE_IGPU = 2 + + +def _igpu_flags(base, lib, count: int) -> list[bool]: + """Per-device integrated-GPU flags via ggml's backend registry. + + The Vulkan reg enumerates devices in the same order as + ``ggml_backend_vk_get_device_memory`` (ggml-vulkan builds each device + context with ``ctx->device = i``), so reg index == device ordinal. + Returns all-False on any failure so the reader never over-caps a + discrete card just because the type couldn't be read. + """ + flags = [False] * count + try: + lib.ggml_backend_vk_reg.restype = ctypes.c_void_p + lib.ggml_backend_vk_reg.argtypes = [] + base.ggml_backend_reg_dev_count.restype = ctypes.c_size_t + base.ggml_backend_reg_dev_count.argtypes = [ctypes.c_void_p] + base.ggml_backend_reg_dev_get.restype = ctypes.c_void_p + base.ggml_backend_reg_dev_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t] + base.ggml_backend_dev_type.restype = ctypes.c_int + base.ggml_backend_dev_type.argtypes = [ctypes.c_void_p] + + reg = lib.ggml_backend_vk_reg() + if not reg: + return flags + dev_count = base.ggml_backend_reg_dev_count(reg) + for i in range(min(count, dev_count)): + dev = base.ggml_backend_reg_dev_get(reg, i) + if dev: + flags[i] = ( + base.ggml_backend_dev_type(dev) + == _GGML_BACKEND_DEVICE_TYPE_IGPU + ) + except Exception: + # iGPU detection is best-effort: any failure (missing symbol, + # registry call error) degrades to "discrete" so the memory + # readings still get through instead of crashing the probe. + pass + return flags + def main() -> int: if len(sys.argv) < 2: @@ -31,7 +75,7 @@ def main() -> int: base_name, vk_name = "libggml-base.so", "libggml-vulkan.so" try: - ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL) + base = ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL) lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL) except OSError as e: print(f"ggml-vulkan load failed: {e}", file = sys.stderr) @@ -46,13 +90,17 @@ def main() -> int: ctypes.POINTER(ctypes.c_size_t), ] + count = lib.ggml_backend_vk_get_device_count() + igpu = _igpu_flags(base, lib, count) rows = [] - for i in range(lib.ggml_backend_vk_get_device_count()): + for i in range(count): free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) + # total is a required out-param of the C call but unused: the reader + # leaves a flat per-device margin, not a fraction of total. lib.ggml_backend_vk_get_device_memory( i, ctypes.byref(free), ctypes.byref(total) ) - rows.append("%d\t%d\t%d" % (i, free.value, total.value)) + rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i]))) sys.stdout.write("\n".join(rows)) return 0 diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index a88204113a..2cda6f8e07 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -611,6 +611,29 @@ def _vulkan_lib_filename() -> str: return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so" +# Free system RAM to leave on an integrated GPU, mirroring llama.cpp's own +# auto-fit margin (llama-server --fit-target, default 1024 MiB per device). +# ggml reports an iGPU's "VRAM" as shared system RAM, so we hold back the same +# per-device margin --fit would rather than inventing a larger reserve. +_IGPU_HOST_RESERVE_MIB = 1024 + + +def _apply_igpu_host_reserve_mib(free_mib: int, is_igpu: bool) -> int: + """Reserve host headroom on an integrated (shared-memory) Vulkan GPU. + + ggml sums every memory heap for an integrated GPU (ggml-vulkan's + ggml_backend_vk_get_device_memory), so its reported free "VRAM" is really + free system RAM. Sizing context/offload against all of it would crowd out + the host and push it into swap or the OOM killer. We leave the same + per-device margin llama.cpp's --fit uses (``_IGPU_HOST_RESERVE_MIB``). + ``is_igpu`` comes straight from ggml's device type, so a discrete card is + never touched. Only ever reduces the budget. + """ + if not is_igpu: + return free_mib + return max(0, free_mib - _IGPU_HOST_RESERVE_MIB) + + def _llama_lib_dir(binary: str) -> Path: # The installer exposes llama-server as a top-level symlink # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml @@ -1440,8 +1463,9 @@ def _get_gpu_free_memory_vulkan( instance is created in this process. Returns list of (device_index, free_mib) sorted by index, where the index is ggml's own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES`` - expects). Returns [] when no Vulkan build is installed or no device - is reachable. + expects). Integrated GPUs leave a per-device host-RAM margin (see + ``_apply_igpu_host_reserve_mib``). Returns [] when no Vulkan build is + installed or no device is reachable. """ binary = binary or LlamaCppBackend._find_llama_server_binary() if not binary: @@ -1451,6 +1475,12 @@ def _get_gpu_free_memory_vulkan( return [] env = child_env_without_native_path_secret() + # Enumerate ggml's canonical, full device list. An inherited + # GGML_VK_VISIBLE_DEVICES would renumber/restrict the ordinals, but + # load_model writes its own pin in that same full space, so letting + # the probe see a pre-existing mask would make the pin double-apply + # and target the wrong device. + env.pop("GGML_VK_VISIBLE_DEVICES", None) if sys.platform != "win32": # Let the loader resolve sibling ggml libs next to the binary. existing_ld = env.get("LD_LIBRARY_PATH", "") @@ -1484,9 +1514,17 @@ def _get_gpu_free_memory_vulkan( try: idx = int(parts[0]) free_mib = int(parts[1]) // (1024 * 1024) + is_igpu = parts[2] == "1" except ValueError: continue - gpus.append((idx, free_mib)) + capped = _apply_igpu_host_reserve_mib(free_mib, is_igpu) + if capped < free_mib: + logger.info( + f"Vulkan device VK{idx} is an integrated GPU sharing system " + f"RAM; reserving {free_mib - capped}MiB host headroom " + f"({free_mib}->{capped}MiB usable)" + ) + gpus.append((idx, capped)) gpus.sort(key = lambda g: g[0]) if gpus: logger.info( diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py new file mode 100644 index 0000000000..c641ad4bd0 --- /dev/null +++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +"""Vulkan free-VRAM reader regression tests on a synthetic probe output. + +Covers the post-probe handling in +``LlamaCppBackend._get_gpu_free_memory_vulkan``: + + * integrated GPUs (probe reports is_igpu=1) leave a flat per-device host + margin matching llama.cpp's --fit-target, so context auto-sizing can't + over-commit shared RAM, + * discrete GPUs (is_igpu=0) are left untouched, + * an inherited ``GGML_VK_VISIBLE_DEVICES`` is stripped before probing so + enumeration stays in ggml's canonical full-device space. + +The ggml Vulkan library is never loaded: subprocess.run is mocked to emit +the tab-separated lines the real ``_vulkan_probe.py`` would print. +""" + +from __future__ import annotations + +import subprocess +import sys +import types as _types +from pathlib import Path +from unittest import mock + +import pytest + +_BACKEND_DIR = str(Path(__file__).resolve().parent.parent) +if _BACKEND_DIR not in sys.path: + sys.path.insert(0, _BACKEND_DIR) + +import importlib as _importlib # noqa: E402 + + +def _maybe_stub(name: str, builder): + try: + _importlib.import_module(name) + except ImportError: + sys.modules[name] = builder() + + +def _build_loggers_stub(): + m = _types.ModuleType("loggers") + m.get_logger = lambda name: __import__("logging").getLogger(name) + return m + + +_maybe_stub("loggers", _build_loggers_stub) +_maybe_stub("structlog", lambda: _types.ModuleType("structlog")) + +from core.inference import llama_cpp as _llama_mod # noqa: E402 +from core.inference.llama_cpp import LlamaCppBackend, _vulkan_lib_filename # noqa: E402 + +MIB = 1024 * 1024 +GIB = 1024 * MIB + + +def _make_vulkan_install(tmp_path: Path) -> str: + """A binary whose sibling dir holds the Vulkan ggml lib, so the + reader's ``is_vulkan_backend`` sibling-file check passes.""" + bindir = tmp_path / "build" / "bin" + bindir.mkdir(parents = True) + binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server") + binary.write_bytes(b"stub") + (bindir / _vulkan_lib_filename()).write_bytes(b"stub") + return str(binary) + + +def _mock_probe(rows: list[str], captured_env: dict | None = None): + """Patch subprocess.run so the _vulkan_probe.py call returns ``rows`` + (already tab-formatted), recording the env it was launched with.""" + real_run = subprocess.run + + def fake_run(cmd, *args, **kwargs): + if isinstance(cmd, list) and any("_vulkan_probe" in str(c) for c in cmd): + if captured_env is not None: + captured_env.clear() + captured_env.update(kwargs.get("env") or {}) + return subprocess.CompletedProcess( + args = cmd, returncode = 0, stdout = "\n".join(rows), stderr = "" + ) + return real_run(cmd, *args, **kwargs) + + return mock.patch("subprocess.run", side_effect = fake_run) + + +def _row(idx: int, free_bytes: int, is_igpu: int) -> str: + return f"{idx}\t{free_bytes}\t{is_igpu}" + + +def test_integrated_gpu_leaves_host_margin(tmp_path): + binary = _make_vulkan_install(tmp_path) + # iGPU with 30 GiB free; reserve a flat 1024 MiB (llama.cpp --fit-target). + rows = [_row(0, 30 * GIB, is_igpu = 1)] + with _mock_probe(rows): + gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert gpus == [(0, 30 * 1024 - 1024)], gpus + + +def test_discrete_gpu_free_is_untouched(tmp_path): + binary = _make_vulkan_install(tmp_path) + rows = [_row(0, 23 * GIB, is_igpu = 0)] + with _mock_probe(rows): + gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert gpus == [(0, 23 * 1024)], gpus + + +def test_large_discrete_gpu_is_untouched(tmp_path): + binary = _make_vulkan_install(tmp_path) + # A 48 GiB discrete card stays untouched regardless of size; only the + # iGPU flag triggers the host margin, never a VRAM/RAM ratio. + rows = [_row(0, 47 * GIB, is_igpu = 0)] + with _mock_probe(rows): + gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert gpus == [(0, 47 * 1024)], gpus + + +def test_inherited_visible_devices_mask_is_stripped(tmp_path, monkeypatch): + binary = _make_vulkan_install(tmp_path) + monkeypatch.setenv("GGML_VK_VISIBLE_DEVICES", "1") + captured: dict = {} + rows = [_row(0, 23 * GIB, is_igpu = 0)] + with _mock_probe(rows, captured_env = captured): + LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert "GGML_VK_VISIBLE_DEVICES" not in captured, captured + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, "-v"])) From 31f4a36f2c5a5172b2083f3160e14c014766fc02 Mon Sep 17 00:00:00 2001 From: oobabooga <112222186+oobabooga@users.noreply.github.com> Date: Sun, 31 May 2026 14:40:00 -0700 Subject: [PATCH 11/13] Add a `UNSLOTH_FORCE_VULKAN` environment variable --- studio/install_llama_prebuilt.py | 54 +++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 1ac41cc648..314d3eb978 100644 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -6558,6 +6558,37 @@ def validate_prebuilt_attempts( raise PrebuiltFallback("no prebuilt bundle passed validation") +def force_vulkan_requested() -> bool: + """Whether UNSLOTH_FORCE_VULKAN opts this host into the Vulkan llama.cpp + prebuilt instead of its detected CUDA/ROCm backend -- e.g. so an AMD user + can run the Vulkan build for inference. Scoped to the llama.cpp backend: + the torch/training stack is installed separately and still sees the real + GPU. + """ + return os.environ.get("UNSLOTH_FORCE_VULKAN", "").strip().lower() in ( + "1", + "true", + "yes", + ) + + +def _vulkan_only_host(host: HostInfo) -> HostInfo: + """Rewrite ``host`` so the asset selectors take their Vulkan branch. + + That branch fires on ``has_intel_gpu and not nvidia and not rocm``, so the + CUDA/ROCm flags are cleared and the integrated-GPU flag is raised. The + synthetic integrated-GPU flag never leaves install planning -- it only + routes the llama.cpp prebuilt choice, not the torch/training stack. + """ + return dataclasses_replace( + host, + has_usable_nvidia = False, + has_physical_nvidia = False, + has_rocm = False, + has_intel_gpu = True, + ) + + def install_prebuilt( install_dir: Path, llama_tag: str, @@ -6570,6 +6601,24 @@ def install_prebuilt( host = detect_host() if override_has_rocm and not host.has_rocm: host = dataclasses_replace(host, has_rocm = True) + # UNSLOTH_FORCE_VULKAN installs the upstream ggml-org Vulkan prebuilt + # instead of the detected CUDA/ROCm backend. The unsloth published repo + # ships only CUDA/ROCm assets, hence UPSTREAM_REPO. + force_vulkan = False + if force_vulkan_requested(): + if host.is_macos: + log( + "UNSLOTH_FORCE_VULKAN is set but ignored on macOS " + "(Metal is used; there is no Vulkan prebuilt)" + ) + else: + log( + "UNSLOTH_FORCE_VULKAN is set; installing the upstream Vulkan " + "llama.cpp prebuilt instead of the detected GPU backend" + ) + host = _vulkan_only_host(host) + published_repo = UPSTREAM_REPO + force_vulkan = True choice: AssetChoice | None = None try: with install_lock(install_lock_path(install_dir)): @@ -6581,7 +6630,10 @@ def install_prebuilt( log( f"no existing llama.cpp install detected at {install_dir}; performing fresh prebuilt install" ) - if simple_policy: + if simple_policy or force_vulkan: + # The simple planner is the one that routes a non-unsloth repo + # (here UPSTREAM_REPO) through direct_upstream_release_plan, + # which carries the Vulkan asset branch. requested_tag, release_plans = resolve_simple_install_release_plans( llama_tag, host, From c3482d406bfea6759afb1a5f20f6a2bb65deebc0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 31 May 2026 21:40:35 +0000 Subject: [PATCH 12/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/_vulkan_probe.py | 3 +-- studio/backend/tests/test_llama_cpp_vulkan_probe.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py index dcbbe9096b..23b91c599c 100644 --- a/studio/backend/core/inference/_vulkan_probe.py +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -49,8 +49,7 @@ def _igpu_flags(base, lib, count: int) -> list[bool]: dev = base.ggml_backend_reg_dev_get(reg, i) if dev: flags[i] = ( - base.ggml_backend_dev_type(dev) - == _GGML_BACKEND_DEVICE_TYPE_IGPU + base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU ) except Exception: # iGPU detection is best-effort: any failure (missing symbol, diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py index c641ad4bd0..cd92cf21ab 100644 --- a/studio/backend/tests/test_llama_cpp_vulkan_probe.py +++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py @@ -62,7 +62,9 @@ def _make_vulkan_install(tmp_path: Path) -> str: reader's ``is_vulkan_backend`` sibling-file check passes.""" bindir = tmp_path / "build" / "bin" bindir.mkdir(parents = True) - binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server") + binary = bindir / ( + "llama-server.exe" if sys.platform == "win32" else "llama-server" + ) binary.write_bytes(b"stub") (bindir / _vulkan_lib_filename()).write_bytes(b"stub") return str(binary) From 7563d9143eaafbf5b042be3b2b663595f82db979 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 Jun 2026 03:30:16 +0000 Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/_vulkan_probe.py | 8 ++------ studio/backend/core/inference/llama_cpp.py | 5 +---- studio/backend/tests/test_llama_cpp_vulkan_probe.py | 4 +--- studio/install_llama_prebuilt.py | 12 +++--------- 4 files changed, 7 insertions(+), 22 deletions(-) diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py index 23b91c599c..f2e225f4c4 100644 --- a/studio/backend/core/inference/_vulkan_probe.py +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -48,9 +48,7 @@ def _igpu_flags(base, lib, count: int) -> list[bool]: for i in range(min(count, dev_count)): dev = base.ggml_backend_reg_dev_get(reg, i) if dev: - flags[i] = ( - base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU - ) + flags[i] = base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU except Exception: # iGPU detection is best-effort: any failure (missing symbol, # registry call error) degrades to "discrete" so the memory @@ -96,9 +94,7 @@ def main() -> int: free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) # total is a required out-param of the C call but unused: the reader # leaves a flat per-device margin, not a fraction of total. - lib.ggml_backend_vk_get_device_memory( - i, ctypes.byref(free), ctypes.byref(total) - ) + lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total)) rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i]))) sys.stdout.write("\n".join(rows)) return 0 diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 2ab54126a7..fd58532f73 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1431,9 +1431,7 @@ def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]: return [] @staticmethod - def _get_gpu_free_memory_vulkan( - binary: Optional[str] = None, - ) -> list[tuple[int, int]]: + def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]: """Query free VRAM per device via the bundled ggml Vulkan backend. Loads ``libggml-vulkan`` in a short-lived subprocess and calls @@ -3446,7 +3444,6 @@ def load_model( env["CUDA_VISIBLE_DEVICES"] = pinned try: import torch as _torch - if getattr(_torch.version, "hip", None) is not None: env["HIP_VISIBLE_DEVICES"] = pinned env["ROCR_VISIBLE_DEVICES"] = pinned diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py index cd92cf21ab..c641ad4bd0 100644 --- a/studio/backend/tests/test_llama_cpp_vulkan_probe.py +++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py @@ -62,9 +62,7 @@ def _make_vulkan_install(tmp_path: Path) -> str: reader's ``is_vulkan_backend`` sibling-file check passes.""" bindir = tmp_path / "build" / "bin" bindir.mkdir(parents = True) - binary = bindir / ( - "llama-server.exe" if sys.platform == "win32" else "llama-server" - ) + binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server") binary.write_bytes(b"stub") (bindir / _vulkan_lib_filename()).write_bytes(b"stub") return str(binary) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index 2d97f11cdf..d662ec697e 100644 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -3926,9 +3926,7 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz" if vulkan_name in upstream_assets: - log( - f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}" - ) + log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}") return AssetChoice( repo = UPSTREAM_REPO, tag = llama_tag, @@ -3937,9 +3935,7 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice source_label = "upstream", install_kind = "linux-vulkan", ) - log( - "Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU" - ) + log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU") upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: @@ -3996,9 +3992,7 @@ def resolve_upstream_asset_choice(host: HostInfo, llama_tag: str) -> AssetChoice source_label = "upstream", install_kind = "windows-vulkan", ) - log( - "Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU" - ) + log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU") upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: