diff --git a/studio/backend/core/inference/_vulkan_probe.py b/studio/backend/core/inference/_vulkan_probe.py new file mode 100644 index 0000000000..f2e225f4c4 --- /dev/null +++ b/studio/backend/core/inference/_vulkan_probe.py @@ -0,0 +1,104 @@ +"""Standalone free-VRAM probe for the bundled ggml Vulkan backend. + +Run in a short-lived subprocess (``python _vulkan_probe.py ``) so the +Vulkan instance never lives in the long-running backend process. Loads the +bundled ggml Vulkan backend from ```` and prints one +``\\t\\t`` line per device to stdout. The indices +are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES +expects), which need not match nvidia-smi order. ``is_igpu`` is ``1`` for an +integrated GPU (shared system RAM) and ``0`` otherwise, taken from ggml's own +device type so the reader needn't guess from VRAM-vs-RAM ratios. + +Uses only the standard library so it stays runnable as a bare script without +importing the backend package. +""" + +import ctypes +import os +import sys + +# ggml_backend_dev_type enum (ggml-backend.h): CPU=0, GPU=1, IGPU=2, ... +_GGML_BACKEND_DEVICE_TYPE_IGPU = 2 + + +def _igpu_flags(base, lib, count: int) -> list[bool]: + """Per-device integrated-GPU flags via ggml's backend registry. + + The Vulkan reg enumerates devices in the same order as + ``ggml_backend_vk_get_device_memory`` (ggml-vulkan builds each device + context with ``ctx->device = i``), so reg index == device ordinal. + Returns all-False on any failure so the reader never over-caps a + discrete card just because the type couldn't be read. + """ + flags = [False] * count + try: + lib.ggml_backend_vk_reg.restype = ctypes.c_void_p + lib.ggml_backend_vk_reg.argtypes = [] + base.ggml_backend_reg_dev_count.restype = ctypes.c_size_t + base.ggml_backend_reg_dev_count.argtypes = [ctypes.c_void_p] + base.ggml_backend_reg_dev_get.restype = ctypes.c_void_p + base.ggml_backend_reg_dev_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t] + base.ggml_backend_dev_type.restype = ctypes.c_int + base.ggml_backend_dev_type.argtypes = [ctypes.c_void_p] + + reg = lib.ggml_backend_vk_reg() + if not reg: + return flags + dev_count = base.ggml_backend_reg_dev_count(reg) + for i in range(min(count, dev_count)): + dev = base.ggml_backend_reg_dev_get(reg, i) + if dev: + flags[i] = base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU + except Exception: + # iGPU detection is best-effort: any failure (missing symbol, + # registry call error) degrades to "discrete" so the memory + # readings still get through instead of crashing the probe. + pass + return flags + + +def main() -> int: + if len(sys.argv) < 2: + return 0 + bindir = sys.argv[1] + + if sys.platform == "win32": + base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll" + try: + os.add_dll_directory(bindir) + except Exception: + pass + else: + base_name, vk_name = "libggml-base.so", "libggml-vulkan.so" + + try: + base = ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL) + lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL) + except OSError as e: + print(f"ggml-vulkan load failed: {e}", file = sys.stderr) + return 1 + + lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int + lib.ggml_backend_vk_get_device_count.argtypes = [] + lib.ggml_backend_vk_get_device_memory.restype = None + lib.ggml_backend_vk_get_device_memory.argtypes = [ + ctypes.c_int, + ctypes.POINTER(ctypes.c_size_t), + ctypes.POINTER(ctypes.c_size_t), + ] + + count = lib.ggml_backend_vk_get_device_count() + igpu = _igpu_flags(base, lib, count) + rows = [] + for i in range(count): + free, total = ctypes.c_size_t(0), ctypes.c_size_t(0) + # total is a required out-param of the C call but unused: the reader + # leaves a flat per-device margin, not a fraction of total. + lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total)) + rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i]))) + sys.stdout.write("\n".join(rows)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index b83e4e6961..ec9f18f9c9 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -658,6 +658,42 @@ def _backfill_usage_from_timings(usage, timings): return out +def _vulkan_lib_filename() -> str: + return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so" + + +# Free system RAM to leave on an integrated GPU, mirroring llama.cpp's own +# auto-fit margin (llama-server --fit-target, default 1024 MiB per device). +# ggml reports an iGPU's "VRAM" as shared system RAM, so we hold back the same +# per-device margin --fit would rather than inventing a larger reserve. +_IGPU_HOST_RESERVE_MIB = 1024 + + +def _apply_igpu_host_reserve_mib(free_mib: int, is_igpu: bool) -> int: + """Reserve host headroom on an integrated (shared-memory) Vulkan GPU. + + ggml sums every memory heap for an integrated GPU (ggml-vulkan's + ggml_backend_vk_get_device_memory), so its reported free "VRAM" is really + free system RAM. Sizing context/offload against all of it would crowd out + the host and push it into swap or the OOM killer. We leave the same + per-device margin llama.cpp's --fit uses (``_IGPU_HOST_RESERVE_MIB``). + ``is_igpu`` comes straight from ggml's device type, so a discrete card is + never touched. Only ever reduces the budget. + """ + if not is_igpu: + return free_mib + return max(0, free_mib - _IGPU_HOST_RESERVE_MIB) + + +def _llama_lib_dir(binary: str) -> Path: + # The installer exposes llama-server as a top-level symlink + # (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml + # backend libs actually live. Resolve it so callers looking for sibling + # libs (Vulkan detection, LD_LIBRARY_PATH, the probe's bindir) hit the real + # directory instead of the symlink's parent. + return Path(binary).resolve().parent + + class LlamaCppBackend: """Manages a llama-server subprocess for GGUF model inference. @@ -1297,6 +1333,20 @@ def _get_gguf_size_bytes(model_path: str) -> int: return total + @staticmethod + def _is_vulkan_backend(binary: Optional[str] = None) -> bool: + """True if the installed llama.cpp build is the Vulkan one. + + Builds are single-backend, so the presence of the Vulkan ggml + backend library next to llama-server is sufficient. Used to keep + the free-memory probe and the GPU pin in the same device-index + space (ggml's Vulkan ordinals, not nvidia-smi order). + """ + binary = binary or LlamaCppBackend._find_llama_server_binary() + if not binary: + return False + return (_llama_lib_dir(binary) / _vulkan_lib_filename()).is_file() + @staticmethod def _amd_apu_wants_unified_memory() -> bool: """True only for AMD unified-memory APUs (gfx1150/gfx1151), where @@ -1322,7 +1372,24 @@ def _amd_apu_wants_unified_memory() -> bool: return False @staticmethod - def _get_gpu_free_memory() -> list[tuple[int, int]]: + def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]: + """Query free memory per GPU across all supported backends. + + On a Vulkan build, the ggml Vulkan probe is authoritative so the + returned indices are Vulkan ordinals (the space the GPU pin writes + to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch + cover NVIDIA + AMD ROCm. + + Returns list of (gpu_index, free_mib) sorted by index. Empty + list if no supported GPU is reachable. + """ + binary = binary or LlamaCppBackend._find_llama_server_binary() + if LlamaCppBackend._is_vulkan_backend(binary): + return LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + return LlamaCppBackend._get_gpu_free_memory_nvidia_torch() + + @staticmethod + def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]: """Query free memory per GPU. Order: @@ -1428,6 +1495,85 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: logger.debug(f"torch GPU probe failed: {e}") return [] + @staticmethod + def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]: + """Query free VRAM per device via the bundled ggml Vulkan backend. + + Loads ``libggml-vulkan`` in a short-lived subprocess and calls + ``ggml_backend_vk_get_device_memory`` for each device, so no Vulkan + instance is created in this process. Returns list of + (device_index, free_mib) sorted by index, where the index is ggml's + own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES`` + expects). Integrated GPUs leave a per-device host-RAM margin (see + ``_apply_igpu_host_reserve_mib``). Returns [] when no Vulkan build is + installed or no device is reachable. + """ + binary = binary or LlamaCppBackend._find_llama_server_binary() + if not binary: + return [] + binary_dir = _llama_lib_dir(binary) + if not (binary_dir / _vulkan_lib_filename()).is_file(): + return [] + + env = child_env_without_native_path_secret() + # Enumerate ggml's canonical, full device list. An inherited + # GGML_VK_VISIBLE_DEVICES would renumber/restrict the ordinals, but + # load_model writes its own pin in that same full space, so letting + # the probe see a pre-existing mask would make the pin double-apply + # and target the wrong device. + env.pop("GGML_VK_VISIBLE_DEVICES", None) + if sys.platform != "win32": + # Let the loader resolve sibling ggml libs next to the binary. + existing_ld = env.get("LD_LIBRARY_PATH", "") + env["LD_LIBRARY_PATH"] = ( + f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir) + ) + probe_script = Path(__file__).with_name("_vulkan_probe.py") + try: + result = subprocess.run( + [sys.executable, str(probe_script), str(binary_dir)], + capture_output = True, + text = True, + timeout = 15, + env = env, + **_windows_hidden_subprocess_kwargs(), + ) + if result.returncode != 0: + logger.debug( + f"vulkan GPU probe exited {result.returncode}: {result.stderr.strip()}" + ) + return [] + except Exception as e: + logger.debug(f"vulkan GPU probe failed: {e}") + return [] + + gpus: list[tuple[int, int]] = [] + for line in result.stdout.strip().splitlines(): + parts = line.split("\t") + if len(parts) != 3: + continue + try: + idx = int(parts[0]) + free_mib = int(parts[1]) // (1024 * 1024) + is_igpu = parts[2] == "1" + except ValueError: + continue + capped = _apply_igpu_host_reserve_mib(free_mib, is_igpu) + if capped < free_mib: + logger.info( + f"Vulkan device VK{idx} is an integrated GPU sharing system " + f"RAM; reserving {free_mib - capped}MiB host headroom " + f"({free_mib}->{capped}MiB usable)" + ) + gpus.append((idx, capped)) + gpus.sort(key = lambda g: g[0]) + if gpus: + logger.info( + "Vulkan GPU memory detected: " + + ", ".join(f"VK{idx}={free}MiB" for idx, free in gpus) + ) + return gpus + # Skip the wait when the last kill is older than this; the driver has # already reclaimed the prior process's allocations. _VRAM_SETTLE_WINDOW_S: float = 15.0 @@ -2915,6 +3061,7 @@ def load_model( "Run setup.sh to build it, install llama.cpp, " "or set LLAMA_SERVER_PATH environment variable." ) + is_vulkan_backend = self._is_vulkan_backend(binary) # ── Phase 2: download (NO lock held, so cancel can proceed) ── # mtp_draft_path arrives set for local Gemma loads (detected @@ -3000,7 +3147,7 @@ def load_model( gpus: list[tuple[int, int]] = [] try: model_size = self._get_gguf_size_bytes(model_path) - gpus = self._get_gpu_free_memory() + gpus = self._get_gpu_free_memory(binary) # Resolve effective context: 0 means let llama-server use # the model's native length. Only expand to a known native @@ -3387,7 +3534,7 @@ def load_model( import sys env = child_env_without_native_path_secret() - binary_dir = str(Path(binary).parent) + binary_dir = str(_llama_lib_dir(binary)) # AMD unified-memory APUs (gfx1150/gfx1151): let llama.cpp use # shared system RAM. setdefault so a user value wins. @@ -3482,26 +3629,32 @@ def load_model( # set, so set HIP_VISIBLE_DEVICES too. if gpu_indices is not None: pinned = ",".join(str(i) for i in gpu_indices) - env["CUDA_VISIBLE_DEVICES"] = pinned - try: - import torch as _torch - if getattr(_torch.version, "hip", None) is not None: - env["HIP_VISIBLE_DEVICES"] = pinned - # Do NOT also set ROCR_VISIBLE_DEVICES to the same - # value. ROCR_VISIBLE_DEVICES filters at the HSA/ROCr - # layer and HIP_VISIBLE_DEVICES at the HIP layer, so - # setting both with the same physical indices applies - # the mask twice: ROCR reduces the visible set and - # re-indexes it from 0, then HIP indexes into the - # already-reduced set. A single non-zero pin (e.g. - # "1") then points out of range at the HIP layer, HIP - # enumerates 0 devices, and llama.cpp falls back to - # CPU ("ggml_cuda_init: no ROCm-capable device is - # detected"). The HIP mask alone narrows correctly; - # clear any inherited ROCR mask so it can't double up. - env.pop("ROCR_VISIBLE_DEVICES", None) - except Exception as e: - logger.debug("Failed to set ROCm visibility env vars for child: %s", e) + if is_vulkan_backend: + # gpu_indices are ggml Vulkan ordinals (see + # _get_gpu_free_memory); the Vulkan backend ignores + # CUDA_VISIBLE_DEVICES, so pin via its own mask. + env["GGML_VK_VISIBLE_DEVICES"] = pinned + else: + env["CUDA_VISIBLE_DEVICES"] = pinned + try: + import torch as _torch + if getattr(_torch.version, "hip", None) is not None: + env["HIP_VISIBLE_DEVICES"] = pinned + # Do NOT also set ROCR_VISIBLE_DEVICES to the same + # value. ROCR_VISIBLE_DEVICES filters at the HSA/ROCr + # layer and HIP_VISIBLE_DEVICES at the HIP layer, so + # setting both with the same physical indices applies + # the mask twice: ROCR reduces the visible set and + # re-indexes it from 0, then HIP indexes into the + # already-reduced set. A single non-zero pin (e.g. + # "1") then points out of range at the HIP layer, HIP + # enumerates 0 devices, and llama.cpp falls back to + # CPU ("ggml_cuda_init: no ROCm-capable device is + # detected"). The HIP mask alone narrows correctly; + # clear any inherited ROCR mask so it can't double up. + env.pop("ROCR_VISIBLE_DEVICES", None) + except Exception as e: + logger.debug("Failed to set ROCm visibility env vars for child: %s", e) # Captured before any text-only fallback strips it from cmd. launched_with_mmproj = "--mmproj" in cmd diff --git a/studio/backend/tests/test_llama_cpp_vulkan_probe.py b/studio/backend/tests/test_llama_cpp_vulkan_probe.py new file mode 100644 index 0000000000..c641ad4bd0 --- /dev/null +++ b/studio/backend/tests/test_llama_cpp_vulkan_probe.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: AGPL-3.0-only +# Copyright 2026-present the Unsloth AI Inc. team. All rights reserved. See /studio/LICENSE.AGPL-3.0 + +"""Vulkan free-VRAM reader regression tests on a synthetic probe output. + +Covers the post-probe handling in +``LlamaCppBackend._get_gpu_free_memory_vulkan``: + + * integrated GPUs (probe reports is_igpu=1) leave a flat per-device host + margin matching llama.cpp's --fit-target, so context auto-sizing can't + over-commit shared RAM, + * discrete GPUs (is_igpu=0) are left untouched, + * an inherited ``GGML_VK_VISIBLE_DEVICES`` is stripped before probing so + enumeration stays in ggml's canonical full-device space. + +The ggml Vulkan library is never loaded: subprocess.run is mocked to emit +the tab-separated lines the real ``_vulkan_probe.py`` would print. +""" + +from __future__ import annotations + +import subprocess +import sys +import types as _types +from pathlib import Path +from unittest import mock + +import pytest + +_BACKEND_DIR = str(Path(__file__).resolve().parent.parent) +if _BACKEND_DIR not in sys.path: + sys.path.insert(0, _BACKEND_DIR) + +import importlib as _importlib # noqa: E402 + + +def _maybe_stub(name: str, builder): + try: + _importlib.import_module(name) + except ImportError: + sys.modules[name] = builder() + + +def _build_loggers_stub(): + m = _types.ModuleType("loggers") + m.get_logger = lambda name: __import__("logging").getLogger(name) + return m + + +_maybe_stub("loggers", _build_loggers_stub) +_maybe_stub("structlog", lambda: _types.ModuleType("structlog")) + +from core.inference import llama_cpp as _llama_mod # noqa: E402 +from core.inference.llama_cpp import LlamaCppBackend, _vulkan_lib_filename # noqa: E402 + +MIB = 1024 * 1024 +GIB = 1024 * MIB + + +def _make_vulkan_install(tmp_path: Path) -> str: + """A binary whose sibling dir holds the Vulkan ggml lib, so the + reader's ``is_vulkan_backend`` sibling-file check passes.""" + bindir = tmp_path / "build" / "bin" + bindir.mkdir(parents = True) + binary = bindir / ("llama-server.exe" if sys.platform == "win32" else "llama-server") + binary.write_bytes(b"stub") + (bindir / _vulkan_lib_filename()).write_bytes(b"stub") + return str(binary) + + +def _mock_probe(rows: list[str], captured_env: dict | None = None): + """Patch subprocess.run so the _vulkan_probe.py call returns ``rows`` + (already tab-formatted), recording the env it was launched with.""" + real_run = subprocess.run + + def fake_run(cmd, *args, **kwargs): + if isinstance(cmd, list) and any("_vulkan_probe" in str(c) for c in cmd): + if captured_env is not None: + captured_env.clear() + captured_env.update(kwargs.get("env") or {}) + return subprocess.CompletedProcess( + args = cmd, returncode = 0, stdout = "\n".join(rows), stderr = "" + ) + return real_run(cmd, *args, **kwargs) + + return mock.patch("subprocess.run", side_effect = fake_run) + + +def _row(idx: int, free_bytes: int, is_igpu: int) -> str: + return f"{idx}\t{free_bytes}\t{is_igpu}" + + +def test_integrated_gpu_leaves_host_margin(tmp_path): + binary = _make_vulkan_install(tmp_path) + # iGPU with 30 GiB free; reserve a flat 1024 MiB (llama.cpp --fit-target). + rows = [_row(0, 30 * GIB, is_igpu = 1)] + with _mock_probe(rows): + gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert gpus == [(0, 30 * 1024 - 1024)], gpus + + +def test_discrete_gpu_free_is_untouched(tmp_path): + binary = _make_vulkan_install(tmp_path) + rows = [_row(0, 23 * GIB, is_igpu = 0)] + with _mock_probe(rows): + gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert gpus == [(0, 23 * 1024)], gpus + + +def test_large_discrete_gpu_is_untouched(tmp_path): + binary = _make_vulkan_install(tmp_path) + # A 48 GiB discrete card stays untouched regardless of size; only the + # iGPU flag triggers the host margin, never a VRAM/RAM ratio. + rows = [_row(0, 47 * GIB, is_igpu = 0)] + with _mock_probe(rows): + gpus = LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert gpus == [(0, 47 * 1024)], gpus + + +def test_inherited_visible_devices_mask_is_stripped(tmp_path, monkeypatch): + binary = _make_vulkan_install(tmp_path) + monkeypatch.setenv("GGML_VK_VISIBLE_DEVICES", "1") + captured: dict = {} + rows = [_row(0, 23 * GIB, is_igpu = 0)] + with _mock_probe(rows, captured_env = captured): + LlamaCppBackend._get_gpu_free_memory_vulkan(binary) + assert "GGML_VK_VISIBLE_DEVICES" not in captured, captured + + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, "-v"])) diff --git a/studio/install_llama_prebuilt.py b/studio/install_llama_prebuilt.py index bd8c90dc9c..70ff052c09 100644 --- a/studio/install_llama_prebuilt.py +++ b/studio/install_llama_prebuilt.py @@ -10,6 +10,7 @@ import errno import fnmatch import functools +import glob import hashlib import json import os @@ -262,6 +263,7 @@ class HostInfo: has_physical_nvidia: bool has_usable_nvidia: bool has_rocm: bool = False + has_intel_gpu: bool = False rocm_gfx_target: str | None = None # (major, minor) from platform.mac_ver(); None off macOS or if unparseable. # Skips a macos prebuilt whose minimum-OS exceeds this host. @@ -1495,6 +1497,21 @@ def direct_upstream_release_plan( install_kind = "windows-hip", ) ) + # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt. + elif host.has_intel_gpu: + vulkan_asset = f"llama-{release_tag}-bin-win-vulkan-x64.zip" + vulkan_url = assets.get(vulkan_asset) + if vulkan_url: + attempts.append( + AssetChoice( + repo = repo, + tag = release_tag, + name = vulkan_asset, + url = vulkan_url, + source_label = "upstream", + install_kind = "windows-vulkan", + ) + ) cpu_asset = f"llama-{release_tag}-bin-win-cpu-x64.zip" cpu_url = assets.get(cpu_asset) if cpu_url: @@ -1555,6 +1572,21 @@ def direct_upstream_release_plan( ) ) elif host.is_linux and host.is_x86_64 and not host.has_usable_nvidia: + # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_asset = f"llama-{release_tag}-bin-ubuntu-vulkan-x64.tar.gz" + vulkan_url = assets.get(vulkan_asset) + if vulkan_url: + attempts.append( + AssetChoice( + repo = repo, + tag = release_tag, + name = vulkan_asset, + url = vulkan_url, + source_label = "upstream", + install_kind = "linux-vulkan", + ) + ) if host.has_rocm: # Lemonade first, mirroring the Windows ROCm branch above, so a # ROCm host routed to ggml-org does not silently get the CPU build. @@ -3062,6 +3094,41 @@ def _resolve_exe(name: str) -> str | None: # Note: amdhip64.dll presence alone is NOT treated as GPU evidence # since the HIP SDK can be installed without an AMD GPU. + # Detect an Intel GPU; gates the Vulkan prebuilt. Linux reads the DRM + # sysfs vendor id (0x8086); Windows queries the WMI video controller list. + # Only probed when there is no usable NVIDIA and no ROCm GPU, since the + # Vulkan selection branches are gated the same way -- this keeps the probe + # (notably the Windows powershell call) off the NVIDIA/AMD path. + has_intel_gpu = False + if not has_usable_nvidia and not has_rocm: + if is_linux: + for _vendor_file in glob.glob("/sys/class/drm/card*/device/vendor"): + try: + with open(_vendor_file) as _vf: + if _vf.read().strip().lower() == "0x8086": + has_intel_gpu = True + break + except OSError: + continue + elif is_windows: + _ps = shutil.which("powershell") or shutil.which("pwsh") + if _ps: + try: + _result = run_capture( + [ + _ps, + "-NoProfile", + "-Command", + "Get-CimInstance Win32_VideoController | " + "Select-Object -ExpandProperty Name", + ], + timeout = 15, + ) + if _result.returncode == 0 and "intel" in _result.stdout.lower(): + has_intel_gpu = True + except Exception: + pass + return HostInfo( system = system, machine = machine, @@ -3077,6 +3144,7 @@ def _resolve_exe(name: str) -> str | None: has_physical_nvidia = has_physical_nvidia, has_usable_nvidia = has_usable_nvidia, has_rocm = has_rocm, + has_intel_gpu = has_intel_gpu, rocm_gfx_target = rocm_gfx_target, macos_version = macos_version, ) @@ -4140,6 +4208,21 @@ def resolve_upstream_asset_choice( "falling back to source build with HIP support" ) + # Intel (or other non-NVIDIA/non-AMD) GPU: use the Vulkan prebuilt. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_name = f"llama-{llama_tag}-bin-ubuntu-vulkan-x64.tar.gz" + if vulkan_name in upstream_assets: + log(f"Intel GPU detected -- using upstream Vulkan prebuilt {vulkan_name}") + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = vulkan_name, + url = upstream_assets[vulkan_name], + source_label = "upstream", + install_kind = "linux-vulkan", + ) + log("Intel GPU detected but no Vulkan prebuilt found -- falling back to CPU") + upstream_name = f"llama-{llama_tag}-bin-ubuntu-x64.tar.gz" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Linux CPU asset was not found") @@ -4180,6 +4263,23 @@ def resolve_upstream_asset_choice( ) log("AMD ROCm detected on Windows but no HIP prebuilt found -- falling back to CPU") + # Intel (or other non-NVIDIA/non-AMD) GPU on Windows: use Vulkan. + if host.has_intel_gpu and not host.has_usable_nvidia and not host.has_rocm: + vulkan_name = f"llama-{llama_tag}-bin-win-vulkan-x64.zip" + if vulkan_name in upstream_assets: + log( + f"Intel GPU detected on Windows -- using upstream Vulkan prebuilt {vulkan_name}" + ) + return AssetChoice( + repo = UPSTREAM_REPO, + tag = llama_tag, + name = vulkan_name, + url = upstream_assets[vulkan_name], + source_label = "upstream", + install_kind = "windows-vulkan", + ) + log("Intel GPU detected on Windows but no Vulkan prebuilt found -- falling back to CPU") + upstream_name = f"llama-{llama_tag}-bin-win-cpu-x64.zip" if upstream_name not in upstream_assets: raise PrebuiltFallback("upstream Windows CPU asset was not found") @@ -4692,6 +4792,7 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: "linux-arm64-cuda", "linux-rocm", "linux-arm64", + "linux-vulkan", }: return ["llama-server", "llama-quantize", "lib*.so*"] if choice.install_kind in {"macos-arm64", "macos-x64"}: @@ -4700,6 +4801,7 @@ def runtime_patterns_for_choice(choice: AssetChoice) -> list[str]: "windows-cpu", "windows-cuda", "windows-hip", + "windows-vulkan", "windows-rocm", "windows-arm64", }: @@ -5743,8 +5845,10 @@ def validate_server( "linux-cuda", "linux-arm64-cuda", "linux-rocm", + "linux-vulkan", "windows-cuda", "windows-hip", + "windows-vulkan", "windows-rocm", "macos-arm64", } @@ -6347,6 +6451,16 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: ["libmtmd.so*"], ["libggml-hip.so*"], ] + if choice.install_kind == "linux-vulkan": + return [ + ["libllama-common.so*"], + ["libllama.so*"], + ["libggml.so*"], + ["libggml-base.so*"], + ["libggml-cpu-*.so*"], + ["libmtmd.so*"], + ["libggml-vulkan.so*"], + ] if choice.install_kind in {"windows-cpu", "windows-arm64"}: return [["llama.dll"]] if choice.install_kind == "windows-cuda": @@ -6366,6 +6480,8 @@ def runtime_payload_health_groups(choice: AssetChoice) -> list[list[str]]: return groups if choice.install_kind in {"windows-hip", "windows-rocm"}: return [["llama.dll"], ["*hip*.dll"]] + if choice.install_kind == "windows-vulkan": + return [["llama.dll"], ["ggml-vulkan.dll"]] return [] @@ -6637,6 +6753,37 @@ def validate_prebuilt_attempts( raise PrebuiltFallback("no prebuilt bundle passed validation") +def force_vulkan_requested() -> bool: + """Whether UNSLOTH_FORCE_VULKAN opts this host into the Vulkan llama.cpp + prebuilt instead of its detected CUDA/ROCm backend -- e.g. so an AMD user + can run the Vulkan build for inference. Scoped to the llama.cpp backend: + the torch/training stack is installed separately and still sees the real + GPU. + """ + return os.environ.get("UNSLOTH_FORCE_VULKAN", "").strip().lower() in ( + "1", + "true", + "yes", + ) + + +def _vulkan_only_host(host: HostInfo) -> HostInfo: + """Rewrite ``host`` so the asset selectors take their Vulkan branch. + + That branch fires on ``has_intel_gpu and not nvidia and not rocm``, so the + CUDA/ROCm flags are cleared and the integrated-GPU flag is raised. The + synthetic integrated-GPU flag never leaves install planning -- it only + routes the llama.cpp prebuilt choice, not the torch/training stack. + """ + return dataclasses_replace( + host, + has_usable_nvidia = False, + has_physical_nvidia = False, + has_rocm = False, + has_intel_gpu = True, + ) + + def install_prebuilt( install_dir: Path, llama_tag: str, @@ -6654,6 +6801,22 @@ def install_prebuilt( override_rocm_gfx = override_rocm_gfx, force_cpu = force_cpu, ) + # UNSLOTH_FORCE_VULKAN installs the upstream ggml-org Vulkan prebuilt + # instead of the detected CUDA/ROCm backend. The unsloth published repo + # ships only CUDA/ROCm assets, hence UPSTREAM_REPO. + if force_vulkan_requested(): + if host.is_macos: + log( + "UNSLOTH_FORCE_VULKAN is set but ignored on macOS " + "(Metal is used; there is no Vulkan prebuilt)" + ) + else: + log( + "UNSLOTH_FORCE_VULKAN is set; installing the upstream Vulkan " + "llama.cpp prebuilt instead of the detected GPU backend" + ) + host = _vulkan_only_host(host) + published_repo = UPSTREAM_REPO choice: AssetChoice | None = None try: with install_lock(install_lock_path(install_dir)): @@ -6666,7 +6829,10 @@ def install_prebuilt( f"no existing llama.cpp install detected at {install_dir}; performing fresh prebuilt install" ) # Single resolver: linux-x64 takes the fast filename path internally, - # every other fork host reads the manifest. + # every other fork host reads the manifest. A forced-Vulkan host has + # already had published_repo pointed at UPSTREAM_REPO above, so the + # simple planner routes through direct_upstream_release_plan and + # carries the Vulkan asset branch. requested_tag, release_plans = resolve_simple_install_release_plans( llama_tag, host,