Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 104 additions & 0 deletions studio/backend/core/inference/_vulkan_probe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
"""Standalone free-VRAM probe for the bundled ggml Vulkan backend.

Run in a short-lived subprocess (``python _vulkan_probe.py <bindir>``) so the
Vulkan instance never lives in the long-running backend process. Loads the
bundled ggml Vulkan backend from ``<bindir>`` and prints one
``<idx>\\t<free_bytes>\\t<is_igpu>`` line per device to stdout. The indices
are ggml's own Vulkan device ordinals (the space GGML_VK_VISIBLE_DEVICES
expects), which need not match nvidia-smi order. ``is_igpu`` is ``1`` for an
integrated GPU (shared system RAM) and ``0`` otherwise, taken from ggml's own
device type so the reader needn't guess from VRAM-vs-RAM ratios.

Uses only the standard library so it stays runnable as a bare script without
importing the backend package.
"""

import ctypes
import os
import sys

# ggml_backend_dev_type enum (ggml-backend.h): CPU=0, GPU=1, IGPU=2, ...
_GGML_BACKEND_DEVICE_TYPE_IGPU = 2


def _igpu_flags(base, lib, count: int) -> list[bool]:
"""Per-device integrated-GPU flags via ggml's backend registry.

The Vulkan reg enumerates devices in the same order as
``ggml_backend_vk_get_device_memory`` (ggml-vulkan builds each device
context with ``ctx->device = i``), so reg index == device ordinal.
Returns all-False on any failure so the reader never over-caps a
discrete card just because the type couldn't be read.
"""
flags = [False] * count
try:
lib.ggml_backend_vk_reg.restype = ctypes.c_void_p
lib.ggml_backend_vk_reg.argtypes = []
base.ggml_backend_reg_dev_count.restype = ctypes.c_size_t
base.ggml_backend_reg_dev_count.argtypes = [ctypes.c_void_p]
base.ggml_backend_reg_dev_get.restype = ctypes.c_void_p
base.ggml_backend_reg_dev_get.argtypes = [ctypes.c_void_p, ctypes.c_size_t]
base.ggml_backend_dev_type.restype = ctypes.c_int
base.ggml_backend_dev_type.argtypes = [ctypes.c_void_p]

reg = lib.ggml_backend_vk_reg()
if not reg:
return flags
dev_count = base.ggml_backend_reg_dev_count(reg)
for i in range(min(count, dev_count)):
dev = base.ggml_backend_reg_dev_get(reg, i)
if dev:
flags[i] = base.ggml_backend_dev_type(dev) == _GGML_BACKEND_DEVICE_TYPE_IGPU
except Exception:
# iGPU detection is best-effort: any failure (missing symbol,
# registry call error) degrades to "discrete" so the memory
# readings still get through instead of crashing the probe.
pass
return flags


def main() -> int:
if len(sys.argv) < 2:
return 0
bindir = sys.argv[1]

if sys.platform == "win32":
base_name, vk_name = "ggml-base.dll", "ggml-vulkan.dll"
try:
os.add_dll_directory(bindir)
except Exception:
pass
else:
base_name, vk_name = "libggml-base.so", "libggml-vulkan.so"

try:
base = ctypes.CDLL(os.path.join(bindir, base_name), mode = ctypes.RTLD_GLOBAL)
lib = ctypes.CDLL(os.path.join(bindir, vk_name), mode = ctypes.RTLD_GLOBAL)
Comment thread
oobabooga marked this conversation as resolved.
except OSError as e:
print(f"ggml-vulkan load failed: {e}", file = sys.stderr)
return 1

lib.ggml_backend_vk_get_device_count.restype = ctypes.c_int
lib.ggml_backend_vk_get_device_count.argtypes = []
lib.ggml_backend_vk_get_device_memory.restype = None
lib.ggml_backend_vk_get_device_memory.argtypes = [
ctypes.c_int,
ctypes.POINTER(ctypes.c_size_t),
ctypes.POINTER(ctypes.c_size_t),
]

count = lib.ggml_backend_vk_get_device_count()
igpu = _igpu_flags(base, lib, count)
rows = []
for i in range(count):
free, total = ctypes.c_size_t(0), ctypes.c_size_t(0)
# total is a required out-param of the C call but unused: the reader
# leaves a flat per-device margin, not a fraction of total.
lib.ggml_backend_vk_get_device_memory(i, ctypes.byref(free), ctypes.byref(total))
rows.append("%d\t%d\t%d" % (i, free.value, int(igpu[i])))
sys.stdout.write("\n".join(rows))
return 0


if __name__ == "__main__":
raise SystemExit(main())
178 changes: 167 additions & 11 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,42 @@ def _backfill_usage_from_timings(usage, timings):
return out


def _vulkan_lib_filename() -> str:
return "ggml-vulkan.dll" if sys.platform == "win32" else "libggml-vulkan.so"


# Free system RAM to leave on an integrated GPU, mirroring llama.cpp's own
# auto-fit margin (llama-server --fit-target, default 1024 MiB per device).
# ggml reports an iGPU's "VRAM" as shared system RAM, so we hold back the same
# per-device margin --fit would rather than inventing a larger reserve.
_IGPU_HOST_RESERVE_MIB = 1024


def _apply_igpu_host_reserve_mib(free_mib: int, is_igpu: bool) -> int:
"""Reserve host headroom on an integrated (shared-memory) Vulkan GPU.

ggml sums every memory heap for an integrated GPU (ggml-vulkan's
ggml_backend_vk_get_device_memory), so its reported free "VRAM" is really
free system RAM. Sizing context/offload against all of it would crowd out
the host and push it into swap or the OOM killer. We leave the same
per-device margin llama.cpp's --fit uses (``_IGPU_HOST_RESERVE_MIB``).
``is_igpu`` comes straight from ggml's device type, so a discrete card is
never touched. Only ever reduces the budget.
"""
if not is_igpu:
return free_mib
return max(0, free_mib - _IGPU_HOST_RESERVE_MIB)


def _llama_lib_dir(binary: str) -> Path:
# The installer exposes llama-server as a top-level symlink
# (~/.unsloth/llama.cpp/llama-server) into build/bin/, where the ggml
# backend libs actually live. Resolve it so callers looking for sibling
# libs (Vulkan detection, LD_LIBRARY_PATH, the probe's bindir) hit the real
# directory instead of the symlink's parent.
return Path(binary).resolve().parent


class LlamaCppBackend:
"""
Manages a llama-server subprocess for GGUF model inference.
Expand Down Expand Up @@ -1219,6 +1255,20 @@ def _get_gguf_size_bytes(model_path: str) -> int:

return total

@staticmethod
def _is_vulkan_backend(binary: Optional[str] = None) -> bool:
"""True if the installed llama.cpp build is the Vulkan one.

Builds are single-backend, so the presence of the Vulkan ggml
backend library next to llama-server is sufficient. Used to keep
the free-memory probe and the GPU pin in the same device-index
space (ggml's Vulkan ordinals, not nvidia-smi order).
"""
binary = binary or LlamaCppBackend._find_llama_server_binary()
if not binary:
return False
return (_llama_lib_dir(binary) / _vulkan_lib_filename()).is_file()

@staticmethod
def _amd_apu_wants_unified_memory() -> bool:
"""True only for AMD unified-memory APUs (gfx1150/gfx1151), where
Expand All @@ -1244,7 +1294,24 @@ def _amd_apu_wants_unified_memory() -> bool:
return False

@staticmethod
def _get_gpu_free_memory() -> list[tuple[int, int]]:
def _get_gpu_free_memory(binary: Optional[str] = None) -> list[tuple[int, int]]:
"""Query free memory per GPU across all supported backends.

On a Vulkan build, the ggml Vulkan probe is authoritative so the
returned indices are Vulkan ordinals (the space the GPU pin writes
to ``GGML_VK_VISIBLE_DEVICES``). Otherwise ``nvidia-smi`` / torch
cover NVIDIA + AMD ROCm.

Returns list of (gpu_index, free_mib) sorted by index. Empty
list if no supported GPU is reachable.
"""
binary = binary or LlamaCppBackend._find_llama_server_binary()
if LlamaCppBackend._is_vulkan_backend(binary):
return LlamaCppBackend._get_gpu_free_memory_vulkan(binary)
return LlamaCppBackend._get_gpu_free_memory_nvidia_torch()

@staticmethod
def _get_gpu_free_memory_nvidia_torch() -> list[tuple[int, int]]:
"""Query free memory per GPU.

Order:
Expand Down Expand Up @@ -1363,6 +1430,85 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
logger.debug(f"torch GPU probe failed: {e}")
return []

@staticmethod
def _get_gpu_free_memory_vulkan(binary: Optional[str] = None) -> list[tuple[int, int]]:
"""Query free VRAM per device via the bundled ggml Vulkan backend.

Loads ``libggml-vulkan`` in a short-lived subprocess and calls
``ggml_backend_vk_get_device_memory`` for each device, so no Vulkan
instance is created in this process. Returns list of
(device_index, free_mib) sorted by index, where the index is ggml's
own Vulkan device ordinal (the space ``GGML_VK_VISIBLE_DEVICES``
expects). Integrated GPUs leave a per-device host-RAM margin (see
``_apply_igpu_host_reserve_mib``). Returns [] when no Vulkan build is
installed or no device is reachable.
"""
binary = binary or LlamaCppBackend._find_llama_server_binary()
if not binary:
return []
binary_dir = _llama_lib_dir(binary)
if not (binary_dir / _vulkan_lib_filename()).is_file():
return []

env = child_env_without_native_path_secret()
# Enumerate ggml's canonical, full device list. An inherited
# GGML_VK_VISIBLE_DEVICES would renumber/restrict the ordinals, but
# load_model writes its own pin in that same full space, so letting
# the probe see a pre-existing mask would make the pin double-apply
# and target the wrong device.
env.pop("GGML_VK_VISIBLE_DEVICES", None)
if sys.platform != "win32":
# Let the loader resolve sibling ggml libs next to the binary.
existing_ld = env.get("LD_LIBRARY_PATH", "")
env["LD_LIBRARY_PATH"] = (
f"{binary_dir}:{existing_ld}" if existing_ld else str(binary_dir)
)
probe_script = Path(__file__).with_name("_vulkan_probe.py")
try:
result = subprocess.run(
[sys.executable, str(probe_script), str(binary_dir)],
capture_output = True,
text = True,
timeout = 15,
env = env,
**_windows_hidden_subprocess_kwargs(),
)
if result.returncode != 0:
logger.debug(
f"vulkan GPU probe exited {result.returncode}: {result.stderr.strip()}"
)
return []
except Exception as e:
Comment thread
oobabooga marked this conversation as resolved.
logger.debug(f"vulkan GPU probe failed: {e}")
return []

gpus: list[tuple[int, int]] = []
for line in result.stdout.strip().splitlines():
parts = line.split("\t")
if len(parts) != 3:
continue
try:
idx = int(parts[0])
free_mib = int(parts[1]) // (1024 * 1024)
is_igpu = parts[2] == "1"
except ValueError:
continue
capped = _apply_igpu_host_reserve_mib(free_mib, is_igpu)
if capped < free_mib:
logger.info(
f"Vulkan device VK{idx} is an integrated GPU sharing system "
f"RAM; reserving {free_mib - capped}MiB host headroom "
f"({free_mib}->{capped}MiB usable)"
)
gpus.append((idx, capped))
gpus.sort(key = lambda g: g[0])
if gpus:
logger.info(
"Vulkan GPU memory detected: "
+ ", ".join(f"VK{idx}={free}MiB" for idx, free in gpus)
)
return gpus

# Skip the wait when the last kill is older than this; the GPU
# driver has already reclaimed the prior process's allocations.
_VRAM_SETTLE_WINDOW_S: float = 15.0
Expand Down Expand Up @@ -2724,6 +2870,7 @@ def load_model(
"Run setup.sh to build it, install llama.cpp, "
"or set LLAMA_SERVER_PATH environment variable."
)
is_vulkan_backend = self._is_vulkan_backend(binary)

# ── Phase 2: download (NO lock held, so cancel can proceed) ──
# Scope HF_HUB_OFFLINE to the download block only when DNS is
Expand Down Expand Up @@ -2791,7 +2938,7 @@ def load_model(
gpus: list[tuple[int, int]] = []
try:
model_size = self._get_gguf_size_bytes(model_path)
gpus = self._get_gpu_free_memory()
gpus = self._get_gpu_free_memory(binary)

# Resolve effective context: 0 means let llama-server use the
# model's native length. Only expand to a known native length
Expand Down Expand Up @@ -3187,7 +3334,7 @@ def load_model(
import sys

env = child_env_without_native_path_secret()
binary_dir = str(Path(binary).parent)
binary_dir = str(_llama_lib_dir(binary))

# AMD unified-memory APUs (gfx1150/gfx1151): let llama.cpp use
# shared system RAM. setdefault so a user value wins.
Expand Down Expand Up @@ -3288,14 +3435,23 @@ def load_model(
# the full HIP/ROCR set the parent inherited.
if gpu_indices is not None:
pinned = ",".join(str(i) for i in gpu_indices)
env["CUDA_VISIBLE_DEVICES"] = pinned
try:
import torch as _torch
if getattr(_torch.version, "hip", None) is not None:
env["HIP_VISIBLE_DEVICES"] = pinned
env["ROCR_VISIBLE_DEVICES"] = pinned
except Exception as e:
logger.debug("Failed to set ROCm visibility env vars for child: %s", e)
if is_vulkan_backend:
# gpu_indices are ggml Vulkan ordinals (see
# _get_gpu_free_memory); the Vulkan backend ignores
# CUDA_VISIBLE_DEVICES, so pin via its own mask.
env["GGML_VK_VISIBLE_DEVICES"] = pinned
else:
env["CUDA_VISIBLE_DEVICES"] = pinned
try:
import torch as _torch
if getattr(_torch.version, "hip", None) is not None:
env["HIP_VISIBLE_DEVICES"] = pinned
env["ROCR_VISIBLE_DEVICES"] = pinned
except Exception as e:
logger.debug(
"Failed to set ROCm visibility env vars for child: %s",
e,
)

# Defensive kill: if a concurrent load slipped past Phase 1
# (because its `self._process` was None at the time) and
Expand Down
Loading
Loading