Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,9 @@ def __init__(self):
self._is_audio: bool = False
self._audio_type: Optional[str] = None
self._audio_probed: bool = False
# Monotonic timestamp set in _kill_process; read by load_model
# to decide whether to wait for the VRAM reclaim to finish.
self._last_kill_monotonic: float = 0.0

self._kill_orphaned_servers()
atexit.register(self._cleanup)
Expand Down Expand Up @@ -1351,6 +1354,76 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
logger.debug(f"torch GPU probe failed: {e}")
return []

# Skip the wait when the last kill is older than this; the GPU
# driver has already reclaimed the prior process's allocations.
_VRAM_SETTLE_WINDOW_S: float = 15.0

@staticmethod
def _wait_for_vram_settle(
max_wait: float = 2.0,
interval: float = 0.25,
tolerance_mib: int = 256,
since_kill: float = 0.0,
) -> None:
"""Poll ``_get_gpu_free_memory`` until free VRAM stabilises.

The GPU driver reclaims a dead process's allocations
asynchronously, so sampling free memory in the kill-to-spawn
window reads artificially low and pushes ``_select_gpus`` /
``_fit_context_to_vram`` toward needless CPU offload -- on a
tight VRAM card this is the Apply-reload OOM that bare-shell
launches with the same flags never see.

Short-circuits on cold start (``since_kill`` zero) or stale
kill (older than ``_VRAM_SETTLE_WINDOW_S``); also on CPU-only
hosts (empty probe), probe exceptions, and GPU-set changes.
``max_wait`` is a wall-clock bound that includes probe time,
so a wedged ``nvidia-smi`` cannot extend the reload.
"""
now = time.monotonic()
if since_kill <= 0.0:
return
if now - since_kill > LlamaCppBackend._VRAM_SETTLE_WINDOW_S:
return
deadline = now + max_wait

def _probe_or_none():
if time.monotonic() >= deadline:
return None
try:
return LlamaCppBackend._get_gpu_free_memory()
Comment on lines +1391 to +1394

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Include probe runtime in the settle wait deadline

_wait_for_vram_settle only checks deadline before invoking _get_gpu_free_memory, so if that probe blocks (e.g., nvidia-smi hangs until its 10s timeout in _get_gpu_free_memory) the function can exceed max_wait by several seconds. This breaks the helper’s bounded-wait contract and can noticeably delay /load completion and cancellation responsiveness when the GPU probe is slow or wedged.

Useful? React with 👍 / 👎.

except Exception:
Comment on lines +1391 to +1395

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Enforce wait deadline during VRAM probe call

_wait_for_vram_settle checks the deadline only before calling _get_gpu_free_memory, but that probe can block for up to 10s (the nvidia-smi timeout in _get_gpu_free_memory). If a reload happens after a kill and the probe is slow/hung, this path can exceed max_wait by several seconds, delaying /load and cancellation handling despite the new helper’s contract that wall-clock wait is bounded by max_wait.

Useful? React with 👍 / 👎.

return None
Comment on lines +1395 to +1396

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Avoid using broad, silent exception handlers. While the underlying _get_gpu_free_memory method performs its own logging, it is best practice to log any unexpected exceptions caught here at a debug level to provide visibility into why a probe might have failed during the settle wait, especially if it leads to an early return.

Suggested change
except Exception:
return None
except Exception as e:
logger.debug(f"VRAM settle probe failed: {e}")
return None
References
  1. Avoid using broad, silent exception handlers like except Exception: pass. Instead, log the exception, even if at a debug level, to aid in future debugging.


prev = _probe_or_none()
if prev is None or not prev:
return
while time.monotonic() < deadline:
remaining = deadline - time.monotonic()
if remaining <= 0:
return
# Clip the nap so a near-zero ``max_wait`` is respected.
time.sleep(min(interval, remaining))
curr = _probe_or_none()
if curr is None or not curr or len(curr) != len(prev):
return
prev_map = dict(prev)
stable = True
for idx, free in curr:
if idx not in prev_map:
stable = False
break
prev_free = prev_map[idx]
# Adaptive: 2 % of the larger sample dominates the
# 256 MiB floor on large-VRAM cards.
per_gpu_tol = max(tolerance_mib, int(max(free, prev_free) * 0.02))
if abs(free - prev_free) >= per_gpu_tol:
stable = False
break
if stable:
return
prev = curr

# Free-VRAM fraction at which Studio pins the GPU directly instead
# of deferring to ``--fit on``. 5% headroom covers CUDA context +
# compute buffers; 0.90 was too conservative and dropped 91-94%
Expand Down Expand Up @@ -2631,6 +2704,12 @@ def load_model(
logger.info("Load cancelled after download phase")
return False

# Outside ``self._lock`` so /unload, /cancel, /status are
# not blocked. ``unload_model`` also records the kill, so
# the frontend /unload+/load Apply path engages the wait
# here even though no in-process kill happened.
self._wait_for_vram_settle(since_kill = self._last_kill_monotonic)

# ── Phase 3: start llama-server (under lock) ──────────────
with self._lock:
# Re-check cancel inside lock
Expand Down Expand Up @@ -3741,6 +3820,10 @@ def _kill_process(self):
# server's warm-up window cannot short-circuit against the
# previous server's health (#5401).
self._healthy = False
# Drives _wait_for_vram_settle in the next load_model;
# set in finally so both in-process and frontend
# /unload+/load Apply paths record the kill.
self._last_kill_monotonic = time.monotonic()
if self._stdout_thread is not None:
self._stdout_thread.join(timeout = 2)
self._stdout_thread = None
Expand Down
Loading
Loading