-
-
Notifications
You must be signed in to change notification settings - Fork 5.9k
studio: settle GPU VRAM after killing llama-server before the next reload #5693
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -687,6 +687,9 @@ def __init__(self): | |||||||||||
| self._is_audio: bool = False | ||||||||||||
| self._audio_type: Optional[str] = None | ||||||||||||
| self._audio_probed: bool = False | ||||||||||||
| # Monotonic timestamp set in _kill_process; read by load_model | ||||||||||||
| # to decide whether to wait for the VRAM reclaim to finish. | ||||||||||||
| self._last_kill_monotonic: float = 0.0 | ||||||||||||
|
|
||||||||||||
| self._kill_orphaned_servers() | ||||||||||||
| atexit.register(self._cleanup) | ||||||||||||
|
|
@@ -1351,6 +1354,76 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]: | |||||||||||
| logger.debug(f"torch GPU probe failed: {e}") | ||||||||||||
| return [] | ||||||||||||
|
|
||||||||||||
| # Skip the wait when the last kill is older than this; the GPU | ||||||||||||
| # driver has already reclaimed the prior process's allocations. | ||||||||||||
| _VRAM_SETTLE_WINDOW_S: float = 15.0 | ||||||||||||
|
|
||||||||||||
| @staticmethod | ||||||||||||
| def _wait_for_vram_settle( | ||||||||||||
| max_wait: float = 2.0, | ||||||||||||
| interval: float = 0.25, | ||||||||||||
| tolerance_mib: int = 256, | ||||||||||||
| since_kill: float = 0.0, | ||||||||||||
| ) -> None: | ||||||||||||
| """Poll ``_get_gpu_free_memory`` until free VRAM stabilises. | ||||||||||||
|
|
||||||||||||
| The GPU driver reclaims a dead process's allocations | ||||||||||||
| asynchronously, so sampling free memory in the kill-to-spawn | ||||||||||||
| window reads artificially low and pushes ``_select_gpus`` / | ||||||||||||
| ``_fit_context_to_vram`` toward needless CPU offload -- on a | ||||||||||||
| tight VRAM card this is the Apply-reload OOM that bare-shell | ||||||||||||
| launches with the same flags never see. | ||||||||||||
|
|
||||||||||||
| Short-circuits on cold start (``since_kill`` zero) or stale | ||||||||||||
| kill (older than ``_VRAM_SETTLE_WINDOW_S``); also on CPU-only | ||||||||||||
| hosts (empty probe), probe exceptions, and GPU-set changes. | ||||||||||||
| ``max_wait`` is a wall-clock bound that includes probe time, | ||||||||||||
| so a wedged ``nvidia-smi`` cannot extend the reload. | ||||||||||||
| """ | ||||||||||||
| now = time.monotonic() | ||||||||||||
| if since_kill <= 0.0: | ||||||||||||
| return | ||||||||||||
| if now - since_kill > LlamaCppBackend._VRAM_SETTLE_WINDOW_S: | ||||||||||||
| return | ||||||||||||
| deadline = now + max_wait | ||||||||||||
|
|
||||||||||||
| def _probe_or_none(): | ||||||||||||
| if time.monotonic() >= deadline: | ||||||||||||
| return None | ||||||||||||
| try: | ||||||||||||
| return LlamaCppBackend._get_gpu_free_memory() | ||||||||||||
| except Exception: | ||||||||||||
|
Comment on lines
+1391
to
+1395
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Useful? React with 👍 / 👎. |
||||||||||||
| return None | ||||||||||||
|
Comment on lines
+1395
to
+1396
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid using broad, silent exception handlers. While the underlying
Suggested change
References
|
||||||||||||
|
|
||||||||||||
| prev = _probe_or_none() | ||||||||||||
| if prev is None or not prev: | ||||||||||||
| return | ||||||||||||
| while time.monotonic() < deadline: | ||||||||||||
| remaining = deadline - time.monotonic() | ||||||||||||
| if remaining <= 0: | ||||||||||||
| return | ||||||||||||
| # Clip the nap so a near-zero ``max_wait`` is respected. | ||||||||||||
| time.sleep(min(interval, remaining)) | ||||||||||||
| curr = _probe_or_none() | ||||||||||||
| if curr is None or not curr or len(curr) != len(prev): | ||||||||||||
| return | ||||||||||||
| prev_map = dict(prev) | ||||||||||||
| stable = True | ||||||||||||
| for idx, free in curr: | ||||||||||||
| if idx not in prev_map: | ||||||||||||
| stable = False | ||||||||||||
| break | ||||||||||||
| prev_free = prev_map[idx] | ||||||||||||
| # Adaptive: 2 % of the larger sample dominates the | ||||||||||||
| # 256 MiB floor on large-VRAM cards. | ||||||||||||
| per_gpu_tol = max(tolerance_mib, int(max(free, prev_free) * 0.02)) | ||||||||||||
| if abs(free - prev_free) >= per_gpu_tol: | ||||||||||||
| stable = False | ||||||||||||
| break | ||||||||||||
| if stable: | ||||||||||||
| return | ||||||||||||
| prev = curr | ||||||||||||
|
|
||||||||||||
| # Free-VRAM fraction at which Studio pins the GPU directly instead | ||||||||||||
| # of deferring to ``--fit on``. 5% headroom covers CUDA context + | ||||||||||||
| # compute buffers; 0.90 was too conservative and dropped 91-94% | ||||||||||||
|
|
@@ -2631,6 +2704,12 @@ def load_model( | |||||||||||
| logger.info("Load cancelled after download phase") | ||||||||||||
| return False | ||||||||||||
|
|
||||||||||||
| # Outside ``self._lock`` so /unload, /cancel, /status are | ||||||||||||
| # not blocked. ``unload_model`` also records the kill, so | ||||||||||||
| # the frontend /unload+/load Apply path engages the wait | ||||||||||||
| # here even though no in-process kill happened. | ||||||||||||
| self._wait_for_vram_settle(since_kill = self._last_kill_monotonic) | ||||||||||||
|
|
||||||||||||
| # ── Phase 3: start llama-server (under lock) ────────────── | ||||||||||||
| with self._lock: | ||||||||||||
| # Re-check cancel inside lock | ||||||||||||
|
|
@@ -3741,6 +3820,10 @@ def _kill_process(self): | |||||||||||
| # server's warm-up window cannot short-circuit against the | ||||||||||||
| # previous server's health (#5401). | ||||||||||||
| self._healthy = False | ||||||||||||
| # Drives _wait_for_vram_settle in the next load_model; | ||||||||||||
| # set in finally so both in-process and frontend | ||||||||||||
| # /unload+/load Apply paths record the kill. | ||||||||||||
| self._last_kill_monotonic = time.monotonic() | ||||||||||||
| if self._stdout_thread is not None: | ||||||||||||
| self._stdout_thread.join(timeout = 2) | ||||||||||||
| self._stdout_thread = None | ||||||||||||
|
|
||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_wait_for_vram_settleonly checksdeadlinebefore invoking_get_gpu_free_memory, so if that probe blocks (e.g.,nvidia-smihangs until its 10s timeout in_get_gpu_free_memory) the function can exceedmax_waitby several seconds. This breaks the helper’s bounded-wait contract and can noticeably delay/loadcompletion and cancellation responsiveness when the GPU probe is slow or wedged.Useful? React with 👍 / 👎.