unslothai · danielhanchen · May 22, 2026 · May 22, 2026 · May 22, 2026 · chatgpt-codex-connector
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -687,6 +687,9 @@ def __init__(self):
         self._is_audio: bool = False
         self._audio_type: Optional[str] = None
         self._audio_probed: bool = False
+        # Monotonic timestamp set in _kill_process; read by load_model
+        # to decide whether to wait for the VRAM reclaim to finish.
+        self._last_kill_monotonic: float = 0.0
 
         self._kill_orphaned_servers()
         atexit.register(self._cleanup)
@@ -1351,6 +1354,76 @@ def _get_gpu_free_memory() -> list[tuple[int, int]]:
             logger.debug(f"torch GPU probe failed: {e}")
             return []
 
+    # Skip the wait when the last kill is older than this; the GPU
+    # driver has already reclaimed the prior process's allocations.
+    _VRAM_SETTLE_WINDOW_S: float = 15.0
+
+    @staticmethod
+    def _wait_for_vram_settle(
+        max_wait: float = 2.0,
+        interval: float = 0.25,
+        tolerance_mib: int = 256,
+        since_kill: float = 0.0,
+    ) -> None:
+        """Poll ``_get_gpu_free_memory`` until free VRAM stabilises.
+
+        The GPU driver reclaims a dead process's allocations
+        asynchronously, so sampling free memory in the kill-to-spawn
+        window reads artificially low and pushes ``_select_gpus`` /
+        ``_fit_context_to_vram`` toward needless CPU offload -- on a
+        tight VRAM card this is the Apply-reload OOM that bare-shell
+        launches with the same flags never see.
+
+        Short-circuits on cold start (``since_kill`` zero) or stale
+        kill (older than ``_VRAM_SETTLE_WINDOW_S``); also on CPU-only
+        hosts (empty probe), probe exceptions, and GPU-set changes.
+        ``max_wait`` is a wall-clock bound that includes probe time,
+        so a wedged ``nvidia-smi`` cannot extend the reload.
+        """
+        now = time.monotonic()
+        if since_kill <= 0.0:
+            return
+        if now - since_kill > LlamaCppBackend._VRAM_SETTLE_WINDOW_S:
+            return
+        deadline = now + max_wait
+
+        def _probe_or_none():
+            if time.monotonic() >= deadline:
+                return None
+            try:
+                return LlamaCppBackend._get_gpu_free_memory()
+            except Exception:
+                return None
-            except Exception:
-                return None
+            except Exception as e:
+                logger.debug(f"VRAM settle probe failed: {e}")
+                return None
-            except Exception:
-                return None
+            except Exception as e:
+                logger.debug(f"VRAM settle probe failed: {e}")
+                return None
+
+        prev = _probe_or_none()
+        if prev is None or not prev:
+            return
+        while time.monotonic() < deadline:
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                return
+            # Clip the nap so a near-zero ``max_wait`` is respected.
+            time.sleep(min(interval, remaining))
+            curr = _probe_or_none()
+            if curr is None or not curr or len(curr) != len(prev):
+                return
+            prev_map = dict(prev)
+            stable = True
+            for idx, free in curr:
+                if idx not in prev_map:
+                    stable = False
+                    break
+                prev_free = prev_map[idx]
+                # Adaptive: 2 % of the larger sample dominates the
+                # 256 MiB floor on large-VRAM cards.
+                per_gpu_tol = max(tolerance_mib, int(max(free, prev_free) * 0.02))
+                if abs(free - prev_free) >= per_gpu_tol:
+                    stable = False
+                    break
+            if stable:
+                return
+            prev = curr
+
     # Free-VRAM fraction at which Studio pins the GPU directly instead
     # of deferring to ``--fit on``. 5% headroom covers CUDA context +
     # compute buffers; 0.90 was too conservative and dropped 91-94%
@@ -2631,6 +2704,12 @@ def load_model(
                 logger.info("Load cancelled after download phase")
                 return False
 
+            # Outside ``self._lock`` so /unload, /cancel, /status are
+            # not blocked. ``unload_model`` also records the kill, so
+            # the frontend /unload+/load Apply path engages the wait
+            # here even though no in-process kill happened.
+            self._wait_for_vram_settle(since_kill = self._last_kill_monotonic)
+
             # ── Phase 3: start llama-server (under lock) ──────────────
             with self._lock:
                 # Re-check cancel inside lock
@@ -3741,6 +3820,10 @@ def _kill_process(self):
             # server's warm-up window cannot short-circuit against the
             # previous server's health (#5401).
             self._healthy = False
+            # Drives _wait_for_vram_settle in the next load_model;
+            # set in finally so both in-process and frontend
+            # /unload+/load Apply paths record the kill.
+            self._last_kill_monotonic = time.monotonic()
             if self._stdout_thread is not None:
                 self._stdout_thread.join(timeout = 2)
                 self._stdout_thread = None