diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 134b6574b0..91ee1e4203 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1695,6 +1695,7 @@ def _fit_context_to_vram( kv_unified: bool = True, ctx_checkpoints: int = 0, kv_on_gpu: bool = True, + mtp_engaged: bool = False, ) -> int: """Return the largest context length that fits in GPU VRAM. @@ -1708,6 +1709,12 @@ def _fit_context_to_vram( the KV cache lives in CPU RAM and doesn't compete with weights for VRAM; the requested context is honored verbatim. The other keyword args mirror ``_estimate_kv_cache_bytes``. + + ``mtp_engaged`` reserves extra VRAM for the MTP draft model's + KV cache + compute graph buffers. llama.cpp's MTP path keeps a + secondary cache sized off the target's KV; on tight VRAM tiers + (e.g. 32 GB) auto-fit at native context would otherwise spill + and force llama-server into a slower partial-offload path. """ if not self._can_estimate_kv(): logger.debug( @@ -1728,7 +1735,9 @@ def _fit_context_to_vram( ctx_checkpoints = ctx_checkpoints, ) - budget_bytes = available_mib * 1024 * 1024 * 0.90 + # MTP needs a tighter budget; drop from 0.90 to 0.85. + budget_frac = 0.85 if mtp_engaged else 0.90 + budget_bytes = available_mib * 1024 * 1024 * budget_frac model_footprint = model_size_bytes # Check if requested context already fits @@ -2614,6 +2623,35 @@ def load_model( # GPU/VRAM-fit logic below may shrink this if hardware is limited. max_available_ctx = self._context_length or effective_ctx + # Will MTP engage on this load? If so, the auto-fit + # budget needs to reserve extra VRAM for the draft + # model's KV cache + compute graph. Mirrors the + # canonical-mode resolver in _build_speculative_flags: + # forced mtp / mtp+ngram always engage; auto only + # engages on an MTP GGUF >= 3B (sub-3B auto falls + # back to ngram-mod which doesn't need headroom); + # ngram / ngram-simple / off never engage MTP. + _mtp_canonical = _canonicalize_spec_mode(speculative_type) + _mtp_effective = _mtp_canonical or "auto" + _mtp_size_for_fit = _extract_model_size_b(model_identifier) + _mtp_sub_3b_for_fit = ( + _mtp_size_for_fit is not None and _mtp_size_for_fit < 3.0 + ) + _mtp_will_engage = bool( + not _extra_args_set_spec_type(extra_args) + and ( + _mtp_effective in ("mtp", "mtp+ngram") + or ( + _mtp_effective == "auto" + and ( + bool(self._nextn_predict_layers) + or _is_mtp_model_name(model_identifier, model_path) + ) + and not _mtp_sub_3b_for_fit + ) + ) + ) + # Auto-cap context to fit in GPU VRAM and select GPUs. # # Two policies depending on whether the user set n_ctx: @@ -2649,6 +2687,7 @@ def load_model( model_size, cache_type_kv, n_parallel = n_parallel, + mtp_engaged = _mtp_will_engage, ) kv = self._estimate_kv_cache_bytes( capped, cache_type_kv, n_parallel = n_parallel @@ -2700,6 +2739,7 @@ def load_model( model_size, cache_type_kv, n_parallel = n_parallel, + mtp_engaged = _mtp_will_engage, ) kv = self._estimate_kv_cache_bytes( capped, cache_type_kv, n_parallel = n_parallel diff --git a/studio/backend/tests/test_kv_cache_estimation.py b/studio/backend/tests/test_kv_cache_estimation.py index 29d87804ff..d52a58a25c 100644 --- a/studio/backend/tests/test_kv_cache_estimation.py +++ b/studio/backend/tests/test_kv_cache_estimation.py @@ -1558,6 +1558,33 @@ def test_fit_reduces_when_kv_on_gpu(self): ) assert fitted < 32_768 + def test_fit_mtp_engaged_returns_smaller_or_equal_context(self): + # MTP-engaged budget is 0.85 of available; non-MTP is 0.90. + # On a tight budget the MTP path must yield <= the non-MTP path. + b = self._gqa_backend() + common = dict( + requested_ctx = 32_768, + available_mib = 128, + model_size_bytes = 8 * 1024 * 1024, + cache_type_kv = "f16", + ) + baseline = b._fit_context_to_vram(**common) + mtp = b._fit_context_to_vram(**common, mtp_engaged = True) + assert mtp <= baseline + + def test_fit_mtp_engaged_unchanged_when_kv_off_gpu(self): + # kv_on_gpu=False short-circuits the fit; mtp_engaged is irrelevant. + b = self._gqa_backend() + fitted = b._fit_context_to_vram( + requested_ctx = 32_768, + available_mib = 1, + model_size_bytes = 100, + cache_type_kv = "f16", + kv_on_gpu = False, + mtp_engaged = True, + ) + assert fitted == 32_768 + def test_fit_threads_swa_full_through_estimator(self): # SWA model, generous budget; both should fit but cache size differs. b = self._swa_backend()