From 1c9dde85eff1b50c556bda098ae09670fd3a00a3 Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Mon, 18 May 2026 18:20:24 +0000 Subject: [PATCH 1/3] studio: reserve VRAM headroom for the MTP draft cache in auto-fit When MTP is going to engage on this load, _fit_context_to_vram now budgets 0.85 of available VRAM instead of 0.90, leaving room for llama.cpp's secondary MTP draft KV cache + compute graph buffers. Motivation: a user report on RTX 5090 (32 GB) showed Qwen3.6-27B-MTP-GGUF UD-Q4_K_XL at native auto-context running roughly half the speed of the same model with a slightly smaller context. The most parsimonious explanation is a VRAM cliff: at native context the target's KV already eats the 90% budget, then llama-server allocates the draft cache + draft graph on top and spills into a slower partial-offload path. Reducing the budget by 5% on MTP loads avoids the spill without penalising non-MTP loads. On hardware with abundant VRAM (B200, etc.) the fit is unchanged because the requested context already fits in the tighter budget too. MTP detection mirrors the auto-promotion logic in load_model: the GGUF advertises nextn_predict_layers, or the model identifier / local path matches the -MTP marker, and the user has not explicitly opted out via speculative_type="off" or --spec-type extra args. Tests: two new cases in test_kv_cache_estimation.py verify that mtp_engaged=True yields a context less-than-or-equal-to the non-MTP path on a tight budget, and that kv_on_gpu=False still short-circuits regardless of mtp_engaged. --- studio/backend/core/inference/llama_cpp.py | 29 ++++++++++++++++++- .../backend/tests/test_kv_cache_estimation.py | 27 +++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 134b6574b0..2704d5e5de 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -1695,6 +1695,7 @@ def _fit_context_to_vram( kv_unified: bool = True, ctx_checkpoints: int = 0, kv_on_gpu: bool = True, + mtp_engaged: bool = False, ) -> int: """Return the largest context length that fits in GPU VRAM. @@ -1708,6 +1709,12 @@ def _fit_context_to_vram( the KV cache lives in CPU RAM and doesn't compete with weights for VRAM; the requested context is honored verbatim. The other keyword args mirror ``_estimate_kv_cache_bytes``. + + ``mtp_engaged`` reserves extra VRAM for the MTP draft model's + KV cache + compute graph buffers. llama.cpp's MTP path keeps a + secondary cache sized off the target's KV; on tight VRAM tiers + (e.g. 32 GB) auto-fit at native context would otherwise spill + and force llama-server into a slower partial-offload path. """ if not self._can_estimate_kv(): logger.debug( @@ -1728,7 +1735,9 @@ def _fit_context_to_vram( ctx_checkpoints = ctx_checkpoints, ) - budget_bytes = available_mib * 1024 * 1024 * 0.90 + # MTP needs a tighter budget; drop from 0.90 to 0.85. + budget_frac = 0.85 if mtp_engaged else 0.90 + budget_bytes = available_mib * 1024 * 1024 * budget_frac model_footprint = model_size_bytes # Check if requested context already fits @@ -2614,6 +2623,22 @@ def load_model( # GPU/VRAM-fit logic below may shrink this if hardware is limited. max_available_ctx = self._context_length or effective_ctx + # Will MTP engage on this load? If so, the auto-fit + # budget needs to reserve extra VRAM for the draft + # model's KV cache + compute graph. + _normalized_spec = ( + speculative_type.lower().strip() + if speculative_type else None + ) + _mtp_will_engage = bool( + ( + bool(self._nextn_predict_layers) + or _is_mtp_model_name(model_identifier, model_path) + ) + and _normalized_spec not in {"off"} + and not _extra_args_set_spec_type(extra_args) + ) + # Auto-cap context to fit in GPU VRAM and select GPUs. # # Two policies depending on whether the user set n_ctx: @@ -2649,6 +2674,7 @@ def load_model( model_size, cache_type_kv, n_parallel = n_parallel, + mtp_engaged = _mtp_will_engage, ) kv = self._estimate_kv_cache_bytes( capped, cache_type_kv, n_parallel = n_parallel @@ -2700,6 +2726,7 @@ def load_model( model_size, cache_type_kv, n_parallel = n_parallel, + mtp_engaged = _mtp_will_engage, ) kv = self._estimate_kv_cache_bytes( capped, cache_type_kv, n_parallel = n_parallel diff --git a/studio/backend/tests/test_kv_cache_estimation.py b/studio/backend/tests/test_kv_cache_estimation.py index 29d87804ff..d52a58a25c 100644 --- a/studio/backend/tests/test_kv_cache_estimation.py +++ b/studio/backend/tests/test_kv_cache_estimation.py @@ -1558,6 +1558,33 @@ def test_fit_reduces_when_kv_on_gpu(self): ) assert fitted < 32_768 + def test_fit_mtp_engaged_returns_smaller_or_equal_context(self): + # MTP-engaged budget is 0.85 of available; non-MTP is 0.90. + # On a tight budget the MTP path must yield <= the non-MTP path. + b = self._gqa_backend() + common = dict( + requested_ctx = 32_768, + available_mib = 128, + model_size_bytes = 8 * 1024 * 1024, + cache_type_kv = "f16", + ) + baseline = b._fit_context_to_vram(**common) + mtp = b._fit_context_to_vram(**common, mtp_engaged = True) + assert mtp <= baseline + + def test_fit_mtp_engaged_unchanged_when_kv_off_gpu(self): + # kv_on_gpu=False short-circuits the fit; mtp_engaged is irrelevant. + b = self._gqa_backend() + fitted = b._fit_context_to_vram( + requested_ctx = 32_768, + available_mib = 1, + model_size_bytes = 100, + cache_type_kv = "f16", + kv_on_gpu = False, + mtp_engaged = True, + ) + assert fitted == 32_768 + def test_fit_threads_swa_full_through_estimator(self): # SWA model, generous budget; both should fit but cache size differs. b = self._swa_backend() From bd4c35e27265c279229189d62302ba28f878867e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 18 May 2026 18:20:59 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- studio/backend/core/inference/llama_cpp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index 2704d5e5de..d38f21d616 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -2627,8 +2627,7 @@ def load_model( # budget needs to reserve extra VRAM for the draft # model's KV cache + compute graph. _normalized_spec = ( - speculative_type.lower().strip() - if speculative_type else None + speculative_type.lower().strip() if speculative_type else None ) _mtp_will_engage = bool( ( From acbb2fb5a2517a323b4c92901741e4f401ed19cc Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Tue, 19 May 2026 13:18:53 +0000 Subject: [PATCH 3/3] studio: gate _mtp_will_engage on canonical-mode resolver After PR #5582 introduced the 5-mode Speculative Decoding dropdown plus _canonicalize_spec_mode, the auto-fit MTP-engaged predicate becomes: * forced mtp / mtp+ngram -> always engage MTP (extra VRAM needed) * auto + MTP GGUF (>= 3B) -> engages MTP via auto-promotion * auto + MTP GGUF (sub-3B) -> falls back to ngram-mod (no extra VRAM) * ngram / ngram-simple / off -> never engage MTP * user --spec-type in extra_args -> resolver suppressed; no headroom The old gate triggered on "anything but off", so it over-reserved the 0.85 budget when the user explicitly picked Ngram (no MTP) or when Auto fell back to ngram-mod on a sub-3B MTP model. The 5% headroom cost was minor but unnecessary. Mirrors the same logic already encoded in _build_speculative_flags so the auto-fit budget and the actual emission agree on whether MTP is running. All 361 backend tests pass. --- studio/backend/core/inference/llama_cpp.py | 30 ++++++++++++++++------ 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py index d38f21d616..91ee1e4203 100644 --- a/studio/backend/core/inference/llama_cpp.py +++ b/studio/backend/core/inference/llama_cpp.py @@ -2625,17 +2625,31 @@ def load_model( # Will MTP engage on this load? If so, the auto-fit # budget needs to reserve extra VRAM for the draft - # model's KV cache + compute graph. - _normalized_spec = ( - speculative_type.lower().strip() if speculative_type else None + # model's KV cache + compute graph. Mirrors the + # canonical-mode resolver in _build_speculative_flags: + # forced mtp / mtp+ngram always engage; auto only + # engages on an MTP GGUF >= 3B (sub-3B auto falls + # back to ngram-mod which doesn't need headroom); + # ngram / ngram-simple / off never engage MTP. + _mtp_canonical = _canonicalize_spec_mode(speculative_type) + _mtp_effective = _mtp_canonical or "auto" + _mtp_size_for_fit = _extract_model_size_b(model_identifier) + _mtp_sub_3b_for_fit = ( + _mtp_size_for_fit is not None and _mtp_size_for_fit < 3.0 ) _mtp_will_engage = bool( - ( - bool(self._nextn_predict_layers) - or _is_mtp_model_name(model_identifier, model_path) + not _extra_args_set_spec_type(extra_args) + and ( + _mtp_effective in ("mtp", "mtp+ngram") + or ( + _mtp_effective == "auto" + and ( + bool(self._nextn_predict_layers) + or _is_mtp_model_name(model_identifier, model_path) + ) + and not _mtp_sub_3b_for_fit + ) ) - and _normalized_spec not in {"off"} - and not _extra_args_set_spec_type(extra_args) ) # Auto-cap context to fit in GPU VRAM and select GPUs.