From 1c9dde85eff1b50c556bda098ae09670fd3a00a3 Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Mon, 18 May 2026 18:20:24 +0000
Subject: [PATCH 1/3] studio: reserve VRAM headroom for the MTP draft cache in
 auto-fit

When MTP is going to engage on this load, _fit_context_to_vram now
budgets 0.85 of available VRAM instead of 0.90, leaving room for
llama.cpp's secondary MTP draft KV cache + compute graph buffers.

Motivation: a user report on RTX 5090 (32 GB) showed Qwen3.6-27B-MTP-GGUF
UD-Q4_K_XL at native auto-context running roughly half the speed of
the same model with a slightly smaller context. The most parsimonious
explanation is a VRAM cliff: at native context the target's KV
already eats the 90% budget, then llama-server allocates the draft
cache + draft graph on top and spills into a slower partial-offload
path. Reducing the budget by 5% on MTP loads avoids the spill without
penalising non-MTP loads. On hardware with abundant VRAM (B200, etc.)
the fit is unchanged because the requested context already fits in
the tighter budget too.

MTP detection mirrors the auto-promotion logic in load_model: the
GGUF advertises nextn_predict_layers, or the model identifier /
local path matches the -MTP marker, and the user has not explicitly
opted out via speculative_type="off" or --spec-type extra args.

Tests: two new cases in test_kv_cache_estimation.py verify that
mtp_engaged=True yields a context less-than-or-equal-to the
non-MTP path on a tight budget, and that kv_on_gpu=False still
short-circuits regardless of mtp_engaged.
---
 studio/backend/core/inference/llama_cpp.py    | 29 ++++++++++++++++++-
 .../backend/tests/test_kv_cache_estimation.py | 27 +++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 134b6574b0..2704d5e5de 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -1695,6 +1695,7 @@ def _fit_context_to_vram(
         kv_unified: bool = True,
         ctx_checkpoints: int = 0,
         kv_on_gpu: bool = True,
+        mtp_engaged: bool = False,
     ) -> int:
         """Return the largest context length that fits in GPU VRAM.
 
@@ -1708,6 +1709,12 @@ def _fit_context_to_vram(
         the KV cache lives in CPU RAM and doesn't compete with weights
         for VRAM; the requested context is honored verbatim. The other
         keyword args mirror ``_estimate_kv_cache_bytes``.
+
+        ``mtp_engaged`` reserves extra VRAM for the MTP draft model's
+        KV cache + compute graph buffers. llama.cpp's MTP path keeps a
+        secondary cache sized off the target's KV; on tight VRAM tiers
+        (e.g. 32 GB) auto-fit at native context would otherwise spill
+        and force llama-server into a slower partial-offload path.
         """
         if not self._can_estimate_kv():
             logger.debug(
@@ -1728,7 +1735,9 @@ def _fit_context_to_vram(
             ctx_checkpoints = ctx_checkpoints,
         )
 
-        budget_bytes = available_mib * 1024 * 1024 * 0.90
+        # MTP needs a tighter budget; drop from 0.90 to 0.85.
+        budget_frac = 0.85 if mtp_engaged else 0.90
+        budget_bytes = available_mib * 1024 * 1024 * budget_frac
         model_footprint = model_size_bytes
 
         # Check if requested context already fits
@@ -2614,6 +2623,22 @@ def load_model(
                     # GPU/VRAM-fit logic below may shrink this if hardware is limited.
                     max_available_ctx = self._context_length or effective_ctx
 
+                    # Will MTP engage on this load? If so, the auto-fit
+                    # budget needs to reserve extra VRAM for the draft
+                    # model's KV cache + compute graph.
+                    _normalized_spec = (
+                        speculative_type.lower().strip()
+                        if speculative_type else None
+                    )
+                    _mtp_will_engage = bool(
+                        (
+                            bool(self._nextn_predict_layers)
+                            or _is_mtp_model_name(model_identifier, model_path)
+                        )
+                        and _normalized_spec not in {"off"}
+                        and not _extra_args_set_spec_type(extra_args)
+                    )
+
                     # Auto-cap context to fit in GPU VRAM and select GPUs.
                     #
                     # Two policies depending on whether the user set n_ctx:
@@ -2649,6 +2674,7 @@ def load_model(
                                     model_size,
                                     cache_type_kv,
                                     n_parallel = n_parallel,
+                                    mtp_engaged = _mtp_will_engage,
                                 )
                                 kv = self._estimate_kv_cache_bytes(
                                     capped, cache_type_kv, n_parallel = n_parallel
@@ -2700,6 +2726,7 @@ def load_model(
                                     model_size,
                                     cache_type_kv,
                                     n_parallel = n_parallel,
+                                    mtp_engaged = _mtp_will_engage,
                                 )
                                 kv = self._estimate_kv_cache_bytes(
                                     capped, cache_type_kv, n_parallel = n_parallel
diff --git a/studio/backend/tests/test_kv_cache_estimation.py b/studio/backend/tests/test_kv_cache_estimation.py
index 29d87804ff..d52a58a25c 100644
--- a/studio/backend/tests/test_kv_cache_estimation.py
+++ b/studio/backend/tests/test_kv_cache_estimation.py
@@ -1558,6 +1558,33 @@ def test_fit_reduces_when_kv_on_gpu(self):
         )
         assert fitted < 32_768
 
+    def test_fit_mtp_engaged_returns_smaller_or_equal_context(self):
+        # MTP-engaged budget is 0.85 of available; non-MTP is 0.90.
+        # On a tight budget the MTP path must yield <= the non-MTP path.
+        b = self._gqa_backend()
+        common = dict(
+            requested_ctx = 32_768,
+            available_mib = 128,
+            model_size_bytes = 8 * 1024 * 1024,
+            cache_type_kv = "f16",
+        )
+        baseline = b._fit_context_to_vram(**common)
+        mtp = b._fit_context_to_vram(**common, mtp_engaged = True)
+        assert mtp <= baseline
+
+    def test_fit_mtp_engaged_unchanged_when_kv_off_gpu(self):
+        # kv_on_gpu=False short-circuits the fit; mtp_engaged is irrelevant.
+        b = self._gqa_backend()
+        fitted = b._fit_context_to_vram(
+            requested_ctx = 32_768,
+            available_mib = 1,
+            model_size_bytes = 100,
+            cache_type_kv = "f16",
+            kv_on_gpu = False,
+            mtp_engaged = True,
+        )
+        assert fitted == 32_768
+
     def test_fit_threads_swa_full_through_estimator(self):
         # SWA model, generous budget; both should fit but cache size differs.
         b = self._swa_backend()

From bd4c35e27265c279229189d62302ba28f878867e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 18 May 2026 18:20:59 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 studio/backend/core/inference/llama_cpp.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index 2704d5e5de..d38f21d616 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -2627,8 +2627,7 @@ def load_model(
                     # budget needs to reserve extra VRAM for the draft
                     # model's KV cache + compute graph.
                     _normalized_spec = (
-                        speculative_type.lower().strip()
-                        if speculative_type else None
+                        speculative_type.lower().strip() if speculative_type else None
                     )
                     _mtp_will_engage = bool(
                         (

From acbb2fb5a2517a323b4c92901741e4f401ed19cc Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Tue, 19 May 2026 13:18:53 +0000
Subject: [PATCH 3/3] studio: gate _mtp_will_engage on canonical-mode resolver

After PR #5582 introduced the 5-mode Speculative Decoding dropdown plus
_canonicalize_spec_mode, the auto-fit MTP-engaged predicate becomes:
  * forced mtp / mtp+ngram -> always engage MTP (extra VRAM needed)
  * auto + MTP GGUF (>= 3B) -> engages MTP via auto-promotion
  * auto + MTP GGUF (sub-3B) -> falls back to ngram-mod (no extra VRAM)
  * ngram / ngram-simple / off -> never engage MTP
  * user --spec-type in extra_args -> resolver suppressed; no headroom

The old gate triggered on "anything but off", so it over-reserved the
0.85 budget when the user explicitly picked Ngram (no MTP) or when
Auto fell back to ngram-mod on a sub-3B MTP model. The 5% headroom
cost was minor but unnecessary.

Mirrors the same logic already encoded in _build_speculative_flags so
the auto-fit budget and the actual emission agree on whether MTP is
running.

All 361 backend tests pass.
---
 studio/backend/core/inference/llama_cpp.py | 30 ++++++++++++++++------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
index d38f21d616..91ee1e4203 100644
--- a/studio/backend/core/inference/llama_cpp.py
+++ b/studio/backend/core/inference/llama_cpp.py
@@ -2625,17 +2625,31 @@ def load_model(
 
                     # Will MTP engage on this load? If so, the auto-fit
                     # budget needs to reserve extra VRAM for the draft
-                    # model's KV cache + compute graph.
-                    _normalized_spec = (
-                        speculative_type.lower().strip() if speculative_type else None
+                    # model's KV cache + compute graph. Mirrors the
+                    # canonical-mode resolver in _build_speculative_flags:
+                    # forced mtp / mtp+ngram always engage; auto only
+                    # engages on an MTP GGUF >= 3B (sub-3B auto falls
+                    # back to ngram-mod which doesn't need headroom);
+                    # ngram / ngram-simple / off never engage MTP.
+                    _mtp_canonical = _canonicalize_spec_mode(speculative_type)
+                    _mtp_effective = _mtp_canonical or "auto"
+                    _mtp_size_for_fit = _extract_model_size_b(model_identifier)
+                    _mtp_sub_3b_for_fit = (
+                        _mtp_size_for_fit is not None and _mtp_size_for_fit < 3.0
                     )
                     _mtp_will_engage = bool(
-                        (
-                            bool(self._nextn_predict_layers)
-                            or _is_mtp_model_name(model_identifier, model_path)
+                        not _extra_args_set_spec_type(extra_args)
+                        and (
+                            _mtp_effective in ("mtp", "mtp+ngram")
+                            or (
+                                _mtp_effective == "auto"
+                                and (
+                                    bool(self._nextn_predict_layers)
+                                    or _is_mtp_model_name(model_identifier, model_path)
+                                )
+                                and not _mtp_sub_3b_for_fit
+                            )
                         )
-                        and _normalized_spec not in {"off"}
-                        and not _extra_args_set_spec_type(extra_args)
                     )
 
                     # Auto-cap context to fit in GPU VRAM and select GPUs.