unslothai · danielhanchen · May 19, 2026 · May 18, 2026 · May 18, 2026 · May 19, 2026
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -1695,6 +1695,7 @@ def _fit_context_to_vram(
         kv_unified: bool = True,
         ctx_checkpoints: int = 0,
         kv_on_gpu: bool = True,
+        mtp_engaged: bool = False,
     ) -> int:
         """Return the largest context length that fits in GPU VRAM.
 
@@ -1708,6 +1709,12 @@ def _fit_context_to_vram(
         the KV cache lives in CPU RAM and doesn't compete with weights
         for VRAM; the requested context is honored verbatim. The other
         keyword args mirror ``_estimate_kv_cache_bytes``.
+
+        ``mtp_engaged`` reserves extra VRAM for the MTP draft model's
+        KV cache + compute graph buffers. llama.cpp's MTP path keeps a
+        secondary cache sized off the target's KV; on tight VRAM tiers
+        (e.g. 32 GB) auto-fit at native context would otherwise spill
+        and force llama-server into a slower partial-offload path.
         """
         if not self._can_estimate_kv():
             logger.debug(
@@ -1728,7 +1735,9 @@ def _fit_context_to_vram(
             ctx_checkpoints = ctx_checkpoints,
         )
 
-        budget_bytes = available_mib * 1024 * 1024 * 0.90
+        # MTP needs a tighter budget; drop from 0.90 to 0.85.
+        budget_frac = 0.85 if mtp_engaged else 0.90
+        budget_bytes = available_mib * 1024 * 1024 * budget_frac
         model_footprint = model_size_bytes
 
         # Check if requested context already fits
@@ -2614,6 +2623,35 @@ def load_model(
                     # GPU/VRAM-fit logic below may shrink this if hardware is limited.
                     max_available_ctx = self._context_length or effective_ctx
 
+                    # Will MTP engage on this load? If so, the auto-fit
+                    # budget needs to reserve extra VRAM for the draft
+                    # model's KV cache + compute graph. Mirrors the
+                    # canonical-mode resolver in _build_speculative_flags:
+                    # forced mtp / mtp+ngram always engage; auto only
+                    # engages on an MTP GGUF >= 3B (sub-3B auto falls
+                    # back to ngram-mod which doesn't need headroom);
+                    # ngram / ngram-simple / off never engage MTP.
+                    _mtp_canonical = _canonicalize_spec_mode(speculative_type)
+                    _mtp_effective = _mtp_canonical or "auto"
+                    _mtp_size_for_fit = _extract_model_size_b(model_identifier)
+                    _mtp_sub_3b_for_fit = (
+                        _mtp_size_for_fit is not None and _mtp_size_for_fit < 3.0
+                    )
+                    _mtp_will_engage = bool(
+                        not _extra_args_set_spec_type(extra_args)
+                        and (
+                            _mtp_effective in ("mtp", "mtp+ngram")
+                            or (
+                                _mtp_effective == "auto"
+                                and (
+                                    bool(self._nextn_predict_layers)
+                                    or _is_mtp_model_name(model_identifier, model_path)
+                                )
+                                and not _mtp_sub_3b_for_fit
+                            )
+                        )
+                    )
+
                     # Auto-cap context to fit in GPU VRAM and select GPUs.
                     #
                     # Two policies depending on whether the user set n_ctx:
@@ -2649,6 +2687,7 @@ def load_model(
                                     model_size,
                                     cache_type_kv,
                                     n_parallel = n_parallel,
+                                    mtp_engaged = _mtp_will_engage,
                                 )
                                 kv = self._estimate_kv_cache_bytes(
                                     capped, cache_type_kv, n_parallel = n_parallel
@@ -2700,6 +2739,7 @@ def load_model(
                                     model_size,
                                     cache_type_kv,
                                     n_parallel = n_parallel,
+                                    mtp_engaged = _mtp_will_engage,
                                 )
                                 kv = self._estimate_kv_cache_bytes(
                                     capped, cache_type_kv, n_parallel = n_parallel

@@ -1558,6 +1558,33 @@ def test_fit_reduces_when_kv_on_gpu(self):
         )
         assert fitted < 32_768
 
+    def test_fit_mtp_engaged_returns_smaller_or_equal_context(self):
+        # MTP-engaged budget is 0.85 of available; non-MTP is 0.90.
+        # On a tight budget the MTP path must yield <= the non-MTP path.
+        b = self._gqa_backend()
+        common = dict(
+            requested_ctx = 32_768,
+            available_mib = 128,
+            model_size_bytes = 8 * 1024 * 1024,
+            cache_type_kv = "f16",
+        )
+        baseline = b._fit_context_to_vram(**common)
+        mtp = b._fit_context_to_vram(**common, mtp_engaged = True)
+        assert mtp <= baseline
+
+    def test_fit_mtp_engaged_unchanged_when_kv_off_gpu(self):
+        # kv_on_gpu=False short-circuits the fit; mtp_engaged is irrelevant.
+        b = self._gqa_backend()
+        fitted = b._fit_context_to_vram(
+            requested_ctx = 32_768,
+            available_mib = 1,
+            model_size_bytes = 100,
+            cache_type_kv = "f16",
+            kv_on_gpu = False,
+            mtp_engaged = True,
+        )
+        assert fitted == 32_768
+
     def test_fit_threads_swa_full_through_estimator(self):
         # SWA model, generous budget; both should fit but cache size differs.
         b = self._swa_backend()