Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1695,6 +1695,7 @@ def _fit_context_to_vram(
kv_unified: bool = True,
ctx_checkpoints: int = 0,
kv_on_gpu: bool = True,
mtp_engaged: bool = False,
) -> int:
"""Return the largest context length that fits in GPU VRAM.

Expand All @@ -1708,6 +1709,12 @@ def _fit_context_to_vram(
the KV cache lives in CPU RAM and doesn't compete with weights
for VRAM; the requested context is honored verbatim. The other
keyword args mirror ``_estimate_kv_cache_bytes``.

``mtp_engaged`` reserves extra VRAM for the MTP draft model's
KV cache + compute graph buffers. llama.cpp's MTP path keeps a
secondary cache sized off the target's KV; on tight VRAM tiers
(e.g. 32 GB) auto-fit at native context would otherwise spill
and force llama-server into a slower partial-offload path.
"""
if not self._can_estimate_kv():
logger.debug(
Expand All @@ -1728,7 +1735,9 @@ def _fit_context_to_vram(
ctx_checkpoints = ctx_checkpoints,
)

budget_bytes = available_mib * 1024 * 1024 * 0.90
# MTP needs a tighter budget; drop from 0.90 to 0.85.
budget_frac = 0.85 if mtp_engaged else 0.90
budget_bytes = available_mib * 1024 * 1024 * budget_frac
model_footprint = model_size_bytes

# Check if requested context already fits
Expand Down Expand Up @@ -2614,6 +2623,35 @@ def load_model(
# GPU/VRAM-fit logic below may shrink this if hardware is limited.
max_available_ctx = self._context_length or effective_ctx

# Will MTP engage on this load? If so, the auto-fit
# budget needs to reserve extra VRAM for the draft
# model's KV cache + compute graph. Mirrors the
# canonical-mode resolver in _build_speculative_flags:
# forced mtp / mtp+ngram always engage; auto only
# engages on an MTP GGUF >= 3B (sub-3B auto falls
# back to ngram-mod which doesn't need headroom);
# ngram / ngram-simple / off never engage MTP.
_mtp_canonical = _canonicalize_spec_mode(speculative_type)
_mtp_effective = _mtp_canonical or "auto"
_mtp_size_for_fit = _extract_model_size_b(model_identifier)
_mtp_sub_3b_for_fit = (
_mtp_size_for_fit is not None and _mtp_size_for_fit < 3.0
)
_mtp_will_engage = bool(
not _extra_args_set_spec_type(extra_args)
and (
_mtp_effective in ("mtp", "mtp+ngram")
or (
_mtp_effective == "auto"
and (
bool(self._nextn_predict_layers)
or _is_mtp_model_name(model_identifier, model_path)
)
and not _mtp_sub_3b_for_fit
)
)
)

# Auto-cap context to fit in GPU VRAM and select GPUs.
#
# Two policies depending on whether the user set n_ctx:
Expand Down Expand Up @@ -2649,6 +2687,7 @@ def load_model(
model_size,
cache_type_kv,
n_parallel = n_parallel,
mtp_engaged = _mtp_will_engage,
)
kv = self._estimate_kv_cache_bytes(
capped, cache_type_kv, n_parallel = n_parallel
Expand Down Expand Up @@ -2700,6 +2739,7 @@ def load_model(
model_size,
cache_type_kv,
n_parallel = n_parallel,
mtp_engaged = _mtp_will_engage,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Enforce MTP headroom after fitting

When this MTP-aware fit is used in the auto-context path, _fit_context_to_vram can still return requested_ctx or the 4096 minimum even though model_size + kv is above the new 85% budget (for example when weights alone exceed the 85% budget, or when the 4096 floor does not fit). The caller then accepts that returned value against _GPU_PIN_VRAM_FRACTION (95%) and sets use_fit=False, so MTP loads with weights+target-KV between 85% and 95% of free VRAM still pin without --fit and leave no room for the draft cache—the spill this change is meant to avoid remains possible in that auto-context path.

Useful? React with 👍 / 👎.

)
kv = self._estimate_kv_cache_bytes(
capped, cache_type_kv, n_parallel = n_parallel
Expand Down
27 changes: 27 additions & 0 deletions studio/backend/tests/test_kv_cache_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,33 @@ def test_fit_reduces_when_kv_on_gpu(self):
)
assert fitted < 32_768

def test_fit_mtp_engaged_returns_smaller_or_equal_context(self):
# MTP-engaged budget is 0.85 of available; non-MTP is 0.90.
# On a tight budget the MTP path must yield <= the non-MTP path.
b = self._gqa_backend()
common = dict(
requested_ctx = 32_768,
available_mib = 128,
model_size_bytes = 8 * 1024 * 1024,
cache_type_kv = "f16",
)
baseline = b._fit_context_to_vram(**common)
mtp = b._fit_context_to_vram(**common, mtp_engaged = True)
assert mtp <= baseline

def test_fit_mtp_engaged_unchanged_when_kv_off_gpu(self):
# kv_on_gpu=False short-circuits the fit; mtp_engaged is irrelevant.
b = self._gqa_backend()
fitted = b._fit_context_to_vram(
requested_ctx = 32_768,
available_mib = 1,
model_size_bytes = 100,
cache_type_kv = "f16",
kv_on_gpu = False,
mtp_engaged = True,
)
assert fitted == 32_768

def test_fit_threads_swa_full_through_estimator(self):
# SWA model, generous budget; both should fit but cache size differs.
b = self._swa_backend()
Expand Down
Loading