Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
610 changes: 500 additions & 110 deletions studio/backend/core/inference/llama_cpp.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions studio/backend/core/inference/llama_server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ def is_managed_flag(flag: str) -> bool:
# MTP path (llama.cpp #22673).
"--spec-draft-n-max",
"--spec-draft-n-min",
"--spec-draft-p-min",
"--spec-draft-p-split",
"--spec-ngram-mod-n-match",
"--spec-ngram-mod-n-min",
"--spec-ngram-mod-n-max",
Expand Down
51 changes: 48 additions & 3 deletions studio/backend/models/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,28 @@ def normalize_blank_chat_template_override(
)
speculative_type: Optional[str] = Field(
None,
description = "Speculative decoding mode for GGUF models (e.g. 'ngram-simple', 'ngram-mod'). Ignored for non-GGUF and vision models.",
description = (
"Speculative decoding mode for GGUF models. Canonical values: "
"'auto' (platform-aware: MTP on MTP GGUFs, ngram-mod fallback "
"for sub-3B), 'mtp' (force draft-mtp only on both GPU and CPU), "
"'ngram' (force ngram-mod only), 'mtp+ngram' (force "
"ngram-mod+draft-mtp chain on both platforms), 'off' (disabled). "
"Legacy values 'default' (-> auto), 'draft-mtp' (-> mtp), "
"'ngram-mod' (-> ngram), and 'ngram-simple' (kept as-is) are "
"still accepted. Ignored for non-GGUF and vision models."
),
)
spec_draft_n_max: Optional[int] = Field(
None,
ge = 1,
le = 16,
description = (
"Max draft tokens per step for MTP speculative decoding "
"(--spec-draft-n-max). Defaults to 2 on GPU and 3 on CPU/Mac "
"when unset (upstream-bench sweet spot for dense Qwen3.6 MTP "
"quants). Only applied when speculative_type resolves to "
"'mtp' or 'mtp+ngram'."
),
)
llama_extra_args: Optional[List[str]] = Field(
None,
Expand Down Expand Up @@ -218,7 +239,19 @@ class LoadResponse(BaseModel):
)
speculative_type: Optional[str] = Field(
None,
description = "Active speculative decoding mode (e.g. 'ngram-simple', 'ngram-mod'), or None if disabled",
description = (
"Canonical UI-facing requested speculative decoding mode "
"('auto' / 'mtp' / 'ngram' / 'mtp+ngram' / 'off' / "
"'ngram-simple'), round-tripped from the original LoadRequest "
"via _canonicalize_spec_mode. None when no model is loaded."
),
)
spec_draft_n_max: Optional[int] = Field(
None,
description = (
"Active --spec-draft-n-max for MTP speculative decoding, or "
"None when the platform default is in effect."
),
)


Expand Down Expand Up @@ -340,7 +373,19 @@ class InferenceStatusResponse(BaseModel):
)
speculative_type: Optional[str] = Field(
None,
description = "Active speculative decoding mode (e.g. 'ngram-simple', 'ngram-mod'), or None if disabled",
description = (
"Canonical UI-facing requested speculative decoding mode "
"('auto' / 'mtp' / 'ngram' / 'mtp+ngram' / 'off' / "
"'ngram-simple'), round-tripped from the original LoadRequest. "
"None when no model is loaded."
),
)
spec_draft_n_max: Optional[int] = Field(
None,
description = (
"Active --spec-draft-n-max for MTP speculative decoding, or "
"None when the platform default is in effect."
),
)
llama_cpp_supports_mtp: bool = Field(
True,
Expand Down
31 changes: 23 additions & 8 deletions studio/backend/routes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def _friendly_error(exc: Exception) -> str:
LlamaCppBackend,
_DEFAULT_MAX_TOKENS_FLOOR,
_DEFAULT_T_MAX_PREDICT_MS,
_canonicalize_spec_mode,
_hf_offline_if_dns_dead,
detect_reasoning_flags,
)
Expand All @@ -143,6 +144,7 @@ def _friendly_error(exc: Exception) -> str:
LlamaCppBackend,
_DEFAULT_MAX_TOKENS_FLOOR,
_DEFAULT_T_MAX_PREDICT_MS,
_canonicalize_spec_mode,
_hf_offline_if_dns_dead,
detect_reasoning_flags,
)
Expand Down Expand Up @@ -441,12 +443,17 @@ def _request_matches_loaded_settings(
# spec on ``not is_vision``), so treat the request as ``off`` against
# the backend's ``None`` to avoid forcing a redundant reload.
if llama_backend.is_vision:
req_spec = "off"
req_mode = "off"
Comment on lines 445 to +446

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Compare the recorded vision spec mode

For vision GGUFs loaded with the default Auto setting, llama_cpp.py no longer suppresses speculative decoding for vision models (it records requested_spec_mode = "auto" and explicitly says there is no vision gate), but this route still rewrites every vision request to "off". In that common scenario _request_matches_loaded_settings always returns false (off != auto), so re-applying or reloading the same already-loaded vision model needlessly restarts llama-server instead of taking the fast path.

Useful? React with 👍 / 👎.

else:
req_spec = _normalise_settings_str(request.speculative_type) or "off"
backend_spec = _normalise_settings_str(llama_backend.speculative_type) or "off"
if req_spec != backend_spec:
req_mode = _canonicalize_spec_mode(request.speculative_type) or "auto"
backend_mode = llama_backend.requested_spec_mode or "auto"
if req_mode != backend_mode:
return False
# spec_draft_n_max only matters when an MTP variant is engaged; None
# means "platform default" and matches whatever the backend chose.
if backend_mode in ("mtp", "mtp+ngram") and request.spec_draft_n_max is not None:
if int(request.spec_draft_n_max) != (llama_backend.spec_draft_n_max or 0):
return False
Comment on lines +454 to +456

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reload Auto-promoted MTP when draft tokens change

When the same MTP GGUF is already loaded in auto, changing spec_draft_n_max through the load API is skipped here because backend_mode remains "auto", so the draft-token comparison never runs. _build_speculative_flags() does apply spec_draft_n_max when Auto resolves to MTP, so a request like speculative_type: "auto", spec_draft_n_max: 5 should restart llama-server instead of reusing the existing default --spec-draft-n-max.

Useful? React with 👍 / 👎.

if (request.chat_template_override or None) != (
llama_backend.chat_template_override or None
):
Expand Down Expand Up @@ -584,7 +591,8 @@ async def load_model(
reasoning_always_on = llama_backend.reasoning_always_on,
supports_preserve_thinking = llama_backend.supports_preserve_thinking,
chat_template = llama_backend.chat_template,
speculative_type = llama_backend.speculative_type,
speculative_type = llama_backend.requested_spec_mode,
spec_draft_n_max = llama_backend.spec_draft_n_max,
)
else:
if (
Expand Down Expand Up @@ -724,7 +732,10 @@ async def load_model(
llama_backend.extra_args,
strip_context = "max_seq_length" in fields_set,
strip_cache = "cache_type_kv" in fields_set,
strip_spec = "speculative_type" in fields_set,
strip_spec = (
"speculative_type" in fields_set
or "spec_draft_n_max" in fields_set
),
strip_template = "chat_template_override" in fields_set,
)
try:
Expand Down Expand Up @@ -765,6 +776,7 @@ async def load_model(
chat_template_override = request.chat_template_override,
cache_type_kv = request.cache_type_kv,
speculative_type = request.speculative_type,
spec_draft_n_max = request.spec_draft_n_max,
n_parallel = _n_parallel,
extra_args = extra_llama_args,
)
Expand All @@ -788,6 +800,7 @@ async def load_model(
chat_template_override = request.chat_template_override,
cache_type_kv = request.cache_type_kv,
speculative_type = request.speculative_type,
spec_draft_n_max = request.spec_draft_n_max,
n_parallel = _n_parallel,
extra_args = extra_llama_args,
)
Expand Down Expand Up @@ -846,7 +859,8 @@ async def load_model(
supports_tools = llama_backend.supports_tools,
cache_type_kv = llama_backend.cache_type_kv,
chat_template = llama_backend.chat_template,
speculative_type = llama_backend.speculative_type,
speculative_type = llama_backend.requested_spec_mode,
spec_draft_n_max = llama_backend.spec_draft_n_max,
)

# ── Standard path: load via Unsloth/transformers ──────────
Expand Down Expand Up @@ -1345,7 +1359,8 @@ async def get_status(
native_context_length = llama_backend.native_context_length,
cache_type_kv = llama_backend.cache_type_kv,
chat_template_override = llama_backend.chat_template_override,
speculative_type = llama_backend.speculative_type,
speculative_type = llama_backend.requested_spec_mode,
spec_draft_n_max = llama_backend.spec_draft_n_max,
llama_cpp_supports_mtp = _supports_mtp,
llama_cpp_prebuilt_stale = _stale,
llama_cpp_installed_tag = _installed_tag,
Expand Down
1 change: 1 addition & 0 deletions studio/backend/tests/test_gguf_reload_inheritance.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def _loaded_backend(**overrides):
backend._requested_n_ctx = 8192
backend._cache_type_kv = None
backend._speculative_type = None
backend._requested_spec_mode = "auto"
backend._chat_template_override = None
backend._is_vision = False
backend._extra_args = None
Expand Down
Loading
Loading