Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 18 additions & 13 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,8 +471,9 @@ def _is_mtp_model_name(


def _extra_args_set_spec_type(extra_args: Optional[Iterable[str]]) -> bool:
"""User passed --spec-type / --spec-default? llama-server accumulates
repeated --spec-type, so we suppress auto-emit when this is true."""
"""User passed --spec-type / --spec-default? llama-server takes a
single --spec-type (comma-separated to chain), so suppress
auto-emit when this is true."""
if not extra_args:
return False
for raw in extra_args:
Expand Down Expand Up @@ -2631,10 +2632,10 @@ def load_model(
# Qwen3-235B offloaded | 12 t/s | 21 t/s | 1.8x
# gpt-oss-120b repeat (92% accept)| 181 t/s | 814 t/s | 4.5x
#
# Params from llama.cpp docs (docs/speculative.md):
# --spec-ngram-size-n 24 (small n not recommended)
# --draft-min 48 --draft-max 64 (MoEs need long drafts;
# dense models can reduce these)
# Params from llama.cpp server README:
# --spec-ngram-mod-n-match 24 (lookup length)
# --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
# (MoEs need long drafts; dense models can reduce these)
# ref: https://github.com/ggml-org/llama.cpp/blob/master/docs/speculative.md
# ref: https://github.com/ggml-org/llama.cpp/pull/19164
# ref: https://github.com/ggml-org/llama.cpp/pull/18471
Expand Down Expand Up @@ -2692,20 +2693,22 @@ def load_model(
]
)
else:
# CPU/Mac: chain ngram-mod + MTP in one
# comma-separated --spec-type (not repeated).
# ngram-mod knobs match llama.cpp defaults
# (n-match 24, n-min 48, n-max 64).
cmd.extend(
[
"--spec-type",
mtp_token,
f"ngram-mod,{mtp_token}",
"--spec-draft-n-max",
"3",
"--spec-type",
"ngram-mod",
"--spec-ngram-mod-n-match",
"24",
"--spec-ngram-mod-n-min",
"48",
"--spec-ngram-mod-n-max",
"6",
"64",
]
)
self._speculative_type = "draft-mtp"
Expand All @@ -2715,13 +2718,15 @@ def load_model(
elif normalized_spec in _valid_spec_types:
cmd.extend(["--spec-type", normalized_spec])
if normalized_spec == "ngram-mod":
# llama.cpp defaults; legacy --spec-ngram-size-n
# / --draft-{min,max} were removed for ngram-mod.
cmd.extend(
[
"--spec-ngram-size-n",
"--spec-ngram-mod-n-match",
"24",
"--draft-min",
"--spec-ngram-mod-n-min",
"48",
"--draft-max",
"--spec-ngram-mod-n-max",
"64",
]
)
Expand Down
6 changes: 2 additions & 4 deletions studio/backend/tests/test_llama_server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,17 +47,15 @@
["--spec-type", "draft-mtp", "--spec-draft-n-max", "6"],
[
"--spec-type",
"draft-mtp",
"ngram-mod,draft-mtp",
"--spec-draft-n-max",
"3",
"--spec-type",
"ngram-mod",
"--spec-ngram-mod-n-match",
"24",
"--spec-ngram-mod-n-min",
"48",
"--spec-ngram-mod-n-max",
"6",
"64",
],
# Reasoning controls
["--reasoning-format", "deepseek"],
Expand Down
Loading