unslothai · danielhanchen · May 19, 2026 · May 18, 2026 · May 18, 2026
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -471,8 +471,9 @@ def _is_mtp_model_name(
 
 
 def _extra_args_set_spec_type(extra_args: Optional[Iterable[str]]) -> bool:
-    """User passed --spec-type / --spec-default? llama-server accumulates
-    repeated --spec-type, so we suppress auto-emit when this is true."""
+    """User passed --spec-type / --spec-default? llama-server takes a
+    single --spec-type (comma-separated to chain), so suppress
+    auto-emit when this is true."""
     if not extra_args:
         return False
     for raw in extra_args:
@@ -2631,10 +2632,10 @@ def load_model(
                 #   Qwen3-235B offloaded            |  12 t/s |  21 t/s | 1.8x
                 #   gpt-oss-120b repeat (92% accept)| 181 t/s | 814 t/s | 4.5x
                 #
-                # Params from llama.cpp docs (docs/speculative.md):
-                #   --spec-ngram-size-n 24  (small n not recommended)
-                #   --draft-min 48 --draft-max 64 (MoEs need long drafts;
-                #     dense models can reduce these)
+                # Params from llama.cpp server README:
+                #   --spec-ngram-mod-n-match 24 (lookup length)
+                #   --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
+                #   (MoEs need long drafts; dense models can reduce these)
                 # ref: https://github.com/ggml-org/llama.cpp/blob/master/docs/speculative.md
                 # ref: https://github.com/ggml-org/llama.cpp/pull/19164
                 # ref: https://github.com/ggml-org/llama.cpp/pull/18471
@@ -2692,20 +2693,22 @@ def load_model(
                                     ]
                                 )
                             else:
+                                # CPU/Mac: chain ngram-mod + MTP in one
+                                # comma-separated --spec-type (not repeated).
+                                # ngram-mod knobs match llama.cpp defaults
+                                # (n-match 24, n-min 48, n-max 64).
                                 cmd.extend(
                                     [
                                         "--spec-type",
-                                        mtp_token,
+                                        f"ngram-mod,{mtp_token}",
                                         "--spec-draft-n-max",
                                         "3",
-                                        "--spec-type",
-                                        "ngram-mod",
                                         "--spec-ngram-mod-n-match",
                                         "24",
                                         "--spec-ngram-mod-n-min",
                                         "48",
                                         "--spec-ngram-mod-n-max",
-                                        "6",
+                                        "64",
                                     ]
                                 )
                             self._speculative_type = "draft-mtp"
@@ -2715,13 +2718,15 @@ def load_model(
                     elif normalized_spec in _valid_spec_types:
                         cmd.extend(["--spec-type", normalized_spec])
                         if normalized_spec == "ngram-mod":
+                            # llama.cpp defaults; legacy --spec-ngram-size-n
+                            # / --draft-{min,max} were removed for ngram-mod.
                             cmd.extend(
                                 [
-                                    "--spec-ngram-size-n",
+                                    "--spec-ngram-mod-n-match",
                                     "24",
-                                    "--draft-min",
+                                    "--spec-ngram-mod-n-min",
                                     "48",
-                                    "--draft-max",
+                                    "--spec-ngram-mod-n-max",
                                     "64",
                                 ]
                             )

@@ -47,17 +47,15 @@
         ["--spec-type", "draft-mtp", "--spec-draft-n-max", "6"],
         [
             "--spec-type",
-            "draft-mtp",
+            "ngram-mod,draft-mtp",
             "--spec-draft-n-max",
             "3",
-            "--spec-type",
-            "ngram-mod",
             "--spec-ngram-mod-n-match",
             "24",
             "--spec-ngram-mod-n-min",
             "48",
             "--spec-ngram-mod-n-max",
-            "6",
+            "64",
         ],
         # Reasoning controls
         ["--reasoning-format", "deepseek"],