unslothai · danielhanchen · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026 · Apr 3, 2026
diff --git a/studio/backend/core/inference/llama_cpp.py b/studio/backend/core/inference/llama_cpp.py
@@ -109,6 +109,7 @@ def __init__(self):
         self._supports_tools: bool = False
         self._cache_type_kv: Optional[str] = None
         self._reasoning_default: bool = True
+        self._speculative_type: Optional[str] = None
         # KV-cache estimation fields (populated by _read_gguf_metadata)
         self._n_layers: Optional[int] = None
         self._n_kv_heads: Optional[int] = None
@@ -198,6 +199,10 @@ def supports_tools(self) -> bool:
     def cache_type_kv(self) -> Optional[str]:
         return self._cache_type_kv
 
+    @property
+    def speculative_type(self) -> Optional[str]:
+        return self._speculative_type
+
     # ── Binary discovery ──────────────────────────────────────────
 
     @staticmethod
@@ -1055,6 +1060,7 @@ def load_model(
         n_ctx: int = 4096,
         chat_template_override: Optional[str] = None,
         cache_type_kv: Optional[str] = None,
+        speculative_type: Optional[str] = None,
         n_threads: Optional[int] = None,
         n_gpu_layers: Optional[int] = None,  # Accepted for caller compat, unused
     ) -> bool:
@@ -1315,6 +1321,46 @@ def load_model(
             else:
                 self._cache_type_kv = None
 
+            # Speculative decoding (n-gram self-speculation, zero VRAM cost)
+            # ngram-mod: ~16 MB shared hash pool, constant memory/complexity,
+            # variable draft lengths.  Helps most when the model repeats
+            # existing text (code refactoring, summarization, reasoning).
+            # For general chat with low repetition, overhead is ~5 ms.
+            #
+            # Benchmarks from llama.cpp PRs #18471, #19164:
+            #   Scenario                        | Without | With    | Speedup
+            #   gpt-oss-120b code refactor      | 181 t/s | 446 t/s | 2.5x
+            #   Qwen3-235B offloaded            |  12 t/s |  21 t/s | 1.8x
+            #   gpt-oss-120b repeat (92% accept)| 181 t/s | 814 t/s | 4.5x
+            #
+            # Params from llama.cpp docs (docs/speculative.md):
+            #   --spec-ngram-size-n 24  (small n not recommended)
+            #   --draft-min 48 --draft-max 64 (MoEs need long drafts;
+            #     dense models can reduce these)
+            # ref: https://github.com/ggml-org/llama.cpp/blob/master/docs/speculative.md
+            # ref: https://github.com/ggml-org/llama.cpp/pull/19164
+            # ref: https://github.com/ggml-org/llama.cpp/pull/18471
+            _valid_spec_types = {"ngram-simple", "ngram-mod"}
+            if speculative_type and speculative_type in _valid_spec_types:
+                if not is_vision:  # spec decoding disabled for vision models
+                    cmd.extend(["--spec-type", speculative_type])
+                    if speculative_type == "ngram-mod":
+                        cmd.extend(
+                            [
+                                "--spec-ngram-size-n",
+                                "24",
+                                "--draft-min",
+                                "48",
+                                "--draft-max",
+                                "64",
+                            ]
+                        )
+                    self._speculative_type = speculative_type
+                else:
+                    self._speculative_type = None
+            else:
+                self._speculative_type = None
+
             # Apply custom chat template override if provided
             if chat_template_override:
                 import tempfile
@@ -1553,6 +1599,7 @@ def unload_model(self) -> bool:
             self._reasoning_always_on = False
             self._supports_tools = False
             self._cache_type_kv = None
+            self._speculative_type = None
             self._n_layers = None
             self._n_kv_heads = None
             self._n_heads = None

diff --git a/studio/backend/models/inference.py b/studio/backend/models/inference.py
@@ -48,6 +48,10 @@ class LoadRequest(BaseModel):
         None,
         description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries. Not supported for GGUF models.",
     )
+    speculative_type: Optional[str] = Field(
+        None,
+        description = "Speculative decoding mode for GGUF models (e.g. 'ngram-simple', 'ngram-mod'). Ignored for non-GGUF and vision models.",
+    )
 
 
 class UnloadRequest(BaseModel):
@@ -163,6 +167,10 @@ class LoadResponse(BaseModel):
         None,
         description = "Jinja2 chat template string (from GGUF metadata or tokenizer)",
     )
+    speculative_type: Optional[str] = Field(
+        None,
+        description = "Active speculative decoding mode (e.g. 'ngram-simple', 'ngram-mod'), or None if disabled",
+    )
 
 
 class UnloadResponse(BaseModel):
@@ -225,6 +233,10 @@ class InferenceStatusResponse(BaseModel):
         None,
         description = "Model's native context length from GGUF metadata (not capped by VRAM)",
     )
+    speculative_type: Optional[str] = Field(
+        None,
+        description = "Active speculative decoding mode (e.g. 'ngram-simple', 'ngram-mod'), or None if disabled",
+    )
 
 
 # =====================================================================

diff --git a/studio/backend/routes/inference.py b/studio/backend/routes/inference.py
@@ -179,6 +179,7 @@ async def load_model(
                     supports_reasoning = llama_backend.supports_reasoning,
                     reasoning_always_on = llama_backend.reasoning_always_on,
                     chat_template = llama_backend.chat_template,
+                    speculative_type = llama_backend.speculative_type,
                 )
         else:
             if (
@@ -263,6 +264,7 @@ async def load_model(
                     n_ctx = request.max_seq_length,
                     chat_template_override = request.chat_template_override,
                     cache_type_kv = request.cache_type_kv,
+                    speculative_type = request.speculative_type,
                 )
             else:
                 # Local mode: llama-server loads via -m <path>
@@ -275,6 +277,7 @@ async def load_model(
                     n_ctx = request.max_seq_length,
                     chat_template_override = request.chat_template_override,
                     cache_type_kv = request.cache_type_kv,
+                    speculative_type = request.speculative_type,
                 )
 
             if not success:
@@ -317,6 +320,7 @@ async def load_model(
                 supports_tools = llama_backend.supports_tools,
                 cache_type_kv = llama_backend.cache_type_kv,
                 chat_template = llama_backend.chat_template,
+                speculative_type = llama_backend.speculative_type,
             )
 
         # ── Standard path: load via Unsloth/transformers ──────────
@@ -652,6 +656,7 @@ async def get_status(
                 context_length = llama_backend.context_length,
                 max_context_length = llama_backend.max_context_length,
                 native_context_length = llama_backend.native_context_length,
+                speculative_type = llama_backend.speculative_type,
             )
 
         # Otherwise, report Unsloth backend status

diff --git a/studio/frontend/src/features/chat/chat-settings-sheet.tsx b/studio/frontend/src/features/chat/chat-settings-sheet.tsx
@@ -280,6 +280,15 @@ export function ChatSettingsPanel({
 }: ChatSettingsPanelProps) {
   const isMobile = useIsMobile();
   const isGguf = useChatRuntimeStore((s) => s.activeGgufVariant) != null;
+  const speculativeType = useChatRuntimeStore((s) => s.speculativeType);
+  const setSpeculativeType = useChatRuntimeStore((s) => s.setSpeculativeType);
+  const loadedSpeculativeType = useChatRuntimeStore(
+    (s) => s.loadedSpeculativeType,
+  );
+  const currentModels = useChatRuntimeStore((s) => s.models);
+  const currentCheckpoint = params.checkpoint;
+  const currentModelIsVision =
+    currentModels.find((m) => m.id === currentCheckpoint)?.isVision ?? false;
   const ggufContextLength = useChatRuntimeStore((s) => s.ggufContextLength);
   const ggufMaxContextLength = useChatRuntimeStore(
     (s) => s.ggufMaxContextLength,
@@ -299,7 +308,8 @@ export function ChatSettingsPanel({
   const ctxMaxValue = ggufNativeContextLength ?? ggufContextLength ?? null;
   const kvDirty = kvCacheDtype !== loadedKvCacheDtype;
   const ctxDirty = customContextLength !== null;
-  const modelSettingsDirty = kvDirty || ctxDirty;
+  const specDirty = speculativeType !== loadedSpeculativeType;
+  const modelSettingsDirty = kvDirty || ctxDirty || specDirty;
   const [customPresets, setCustomPresets] = useState<Preset[]>(() =>
     loadSavedCustomPresets(),
   );
@@ -580,6 +590,32 @@ export function ChatSettingsPanel({
                     </SelectContent>
                   </Select>
                 </div>
+                {!currentModelIsVision && (
+                  <div className="flex items-center justify-between gap-3">
+                    <div className="min-w-0">
+                      <div className="text-xs font-medium">
+                        Speculative Decoding
+                      </div>
+                      <div className="text-[11px] text-muted-foreground">
+                        Speed up generation with no VRAM cost.
+                      </div>
+                    </div>
+                    <Select
+                      value={speculativeType ?? "off"}
+                      onValueChange={(v) => {
+                        setSpeculativeType(v === "off" ? null : v);
+                      }}
+                    >
+                      <SelectTrigger className="h-7 w-[120px] text-xs">
+                        <SelectValue />
+                      </SelectTrigger>
+                      <SelectContent>
+                        <SelectItem value="ngram-mod">On</SelectItem>
+                        <SelectItem value="off">Off</SelectItem>
+                      </SelectContent>
+                    </Select>
+                  </div>
+                )}
                 {modelSettingsDirty && (
                   <div className="flex flex-wrap gap-1.5 pt-1">
                     <button
@@ -594,6 +630,7 @@ export function ChatSettingsPanel({
                       onClick={() => {
                         setCustomContextLength(null);
                         setKvCacheDtype(loadedKvCacheDtype);
+                        setSpeculativeType(loadedSpeculativeType);
                       }}
                       className="rounded-md border px-2.5 py-1 text-[11px] font-medium text-muted-foreground transition-colors hover:bg-accent"
                     >

diff --git a/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts b/studio/frontend/src/features/chat/hooks/use-chat-model-runtime.ts
@@ -250,13 +250,16 @@ export function useChatModelRuntime() {
         const ggufNativeContextLength = statusRes.is_gguf
           ? (statusRes.native_context_length ?? null)
           : null;
+        const currentSpecType = statusRes.speculative_type ?? null;
         useChatRuntimeStore.setState({
           supportsReasoning,
           reasoningAlwaysOn,
           supportsTools,
           ggufContextLength: currentGgufContextLength,
           ggufMaxContextLength,
           ggufNativeContextLength,
+          speculativeType: currentSpecType,
+          loadedSpeculativeType: currentSpecType,
         });
 
         // Set reasoning default for Qwen3.5 small models
@@ -393,7 +396,7 @@ export function useChatModelRuntime() {
               previousWasUnloaded = true;
             }
 
-            const { chatTemplateOverride, kvCacheDtype, customContextLength, ggufContextLength } = useChatRuntimeStore.getState();
+            const { chatTemplateOverride, kvCacheDtype, customContextLength, ggufContextLength, speculativeType } = useChatRuntimeStore.getState();
             // GGUF: use custom context length, or 0 = model's native context
             // Non-GGUF: use the Max Seq Length slider value
             const effectiveMaxSeqLength = customContextLength != null
@@ -409,6 +412,7 @@ export function useChatModelRuntime() {
               trust_remote_code: paramsBeforeLoad.trustRemoteCode ?? false,
               chat_template_override: chatTemplateOverride,
               cache_type_kv: kvCacheDtype,
+              speculative_type: speculativeType,
             });
 
             // If cancelled while loading, don't update UI to show
@@ -431,6 +435,7 @@ export function useChatModelRuntime() {
               }
             }
             const loadedKv = loadResponse.cache_type_kv ?? null;
+            const loadedSpec = loadResponse.speculative_type ?? null;
             const nativeCtx = loadResponse.is_gguf
               ? (loadResponse.context_length ?? 131072)
               : null;
@@ -457,6 +462,8 @@ export function useChatModelRuntime() {
               codeToolsEnabled: loadResponse.supports_tools ?? false,
               kvCacheDtype: loadedKv,
               loadedKvCacheDtype: loadedKv,
+              speculativeType: loadedSpec,
+              loadedSpeculativeType: loadedSpec,
               customContextLength: keepCustomCtx,
               defaultChatTemplate: loadResponse.chat_template ?? null,
               chatTemplateOverride: null,

diff --git a/studio/frontend/src/features/chat/stores/chat-runtime-store.ts b/studio/frontend/src/features/chat/stores/chat-runtime-store.ts
@@ -165,6 +165,8 @@ type ChatRuntimeStore = {
   toolCallTimeout: number;
   kvCacheDtype: string | null;
   loadedKvCacheDtype: string | null;
+  speculativeType: string | null;
+  loadedSpeculativeType: string | null;
   customContextLength: number | null;
   defaultChatTemplate: string | null;
   chatTemplateOverride: string | null;
@@ -198,6 +200,7 @@ type ChatRuntimeStore = {
   setMaxToolCallsPerMessage: (value: number) => void;
   setToolCallTimeout: (value: number) => void;
   setKvCacheDtype: (dtype: string | null) => void;
+  setSpeculativeType: (type: string | null) => void;
   setCustomContextLength: (v: number | null) => void;
   setChatTemplateOverride: (template: string | null) => void;
   setPendingAudio: (base64: string, name: string) => void;
@@ -230,6 +233,8 @@ export const useChatRuntimeStore = create<ChatRuntimeStore>((set) => ({
   toolCallTimeout: loadInt(TOOL_CALL_TIMEOUT_KEY, 5),
   kvCacheDtype: null,
   loadedKvCacheDtype: null,
+  speculativeType: "ngram-mod",
+  loadedSpeculativeType: null,
   customContextLength: null,
   defaultChatTemplate: null,
   chatTemplateOverride: null,
@@ -302,6 +307,8 @@ export const useChatRuntimeStore = create<ChatRuntimeStore>((set) => ({
       toolStatus: null,
       kvCacheDtype: null,
       loadedKvCacheDtype: null,
+      speculativeType: "ngram-mod",
+      loadedSpeculativeType: null,
       customContextLength: null,
       defaultChatTemplate: null,
       chatTemplateOverride: null,
@@ -327,6 +334,7 @@ export const useChatRuntimeStore = create<ChatRuntimeStore>((set) => ({
       return { toolCallTimeout };
     }),
   setKvCacheDtype: (kvCacheDtype) => set({ kvCacheDtype }),
+  setSpeculativeType: (speculativeType) => set({ speculativeType }),
   setCustomContextLength: (customContextLength) => set({ customContextLength }),
   setChatTemplateOverride: (chatTemplateOverride) => set({ chatTemplateOverride }),
   setPendingAudio: (base64, name) =>

diff --git a/studio/frontend/src/features/chat/types/api.ts b/studio/frontend/src/features/chat/types/api.ts
@@ -41,6 +41,7 @@ export interface LoadModelRequest {
   trust_remote_code?: boolean;
   chat_template_override?: string | null;
   cache_type_kv?: string | null;
+  speculative_type?: string | null;
 }
 
 export interface ValidateModelResponse {
@@ -93,6 +94,7 @@ export interface LoadModelResponse {
   supports_tools?: boolean;
   cache_type_kv?: string | null;
   chat_template?: string | null;
+  speculative_type?: string | null;
 }
 
 export interface UnloadModelRequest {
@@ -123,6 +125,7 @@ export interface InferenceStatusResponse {
   context_length?: number | null;
   max_context_length?: number | null;
   native_context_length?: number | null;
+  speculative_type?: string | null;
 }
 
 export interface AudioGenerationResponse {