Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions studio/backend/core/inference/llama_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ def __init__(self):
self._supports_tools: bool = False
self._cache_type_kv: Optional[str] = None
self._reasoning_default: bool = True
self._speculative_type: Optional[str] = None
# KV-cache estimation fields (populated by _read_gguf_metadata)
self._n_layers: Optional[int] = None
self._n_kv_heads: Optional[int] = None
Expand Down Expand Up @@ -198,6 +199,10 @@ def supports_tools(self) -> bool:
def cache_type_kv(self) -> Optional[str]:
return self._cache_type_kv

@property
def speculative_type(self) -> Optional[str]:
return self._speculative_type

# ── Binary discovery ──────────────────────────────────────────

@staticmethod
Expand Down Expand Up @@ -1055,6 +1060,7 @@ def load_model(
n_ctx: int = 4096,
chat_template_override: Optional[str] = None,
cache_type_kv: Optional[str] = None,
speculative_type: Optional[str] = None,
n_threads: Optional[int] = None,
n_gpu_layers: Optional[int] = None, # Accepted for caller compat, unused
) -> bool:
Expand Down Expand Up @@ -1315,6 +1321,46 @@ def load_model(
else:
self._cache_type_kv = None

# Speculative decoding (n-gram self-speculation, zero VRAM cost)
# ngram-mod: ~16 MB shared hash pool, constant memory/complexity,
# variable draft lengths. Helps most when the model repeats
# existing text (code refactoring, summarization, reasoning).
# For general chat with low repetition, overhead is ~5 ms.
#
# Benchmarks from llama.cpp PRs #18471, #19164:
# Scenario | Without | With | Speedup
# gpt-oss-120b code refactor | 181 t/s | 446 t/s | 2.5x
# Qwen3-235B offloaded | 12 t/s | 21 t/s | 1.8x
# gpt-oss-120b repeat (92% accept)| 181 t/s | 814 t/s | 4.5x
#
# Params from llama.cpp docs (docs/speculative.md):
# --spec-ngram-size-n 24 (small n not recommended)
# --draft-min 48 --draft-max 64 (MoEs need long drafts;
# dense models can reduce these)
# ref: https://github.com/ggml-org/llama.cpp/blob/master/docs/speculative.md
# ref: https://github.com/ggml-org/llama.cpp/pull/19164
# ref: https://github.com/ggml-org/llama.cpp/pull/18471
_valid_spec_types = {"ngram-simple", "ngram-mod"}
if speculative_type and speculative_type in _valid_spec_types:
if not is_vision: # spec decoding disabled for vision models
cmd.extend(["--spec-type", speculative_type])
if speculative_type == "ngram-mod":
cmd.extend(
[
"--spec-ngram-size-n",
"24",
"--draft-min",
"48",
"--draft-max",
"64",
]
)
self._speculative_type = speculative_type
else:
self._speculative_type = None
else:
self._speculative_type = None

# Apply custom chat template override if provided
if chat_template_override:
import tempfile
Expand Down Expand Up @@ -1553,6 +1599,7 @@ def unload_model(self) -> bool:
self._reasoning_always_on = False
self._supports_tools = False
self._cache_type_kv = None
self._speculative_type = None
self._n_layers = None
self._n_kv_heads = None
self._n_heads = None
Expand Down
12 changes: 12 additions & 0 deletions studio/backend/models/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ class LoadRequest(BaseModel):
None,
description = "Physical GPU indices to use, for example [0, 1]. Omit or pass [] to use automatic selection. Explicit gpu_ids are unsupported when the parent CUDA_VISIBLE_DEVICES uses UUID/MIG entries. Not supported for GGUF models.",
)
speculative_type: Optional[str] = Field(
None,
description = "Speculative decoding mode for GGUF models (e.g. 'ngram-simple', 'ngram-mod'). Ignored for non-GGUF and vision models.",
)


class UnloadRequest(BaseModel):
Expand Down Expand Up @@ -163,6 +167,10 @@ class LoadResponse(BaseModel):
None,
description = "Jinja2 chat template string (from GGUF metadata or tokenizer)",
)
speculative_type: Optional[str] = Field(
None,
description = "Active speculative decoding mode (e.g. 'ngram-simple', 'ngram-mod'), or None if disabled",
)


class UnloadResponse(BaseModel):
Expand Down Expand Up @@ -225,6 +233,10 @@ class InferenceStatusResponse(BaseModel):
None,
description = "Model's native context length from GGUF metadata (not capped by VRAM)",
)
speculative_type: Optional[str] = Field(
None,
description = "Active speculative decoding mode (e.g. 'ngram-simple', 'ngram-mod'), or None if disabled",
)


# =====================================================================
Expand Down
5 changes: 5 additions & 0 deletions studio/backend/routes/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ async def load_model(
supports_reasoning = llama_backend.supports_reasoning,
reasoning_always_on = llama_backend.reasoning_always_on,
chat_template = llama_backend.chat_template,
speculative_type = llama_backend.speculative_type,
)
else:
if (
Expand Down Expand Up @@ -263,6 +264,7 @@ async def load_model(
n_ctx = request.max_seq_length,
chat_template_override = request.chat_template_override,
cache_type_kv = request.cache_type_kv,
speculative_type = request.speculative_type,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Apply default speculative mode when field is omitted

Forwarding request.speculative_type directly means omitted input is treated as None (because LoadRequest.speculative_type defaults to None), so GGUF loads from callers that still omit this field (e.g. chat-adapter.ts auto-load paths and shared-composer.tsx compare loads) silently start with speculative decoding off. That makes behavior depend on which UI path triggered the load, despite this feature being intended to be on by default for GGUF models.

Useful? React with 👍 / 👎.

)
else:
# Local mode: llama-server loads via -m <path>
Expand All @@ -275,6 +277,7 @@ async def load_model(
n_ctx = request.max_seq_length,
chat_template_override = request.chat_template_override,
cache_type_kv = request.cache_type_kv,
speculative_type = request.speculative_type,
)

if not success:
Expand Down Expand Up @@ -317,6 +320,7 @@ async def load_model(
supports_tools = llama_backend.supports_tools,
cache_type_kv = llama_backend.cache_type_kv,
chat_template = llama_backend.chat_template,
speculative_type = llama_backend.speculative_type,
)

# ── Standard path: load via Unsloth/transformers ──────────
Expand Down Expand Up @@ -652,6 +656,7 @@ async def get_status(
context_length = llama_backend.context_length,
max_context_length = llama_backend.max_context_length,
native_context_length = llama_backend.native_context_length,
speculative_type = llama_backend.speculative_type,
)

# Otherwise, report Unsloth backend status
Expand Down
39 changes: 38 additions & 1 deletion studio/frontend/src/features/chat/chat-settings-sheet.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,15 @@ export function ChatSettingsPanel({
}: ChatSettingsPanelProps) {
const isMobile = useIsMobile();
const isGguf = useChatRuntimeStore((s) => s.activeGgufVariant) != null;
const speculativeType = useChatRuntimeStore((s) => s.speculativeType);
const setSpeculativeType = useChatRuntimeStore((s) => s.setSpeculativeType);
const loadedSpeculativeType = useChatRuntimeStore(
(s) => s.loadedSpeculativeType,
);
const currentModels = useChatRuntimeStore((s) => s.models);
const currentCheckpoint = params.checkpoint;
const currentModelIsVision =
currentModels.find((m) => m.id === currentCheckpoint)?.isVision ?? false;
const ggufContextLength = useChatRuntimeStore((s) => s.ggufContextLength);
const ggufMaxContextLength = useChatRuntimeStore(
(s) => s.ggufMaxContextLength,
Expand All @@ -299,7 +308,8 @@ export function ChatSettingsPanel({
const ctxMaxValue = ggufNativeContextLength ?? ggufContextLength ?? null;
const kvDirty = kvCacheDtype !== loadedKvCacheDtype;
const ctxDirty = customContextLength !== null;
const modelSettingsDirty = kvDirty || ctxDirty;
const specDirty = speculativeType !== loadedSpeculativeType;
const modelSettingsDirty = kvDirty || ctxDirty || specDirty;
const [customPresets, setCustomPresets] = useState<Preset[]>(() =>
loadSavedCustomPresets(),
);
Expand Down Expand Up @@ -580,6 +590,32 @@ export function ChatSettingsPanel({
</SelectContent>
</Select>
</div>
{!currentModelIsVision && (
<div className="flex items-center justify-between gap-3">
<div className="min-w-0">
<div className="text-xs font-medium">
Speculative Decoding
</div>
<div className="text-[11px] text-muted-foreground">
Speed up generation with no VRAM cost.
</div>
</div>
<Select
value={speculativeType ?? "off"}
onValueChange={(v) => {
setSpeculativeType(v === "off" ? null : v);
}}
>
<SelectTrigger className="h-7 w-[120px] text-xs">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="ngram-mod">On</SelectItem>
<SelectItem value="off">Off</SelectItem>
</SelectContent>
</Select>
</div>
)}
{modelSettingsDirty && (
<div className="flex flex-wrap gap-1.5 pt-1">
<button
Expand All @@ -594,6 +630,7 @@ export function ChatSettingsPanel({
onClick={() => {
setCustomContextLength(null);
setKvCacheDtype(loadedKvCacheDtype);
setSpeculativeType(loadedSpeculativeType);
}}
className="rounded-md border px-2.5 py-1 text-[11px] font-medium text-muted-foreground transition-colors hover:bg-accent"
>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,13 +250,16 @@ export function useChatModelRuntime() {
const ggufNativeContextLength = statusRes.is_gguf
? (statusRes.native_context_length ?? null)
: null;
const currentSpecType = statusRes.speculative_type ?? null;
useChatRuntimeStore.setState({
supportsReasoning,
reasoningAlwaysOn,
supportsTools,
ggufContextLength: currentGgufContextLength,
ggufMaxContextLength,
ggufNativeContextLength,
speculativeType: currentSpecType,
loadedSpeculativeType: currentSpecType,
});

// Set reasoning default for Qwen3.5 small models
Expand Down Expand Up @@ -393,7 +396,7 @@ export function useChatModelRuntime() {
previousWasUnloaded = true;
}

const { chatTemplateOverride, kvCacheDtype, customContextLength, ggufContextLength } = useChatRuntimeStore.getState();
const { chatTemplateOverride, kvCacheDtype, customContextLength, ggufContextLength, speculativeType } = useChatRuntimeStore.getState();
// GGUF: use custom context length, or 0 = model's native context
// Non-GGUF: use the Max Seq Length slider value
const effectiveMaxSeqLength = customContextLength != null
Expand All @@ -409,6 +412,7 @@ export function useChatModelRuntime() {
trust_remote_code: paramsBeforeLoad.trustRemoteCode ?? false,
chat_template_override: chatTemplateOverride,
cache_type_kv: kvCacheDtype,
speculative_type: speculativeType,
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Propagate speculative setting in rollback load request

The new speculative_type is only attached to the primary loadModel request path, so when a model switch fails and performLoad() falls back to reloading the previous checkpoint, that rollback request still omits this field. In that failure scenario, GGUF rollback loads come back with speculative decoding disabled (null) even if the previous model had it enabled, so a transient load error silently changes runtime behavior after recovery.

Useful? React with 👍 / 👎.

});

// If cancelled while loading, don't update UI to show
Expand All @@ -431,6 +435,7 @@ export function useChatModelRuntime() {
}
}
const loadedKv = loadResponse.cache_type_kv ?? null;
const loadedSpec = loadResponse.speculative_type ?? null;
const nativeCtx = loadResponse.is_gguf
? (loadResponse.context_length ?? 131072)
: null;
Expand All @@ -457,6 +462,8 @@ export function useChatModelRuntime() {
codeToolsEnabled: loadResponse.supports_tools ?? false,
kvCacheDtype: loadedKv,
loadedKvCacheDtype: loadedKv,
speculativeType: loadedSpec,
loadedSpeculativeType: loadedSpec,
Comment on lines +465 to +466
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve speculative default across non-GGUF loads

This assignment unconditionally overwrites the store with loadResponse.speculative_type, but non-GGUF (and vision GGUF) load responses return null/missing for that field. After loading one of those models, speculativeType becomes null, and the next GGUF load reuses that value in the request, so speculative decoding is silently disabled even though the feature is intended to default to On. Restrict this overwrite to GGUF models (or keep the previous/default value for non-GGUF responses).

Useful? React with 👍 / 👎.

customContextLength: keepCustomCtx,
defaultChatTemplate: loadResponse.chat_template ?? null,
chatTemplateOverride: null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ type ChatRuntimeStore = {
toolCallTimeout: number;
kvCacheDtype: string | null;
loadedKvCacheDtype: string | null;
speculativeType: string | null;
loadedSpeculativeType: string | null;
customContextLength: number | null;
defaultChatTemplate: string | null;
chatTemplateOverride: string | null;
Expand Down Expand Up @@ -198,6 +200,7 @@ type ChatRuntimeStore = {
setMaxToolCallsPerMessage: (value: number) => void;
setToolCallTimeout: (value: number) => void;
setKvCacheDtype: (dtype: string | null) => void;
setSpeculativeType: (type: string | null) => void;
setCustomContextLength: (v: number | null) => void;
setChatTemplateOverride: (template: string | null) => void;
setPendingAudio: (base64: string, name: string) => void;
Expand Down Expand Up @@ -230,6 +233,8 @@ export const useChatRuntimeStore = create<ChatRuntimeStore>((set) => ({
toolCallTimeout: loadInt(TOOL_CALL_TIMEOUT_KEY, 5),
kvCacheDtype: null,
loadedKvCacheDtype: null,
speculativeType: "ngram-mod",
loadedSpeculativeType: null,
customContextLength: null,
defaultChatTemplate: null,
chatTemplateOverride: null,
Expand Down Expand Up @@ -302,6 +307,8 @@ export const useChatRuntimeStore = create<ChatRuntimeStore>((set) => ({
toolStatus: null,
kvCacheDtype: null,
loadedKvCacheDtype: null,
speculativeType: "ngram-mod",
loadedSpeculativeType: null,
customContextLength: null,
defaultChatTemplate: null,
chatTemplateOverride: null,
Expand All @@ -327,6 +334,7 @@ export const useChatRuntimeStore = create<ChatRuntimeStore>((set) => ({
return { toolCallTimeout };
}),
setKvCacheDtype: (kvCacheDtype) => set({ kvCacheDtype }),
setSpeculativeType: (speculativeType) => set({ speculativeType }),
setCustomContextLength: (customContextLength) => set({ customContextLength }),
setChatTemplateOverride: (chatTemplateOverride) => set({ chatTemplateOverride }),
setPendingAudio: (base64, name) =>
Expand Down
3 changes: 3 additions & 0 deletions studio/frontend/src/features/chat/types/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ export interface LoadModelRequest {
trust_remote_code?: boolean;
chat_template_override?: string | null;
cache_type_kv?: string | null;
speculative_type?: string | null;
}

export interface ValidateModelResponse {
Expand Down Expand Up @@ -93,6 +94,7 @@ export interface LoadModelResponse {
supports_tools?: boolean;
cache_type_kv?: string | null;
chat_template?: string | null;
speculative_type?: string | null;
}

export interface UnloadModelRequest {
Expand Down Expand Up @@ -123,6 +125,7 @@ export interface InferenceStatusResponse {
context_length?: number | null;
max_context_length?: number | null;
native_context_length?: number | null;
speculative_type?: string | null;
}

export interface AudioGenerationResponse {
Expand Down
Loading