-
-
Notifications
You must be signed in to change notification settings - Fork 5.9k
studio: add speculative decoding support (ngram-mod, on by default) #4836
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8c46f53
a77a8ac
67dae36
082987a
66797d1
1e8f4ac
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -250,13 +250,16 @@ export function useChatModelRuntime() { | |
| const ggufNativeContextLength = statusRes.is_gguf | ||
| ? (statusRes.native_context_length ?? null) | ||
| : null; | ||
| const currentSpecType = statusRes.speculative_type ?? null; | ||
| useChatRuntimeStore.setState({ | ||
| supportsReasoning, | ||
| reasoningAlwaysOn, | ||
| supportsTools, | ||
| ggufContextLength: currentGgufContextLength, | ||
| ggufMaxContextLength, | ||
| ggufNativeContextLength, | ||
| speculativeType: currentSpecType, | ||
| loadedSpeculativeType: currentSpecType, | ||
| }); | ||
|
|
||
| // Set reasoning default for Qwen3.5 small models | ||
|
|
@@ -393,7 +396,7 @@ export function useChatModelRuntime() { | |
| previousWasUnloaded = true; | ||
| } | ||
|
|
||
| const { chatTemplateOverride, kvCacheDtype, customContextLength, ggufContextLength } = useChatRuntimeStore.getState(); | ||
| const { chatTemplateOverride, kvCacheDtype, customContextLength, ggufContextLength, speculativeType } = useChatRuntimeStore.getState(); | ||
| // GGUF: use custom context length, or 0 = model's native context | ||
| // Non-GGUF: use the Max Seq Length slider value | ||
| const effectiveMaxSeqLength = customContextLength != null | ||
|
|
@@ -409,6 +412,7 @@ export function useChatModelRuntime() { | |
| trust_remote_code: paramsBeforeLoad.trustRemoteCode ?? false, | ||
| chat_template_override: chatTemplateOverride, | ||
| cache_type_kv: kvCacheDtype, | ||
| speculative_type: speculativeType, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The new Useful? React with 👍 / 👎. |
||
| }); | ||
|
|
||
| // If cancelled while loading, don't update UI to show | ||
|
|
@@ -431,6 +435,7 @@ export function useChatModelRuntime() { | |
| } | ||
| } | ||
| const loadedKv = loadResponse.cache_type_kv ?? null; | ||
| const loadedSpec = loadResponse.speculative_type ?? null; | ||
| const nativeCtx = loadResponse.is_gguf | ||
| ? (loadResponse.context_length ?? 131072) | ||
| : null; | ||
|
|
@@ -457,6 +462,8 @@ export function useChatModelRuntime() { | |
| codeToolsEnabled: loadResponse.supports_tools ?? false, | ||
| kvCacheDtype: loadedKv, | ||
| loadedKvCacheDtype: loadedKv, | ||
| speculativeType: loadedSpec, | ||
| loadedSpeculativeType: loadedSpec, | ||
|
Comment on lines
+465
to
+466
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This assignment unconditionally overwrites the store with Useful? React with 👍 / 👎. |
||
| customContextLength: keepCustomCtx, | ||
| defaultChatTemplate: loadResponse.chat_template ?? null, | ||
| chatTemplateOverride: null, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Forwarding
request.speculative_typedirectly means omitted input is treated asNone(becauseLoadRequest.speculative_typedefaults toNone), so GGUF loads from callers that still omit this field (e.g.chat-adapter.tsauto-load paths andshared-composer.tsxcompare loads) silently start with speculative decoding off. That makes behavior depend on which UI path triggered the load, despite this feature being intended to be on by default for GGUF models.Useful? React with 👍 / 👎.