Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 116 additions & 116 deletions tools/server/public/bundle.js

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,10 @@ json oaicompat_chat_params_parse(
inputs.use_jinja = opt.use_jinja;
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
const bool continue_final_message = json_value(body, "continue_final_message", false);
if (continue_final_message && inputs.add_generation_prompt) {
throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true.");
}
inputs.reasoning_format = opt.reasoning_format;
if (body.contains("reasoning_format")) {
inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
Expand Down Expand Up @@ -1071,7 +1075,10 @@ json oaicompat_chat_params_parse(

// if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
// continue_final_message is the explicit opt in alias from the vLLM/transformers API,
// equivalent to the prefill_assistant heuristic
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"
&& (continue_final_message || opt.prefill_assistant);
common_chat_msg last_message;
if (prefill_assistant_message) {
last_message = inputs.messages.back();
Expand Down
39 changes: 39 additions & 0 deletions tools/server/tests/unit/test_chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,45 @@ def test_chat_template_assistant_prefill(prefill, re_prefill):
assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"


def test_chat_template_continue_final_message_vllm_compat():
"""continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic.
Both must produce the same prompt."""
global server
server.chat_template = "llama3"
server.debug = True
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 8,
"add_generation_prompt": False,
"continue_final_message": True,
"messages": [
{"role": "system", "content": "Book"},
{"role": "user", "content": "What is the best book"},
{"role": "assistant", "content": "Whill"},
]
})
assert res.status_code == 200
assert "__verbose" in res.body
assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill"


def test_chat_template_continue_final_message_mutual_exclusion():
"""add_generation_prompt and continue_final_message both set to true must be rejected"""
global server
server.chat_template = "llama3"
server.start()
res = server.make_request("POST", "/chat/completions", data={
"max_tokens": 8,
"add_generation_prompt": True,
"continue_final_message": True,
"messages": [
{"role": "user", "content": "Hi"},
{"role": "assistant", "content": "Hello"},
]
})
assert res.status_code == 400


def test_apply_chat_template():
global server
server.chat_template = "command-r"
Expand Down
8 changes: 7 additions & 1 deletion tools/server/webui/src/lib/services/chat.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ export class ChatService {
timings_per_token,
// Config options
disableReasoningParsing,
excludeReasoningFromContext
excludeReasoningFromContext,
continueFinalMessage
} = options;

const normalizedMessages: ApiChatMessageData[] = messages
Expand Down Expand Up @@ -209,6 +210,11 @@ export class ChatService {
? ReasoningFormat.NONE
: ReasoningFormat.AUTO;

if (continueFinalMessage) {
requestBody.continue_final_message = true;
requestBody.add_generation_prompt = false;
}

if (temperature !== undefined) requestBody.temperature = temperature;
if (max_tokens !== undefined) {
// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
Expand Down
1 change: 1 addition & 0 deletions tools/server/webui/src/lib/stores/chat.svelte.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1301,6 +1301,7 @@ class ChatStore {
contextWithContinue,
{
...this.getApiOptions(),
continueFinalMessage: true,
onChunk: (chunk: string) => {
appendedContent += chunk;
hasReceivedContent = true;
Expand Down
3 changes: 3 additions & 0 deletions tools/server/webui/src/lib/types/api.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest {
// Custom parameters (JSON string)
custom?: Record<string, unknown>;
timings_per_token?: boolean;
// Continuation control (vLLM compat)
add_generation_prompt?: boolean;
continue_final_message?: boolean;
}

export interface ApiChatCompletionToolCallFunctionDelta {
Expand Down
2 changes: 2 additions & 0 deletions tools/server/webui/src/lib/types/settings.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ export interface SettingsChatServiceOptions {
// Custom parameters
custom?: string;
timings_per_token?: boolean;
// Continuation control (vLLM compat), opt in to the explicit continue final message flag
continueFinalMessage?: boolean;
// Callbacks
onChunk?: (chunk: string) => void;
onReasoningChunk?: (chunk: string) => void;
Expand Down
Loading