ggml-org · ServeurpersoCom · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
diff --git a/tools/server/public/bundle.js b/tools/server/public/bundle.js
@@ -1040,6 +1040,10 @@ json oaicompat_chat_params_parse(
     inputs.use_jinja             = opt.use_jinja;
     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", caps["supports_parallel_tool_calls"]);
     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
+    const bool continue_final_message = json_value(body, "continue_final_message", false);
+    if (continue_final_message && inputs.add_generation_prompt) {
+        throw std::invalid_argument("Cannot set both add_generation_prompt and continue_final_message to true.");
+    }
     inputs.reasoning_format      = opt.reasoning_format;
     if (body.contains("reasoning_format")) {
         inputs.reasoning_format = common_reasoning_format_from_name(body.at("reasoning_format").get<std::string>());
@@ -1071,7 +1075,10 @@ json oaicompat_chat_params_parse(
 
     // if the assistant message appears at the end of list, we do not add end-of-turn token
     // for ex. this can be useful to modify the reasoning process in reasoning models
-    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
+    // continue_final_message is the explicit opt in alias from the vLLM/transformers API,
+    // equivalent to the prefill_assistant heuristic
+    bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"
+        && (continue_final_message || opt.prefill_assistant);
     common_chat_msg last_message;
     if (prefill_assistant_message) {
         last_message = inputs.messages.back();

@@ -178,6 +178,45 @@ def test_chat_template_assistant_prefill(prefill, re_prefill):
     assert res.body["__verbose"]["prompt"] == f"<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{re_prefill}"
 
 
+def test_chat_template_continue_final_message_vllm_compat():
+    """continue_final_message is the vLLM/transformers explicit alias for the prefill_assistant heuristic.
+    Both must produce the same prompt."""
+    global server
+    server.chat_template = "llama3"
+    server.debug = True
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "add_generation_prompt": False,
+        "continue_final_message": True,
+        "messages": [
+            {"role": "system", "content": "Book"},
+            {"role": "user", "content": "What is the best book"},
+            {"role": "assistant", "content": "Whill"},
+        ]
+    })
+    assert res.status_code == 200
+    assert "__verbose" in res.body
+    assert res.body["__verbose"]["prompt"] == "<s> <|start_header_id|>system<|end_header_id|>\n\nBook<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the best book<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nWhill"
+
+
+def test_chat_template_continue_final_message_mutual_exclusion():
+    """add_generation_prompt and continue_final_message both set to true must be rejected"""
+    global server
+    server.chat_template = "llama3"
+    server.start()
+    res = server.make_request("POST", "/chat/completions", data={
+        "max_tokens": 8,
+        "add_generation_prompt": True,
+        "continue_final_message": True,
+        "messages": [
+            {"role": "user", "content": "Hi"},
+            {"role": "assistant", "content": "Hello"},
+        ]
+    })
+    assert res.status_code == 400
+
+
 def test_apply_chat_template():
     global server
     server.chat_template = "command-r"

diff --git a/tools/server/webui/src/lib/services/chat.service.ts b/tools/server/webui/src/lib/services/chat.service.ts
@@ -130,7 +130,8 @@ export class ChatService {
 			timings_per_token,
 			// Config options
 			disableReasoningParsing,
-			excludeReasoningFromContext
+			excludeReasoningFromContext,
+			continueFinalMessage
 		} = options;
 
 		const normalizedMessages: ApiChatMessageData[] = messages
@@ -209,6 +210,11 @@ export class ChatService {
 			? ReasoningFormat.NONE
 			: ReasoningFormat.AUTO;
 
+		if (continueFinalMessage) {
+			requestBody.continue_final_message = true;
+			requestBody.add_generation_prompt = false;
+		}
+
 		if (temperature !== undefined) requestBody.temperature = temperature;
 		if (max_tokens !== undefined) {
 			// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null

diff --git a/tools/server/webui/src/lib/stores/chat.svelte.ts b/tools/server/webui/src/lib/stores/chat.svelte.ts
@@ -1301,6 +1301,7 @@ class ChatStore {
 				contextWithContinue,
 				{
 					...this.getApiOptions(),
+					continueFinalMessage: true,
 					onChunk: (chunk: string) => {
 						appendedContent += chunk;
 						hasReceivedContent = true;

diff --git a/tools/server/webui/src/lib/types/api.d.ts b/tools/server/webui/src/lib/types/api.d.ts
@@ -239,6 +239,9 @@ export interface ApiChatCompletionRequest {
 	// Custom parameters (JSON string)
 	custom?: Record<string, unknown>;
 	timings_per_token?: boolean;
+	// Continuation control (vLLM compat)
+	add_generation_prompt?: boolean;
+	continue_final_message?: boolean;
 }
 
 export interface ApiChatCompletionToolCallFunctionDelta {

diff --git a/tools/server/webui/src/lib/types/settings.d.ts b/tools/server/webui/src/lib/types/settings.d.ts
@@ -92,6 +92,8 @@ export interface SettingsChatServiceOptions {
 	// Custom parameters
 	custom?: string;
 	timings_per_token?: boolean;
+	// Continuation control (vLLM compat), opt in to the explicit continue final message flag
+	continueFinalMessage?: boolean;
 	// Callbacks
 	onChunk?: (chunk: string) => void;
 	onReasoningChunk?: (chunk: string) => void;