From d734294240bff152e8c2c6f02135f9c091a6ec26 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 29 May 2026 21:31:11 +0200 Subject: [PATCH 1/3] server: in SSE mode, send HTTP headers when slot starts --- tools/server/server-context.cpp | 20 +++++++++++++++----- tools/server/server-task.cpp | 3 +++ tools/server/server-task.h | 1 + 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ae9e0bf60d8..bfe3443c1de 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1734,7 +1734,7 @@ struct server_context_impl { return true; } - void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { + void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress, bool is_begin = false) { auto res = std::make_unique(); res->id = slot.task->id; @@ -1746,6 +1746,9 @@ struct server_context_impl { res->progress.cache = slot.n_prompt_tokens_cache; res->progress.processed = slot.prompt.tokens.size(); res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt) / 1000; + } + if (is_begin) { + res->is_begin = true; } else { res->content = tkn.text_to_send; res->tokens = { tkn.tok }; @@ -2828,10 +2831,15 @@ struct server_context_impl { slot.prompt.tokens.keep_first(n_past); - // send initial 0% progress update if needed // this is to signal the client that the request has started processing - if (slot.task->params.stream && slot.task->params.return_progress) { - send_partial_response(slot, {}, true); + if (slot.task->params.stream) { + if (slot.task->params.return_progress) { + // send initial 0% progress update if needed + send_partial_response(slot, {}, true); + } else { + // otherwise, for streaming without progress, signal HTTP to send the headers (i.e. 200 status) + send_partial_response(slot, {}, false, true); + } } } @@ -3745,7 +3753,9 @@ std::unique_ptr server_routes::handle_completions_impl( // next responses are streamed // to be sent immediately json first_result_json = first_result->to_json(); - if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { + if (first_result_json == nullptr) { + res->data = ""; // simply send HTTP headers and status code + } else if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) { res->data = format_anthropic_sse(first_result_json); } else if (res_type == TASK_RESPONSE_TYPE_OAI_RESP) { res->data = format_oai_resp_sse(first_result_json); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index abc00c82bdb..ff80be6ccba 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -1422,6 +1422,9 @@ void server_task_result_cmpl_partial::update(task_result_state & state) { json server_task_result_cmpl_partial::to_json() { GGML_ASSERT(is_updated && "update() must be called before to_json()"); + if (is_begin) { + return nullptr; // simply signal to HTTP handler to send the headers and status code + } switch (res_type) { case TASK_RESPONSE_TYPE_NONE: return to_json_non_oaicompat(); diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 60e216e7927..61ecf90414c 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -418,6 +418,7 @@ struct server_task_result_cmpl_partial : server_task_result { bool post_sampling_probs; bool is_progress = false; + bool is_begin = false; // whether to send 200 status to HTTP client (begin of SSE stream) completion_token_output prob_output; result_timings timings; result_prompt_progress progress; From b9d69e18e1f16eea36df9c1352505d10b279620c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 29 May 2026 21:33:11 +0200 Subject: [PATCH 2/3] ref to pr --- tools/server/server-task.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 61ecf90414c..7c5868d0dea 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -419,6 +419,7 @@ struct server_task_result_cmpl_partial : server_task_result { bool post_sampling_probs; bool is_progress = false; bool is_begin = false; // whether to send 200 status to HTTP client (begin of SSE stream) + // ref: https://github.com/ggml-org/llama.cpp/pull/23884 completion_token_output prob_output; result_timings timings; result_prompt_progress progress; From d1236c8a1af12c20539d1688375a607902a034c6 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 29 May 2026 21:46:29 +0200 Subject: [PATCH 3/3] stream should be false by default --- tools/server/server-task.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 7c5868d0dea..d47dc690cff 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -47,7 +47,7 @@ enum stop_type { }; struct task_params { - bool stream = true; + bool stream = false; bool include_usage = false; bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt bool return_tokens = false;