From 29f1d50e1beb6afaf4165c9d6c8b11677c1f5bca Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Sep 2025 11:24:10 +0700 Subject: [PATCH 1/7] server : implement `return_progress` --- tools/server/README.md | 2 + tools/server/server.cpp | 83 ++++++++++++++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index b0527f3cbea28..a36325ea3648e 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -512,6 +512,8 @@ These words will not be included in the completion, so make sure to add them to `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`return_progress`: Include prompt processing progress in `stream` mode. The progress will be contained inside `prompt_progress` with 3 values: `total`, `cache` and `processed`. The overall progress is `processed/total`, while the actual timed progress is `(processed-cache)/(total-cache)`. Default: `false` + `post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. `response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. Note that fields with a slash will be unnested; for example, `generation_settings/n_predict` will move the field `n_predict` from the `generation_settings` object to the root of the response and give it a new name. diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 73fc43bada543..ba165dc9efcb9 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -110,9 +110,10 @@ static bool server_task_type_need_logits(server_task_type task_type) { } struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt - bool return_tokens = false; + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool return_tokens = false; + bool return_progress = false; int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half @@ -307,11 +308,11 @@ struct server_task { // enabling this will output extra debug information in the HTTP responses from the server params.verbose = params_base.verbosity > 9; - params.timings_per_token = json_value(data, "timings_per_token", false); params.stream = json_value(data, "stream", false); params.cache_prompt = json_value(data, "cache_prompt", true); params.return_tokens = json_value(data, "return_tokens", false); + params.return_progress = json_value(data, "return_progress", false); params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); params.n_indent = json_value(data, "n_indent", defaults.n_indent); params.n_keep = json_value(data, "n_keep", defaults.n_keep); @@ -644,6 +645,20 @@ struct result_timings { } }; +struct result_prompt_progress { + int32_t total = 0; + int32_t cache = 0; + int32_t processed = 0; + + json to_json() const { + return json { + {"total", total}, + {"cache", cache}, + {"processed", processed}, + }; + } +}; + struct server_task_result { int id = -1; int id_slot = -1; @@ -999,8 +1014,10 @@ struct server_task_result_cmpl_partial : server_task_result { int32_t n_prompt_tokens; bool post_sampling_probs; + bool is_progress = false; completion_token_output prob_output; result_timings timings; + result_prompt_progress progress; // OAI-compat fields bool verbose = false; @@ -1045,6 +1062,9 @@ struct server_task_result_cmpl_partial : server_task_result { if (timings.prompt_n > 0) { res.push_back({"timings", timings.to_json()}); } + if (is_progress) { + res.push_back({"prompt_progress", progress.to_json()}); + } if (!prob_output.probs.empty()) { res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs); } @@ -1082,6 +1102,9 @@ struct server_task_result_cmpl_partial : server_task_result { if (timings.prompt_n >= 0) { res.push_back({"timings", timings.to_json()}); } + if (is_progress) { + res.push_back({"prompt_progress", progress.to_json()}); + } return res; } @@ -1109,7 +1132,7 @@ struct server_task_result_cmpl_partial : server_task_result { }); }; // We have to send an initial update to conform to openai behavior - if (first) { + if (first || is_progress) { add_delta({ {"role", "assistant"}, {"content", nullptr}, @@ -1121,16 +1144,20 @@ struct server_task_result_cmpl_partial : server_task_result { } if (!deltas.empty()) { - GGML_ASSERT(deltas[deltas.size() - 1].at("choices").size() >= 1); + auto & last_json = deltas[deltas.size() - 1]; + GGML_ASSERT(last_json.at("choices").size() >= 1); if (prob_output.probs.size() > 0) { - deltas[deltas.size() - 1].at("choices").at(0)["logprobs"] = json { + last_json.at("choices").at(0)["logprobs"] = json { {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, }; } if (timings.prompt_n >= 0) { - deltas[deltas.size() - 1].push_back({"timings", timings.to_json()}); + last_json.push_back({"timings", timings.to_json()}); + } + if (is_progress) { + last_json.push_back({"prompt_progress", progress.to_json()}); } } @@ -1404,6 +1431,7 @@ struct server_slot { // n_prompt_tokens may not be equal to prompt_tokens.size(), because prompt maybe truncated int32_t n_prompt_tokens = 0; + int32_t n_prompt_tokens_cache = 0; int32_t n_prompt_tokens_processed = 0; // input prompt tokens @@ -1456,7 +1484,9 @@ struct server_slot { void reset() { SLT_DBG(*this, "%s", "\n"); - n_prompt_tokens = 0; + n_prompt_tokens = 0; + n_prompt_tokens_cache = 0; + last_nl_pos = 0; generated_text = ""; has_new_line = false; @@ -2520,7 +2550,7 @@ struct server_context { slot.add_token(result); if (slot.params.stream) { - send_partial_response(slot, result); + send_partial_response(slot, result, false); } } @@ -2712,13 +2742,23 @@ struct server_context { return true; } - void send_partial_response(server_slot & slot, const completion_token_output & tkn) { + void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; - res->content = tkn.text_to_send; - res->tokens = { tkn.tok }; + res->id = slot.id_task; + res->index = slot.index; + + if (is_progress) { + res->is_progress = true; + res->progress.total = slot.n_prompt_tokens; + res->progress.cache = slot.n_prompt_tokens_cache; + res->progress.processed = slot.cache_tokens.size(); + } else { + res->content = tkn.text_to_send; + res->tokens = { tkn.tok }; + + slot.update_chat_msg(res->oaicompat_msg_diffs); + } res->n_decoded = slot.n_decoded; res->n_prompt_tokens = slot.n_prompt_tokens; @@ -2729,8 +2769,6 @@ struct server_context { res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; - slot.update_chat_msg(res->oaicompat_msg_diffs); - // populate res.probs_output if (slot.params.sampling.n_probs > 0) { res->prob_output = tkn; // copy the token probs @@ -3557,6 +3595,7 @@ struct server_context { slot.n_past--; } + slot.n_prompt_tokens_cache = slot.n_past; slot.n_prompt_tokens_processed = 0; } @@ -3573,7 +3612,8 @@ struct server_context { llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); // there is no common part left - slot.n_past = 0; + slot.n_past = 0; + slot.n_prompt_tokens_cache = 0; } SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past); @@ -3767,6 +3807,13 @@ struct server_context { n_batch = llama_n_batch(ctx); for (auto & slot : slots) { + // optionally send prompt processing progress + if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) { + if (slot.params.stream && slot.params.return_progress) { + send_partial_response(slot, {}, true); + } + } + if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { continue; // continue loop of slots } From 4404ad86e5ea347d324df0070a542a7fb052baf9 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Sep 2025 13:49:53 +0700 Subject: [PATCH 2/7] add timings.cache_n --- tools/server/server.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index ba165dc9efcb9..a24443a65e03a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -609,6 +609,8 @@ struct server_task { }; struct result_timings { + int32_t cache_n = -1; + int32_t prompt_n = -1; double prompt_ms; double prompt_per_token_ms; @@ -625,6 +627,8 @@ struct result_timings { json to_json() const { json base = { + {"cache_n", cache_n}, + {"prompt_n", prompt_n}, {"prompt_ms", prompt_ms}, {"prompt_per_token_ms", prompt_per_token_ms}, @@ -1577,6 +1581,8 @@ struct server_slot { result_timings get_timings() const { result_timings timings; + timings.cache_n = n_prompt_tokens_cache; + timings.prompt_n = n_prompt_tokens_processed; timings.prompt_ms = t_prompt_processing; timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; From e166a550eafdbc229013c6cbd844004066fa457d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Sep 2025 13:55:01 +0700 Subject: [PATCH 3/7] add progress.time_ms --- tools/server/server.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a24443a65e03a..d345bfb646b24 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -653,12 +653,14 @@ struct result_prompt_progress { int32_t total = 0; int32_t cache = 0; int32_t processed = 0; + int64_t time_ms = 0; json to_json() const { return json { {"total", total}, {"cache", cache}, {"processed", processed}, + {"time_ms", time_ms}, }; } }; @@ -2759,6 +2761,7 @@ struct server_context { res->progress.total = slot.n_prompt_tokens; res->progress.cache = slot.n_prompt_tokens_cache; res->progress.processed = slot.cache_tokens.size(); + res->progress.time_ms = ggml_time_ms() - (slot.t_start_process_prompt / 1000); } else { res->content = tkn.text_to_send; res->tokens = { tkn.tok }; From f4213ccda8730b97f491d3e7f8988dd5467b43e6 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Sep 2025 14:09:49 +0700 Subject: [PATCH 4/7] add test --- .../server/tests/unit/test_chat_completion.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 22dbfdd9b4a89..49ca709c84efe 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -402,3 +402,49 @@ def test_context_size_exceeded(): assert server.n_ctx is not None assert server.n_slots is not None assert res.body["error"]["n_ctx"] == server.n_ctx // server.n_slots + + +@pytest.mark.parametrize( + "n_batch,batch_count,reuse_cache", + [ + (64, 4, False), + (64, 1, True), + ] +) +def test_return_progress(n_batch, batch_count, reuse_cache): + global server + server.n_batch = n_batch + server.start() + def make_cmpl_request(): + return server.make_stream_request("POST", "/chat/completions", data={ + "max_tokens": 10, + "messages": [ + {"role": "user", "content": "This is a test" * 100}, + ], + "stream": True, + "return_progress": True, + }) + if reuse_cache: + # make a first request to populate the cache + res0 = make_cmpl_request() + for _ in res0: + pass # discard the output + + res = make_cmpl_request() + last_progress = None + total_batch_count = 0 + for data in res: + cur_progress = data.get("prompt_progress", None) + if cur_progress is None: + continue + if last_progress is not None: + assert cur_progress["total"] == last_progress["total"] + assert cur_progress["cache"] == last_progress["cache"] + assert cur_progress["processed"] > last_progress["processed"] + total_batch_count += 1 + last_progress = cur_progress + + assert last_progress is not None + assert last_progress["total"] > 0 + assert last_progress["processed"] == last_progress["total"] + assert total_batch_count == batch_count From ebcef9104f4d586b2c3e5124b2551f4b120b6e34 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Sep 2025 17:23:20 +0700 Subject: [PATCH 5/7] fix test for chat/completions --- tools/server/tests/unit/test_chat_completion.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index 49ca709c84efe..53421d1b57351 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -407,13 +407,15 @@ def test_context_size_exceeded(): @pytest.mark.parametrize( "n_batch,batch_count,reuse_cache", [ - (64, 4, False), + (64, 15, False), (64, 1, True), ] ) -def test_return_progress(n_batch, batch_count, reuse_cache): +def test_return_progresssss(n_batch, batch_count, reuse_cache): global server server.n_batch = n_batch + server.n_ctx = 2048 + server.n_slots = 1 server.start() def make_cmpl_request(): return server.make_stream_request("POST", "/chat/completions", data={ @@ -441,7 +443,7 @@ def make_cmpl_request(): assert cur_progress["total"] == last_progress["total"] assert cur_progress["cache"] == last_progress["cache"] assert cur_progress["processed"] > last_progress["processed"] - total_batch_count += 1 + total_batch_count += 1 last_progress = cur_progress assert last_progress is not None From b6ac24c67050f4efd722329f5bf29c96ba69450e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 6 Sep 2025 17:36:37 +0700 Subject: [PATCH 6/7] readme: add docs on timings --- tools/server/README.md | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index a36325ea3648e..73b4cc6f03a28 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1278,6 +1278,34 @@ curl http://localhost:8080/v1/chat/completions \ **See our [Function calling](../../docs/function-calling.md) docs** for more details, supported native tool call styles (generic tool call style is used as fallback) / examples of use. +*Timings and context usage* + +The response contains a `timings` object, for example: + +```js +{ + "choices": [], + "created": 1757141666, + "id": "chatcmpl-ecQULm0WqPrftUqjPZO1CFYeDjGZNbDu", + // ... + "timings": { + "cache_n": 236, // number of prompt tokens reused from cache + "prompt_n": 1, // number of prompt tokens being processed + "prompt_ms": 30.958, + "prompt_per_token_ms": 30.958, + "prompt_per_second": 32.301828283480845, + "predicted_n": 35, // number of predicted tokens + "predicted_ms": 661.064, + "predicted_per_token_ms": 18.887542857142858, + "predicted_per_second": 52.94494935437416 + } +} +``` + +This provides information on the performance of the server. It also allows calculating the current context usage. + +The total number of tokens in context is equal to `prompt_n + cache_n + predicted_n` + ### POST `/v1/embeddings`: OpenAI-compatible embeddings API This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. From 053dc6b3808ede5ba600aea2eed8014ae023f7da Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Sat, 6 Sep 2025 12:43:43 +0200 Subject: [PATCH 7/7] use ggml_time_us Co-authored-by: Georgi Gerganov --- tools/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index d345bfb646b24..70aa18756465f 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2761,7 +2761,7 @@ struct server_context { res->progress.total = slot.n_prompt_tokens; res->progress.cache = slot.n_prompt_tokens_cache; res->progress.processed = slot.cache_tokens.size(); - res->progress.time_ms = ggml_time_ms() - (slot.t_start_process_prompt / 1000); + res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000); } else { res->content = tkn.text_to_send; res->tokens = { tkn.tok };