From 7c9d49b40939388891083367afc9e056537f9169 Mon Sep 17 00:00:00 2001 From: Sun Yongyue Date: Thu, 4 Jun 2026 18:06:47 +0800 Subject: [PATCH 1/2] server: avoid unnecessary checkpoint restore when new tokens are present The pos_min_thold calculation unconditionally subtracts 1 to ensure at least one token is evaluated for logits when no new tokens exist. However, when the request contains new tokens beyond the cached prefix, this -1 is overly conservative and may trigger an unnecessary checkpoint restore. Conditionally apply the -1 only when n_past >= task.n_tokens() (no new tokens), avoiding redundant KV state restoration when there is actual work to do. --- tools/server/server-context.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 28b1158c7769..c9fde7be1558 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2781,9 +2781,10 @@ struct server_context_impl { } llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); + const bool has_new_tokens = (n_past < slot.task->n_tokens()); // the largest pos_min required for a checkpoint to be useful - const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); + const auto pos_min_thold = std::max(0, pos_next - n_swa - (has_new_tokens ? 0 : 1)); if (n_past > 0 && n_past <= slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); From 24632d545064bb12ecd140438b2c9d6feaabccb9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 4 Jun 2026 15:39:18 +0300 Subject: [PATCH 2/2] cont : add ref --- tools/server/server-context.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index c9fde7be1558..28f738c3feb2 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2781,6 +2781,8 @@ struct server_context_impl { } llama_pos pos_next = slot.prompt.tokens.pos_next(n_past); + + // ref: https://github.com/ggml-org/llama.cpp/pull/24110 const bool has_new_tokens = (n_past < slot.task->n_tokens()); // the largest pos_min required for a checkpoint to be useful