ggml-org · ggerganov · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -2781,9 +2781,10 @@ struct server_context_impl {
                             }
 
                             llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
+                            const bool has_new_tokens = (n_past < slot.task->n_tokens());
 
                             // the largest pos_min required for a checkpoint to be useful
-                            const auto pos_min_thold = std::max(0, pos_next - n_swa - 1);
+                            const auto pos_min_thold = std::max(0, pos_next - n_swa - (has_new_tokens ? 0 : 1));
 
                             if (n_past > 0 && n_past <= slot.prompt.n_tokens()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);