From f54684271436285ef3177ec7f3fc46bc856b17bc Mon Sep 17 00:00:00 2001 From: o7si Date: Fri, 26 Dec 2025 21:54:17 +0800 Subject: [PATCH 1/2] server : fix crash when seq_rm fails for hybrid/recurrent models --- tools/server/server-context.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 68a5fd8ab08d..7e2dd335f56c 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2220,7 +2220,8 @@ struct server_context_impl { if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - clear_slot(slot); + llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); + slot.prompt.tokens.clear(); // there is no common part left slot.n_prompt_tokens_cache = 0; From 0e8829f178339814431775ee78f3a996e1d93113 Mon Sep 17 00:00:00 2001 From: o7si Date: Fri, 26 Dec 2025 22:43:06 +0800 Subject: [PATCH 2/2] server : add allow_processing param to clear_slot --- tools/server/server-context.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 7e2dd335f56c..2a293d8e2130 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -948,8 +948,10 @@ struct server_context_impl { return ret; } - void clear_slot(server_slot & slot) const { - GGML_ASSERT(!slot.is_processing()); + void clear_slot(server_slot & slot, bool allow_processing = false) const { + if (!allow_processing) { + GGML_ASSERT(!slot.is_processing()); + } SLT_WRN(slot, "clearing slot with %zu tokens\n", slot.prompt.tokens.size()); @@ -2220,8 +2222,7 @@ struct server_context_impl { if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); - llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); - slot.prompt.tokens.clear(); + clear_slot(slot, /*allow_processing=*/true); // there is no common part left slot.n_prompt_tokens_cache = 0;