diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index ae9e0bf60d8..4c2ce88568f 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -152,6 +152,7 @@ struct server_slot { } prompt.tokens.clear(); + prompt.score = 1; } std::vector lora; diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index abc00c82bdb..7a73f616a2d 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -2048,6 +2048,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t /*.drft =*/ std::move(state_data_dft), }, /*.checkpoints =*/ prompt.checkpoints, + /*.score =*/ prompt.score, }); return &states.back(); @@ -2086,6 +2087,9 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok if (it_best != states.end()) { SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best); + // reward cache hit + it_best->score = std::min((uint8_t)(it_best->score + 1), (uint8_t)4); + { auto & data = it_best->data.main; @@ -2129,16 +2133,41 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok } void server_prompt_cache::update() { - if (limit_size > 0) { - // always keep at least one state, regardless of the limits - while (states.size() > 1 && size() > limit_size) { - if (states.empty()) { - break; + // second-chance eviction: decay score and rotate to back, evict when score <= 1 + auto evict_one = [this]() { + if (states.size() <= 1) { + return; + } + + // hard iteration cap to prevent infinite loops when all entries have max score + const size_t max_iter = states.size() * 5; + size_t iter = 0; + + while (states.size() > 1) { + if (iter++ >= max_iter) { + SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0)); + states.pop_front(); + return; } - SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0)); + if (states.front().score <= 1) { + // score has decayed, safe to evict + SRV_WRN(" - cache limit reached, evicting unused/decayed entry (size = %.3f MiB)\n", + states.front().size() / (1024.0 * 1024.0)); + states.pop_front(); + return; + } - states.pop_front(); + // second chance: decay score and rotate to back + states.front().score--; + states.splice(states.end(), states, states.begin()); + } + }; + + if (limit_size > 0) { + // always keep at least one state, regardless of the limits + while (states.size() > 1 && size() > limit_size) { + evict_one(); } } @@ -2150,14 +2179,7 @@ void server_prompt_cache::update() { if (limit_tokens > 0) { while (states.size() > 1 && n_tokens() > limit_tokens_cur) { - if (states.empty()) { - break; - } - - SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n", - limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0)); - - states.pop_front(); + evict_one(); } } diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 60e216e7927..373aa4f66de 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -580,6 +580,9 @@ struct server_prompt { std::list checkpoints; + // second-chance score for cache eviction, 1 = fresh/decayed, higher = more retained + uint8_t score = 1; + size_t size() const { size_t res = 0;