Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ struct server_slot {
}

prompt.tokens.clear();
prompt.score = 1;
}

std::vector<common_adapter_lora_info> lora;
Expand Down
52 changes: 37 additions & 15 deletions tools/server/server-task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,7 @@ server_prompt * server_prompt_cache::alloc(const server_prompt & prompt, size_t
/*.drft =*/ std::move(state_data_dft),
},
/*.checkpoints =*/ prompt.checkpoints,
/*.score =*/ prompt.score,
});

return &states.back();
Expand Down Expand Up @@ -2086,6 +2087,9 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
if (it_best != states.end()) {
SRV_INF(" - found better prompt with f_keep = %.3f, sim = %.3f\n", f_keep_best, sim_best);

// reward cache hit
it_best->score = std::min((uint8_t)(it_best->score + 1), (uint8_t)4);

{
auto & data = it_best->data.main;

Expand Down Expand Up @@ -2129,16 +2133,41 @@ bool server_prompt_cache::load(server_prompt & prompt, const server_tokens & tok
}

void server_prompt_cache::update() {
if (limit_size > 0) {
// always keep at least one state, regardless of the limits
while (states.size() > 1 && size() > limit_size) {
if (states.empty()) {
break;
// second-chance eviction: decay score and rotate to back, evict when score <= 1
auto evict_one = [this]() {
if (states.size() <= 1) {
return;
}

// hard iteration cap to prevent infinite loops when all entries have max score
const size_t max_iter = states.size() * 5;
size_t iter = 0;

while (states.size() > 1) {
if (iter++ >= max_iter) {
SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
states.pop_front();
return;
}

SRV_WRN(" - cache size limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
if (states.front().score <= 1) {
// score has decayed, safe to evict
SRV_WRN(" - cache limit reached, evicting unused/decayed entry (size = %.3f MiB)\n",
states.front().size() / (1024.0 * 1024.0));
states.pop_front();
return;
}

states.pop_front();
// second chance: decay score and rotate to back
states.front().score--;
states.splice(states.end(), states, states.begin());
}
};

if (limit_size > 0) {
// always keep at least one state, regardless of the limits
while (states.size() > 1 && size() > limit_size) {
evict_one();
}
}

Expand All @@ -2150,14 +2179,7 @@ void server_prompt_cache::update() {

if (limit_tokens > 0) {
while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
if (states.empty()) {
break;
}

SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));

states.pop_front();
evict_one();
}
}

Expand Down
3 changes: 3 additions & 0 deletions tools/server/server-task.h
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,9 @@ struct server_prompt {

std::list<common_prompt_checkpoint> checkpoints;

// second-chance score for cache eviction, 1 = fresh/decayed, higher = more retained
uint8_t score = 1;

size_t size() const {
size_t res = 0;

Expand Down