Skip to content

Commit bc07349

Browse files
authored
server : dynamic token limit for prompt cache (#16560)
* server : dynamic token limit for prompt cache * cont : print estimated token limit
1 parent e60f241 commit bc07349

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

tools/server/server.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,23 +1585,31 @@ struct server_prompt_cache {
15851585
}
15861586
}
15871587

1588+
// average size per token
1589+
const float size_per_token = std::max<float>(1.0f, float(size()) / (std::max<size_t>(1, n_tokens())));
1590+
1591+
// dynamically increase the token limit if it can fit in the memory limit
1592+
const size_t limit_tokens_cur = limit_size > 0 ? std::max<size_t>(limit_tokens, limit_size/size_per_token) : limit_tokens;
1593+
15881594
if (limit_tokens > 0) {
1589-
while (states.size() > 1 && n_tokens() > limit_tokens) {
1595+
while (states.size() > 1 && n_tokens() > limit_tokens_cur) {
15901596
if (states.empty()) {
15911597
break;
15921598
}
15931599

1594-
SRV_WRN(" - cache token limit reached, removing oldest entry (size = %.3f MiB)\n", states.front().size() / (1024.0 * 1024.0));
1600+
SRV_WRN(" - cache token limit (%zu, est: %zu) reached, removing oldest entry (size = %.3f MiB)\n",
1601+
limit_tokens, limit_tokens_cur, states.front().size() / (1024.0 * 1024.0));
15951602

15961603
states.pop_front();
15971604
}
15981605
}
15991606

1600-
SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens)\n",
1601-
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens);
1607+
SRV_WRN(" - cache state: %zu prompts, %.3f MiB (limits: %.3f MiB, %zu tokens, %zu est)\n",
1608+
states.size(), size() / (1024.0 * 1024.0), limit_size / (1024.0 * 1024.0), limit_tokens, limit_tokens_cur);
16021609

16031610
for (const auto & state : states) {
1604-
SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n", (const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
1611+
SRV_WRN(" - prompt %p: %7d tokens, checkpoints: %2zu, %9.3f MiB\n",
1612+
(const void *)&state, state.n_tokens(), state.checkpoints.size(), state.size() / (1024.0 * 1024.0));
16051613
}
16061614
}
16071615
};

0 commit comments

Comments
 (0)