Skip to content

Commit 7ebe7f7

Browse files
committed
server : use slot context size instead of training context size
1 parent e776168 commit 7ebe7f7

File tree

2 files changed

+4
-6
lines changed

2 files changed

+4
-6
lines changed

tools/server/server.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2946,17 +2946,15 @@ struct server_context {
29462946
SLT_DBG(slot, "%s", "stopped by EOS\n");
29472947
}
29482948

2949-
const auto n_ctx_train = llama_model_n_ctx_train(model);
2950-
2951-
if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= n_ctx_train) {
2949+
if (slot.task->params.n_predict < 1 && slot.n_prompt_tokens() + slot.n_decoded >= slot.n_ctx) {
29522950
slot.truncated = true;
29532951
slot.stop = STOP_TYPE_LIMIT;
29542952
slot.has_next_token = false; // stop prediction
29552953

29562954
SLT_WRN(slot,
29572955
"n_predict (%d) is set for infinite generation. "
2958-
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
2959-
slot.task->params.n_predict, n_ctx_train);
2956+
"Limiting generated tokens to slot.n_ctx (%d) to avoid EOS-less generation infinite loop\n",
2957+
slot.task->params.n_predict, slot.n_ctx);
29602958
}
29612959

29622960
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());

tools/server/tests/unit/test_ctx_shift.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_ctx_shift_enabled():
4545

4646
@pytest.mark.parametrize("n_predict,n_token_output,truncated", [
4747
(64, 64, False),
48-
(-1, 120, True),
48+
(-1, 248, True), # 8 tokens prompt + 248 tokens generated = 256 tokens total
4949
])
5050
def test_ctx_shift_disabled_short_prompt(n_predict: int, n_token_output: int, truncated: bool):
5151
global server

0 commit comments

Comments
 (0)