Skip to content

Commit b537c24

Browse files
committed
[https://nvbugs/5474169][bug] adjust dummy requests max seq length by
kv cache manager Signed-off-by: Hui Gao <[email protected]>
1 parent 050db0e commit b537c24

File tree

2 files changed

+11
-3
lines changed

2 files changed

+11
-3
lines changed

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ def __init__(self, *, executor_config: ExecutorConfig,
5050
self._draft_model_engine = draft_model_engine
5151
self._mapping = mapping
5252
self._max_kv_tokens_in = self._executor_config.kv_cache_config.max_tokens
53-
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
54-
1)
53+
self._dummy_reqs = None
54+
self._max_seq_len = net_max_seq_len
5555

5656
@staticmethod
5757
def _get_cache_size_per_token(model_config: ModelConfig,
@@ -168,6 +168,10 @@ def _get_token_num_for_estimation(self) -> int:
168168
if spec_cfg is not None:
169169
num_extra_tokens_per_seq += spec_cfg.max_draft_len
170170
num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg)
171+
172+
if self._dummy_reqs is None:
173+
self._dummy_reqs = self._create_dummy_context_requests(
174+
self._max_seq_len - 1)
171175
for req in self._dummy_reqs:
172176
num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
173177
# Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
@@ -381,6 +385,10 @@ def _create_kv_cache_manager(
381385
if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
382386
executor_config.max_seq_len = kv_cache_manager.max_seq_len
383387

388+
# When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
389+
if kv_cache_manager != None:
390+
self._max_seq_len = kv_cache_manager.max_seq_len
391+
384392
return kv_cache_manager
385393

386394
def build_managers(self, resources: Dict) -> None:

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,7 +485,7 @@ def calculate_max_num_blocks(self,
485485
if kv_cache_config.free_gpu_memory_fraction is not None:
486486
max_tokens = min(kv_cache_config.max_tokens, max_tokens)
487487
logger.warning(
488-
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {kv_cache_config.max_tokens}, respectively). The smaller value will be used.'
488+
f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.'
489489
)
490490
else:
491491
max_tokens = kv_cache_config.max_tokens

0 commit comments

Comments
 (0)