[https://nvbugs/5474169][fix]Adjust max seq len for kvcache for memory estimation (NVIDIA#7391)

HuiGao-NV · dominicshanshan · commit 42b9f10c1fc7 · 2025-09-08T22:45:56.000-07:00
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -52,9 +52,9 @@ def __init__(self, *, executor_config: ExecutorConfig,
         self._draft_model_engine = draft_model_engine
         self._mapping = mapping
         self._max_kv_tokens_in = self._executor_config.kv_cache_config.max_tokens
-        self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
-                                                               1)
         self._kv_connector_manager = kv_connector_manager
+        self._dummy_reqs = None
+        self._max_seq_len = net_max_seq_len
 
     @staticmethod
     def _get_cache_size_per_token(model_config: ModelConfig,
@@ -177,6 +177,10 @@ def _get_token_num_for_estimation(self) -> int:
         if spec_cfg is not None:
             num_extra_tokens_per_seq += spec_cfg.max_draft_len
             num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg)
+
+        if self._dummy_reqs is None:
+            self._dummy_reqs = self._create_dummy_context_requests(
+                max(1, self._max_seq_len - 1))
         for req in self._dummy_reqs:
             num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
             # Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
@@ -466,6 +470,10 @@ def _create_kv_cache_manager(
         if model_engine.kv_cache_manager_key == ResourceManagerType.KV_CACHE_MANAGER:
             executor_config.max_seq_len = kv_cache_manager.max_seq_len
 
+        # When SWA is enabled, max_seq_len is updated inside kv_cache_manager.
+        if kv_cache_manager is not None:
+            self._max_seq_len = kv_cache_manager.max_seq_len
+
         return kv_cache_manager
 
     def build_managers(self,
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -582,7 +582,7 @@ def calculate_max_num_blocks(self,
             if kv_cache_config.free_gpu_memory_fraction is not None:
                 max_tokens = min(kv_cache_config.max_tokens, max_tokens)
                 logger.warning(
-                    f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {kv_cache_config.max_tokens}, respectively). The smaller value will be used.'
+                    f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.'
                 )
             else:
                 max_tokens = kv_cache_config.max_tokens

Original file line number	Diff line number	Diff line change
`@@ -582,7 +582,7 @@ def calculate_max_num_blocks(self,`
`582`	`582`	`if kv_cache_config.free_gpu_memory_fraction is not None:`
`583`	`583`	`max_tokens = min(kv_cache_config.max_tokens, max_tokens)`
`584`	`584`	`logger.warning(`
`585`		`- f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {kv_cache_config.max_tokens}, respectively). The smaller value will be used.'`
	`585`	`+ f'Both free_gpu_memory_fraction and max_tokens are set (to {free_mem_fraction} and {max_tokens} with free memory {free_mem / (1 << 32)} of total memory {total_mem / (1<<32)}, respectively). The smaller value will be used.'`
`586`	`586`	`)`
`587`	`587`	`else:`
`588`	`588`	`max_tokens = kv_cache_config.max_tokens`