revert: revert the change for prepare estimation:

qixiang-99 · qixiang-99 · commit 6e93c1b91f49 · 2025-08-21T22:26:25.000-07:00
- we shouldn't use max_seq_len as the kv config max_tokens as it doesn't need that much
- and it makes preparation to have OOM more easily especially long seq.

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -193,12 +193,8 @@ def try_prepare_estimation(self) -> bool:
         estimating_kv_cache = False
         if 'cp_type' not in self._mapping.cp_config:
             estimating_kv_cache = True
-            max_attention_window_from_config = self._executor_config.kv_cache_config.max_attention_window
-            max_seq_len = max(
-                max_attention_window_from_config
-            ) if max_attention_window_from_config is not None else self._executor_config.max_seq_len
-            self._executor_config.kv_cache_config.max_tokens = max(
-                self._get_token_num_for_estimation(), max_seq_len)
+            self._executor_config.kv_cache_config.max_tokens = self._get_token_num_for_estimation(
+            )
         return estimating_kv_cache
 
     def configure_kv_cache_capacity(self, py_executor: PyExecutor) -> None: