[https://nvbugs/5527956][fix] AutoDeploy: fix IMA due to outdated metadata (#8002)

lucaslie · web-flow · commit 3a96d75a3ce2 · 2025-09-25T22:05:55.000-07:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -19,6 +19,7 @@
 from torch.fx import Node
 
 from ...._utils import nvtx_range
+from ..utils.logger import ad_logger
 
 DynamicShape = Dict[int, Dim]  # indicating the dynamic shape in tensor dimension
 DynamicShapeCallback = Callable[[], DynamicShape]
@@ -122,22 +123,28 @@ def __init__(
         # see https://github.com/NVIDIA/TensorRT-LLM/issues/4504
         max_seq_len_adjusted = self.max_seq_len + 1
 
-        if max_num_tokens is None or max_num_tokens < 1:
-            self.max_num_tokens = self.max_batch_size * max_seq_len_adjusted
-        else:
-            self.max_num_tokens = max_num_tokens
+        # if the provided max_num_tokens is less than the max_batch_size * max_seq_len_adjusted,
+        # we use the provided max_num_tokens. If max_num_tokens provided is more, we still use
+        # max_batch_size * max_seq_len_adjusted since the extra tokens cannot be used.
+        self.max_num_tokens = self.max_batch_size * max_seq_len_adjusted
+        if max_num_tokens is not None and max_num_tokens > 0:
+            self.max_num_tokens = min(self.max_num_tokens, max_num_tokens)
 
-        # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
-        # we use the provided max_num_tokens to calculate the number of pages
-        total_tokens = min(self.max_num_tokens, self.max_batch_size * max_seq_len_adjusted)
         # Num pages can not be less than max_batch_size.
         self._num_pages = max(
             self.max_batch_size,
-            (total_tokens) // self.page_size + (total_tokens % self.page_size > 0),
+            (self.max_num_tokens) // self.page_size  # floored number of pages
+            + (self.max_num_tokens % self.page_size > 0) * self.max_batch_size,  # +1 per sequence
         )
         # sanity check
         assert self.num_pages >= self.max_batch_size, "num_pages can't be less than max_batch_size"
 
+        # log parameters
+        ad_logger.info(
+            f"[SequenceInfo:] {self.max_seq_len=}, {self.max_batch_size=}, {self.page_size=}, "
+            f"{self.max_num_tokens=} (inferred), {max_num_tokens=} (provided), {self.num_pages=}"
+        )
+
         # indicator if extra args are activated that are needed for cached attention backends
         self._is_cached_attn = False
 
@@ -572,6 +579,12 @@ def _store_arg(
             # pin the memory on the host
             tnsr_host = torch.tensor(tnsr_like, dtype=tnsr_device.dtype, pin_memory=True)
 
+            # check for available space
+            assert tnsr_device.numel() >= tnsr_host.numel(), (
+                f"device tensor {name} is too small, available: {tnsr_device.numel()}, "
+                f"required: {tnsr_host.numel()}"
+            )
+
             # reset/copy to the device in a non-blocking fashion
             if reset:
                 tnsr_device.zero_()
@@ -632,8 +645,8 @@ def nest_sequences(
             cache_loc, pages_per_seq = self._get_cache_locations_and_pages_per_sequence(
                 page_assignments
             )
-            self._store_arg("cache_loc", cache_loc)
-            self._store_arg("pages_per_seq", pages_per_seq)
+            self._store_arg("cache_loc", cache_loc, reset=True)
+            self._store_arg("pages_per_seq", pages_per_seq, reset=True)
 
         ### UPDATE MAIN INPUTS #####################################################################
         # set new input_ids and make sure to flatten it
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -89,9 +89,6 @@ def build_from_config(cls, ad_config: AutoDeployConfig):
         attn_page_size = ad_config.attn_page_size
         max_num_tokens = ad_config.max_num_tokens
         max_beam_width = ad_config.max_beam_width
-        ad_logger.info(
-            f"{max_seq_len=}, {max_batch_size=}, {attn_page_size=}, {max_num_tokens=}, {max_beam_width=}"
-        )
 
         # update device to contain the current default device if it's in cuda
         device = torch.device(ad_config.device)
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/interface.py b/tensorrt_llm/_torch/auto_deploy/shim/interface.py
@@ -73,7 +73,9 @@ def resize_cache(self, new_num_pages: int):
         self.info.num_pages = new_num_pages
         for name, cache in self._caches.items():
             # We assume cache is a tensor of shape (max_batch_size, page_size, n_heads, head_dim)
-            if "cache" in name:
+            # TODO: cache resize should ideally be handled via a callback to the AttentionDescriptor
+            # to avoid hard-coding any assumptions about the cache shape or its "pagedness"
+            if "k_cache" in name or "v_cache" in name:
                 current_shape = cache.shape
                 new_shape = (new_num_pages, *current_shape[1:])
                 cache.resize_(new_shape)
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_input_constraints.py b/tensorrt_llm/_torch/auto_deploy/transform/library/cleanup_input_constraints.py
@@ -43,7 +43,11 @@ def _apply(
                 raise TypeError(f"Unexpected type {type(s)} in symbolic shape.")
 
         # update the max constraint for each vr
-        max_total = math.prod(vr.upper for vr in vrs)
+        # NOTE: this is more a heuristic anyway than a strict constraint. We just want to make sure
+        # that this never gets triggered. So we multiply by 1000 to be safe. Not that it has to
+        # be a symint (not an int) --> so that's why we use a heuristic based on the existing
+        # symint values instead of just using e.g. max_num_tokens...
+        max_total = math.prod(vr.upper for vr in vrs) * 1000
         for vr in vrs:
             object.__setattr__(vr, "upper", max_total)
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transform/library/kvcache.py
@@ -12,7 +12,6 @@
 from ...models.factory import ModelFactory
 from ...shim.interface import CachedSequenceInterface
 from ...transformations._graph import add_graph_input
-from ...utils.logger import ad_logger
 from ...utils.node_utils import get_all_input_output_nodes, is_op
 from ..interface import (
     BaseTransform,
@@ -280,34 +279,32 @@ def _get_mem_info_in_mb():
                 skipped=True, num_matches=0, is_clean=True, has_valid_shapes=True
             )
 
-        try:
-            # Let's run a forward pass to get the memory usage
-            cm.info.set_max_num_tokens_sample()
-            free_mem_pre, _ = _get_mem_info_in_mb()
-            self._log_info(f"Free memory before forward pass (MB): {free_mem_pre}")
+        # TODO: the manual PyTorch workflow respects max_num_tokens if set and does _NOT_ resize
+        # the cache in this case. Should we do the same here?
 
-            self._run_forward(gm, cm)
+        # Let's run a forward pass to get the memory usage
+        cm.info.set_max_num_tokens_sample()
+        free_mem_pre, _ = _get_mem_info_in_mb()
+        self._log_info(f"Free memory before forward pass (MB): {free_mem_pre}")
 
-            free_mem_post, _ = _get_mem_info_in_mb()
-            self._log_info(f"Free memory after forward pass (MB): {free_mem_post}")
+        self._run_forward(gm, cm)
 
-            memory_for_forward_pass = free_mem_pre - free_mem_post
-            self._log_info(f"Memory for forward pass (MB): {memory_for_forward_pass}")
+        free_mem_post, _ = _get_mem_info_in_mb()
+        self._log_info(f"Free memory after forward pass (MB): {free_mem_post}")
 
-            new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size
-            new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages))
+        memory_for_forward_pass = free_mem_pre - free_mem_post
+        self._log_info(f"Memory for forward pass (MB): {memory_for_forward_pass}")
 
-            # Need to sync all the GPUs
-            gathered_num_pages = [None] * get_world_size()
-            all_gather_object(gathered_num_pages, new_num_pages)
-            new_num_pages = min(gathered_num_pages)
-            self._log_info(f"After all_gather - new_num_pages: {new_num_pages}")
+        new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size
+        new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages))
 
-            cm.resize_cache(new_num_pages)
-        except Exception as e:
-            ad_logger.warning(
-                f"Error encountered while resizing kv cache: {e}.\nSkipping cache resize."
-            )
+        # Need to sync all the GPUs
+        gathered_num_pages = [None] * get_world_size()
+        all_gather_object(gathered_num_pages, new_num_pages)
+        new_num_pages = min(gathered_num_pages)
+        self._log_info(f"After all_gather - new_num_pages: {new_num_pages}")
+
+        cm.resize_cache(new_num_pages)
 
         # Free memory
         torch.cuda.empty_cache()
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -327,7 +327,6 @@ test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/55233
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5434320)
 examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5528070)
-accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype SKIP (https://nvbugs/5527956)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5509024)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/5481198)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] SKIP (https://nvbugs/5481198)