test: Add test to protect the memory allocation behavior.

qixiang-99 · qixiang-99 · commit 33cc07de84b8 · 2025-08-21T22:26:24.000-07:00
- Changed primary and secondary pool memory allocation to use instance variables instead of local variables for better clarity and maintainability.
- Updated logging to reflect the new instance variable usage.
- Added unit tests to validate memory allocation behavior in KVCacheManager.

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -264,7 +264,6 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
             assert isinstance(
                 kv_cache_config, KvCacheConfigCpp
             ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfigCpp"
-
             blocks_per_window = self.calculate_max_num_blocks_from_cpp(
                 kv_cache_config=kv_cache_config,
                 model_config=model_config,
@@ -828,12 +827,12 @@ def calculate_max_num_blocks_from_cpp(
         free_mem, total_mem = torch.cuda.mem_get_info()
         # Respect max_gpu_total_bytes if provided
         free_gpu_memory_fraction = kv_cache_config.free_gpu_memory_fraction if kv_cache_config.free_gpu_memory_fraction else 0.9
-        primary_pool_memory_bytes = kv_cache_config.max_gpu_total_bytes if kv_cache_config.max_gpu_total_bytes > 0 else int(
+        self._primary_pool_memory_bytes = kv_cache_config.max_gpu_total_bytes if kv_cache_config.max_gpu_total_bytes > 0 else int(
             free_mem * free_gpu_memory_fraction)
-        secondary_pool_memory_bytes = kv_cache_config.host_cache_size if kv_cache_config.host_cache_size else 0
+        self._secondary_pool_memory_bytes = kv_cache_config.host_cache_size if kv_cache_config.host_cache_size else 0
         logger.debug(
-            f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \n"
-            f"secondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"
+            f"primary_pool_memory_bytes is set to {self._primary_pool_memory_bytes/1024**3}GB, \n"
+            f"secondary_pool_memory_bytes is set to {self._secondary_pool_memory_bytes/1024**3}GB"
         )
 
         # Adjust the window sizes to fit the memory if even a single sequence
@@ -843,7 +842,7 @@ def calculate_max_num_blocks_from_cpp(
             max_attention_window_vec=self.max_attention_window_vec,
             model_config=model_config,
             kv_cache_config=kv_cache_config,
-            pool_memory_bytes=primary_pool_memory_bytes,
+            pool_memory_bytes=self._primary_pool_memory_bytes,
             kv_factor=self.kv_factor,
             dtype=self.dtype,
             is_cross_attention=is_cross_attention,
@@ -858,8 +857,8 @@ def calculate_max_num_blocks_from_cpp(
             model_config=model_config,
             world_config=world_config_cpp,
             window_size_to_layers=window_size_to_layers,
-            allotted_primary_mem_bytes=primary_pool_memory_bytes,
-            allotted_secondary_mem_bytes=secondary_pool_memory_bytes,
+            allotted_primary_mem_bytes=self._primary_pool_memory_bytes,
+            allotted_secondary_mem_bytes=self._secondary_pool_memory_bytes,
             extra_cost_memory=extra_cost_memory,
             kv_factor=self.kv_factor,
         )
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -1057,14 +1057,6 @@ def validate_max_attention_window(cls, v: Optional[List[int]]):
                 raise ValueError(
                     "kv_cache_config.max_attention_window values must be positive"
                 )
-
-        # Must not be a redundant repetition of a shorter pattern
-        n = len(v)
-        for k in range(1, n):
-            if n % k == 0 and v == v[:k] * (n // k):
-                raise ValueError(
-                    f"kv_cache_config.max_attention_window should contain only the minimal repeating pattern; use {v[:k]} instead of {v}"
-                )
         return v
 
 
diff --git a/tests/unittest/_torch/executor/test_resource_manager.py b/tests/unittest/_torch/executor/test_resource_manager.py
@@ -3,6 +3,8 @@
 import subprocess
 import sys
 import unittest
+from typing import NamedTuple, Tuple
+from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -13,11 +15,13 @@
 from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager,
                                                              PeftCacheConfig,
                                                              PeftCacheManager)
+from tensorrt_llm.bindings import LayerType
 from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.bindings.internal.batch_manager import \
     PeftTaskNotCachedException
 from tensorrt_llm.lora_helper import LoraConfig
+from tensorrt_llm.mapping import Mapping
 
 DataType = tensorrt_llm.bindings.DataType
 LoraModule = tensorrt_llm.bindings.LoraModule
@@ -544,6 +548,148 @@ def test_adjust_window_sizes_for_vswa(self):
                     f"Actual: {adjusted_max_attention_window_vec}\n"
                     f"Expected: {expected_max_attention_window_vec}")
 
+    @staticmethod
+    def _create_model_config_for_kv_cache_manager() -> ModelConfigCpp:
+        """
+        Create a simple model config for KVCacheManager test.
+        """
+
+        model_config_params = {
+            "vocab_size": 0,
+            "num_layers": 4,
+            "num_attention_layers": 4,
+            "num_rnn_layers": 0,
+            "num_heads": 64,
+            "hidden_size": 64,
+            "data_type": DataType.HALF
+        }
+        num_kv_heads = 8
+
+        model_config = ModelConfigCpp(**model_config_params)
+        model_config.layer_types = [LayerType.ATTENTION
+                                    ] * model_config.num_attention_layers()
+        model_config.set_num_kv_heads(num_kv_heads)
+
+        return model_config
+
+    @staticmethod
+    def _create_kv_cache_config_for_kv_cache_manager(
+            params: dict) -> tllm.KvCacheConfig:
+        """
+        Create a KV cache config for KVCacheManager test.
+        """
+        return tllm.KvCacheConfig(**params)
+
+    def test_calculate_max_num_blocks_from_cpp(self):
+        # Construct a minimal mapping (single-rank, no TP/PP)
+        mapping = Mapping(world_size=1, tp_size=1, pp_size=1)
+
+        # Construct model config
+        model_config = TestResourceManager._create_model_config_for_kv_cache_manager(
+        )
+
+        # Construct KV cache config
+        free_gpu_memory_fraction = 0.1
+        max_attention_window = [64, 128]
+        max_gpu_total_bytes = 32 * 1024 * 1024  # 32MB
+        enable_block_reuse = False
+        host_cache_size = 32 * 1024 * 1024  # 32MB
+
+        # mock values for torch.cuda.mem_get_info to return a fixed value
+        fixed_free_mem = 128 * 1024 * 1024  # 128MB
+        fixed_total_mem = 256 * 1024 * 1024  # 256MB
+
+        class MemTestCase(NamedTuple):
+            case_name: str
+            kv_cache_config_params: dict
+            expected_memory_bytes: Tuple[
+                int,
+                int]  # (primary_pool_memory_bytes, secondary_pool_memory_bytes)
+
+        test_cases = [
+            # Case 1:
+            # max_gpu_total_bytes is set, even if free_gpu_memory_fraction is set, we will use max_gpu_total_bytes
+            # host_cache_size is set, we will use host_cache_size
+            MemTestCase(
+                case_name="max_gpu_total_bytes is set, host_cache_size is set",
+                kv_cache_config_params={
+                    "max_attention_window": max_attention_window,
+                    "free_gpu_memory_fraction": free_gpu_memory_fraction,
+                    "max_gpu_total_bytes": max_gpu_total_bytes,
+                    "enable_block_reuse": enable_block_reuse,
+                    "host_cache_size": host_cache_size,
+                },
+                expected_memory_bytes=(max_gpu_total_bytes, host_cache_size),
+            ),
+
+            # Case 2:
+            # max_gpu_total_bytes is not set, we will use free_gpu_memory_fraction
+            # host_cache_size is not set, we will use 0
+            MemTestCase(
+                case_name=
+                "max_gpu_total_bytes is not set, host_cache_size is not set",
+                kv_cache_config_params={
+                    "max_attention_window": max_attention_window,
+                    "free_gpu_memory_fraction": free_gpu_memory_fraction,
+                    "enable_block_reuse": enable_block_reuse,
+                },
+                # NOTE: use np.float32 to avoid float precision issue between python(double in most cases) and cpp binding(float)
+                expected_memory_bytes=(int(
+                    fixed_free_mem * np.float32(free_gpu_memory_fraction)), 0),
+            ),
+        ]
+
+        tokens_per_block = 32
+        model_config.tokens_per_block = tokens_per_block
+        max_seq_len = max(max_attention_window)
+        max_batch_size = 1
+        max_beam_width = 1
+
+        for case_name, kv_cache_config_params, expected_memory_bytes in test_cases:
+            with self.subTest(case=case_name):
+                kv_cache_config = TestResourceManager._create_kv_cache_config_for_kv_cache_manager(
+                    kv_cache_config_params)
+                with patch('torch.cuda.mem_get_info',
+                           return_value=(fixed_free_mem, fixed_total_mem)):
+                    # Create a real KVCacheManager, it will run calculate_max_num_blocks_from_cpp in __init__
+                    manager = KVCacheManager(
+                        kv_cache_config=kv_cache_config,
+                        kv_cache_type=tensorrt_llm.bindings.internal.
+                        batch_manager.CacheType.SELF,
+                        num_layers=model_config.num_attention_layers(),
+                        num_kv_heads=model_config.num_kv_heads(
+                            0
+                        ),  # NOTE: assume same number of kv heads for all layers
+                        head_dim=model_config.head_size,
+                        tokens_per_block=tokens_per_block,
+                        max_seq_len=max_seq_len,
+                        max_batch_size=max_batch_size,
+                        mapping=mapping,
+                        dtype=model_config.data_type,
+                        model_config=model_config,
+                        max_beam_width=max_beam_width,
+                    )
+                    try:
+                        expected_primary, expected_secondary = expected_memory_bytes
+                        self.assertEqual(
+                            manager._primary_pool_memory_bytes,
+                            expected_primary,
+                            f"Test case '{case_name}' failed.\n"
+                            f"Expected primary pool memory bytes: {expected_primary}\n"
+                            f"Actual primary pool memory bytes: {manager._primary_pool_memory_bytes}"
+                        )
+                        self.assertEqual(
+                            manager._secondary_pool_memory_bytes,
+                            expected_secondary,
+                            f"Test case '{case_name}' failed.\n"
+                            f"Expected secondary pool memory bytes: {expected_secondary}\n"
+                            f"Actual secondary pool memory bytes: {manager._secondary_pool_memory_bytes}"
+                        )
+                    except Exception as e:
+                        self.fail(f"Test case '{case_name}' failed: {e}")
+                    finally:
+                        manager.shutdown()
+
 
 if __name__ == "__main__":
     unittest.main()