fix:

qixiang-99 · qixiang-99 · commit 9fecb3583882 · 2025-07-16T04:13:07.000Z
- expose `num_free_blocks_per_window_size` via kv_cache_stats.
- with `num_free_blocks_per_window_size` , update `get_num_free_blocks()` and `get_num_available_tokens()`

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -180,6 +180,8 @@ struct KvCacheStats
     SizeType32 missedBlocks;
     // Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
     float cacheHitRate;
+    // Number of free blocks for every configured attention-window size.
+    std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
 };
 
 // Basic building block of a paged KV cache - a single
@@ -1454,6 +1456,11 @@ class KVCacheManager : public BaseKVCacheManager
         return mBlockManager.getNumMissedBlocks();
     }
 
+    [[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize() const
+    {
+        return mBlockManager.getNumFreeBlocksPerWindowSize();
+    }
+
     [[nodiscard]] KvCacheStats getKvCacheStats() const override
     {
         KvCacheStats kvCacheStats;
@@ -1468,6 +1475,7 @@ class KVCacheManager : public BaseKVCacheManager
         kvCacheStats.cacheHitRate = kvCacheStats.reusedBlocks == 0 ? 0
                                                                    : static_cast<float>(kvCacheStats.reusedBlocks)
                 / static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
+        kvCacheStats.numFreeBlocksPerWindowSize = getNumFreeBlocksPerWindowSize();
         return kvCacheStats;
     }
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -298,7 +298,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def_readwrite("alloc_new_blocks", &tbk::KvCacheStats::allocNewBlocks)
         .def_readwrite("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
         .def_readwrite("missed_blocks", &tbk::KvCacheStats::missedBlocks)
-        .def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate);
+        .def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
+        .def_readwrite("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
 
     py::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
         .def(py::init<>())
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -193,10 +193,10 @@ def __init__(
                              else 0)
 
         # Determine if this is VSWA (Variable Sliding Window Attention)
-        is_vswa = len(self.max_attention_window_vec) > 1
+        self.is_vswa = len(self.max_attention_window_vec) > 1
 
         # Calculate blocks per window using appropriate method
-        if is_vswa:
+        if self.is_vswa:
             # VSWA case: use C++ implementation for variable window sizes
             # model config check
             if model_config is None:
@@ -523,14 +523,29 @@ def get_batch_cache_indices(
         return result
 
     def get_num_free_blocks(self) -> int:
-        return self.impl.get_kv_cache_stats().free_num_blocks
+        if self.is_vswa:
+            logger.info(
+                f"For VSWA case, we return the minimum of the number of free blocks for each window size: {self.impl.get_kv_cache_stats().num_free_blocks_per_window_size}"
+            )
+            return min(self.impl.get_kv_cache_stats().
+                       num_free_blocks_per_window_size.values())
+        else:
+            return self.impl.get_kv_cache_stats().free_num_blocks
 
     def get_num_kv_blocks(self, num_tokens: int) -> int:
         return (num_tokens + self.tokens_per_block - 1) // self.tokens_per_block
 
     def get_num_available_tokens(self, max_num_draft_tokens: int = 0) -> int:
-        return (self.get_num_free_blocks() * self.tokens_per_block -
-                self.num_extra_kv_tokens - max_num_draft_tokens)
+        if self.max_attention_window_vec and len(
+                self.max_attention_window_vec) > 1:
+            # VSWA case, the available tokens should the the minimum of the available tokens for each window size
+            min_free_blocks = min(self.impl.get_kv_cache_stats().
+                                  num_free_blocks_per_window_size.values())
+            res = min_free_blocks * self.tokens_per_block - self.num_extra_kv_tokens - max_num_draft_tokens
+        else:
+            res = (self.get_num_free_blocks() * self.tokens_per_block -
+                   self.num_extra_kv_tokens - max_num_draft_tokens)
+        return res
 
     def get_buffers(self, layer_idx: int) -> Optional[torch.Tensor]:
         layer_offset = self.layer_offsets[layer_idx]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -24,6 +24,8 @@ l0_h100:
   - unittest/disaggregated/test_router.py
   - unittest/disaggregated/test_remoteDictionary.py
   - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_chunked_prefill
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)