vllm-project · ywang96 · Apr 30, 2026 · Apr 29, 2026 · Apr 29, 2026
@@ -432,3 +432,52 @@ def test_chunked_local_attention_get_num_blocks_to_allocate():
         )
         == 15
     )
+
+
+def test_predictor_matches_allocator_blocks_calculation_with_admission_cap():
+    """In forward steps, `get_num_blocks_to_allocate` must return exactly what
+    `allocate_new_blocks` will pull; otherwise `block_pool.get_new_blocks`
+    raises `ValueError: Cannot get N free blocks from the pool`.
+    """
+    block_size = 2
+    sliding_window = 8  # 4-block live window
+    cap = sliding_window // block_size
+
+    spec = SlidingWindowSpec(
+        block_size=block_size,
+        num_kv_heads=1,
+        head_size=1,
+        dtype=torch.float32,
+        sliding_window=sliding_window,
+    )
+    block_pool = BlockPool(
+        num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size
+    )
+    manager = SlidingWindowManager(
+        spec,
+        block_pool=block_pool,
+        enable_caching=False,
+        kv_cache_group_id=0,
+        max_admission_blocks_per_request=cap,
+    )
+
+    request_id = "req"
+    total_computed = 0
+    # Walk through request forward steps. Check num_blocks returned by
+    # `get_num_blocks_to_allocate` matches what `allocate_new_blocks` pulls
+    for num_tokens in (4, 8, 12, 16):
+        predicted = manager.get_num_blocks_to_allocate(
+            request_id=request_id,
+            num_tokens=num_tokens,
+            new_computed_blocks=[],
+            total_computed_tokens=total_computed,
+            num_tokens_main_model=num_tokens,
+        )
+        new_blocks = manager.allocate_new_blocks(
+            request_id, num_tokens=num_tokens, num_tokens_main_model=num_tokens
+        )
+        assert predicted == len(new_blocks), (
+            f"num_tokens={num_tokens}: predictor returned {predicted} "
+            f"but allocator pulled {len(new_blocks)}"
+        )
+        total_computed = num_tokens
@@ -85,6 +85,7 @@ def get_num_blocks_to_allocate(
         num_encoder_tokens: int,
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -101,6 +102,10 @@ def get_num_blocks_to_allocate(
             num_tokens_main_model: The number of tokens for the main model (aka target
                 model in spec decode). w/o spec decode, it is num_tokens;
                 with spec decode, it is num_tokens - num_lookahead_tokens.
+            apply_admission_cap: If True, apply the recycling-aware
+                per-request admission cap (SWA / chunked-local). Set only by
+                the full-sequence admission gate; per-step allocation must
+                leave it False so the predictor matches `allocate_new_blocks`.
 
         Returns:
             The number of blocks to allocate.
@@ -111,7 +116,12 @@ def get_num_blocks_to_allocate(
                 # For cross-attention, we issue a single static allocation
                 # of blocks based on the number of encoder input tokens.
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
-                    request_id, num_encoder_tokens, [], 0, num_encoder_tokens
+                    request_id,
+                    num_encoder_tokens,
+                    [],
+                    0,
+                    num_encoder_tokens,
+                    apply_admission_cap=apply_admission_cap,
                 )
             else:
                 num_blocks_to_allocate += manager.get_num_blocks_to_allocate(
@@ -120,6 +130,7 @@ def get_num_blocks_to_allocate(
                     new_computed_blocks[i],
                     total_computed_tokens,
                     num_tokens_main_model,
+                    apply_admission_cap=apply_admission_cap,
                 )
         return num_blocks_to_allocate
 

@@ -257,6 +257,7 @@ def can_fit_full_sequence(
             num_encoder_tokens=num_encoder_tokens,
             total_computed_tokens=total_computed_tokens,
             num_tokens_main_model=full_num_tokens,
+            apply_admission_cap=True,
         )
 
         return num_blocks_to_allocate <= self.block_pool.get_num_free_blocks()

@@ -92,6 +92,7 @@ def get_num_blocks_to_allocate(
         new_computed_blocks: Sequence[KVCacheBlock],
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         """
         Get the number of blocks needed to be allocated for the request.
@@ -107,13 +108,16 @@ def get_num_blocks_to_allocate(
             num_tokens_main_model: The number of tokens for the main model (aka target
                 model in spec decode). w/o spec decode, it is num_tokens;
                 with spec decode, it is num_tokens - num_lookahead_tokens.
+            apply_admission_cap: If True, clamp by `num_required_blocks` by
+                `_max_admission_blocks_per_request`for recycling-aware specs
+                (SWA, chunked-local).
 
         Returns:
             The number of blocks to allocate.
         """
 
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        if self._max_admission_blocks_per_request is not None:
+        if apply_admission_cap and self._max_admission_blocks_per_request is not None:
             # Recycling-aware specs (SWA, chunked-local) cap the per-request
             # reservation here so admission matches the startup pool sizer
             # (`SlidingWindowSpec.max_admission_blocks_per_request` / its
@@ -893,6 +897,7 @@ def get_num_blocks_to_allocate(
         new_computed_blocks: Sequence[KVCacheBlock],
         total_computed_tokens: int,
         num_tokens_main_model: int,
+        apply_admission_cap: bool = False,
     ) -> int:
         assert isinstance(self.kv_cache_spec, MambaSpec)
         if (
@@ -917,6 +922,7 @@ def get_num_blocks_to_allocate(
                 new_computed_blocks,
                 total_computed_tokens,
                 num_tokens_main_model,
+                apply_admission_cap=apply_admission_cap,
             )
         else:
             # We don't allocate blocks for lookahead tokens in align mode, because if