diff --git a/tests/v1/core/test_single_type_kv_cache_manager.py b/tests/v1/core/test_single_type_kv_cache_manager.py index 08fda7593e28..f59830dcd741 100644 --- a/tests/v1/core/test_single_type_kv_cache_manager.py +++ b/tests/v1/core/test_single_type_kv_cache_manager.py @@ -432,3 +432,52 @@ def test_chunked_local_attention_get_num_blocks_to_allocate(): ) == 15 ) + + +def test_predictor_matches_allocator_blocks_calculation_with_admission_cap(): + """In forward steps, `get_num_blocks_to_allocate` must return exactly what + `allocate_new_blocks` will pull; otherwise `block_pool.get_new_blocks` + raises `ValueError: Cannot get N free blocks from the pool`. + """ + block_size = 2 + sliding_window = 8 # 4-block live window + cap = sliding_window // block_size + + spec = SlidingWindowSpec( + block_size=block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + sliding_window=sliding_window, + ) + block_pool = BlockPool( + num_gpu_blocks=100, enable_caching=True, hash_block_size=block_size + ) + manager = SlidingWindowManager( + spec, + block_pool=block_pool, + enable_caching=False, + kv_cache_group_id=0, + max_admission_blocks_per_request=cap, + ) + + request_id = "req" + total_computed = 0 + # Walk through request forward steps. Check num_blocks returned by + # `get_num_blocks_to_allocate` matches what `allocate_new_blocks` pulls + for num_tokens in (4, 8, 12, 16): + predicted = manager.get_num_blocks_to_allocate( + request_id=request_id, + num_tokens=num_tokens, + new_computed_blocks=[], + total_computed_tokens=total_computed, + num_tokens_main_model=num_tokens, + ) + new_blocks = manager.allocate_new_blocks( + request_id, num_tokens=num_tokens, num_tokens_main_model=num_tokens + ) + assert predicted == len(new_blocks), ( + f"num_tokens={num_tokens}: predictor returned {predicted} " + f"but allocator pulled {len(new_blocks)}" + ) + total_computed = num_tokens diff --git a/vllm/v1/core/kv_cache_coordinator.py b/vllm/v1/core/kv_cache_coordinator.py index d53666f0d460..65993e804153 100644 --- a/vllm/v1/core/kv_cache_coordinator.py +++ b/vllm/v1/core/kv_cache_coordinator.py @@ -85,6 +85,7 @@ def get_num_blocks_to_allocate( num_encoder_tokens: int, total_computed_tokens: int, num_tokens_main_model: int, + apply_admission_cap: bool = False, ) -> int: """ Get the number of blocks needed to be allocated for the request. @@ -101,6 +102,10 @@ def get_num_blocks_to_allocate( num_tokens_main_model: The number of tokens for the main model (aka target model in spec decode). w/o spec decode, it is num_tokens; with spec decode, it is num_tokens - num_lookahead_tokens. + apply_admission_cap: If True, apply the recycling-aware + per-request admission cap (SWA / chunked-local). Set only by + the full-sequence admission gate; per-step allocation must + leave it False so the predictor matches `allocate_new_blocks`. Returns: The number of blocks to allocate. @@ -111,7 +116,12 @@ def get_num_blocks_to_allocate( # For cross-attention, we issue a single static allocation # of blocks based on the number of encoder input tokens. num_blocks_to_allocate += manager.get_num_blocks_to_allocate( - request_id, num_encoder_tokens, [], 0, num_encoder_tokens + request_id, + num_encoder_tokens, + [], + 0, + num_encoder_tokens, + apply_admission_cap=apply_admission_cap, ) else: num_blocks_to_allocate += manager.get_num_blocks_to_allocate( @@ -120,6 +130,7 @@ def get_num_blocks_to_allocate( new_computed_blocks[i], total_computed_tokens, num_tokens_main_model, + apply_admission_cap=apply_admission_cap, ) return num_blocks_to_allocate diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 83aa26bd96f0..ad82e5a1d5c5 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -257,6 +257,7 @@ def can_fit_full_sequence( num_encoder_tokens=num_encoder_tokens, total_computed_tokens=total_computed_tokens, num_tokens_main_model=full_num_tokens, + apply_admission_cap=True, ) return num_blocks_to_allocate <= self.block_pool.get_num_free_blocks() diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 0aa08f35801f..e8d3a6f75688 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -92,6 +92,7 @@ def get_num_blocks_to_allocate( new_computed_blocks: Sequence[KVCacheBlock], total_computed_tokens: int, num_tokens_main_model: int, + apply_admission_cap: bool = False, ) -> int: """ Get the number of blocks needed to be allocated for the request. @@ -107,13 +108,16 @@ def get_num_blocks_to_allocate( num_tokens_main_model: The number of tokens for the main model (aka target model in spec decode). w/o spec decode, it is num_tokens; with spec decode, it is num_tokens - num_lookahead_tokens. + apply_admission_cap: If True, clamp by `num_required_blocks` by + `_max_admission_blocks_per_request`for recycling-aware specs + (SWA, chunked-local). Returns: The number of blocks to allocate. """ num_required_blocks = cdiv(num_tokens, self.block_size) - if self._max_admission_blocks_per_request is not None: + if apply_admission_cap and self._max_admission_blocks_per_request is not None: # Recycling-aware specs (SWA, chunked-local) cap the per-request # reservation here so admission matches the startup pool sizer # (`SlidingWindowSpec.max_admission_blocks_per_request` / its @@ -893,6 +897,7 @@ def get_num_blocks_to_allocate( new_computed_blocks: Sequence[KVCacheBlock], total_computed_tokens: int, num_tokens_main_model: int, + apply_admission_cap: bool = False, ) -> int: assert isinstance(self.kv_cache_spec, MambaSpec) if ( @@ -917,6 +922,7 @@ def get_num_blocks_to_allocate( new_computed_blocks, total_computed_tokens, num_tokens_main_model, + apply_admission_cap=apply_admission_cap, ) else: # We don't allocate blocks for lookahead tokens in align mode, because if