Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions tests/v1/core/test_kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1447,7 +1447,10 @@ def test_allocate_with_lookahead():

# Test case 1: Requires additional lookahead tokens
kv_cache_manager = KVCacheManager(
kv_cache_config=config, max_model_len=100, hash_block_size=block_size
kv_cache_config=config,
max_model_len=100,
scheduler_block_size=block_size,
hash_block_size=block_size,
)
blocks = kv_cache_manager.allocate_slots(
request,
Expand All @@ -1458,7 +1461,10 @@ def test_allocate_with_lookahead():

# Test case 2: With precomputed blocks
kv_cache_manager = KVCacheManager(
kv_cache_config=config, max_model_len=100, hash_block_size=block_size
kv_cache_config=config,
max_model_len=100,
scheduler_block_size=block_size,
hash_block_size=block_size,
)
# required_blocks = ceil((3 + 2) /4) = 2
blocks = kv_cache_manager.allocate_slots(
Expand All @@ -1471,7 +1477,10 @@ def test_allocate_with_lookahead():
# Test case 3: With precomputed blocks
# required_blocks = ceil((3 + 4) / 4) = 2
kv_cache_manager = KVCacheManager(
kv_cache_config=config, max_model_len=100, hash_block_size=block_size
kv_cache_config=config,
max_model_len=100,
scheduler_block_size=block_size,
hash_block_size=block_size,
)
blocks = kv_cache_manager.allocate_slots(
request,
Expand Down
73 changes: 43 additions & 30 deletions tests/v1/core/test_prefix_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import copy
from collections.abc import Callable
from math import lcm

import pytest
import torch
Expand Down Expand Up @@ -92,6 +93,18 @@ def make_request(
)


def make_kv_cache_manager(kv_cache_config: KVCacheConfig, **kwargs) -> KVCacheManager:
"""Build a ``KVCacheManager``, deriving ``scheduler_block_size`` from the
config (LCM of group block sizes) unless explicitly provided. This mirrors
``resolve_kv_cache_block_sizes`` for the non-context-parallel case used by
these tests, so callers don't have to pass it at every site."""
kwargs.setdefault(
"scheduler_block_size",
lcm(*(g.kv_cache_spec.block_size for g in kv_cache_config.kv_cache_groups)),
)
return KVCacheManager(kv_cache_config, **kwargs)


def make_kv_cache_config(block_size: int, num_blocks: int) -> KVCacheConfig:
return KVCacheConfig(
num_blocks=num_blocks,
Expand Down Expand Up @@ -208,7 +221,7 @@ def make_kv_cache_config_three_types(
@pytest.mark.parametrize("hash_fn", [sha256, sha256_cbor])
def test_prefill(hash_fn):
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -331,7 +344,7 @@ def test_prefill(hash_fn):

def test_prefill_hybrid_model():
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config_hybrid_model(block_size, 21, 2),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -500,7 +513,7 @@ def test_prefill_hybrid_model():
def test_prefill_hybrid_model_eagle():
block_size = 16
kv_cache_config = make_kv_cache_config_hybrid_model(block_size, 31, 3)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -837,7 +850,7 @@ def test_prefill_hybrid_model_combinations(spec_types: list[str]):
num_blocks = 10 * num_groups

kv_cache_config = _make_hybrid_kv_cache_config(block_size, num_blocks, spec_types)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -912,7 +925,7 @@ def test_prefill_hybrid_model_combinations_eagle(
num_blocks = 10 * num_groups

kv_cache_config = _make_hybrid_kv_cache_config(block_size, num_blocks, spec_types)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -984,7 +997,7 @@ def test_prefill_hybrid_model_mamba_align():
kv_cache_config = _make_hybrid_kv_cache_config(
block_size, num_blocks, ["full", "mamba_align"]
)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1017,7 +1030,7 @@ def test_prefill_plp():
3. Schedule plp request; no hit should occur; validate blocks
"""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1125,7 +1138,7 @@ def test_prefill_plp():

def test_decode():
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1188,7 +1201,7 @@ def test_decode():

def test_evict():
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1247,7 +1260,7 @@ def test_hash_block_correct_reuse():
its hash metadata should be correctly reset.
"""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(16, 2),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1288,7 +1301,7 @@ def test_computed_blocks_not_evicted():
for a request if there are any other free blocks.
"""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 3),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1347,7 +1360,7 @@ def test_basic_prefix_caching_disabled():
This tests that the prefix caching is disabled.
"""
block_size = 4
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 5),
max_model_len=8192,
enable_caching=False,
Expand Down Expand Up @@ -1531,7 +1544,7 @@ def test_mm_prefix_caching():
"""

block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1639,7 +1652,7 @@ def test_cache_key_salting():
is separated cache as expected.
"""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1721,7 +1734,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():
the computed blocks should not be touched.
"""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1794,7 +1807,7 @@ def test_prefill_not_enough_free_blocks_with_computed_blocks():

def test_reset_prefix_cache():
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1835,7 +1848,7 @@ def test_reset_prefix_cache():
def test_prefix_cache_stats_disabled():
"""Test that prefix_cache_stats is None when log_stats is False."""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, 11),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -1915,7 +1928,7 @@ def test_kv_cache_events(blocks_to_cache: int):
# Should see a single block stored event with a blocks_to_cache number of
# block hashes
# take_events should reset the kv_event_queue
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, num_blocks),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2043,7 +2056,7 @@ def test_kv_cache_events_with_lora(blocks_to_cache: int):
num_blocks = blocks_to_cache + 1

# Create KVCacheManager with events enabled
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, num_blocks),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2101,7 +2114,7 @@ def test_block_stored_event_group_idx(group_id: int):
block_size = 4
num_tokens = block_size * 2

manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config_three_types(block_size, num_blocks=5),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2161,7 +2174,7 @@ def test_block_stored_event_group_idx_multiple_groups():
block_size = 4
num_tokens = block_size * 2

manager = KVCacheManager(
manager = make_kv_cache_manager(
KVCacheConfig(
num_blocks=5,
kv_cache_tensors=[],
Expand Down Expand Up @@ -2238,7 +2251,7 @@ def test_block_stored_event_group_idx_multiple_groups():
def test_block_stored_event_group_idx_out_of_bounds(monkeypatch):
"""Out-of-range group_idx events are returned without metadata annotation."""
block_size = 4
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, num_blocks=5),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2328,7 +2341,7 @@ def test_eagle_enabled_removes_last_block():
"""Verify Eagle does NOT remove blocks when request
length is divisible by block size."""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, num_blocks=10),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2361,7 +2374,7 @@ def test_eagle_enabled_removes_last_block():
def test_eagle_with_partial_blocks():
"""Test Eagle behavior with requests containing partial blocks."""
block_size = 16
manager = KVCacheManager(
manager = make_kv_cache_manager(
make_kv_cache_config(block_size, num_blocks=10),
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2397,7 +2410,7 @@ def test_eagle_with_sliding_window():
dtype=torch.float32,
sliding_window=block_size,
)
manager = KVCacheManager(
manager = make_kv_cache_manager(
KVCacheConfig(
num_blocks=10,
kv_cache_tensors=[],
Expand Down Expand Up @@ -2482,7 +2495,7 @@ def test_different_block_size():
),
],
)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config=kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2565,7 +2578,7 @@ def test_hybrid_cache_blocks_swa_tail_window_only():
),
],
)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config=kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2633,7 +2646,7 @@ def test_hybrid_cache_blocks_clamped_to_lcm():
),
],
)
manager = KVCacheManager(
manager = make_kv_cache_manager(
kv_cache_config=kv_cache_config,
max_model_len=8192,
enable_caching=True,
Expand Down Expand Up @@ -2781,7 +2794,7 @@ def test_can_fit_full_sequence_swa_cap_admits_long_prompt():
],
)

manager = KVCacheManager(
manager = make_kv_cache_manager(
config,
max_model_len=max_model_len,
max_num_batched_tokens=max_num_batched_tokens,
Expand Down Expand Up @@ -2837,7 +2850,7 @@ def test_can_fit_full_sequence_full_attention_still_gates_oversized():
],
)

manager = KVCacheManager(
manager = make_kv_cache_manager(
config,
max_model_len=max_model_len,
max_num_batched_tokens=max_num_batched_tokens,
Expand Down
3 changes: 3 additions & 0 deletions tests/v1/core/test_single_type_kv_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def get_sliding_window_manager(sliding_window_spec, block_pool, enable_caching=T
block_pool=block_pool,
enable_caching=enable_caching,
kv_cache_group_id=0,
scheduler_block_size=sliding_window_spec.block_size,
max_admission_blocks_per_request=10**9,
)

Expand All @@ -40,6 +41,7 @@ def get_chunked_local_attention_manager(
block_pool=block_pool,
enable_caching=enable_caching,
kv_cache_group_id=0,
scheduler_block_size=chunked_local_attention_spec.block_size,
max_admission_blocks_per_request=10**9,
)

Expand Down Expand Up @@ -458,6 +460,7 @@ def test_predictor_matches_allocator_blocks_calculation_with_admission_cap():
block_pool=block_pool,
enable_caching=False,
kv_cache_group_id=0,
scheduler_block_size=spec.block_size,
max_admission_blocks_per_request=cap,
)

Expand Down
8 changes: 7 additions & 1 deletion tests/v1/kv_connector/unit/offloading_connector/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from vllm.v1.core.kv_cache_utils import (
get_request_block_hasher,
init_none_hash,
resolve_kv_cache_block_sizes,
)
from vllm.v1.core.sched.async_scheduler import AsyncScheduler
from vllm.v1.core.sched.scheduler import Scheduler
Expand Down Expand Up @@ -230,13 +231,18 @@ def __init__(
vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
self.num_kv_groups = len(kv_cache_config.kv_cache_groups)

scheduler_block_size, hash_block_size = resolve_kv_cache_block_sizes(
kv_cache_config, vllm_config
)

scheduler_cls = AsyncScheduler if async_scheduling else Scheduler
self.scheduler = scheduler_cls(
vllm_config=vllm_config,
kv_cache_config=kv_cache_config,
log_stats=True,
structured_output_manager=StructuredOutputManager(vllm_config),
block_size=block_size,
block_size=scheduler_block_size,
hash_block_size=hash_block_size,
)

self.worker_connector = OffloadingConnector(
Expand Down
Loading
Loading