Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b2cc84f
Add hybrid offload planning scaffolding
Mar 22, 2026
f3e33eb
Add partial GPU block transfer metadata
Mar 22, 2026
1b5b6e1
Add per-group offload hashing
Mar 22, 2026
0a80bdb
Add per-group hybrid offload accounting
Mar 22, 2026
4cabdf1
Relax hybrid planner hash-size assumptions
Mar 22, 2026
7093036
Emit hybrid partial GPU transfer specs
Mar 22, 2026
4ae026f
Validate hybrid chunk sizing inputs
Mar 22, 2026
bd18159
Log offloading worker job completion
Mar 22, 2026
f8e3999
Tolerate compatible offload metadata on preemption
Mar 23, 2026
6bc5f32
Allow hybrid planner fallback for unsplittable groups
Mar 23, 2026
91cd4b3
Offset hybrid hash/store chunks for late full blocks
Mar 23, 2026
7eba4f5
Load first hybrid chunk from group start
Mar 23, 2026
049348a
Add hybrid offload validation and handle_preemptions compatibility
Mar 23, 2026
a524b28
Add SupportsHMA to LMCacheConnectorV1 for hybrid model support
Mar 23, 2026
870f9d9
Fix hybrid offload multi-hash block ordering bug
Mar 23, 2026
d9334a0
Add MultiConnector HMA support, load backpressure, and metrics safety
Mar 23, 2026
71f19a2
Add weighted load selection to MultiConnector
Mar 23, 2026
f29d199
Fix MultiConnector handle_preemptions for stock vLLM signature
Mar 23, 2026
f7b1e29
Add per-connector Prometheus metrics to MultiConnector
Mar 23, 2026
b8d51ad
Add per-connector Prometheus metrics to MultiConnector via stats pipe…
Mar 23, 2026
f59a957
Cache computed planner properties in HybridOffloadPlanner.__post_init__
Mar 23, 2026
9d10998
Materialise block hashes once in update_state_after_alloc
Mar 23, 2026
203bca4
Cache HybridChunkBlockHashList per request in OffloadingConnectorSche…
Mar 23, 2026
3d41022
Cache combined chunk hashes in HybridChunkBlockHashList
Mar 23, 2026
d10af16
Validate hybrid block/unit divisibility once at init
Mar 23, 2026
1803c52
Add test for HybridChunkBlockHashList chunk hash caching
Mar 23, 2026
161f06f
Remove unverified-external-KV assertion in mamba block alignment
Mar 23, 2026
0dfbcd0
offloading: remove dead _ensure_transfer_supported, downgrade hot-pat…
Mar 23, 2026
b29af8d
Fix MultiConnector deferred resolution, load fallback, and mamba alig…
malaiwah Mar 26, 2026
c229a73
Handle disagreeing KV group prefix lengths gracefully
malaiwah Mar 26, 2026
fcd2f7d
Fix crash on group-disagree: zero external tokens when falling back
malaiwah Mar 26, 2026
7d788f1
Add load timeout for external KV cache transfers
malaiwah Mar 26, 2026
0072b0b
Fix pre-commit failures: typos, line length, mypy type errors
malaiwah Mar 27, 2026
77160a8
fix(rebase): remove spurious imports and undefined args from merge re…
malaiwah Mar 29, 2026
5809bae
Fix mangled test utils.py from rebase conflict resolver
malaiwah Mar 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions tests/v1/kv_offload/test_cpu_gpu_expand.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np

from vllm.v1.kv_offload.worker.cpu_gpu import expand_block_ids


def test_expand_block_ids_full_blocks():
output = np.empty(12, dtype=np.int64)
expand_block_ids(
np.array([0, 1, 3], dtype=np.int64),
block_size_factor=4,
output=output,
)

np.testing.assert_array_equal(
output,
np.array([0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15], dtype=np.int64),
)


def test_expand_block_ids_partial_ranges():
output = np.empty(6, dtype=np.int64)
expand_block_ids(
np.array([0, 1], dtype=np.int64),
block_size_factor=8,
output=output,
block_offsets=np.array([2, 0], dtype=np.int64),
block_counts=np.array([3, 3], dtype=np.int64),
)

np.testing.assert_array_equal(
output,
np.array([2, 3, 4, 8, 9, 10], dtype=np.int64),
)


def test_expand_block_ids_partial_ranges_can_repeat_same_block():
output = np.empty(4, dtype=np.int64)
expand_block_ids(
np.array([0, 0], dtype=np.int64),
block_size_factor=8,
output=output,
block_offsets=np.array([0, 4], dtype=np.int64),
block_counts=np.array([2, 2], dtype=np.int64),
)

np.testing.assert_array_equal(
output,
np.array([0, 1, 4, 5], dtype=np.int64),
)
53 changes: 53 additions & 0 deletions tests/v1/kv_offload/test_cpu_gpu_mapping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import numpy as np

from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.cpu_gpu import build_transfer_indices


def test_build_transfer_indices_whole_blocks_preserves_legacy_skip_behavior():
src_spec = CPULoadStoreSpec([7])
dst_spec = GPULoadStoreSpec([3, 4, 5], group_sizes=(3,))

mapping = build_transfer_indices(
src_spec,
dst_spec,
src_block_size_factor=4,
dst_block_size_factor=1,
)

np.testing.assert_array_equal(
mapping,
np.array([[29, 3], [30, 4], [31, 5]], dtype=np.int64),
)


def test_build_transfer_indices_supports_partial_gpu_ranges():
src_spec = GPULoadStoreSpec(
[0, 1],
group_sizes=(2,),
block_offsets=[2, 0],
block_counts=[3, 3],
)
dst_spec = GPULoadStoreSpec(
[5, 6],
group_sizes=(2,),
block_offsets=[1, 4],
block_counts=[3, 3],
)

mapping = build_transfer_indices(
src_spec,
dst_spec,
src_block_size_factor=8,
dst_block_size_factor=8,
)

np.testing.assert_array_equal(
mapping,
np.array(
[[2, 41], [3, 42], [4, 43], [8, 52], [9, 53], [10, 54]],
dtype=np.int64,
),
)
94 changes: 94 additions & 0 deletions tests/v1/kv_offload/test_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from vllm import SamplingParams
from vllm.utils.hashing import sha256
from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
from vllm.v1.kv_offload.hashing import HybridChunkBlockHashList, RequestBlockHashList
from vllm.v1.request import Request


def make_request(num_tokens: int, block_size: int = 16) -> Request:
init_none_hash(sha256)
sampling_params = SamplingParams(max_tokens=1)
sampling_params.update_from_generation_config({}, eos_token_id=100)
return Request(
request_id="r0",
prompt_token_ids=list(range(num_tokens)),
sampling_params=sampling_params,
pooling_params=None,
block_hasher=get_request_block_hasher(block_size, sha256),
)


def test_request_block_hash_list_matches_request_hashes_when_sizes_match():
request = make_request(64, block_size=16)
direct_hashes = list(RequestBlockHashList(request, 16, sha256))

assert direct_hashes == request.block_hashes


def test_request_block_hash_list_supports_arbitrary_block_sizes():
request = make_request(65536, block_size=1056)
direct_hashes = RequestBlockHashList(request, 16384, sha256)

assert len(direct_hashes) == 4
assert direct_hashes[0] != direct_hashes[1]


def test_hybrid_chunk_block_hash_list_uses_per_group_granularity():
request = make_request(65536, block_size=1056)
hash_list = HybridChunkBlockHashList(
request,
group_block_sizes=(16384, 16384, 16384, 1056),
logical_chunk_size=16384,
hash_function=sha256,
)

assert len(hash_list) == 4
assert hash_list[0] != hash_list[1]


def test_hybrid_chunk_block_hash_list_caches_chunk_hashes():
"""Accessing the same index twice should return the cached value."""
request = make_request(65536, block_size=1056)
hash_list = HybridChunkBlockHashList(
request,
group_block_sizes=(16384, 1056),
logical_chunk_size=16384,
hash_function=sha256,
)

# Cache starts empty
assert len(hash_list._chunk_hashes) == 0

# Access index 0: should populate the cache
h0 = hash_list[0]
assert len(hash_list._chunk_hashes) == 1
assert hash_list._chunk_hashes[0] == h0

# Access index 1: cache grows
h1 = hash_list[1]
assert len(hash_list._chunk_hashes) == 2

# Re-access index 0: served from cache, identical value
assert hash_list[0] == h0

# Re-access index 1: served from cache
assert hash_list[1] == h1

# Cache does not grow on repeated access
assert len(hash_list._chunk_hashes) == 2


def test_hybrid_chunk_block_hash_list_skips_leading_unhashable_chunks():
request = make_request(100000, block_size=1056)
hash_list = HybridChunkBlockHashList(
request,
group_block_sizes=(50000, 16384, 1056),
logical_chunk_size=16384,
hash_function=sha256,
)

assert hash_list.first_hashable_chunk_idx == 3
assert len(hash_list) == 3
123 changes: 123 additions & 0 deletions tests/v1/kv_offload/test_planner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from vllm.v1.kv_offload.planner import HybridOffloadPlanner


def test_fixed_chunk_marks_large_groups_as_partial():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 65536, 65536, 1056),
fixed_chunk_size=16384,
)

assert planner.offload_unit_sizes == (16384, 16384, 16384, 1056)
assert planner.requires_partial_group_offload == (True, True, True, False)
assert planner.group_hash_factors == (1024, 1024, 1024, 66)


def test_fixed_chunk_rejects_non_positive_size():
with pytest.raises(ValueError, match="must be positive"):
HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 1056),
fixed_chunk_size=0,
)


def test_fixed_chunk_rejects_smaller_than_hash_block_size():
with pytest.raises(ValueError, match="greater than or equal to hash_block_size"):
HybridOffloadPlanner(
hash_block_size=1056,
gpu_block_sizes=(65536, 1056),
fixed_chunk_size=1024,
)


def test_fixed_chunk_leaves_indivisible_large_groups_unsplit():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 50000, 1056),
fixed_chunk_size=16384,
)

assert planner.offload_unit_sizes == (16384, 50000, 1056)
assert planner.requires_partial_group_offload == (True, False, False)
assert planner.first_hashable_chunk_idx == 3
assert planner.chunk_count_for_tokens(16_384) == 0
assert planner.chunk_count_for_tokens(50_000) == 1


def test_storable_prefix_uses_common_fully_covered_units():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 65536, 65536, 1056),
fixed_chunk_size=16384,
)

assert planner.storable_prefix_tokens(10_000) == 0
assert planner.storable_prefix_tokens(16_384) == 15_840
assert planner.storable_prefix_tokens(20_000) == 16_384
assert planner.storable_prefix_tokens(33_000) == 32_736


def test_loadable_prefix_reconciles_existing_group_coverage():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 65536, 65536, 1056),
fixed_chunk_size=16384,
)

assert planner.loadable_prefix_tokens((16384, 16384, 16384, 15840)) == 15840
assert planner.loadable_prefix_tokens((32768, 32768, 32768, 32736)) == 32736
assert planner.loadable_prefix_tokens((16384, 0, 16384, 15840)) == 0


def test_planner_reports_partial_group_requirement():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 1056),
fixed_chunk_size=16384,
)

assert planner.requires_partial_group_offload_any is True


def test_planner_allows_engine_hash_size_to_differ_from_hybrid_chunk():
planner = HybridOffloadPlanner(
hash_block_size=1056,
gpu_block_sizes=(65536, 65536, 65536, 1056),
fixed_chunk_size=16384,
)

assert planner.offload_unit_sizes == (16384, 16384, 16384, 1056)
assert planner.group_hash_factors == (None, None, None, 1)
assert planner.chunk_prefix_tokens(1) == 15840


def test_chunk_prefix_tokens_uses_common_covered_prefix():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 65536, 65536, 1056),
fixed_chunk_size=16384,
)

assert planner.chunk_prefix_tokens(0) == 0
assert planner.chunk_prefix_tokens(1) == 15840
assert planner.chunk_prefix_tokens(2) == 32736
assert planner.chunk_prefix_tokens(4) == 65472


def test_chunk_count_for_tokens_inverts_common_prefix_boundaries():
planner = HybridOffloadPlanner(
hash_block_size=16,
gpu_block_sizes=(65536, 65536, 65536, 1056),
fixed_chunk_size=16384,
)

assert planner.chunk_count_for_tokens(0) == 0
assert planner.chunk_count_for_tokens(15839) == 0
assert planner.chunk_count_for_tokens(15840) == 1
assert planner.chunk_count_for_tokens(32735) == 1
assert planner.chunk_count_for_tokens(32736) == 2
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
KVConnectorBase_V1,
KVConnectorMetadata,
KVConnectorRole,
SupportsHMA,
)
from vllm.logger import init_logger
from vllm.v1.attention.backend import AttentionMetadata
Expand Down Expand Up @@ -69,7 +70,7 @@ def __repr__(self) -> str:
return f"<LMCacheKVEvents events={self.get_all_events()}>"


class LMCacheConnectorV1(KVConnectorBase_V1):
class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
@classmethod
def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
"""
Expand Down Expand Up @@ -339,6 +340,18 @@ def request_finished(
"""
return self._lmcache_engine.request_finished(request, block_ids)

def request_finished_all_groups(
self,
request: "Request",
block_ids: tuple[list[int], ...],
) -> tuple[bool, dict[str, Any] | None]:
"""
Called when a request has finished for all kv cache groups.
Flatten per-group block IDs and delegate to request_finished.
"""
flat_block_ids = [bid for group_ids in block_ids for bid in group_ids]
return self._lmcache_engine.request_finished(request, flat_block_ids)

def take_events(self) -> Iterable["KVCacheEvent"]:
"""
Take the KV cache events from the connector.
Expand Down
Loading
Loading