vllm-project · malaiwah · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
+from vllm.v1.kv_offload.worker.cpu_gpu import expand_block_ids
+
+
+def test_expand_block_ids_full_blocks():
+    output = np.empty(12, dtype=np.int64)
+    expand_block_ids(
+        np.array([0, 1, 3], dtype=np.int64),
+        block_size_factor=4,
+        output=output,
+    )
+
+    np.testing.assert_array_equal(
+        output,
+        np.array([0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15], dtype=np.int64),
+    )
+
+
+def test_expand_block_ids_partial_ranges():
+    output = np.empty(6, dtype=np.int64)
+    expand_block_ids(
+        np.array([0, 1], dtype=np.int64),
+        block_size_factor=8,
+        output=output,
+        block_offsets=np.array([2, 0], dtype=np.int64),
+        block_counts=np.array([3, 3], dtype=np.int64),
+    )
+
+    np.testing.assert_array_equal(
+        output,
+        np.array([2, 3, 4, 8, 9, 10], dtype=np.int64),
+    )
+
+
+def test_expand_block_ids_partial_ranges_can_repeat_same_block():
+    output = np.empty(4, dtype=np.int64)
+    expand_block_ids(
+        np.array([0, 0], dtype=np.int64),
+        block_size_factor=8,
+        output=output,
+        block_offsets=np.array([0, 4], dtype=np.int64),
+        block_counts=np.array([2, 2], dtype=np.int64),
+    )
+
+    np.testing.assert_array_equal(
+        output,
+        np.array([0, 1, 4, 5], dtype=np.int64),
+    )
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import numpy as np
+
+from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
+from vllm.v1.kv_offload.worker.cpu_gpu import build_transfer_indices
+
+
+def test_build_transfer_indices_whole_blocks_preserves_legacy_skip_behavior():
+    src_spec = CPULoadStoreSpec([7])
+    dst_spec = GPULoadStoreSpec([3, 4, 5], group_sizes=(3,))
+
+    mapping = build_transfer_indices(
+        src_spec,
+        dst_spec,
+        src_block_size_factor=4,
+        dst_block_size_factor=1,
+    )
+
+    np.testing.assert_array_equal(
+        mapping,
+        np.array([[29, 3], [30, 4], [31, 5]], dtype=np.int64),
+    )
+
+
+def test_build_transfer_indices_supports_partial_gpu_ranges():
+    src_spec = GPULoadStoreSpec(
+        [0, 1],
+        group_sizes=(2,),
+        block_offsets=[2, 0],
+        block_counts=[3, 3],
+    )
+    dst_spec = GPULoadStoreSpec(
+        [5, 6],
+        group_sizes=(2,),
+        block_offsets=[1, 4],
+        block_counts=[3, 3],
+    )
+
+    mapping = build_transfer_indices(
+        src_spec,
+        dst_spec,
+        src_block_size_factor=8,
+        dst_block_size_factor=8,
+    )
+
+    np.testing.assert_array_equal(
+        mapping,
+        np.array(
+            [[2, 41], [3, 42], [4, 43], [8, 52], [9, 53], [10, 54]],
+            dtype=np.int64,
+        ),
+    )
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from vllm import SamplingParams
+from vllm.utils.hashing import sha256
+from vllm.v1.core.kv_cache_utils import get_request_block_hasher, init_none_hash
+from vllm.v1.kv_offload.hashing import HybridChunkBlockHashList, RequestBlockHashList
+from vllm.v1.request import Request
+
+
+def make_request(num_tokens: int, block_size: int = 16) -> Request:
+    init_none_hash(sha256)
+    sampling_params = SamplingParams(max_tokens=1)
+    sampling_params.update_from_generation_config({}, eos_token_id=100)
+    return Request(
+        request_id="r0",
+        prompt_token_ids=list(range(num_tokens)),
+        sampling_params=sampling_params,
+        pooling_params=None,
+        block_hasher=get_request_block_hasher(block_size, sha256),
+    )
+
+
+def test_request_block_hash_list_matches_request_hashes_when_sizes_match():
+    request = make_request(64, block_size=16)
+    direct_hashes = list(RequestBlockHashList(request, 16, sha256))
+
+    assert direct_hashes == request.block_hashes
+
+
+def test_request_block_hash_list_supports_arbitrary_block_sizes():
+    request = make_request(65536, block_size=1056)
+    direct_hashes = RequestBlockHashList(request, 16384, sha256)
+
+    assert len(direct_hashes) == 4
+    assert direct_hashes[0] != direct_hashes[1]
+
+
+def test_hybrid_chunk_block_hash_list_uses_per_group_granularity():
+    request = make_request(65536, block_size=1056)
+    hash_list = HybridChunkBlockHashList(
+        request,
+        group_block_sizes=(16384, 16384, 16384, 1056),
+        logical_chunk_size=16384,
+        hash_function=sha256,
+    )
+
+    assert len(hash_list) == 4
+    assert hash_list[0] != hash_list[1]
+
+
+def test_hybrid_chunk_block_hash_list_caches_chunk_hashes():
+    """Accessing the same index twice should return the cached value."""
+    request = make_request(65536, block_size=1056)
+    hash_list = HybridChunkBlockHashList(
+        request,
+        group_block_sizes=(16384, 1056),
+        logical_chunk_size=16384,
+        hash_function=sha256,
+    )
+
+    # Cache starts empty
+    assert len(hash_list._chunk_hashes) == 0
+
+    # Access index 0: should populate the cache
+    h0 = hash_list[0]
+    assert len(hash_list._chunk_hashes) == 1
+    assert hash_list._chunk_hashes[0] == h0
+
+    # Access index 1: cache grows
+    h1 = hash_list[1]
+    assert len(hash_list._chunk_hashes) == 2
+
+    # Re-access index 0: served from cache, identical value
+    assert hash_list[0] == h0
+
+    # Re-access index 1: served from cache
+    assert hash_list[1] == h1
+
+    # Cache does not grow on repeated access
+    assert len(hash_list._chunk_hashes) == 2
+
+
+def test_hybrid_chunk_block_hash_list_skips_leading_unhashable_chunks():
+    request = make_request(100000, block_size=1056)
+    hash_list = HybridChunkBlockHashList(
+        request,
+        group_block_sizes=(50000, 16384, 1056),
+        logical_chunk_size=16384,
+        hash_function=sha256,
+    )
+
+    assert hash_list.first_hashable_chunk_idx == 3
+    assert len(hash_list) == 3
@@ -0,0 +1,123 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.v1.kv_offload.planner import HybridOffloadPlanner
+
+
+def test_fixed_chunk_marks_large_groups_as_partial():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 65536, 65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.offload_unit_sizes == (16384, 16384, 16384, 1056)
+    assert planner.requires_partial_group_offload == (True, True, True, False)
+    assert planner.group_hash_factors == (1024, 1024, 1024, 66)
+
+
+def test_fixed_chunk_rejects_non_positive_size():
+    with pytest.raises(ValueError, match="must be positive"):
+        HybridOffloadPlanner(
+            hash_block_size=16,
+            gpu_block_sizes=(65536, 1056),
+            fixed_chunk_size=0,
+        )
+
+
+def test_fixed_chunk_rejects_smaller_than_hash_block_size():
+    with pytest.raises(ValueError, match="greater than or equal to hash_block_size"):
+        HybridOffloadPlanner(
+            hash_block_size=1056,
+            gpu_block_sizes=(65536, 1056),
+            fixed_chunk_size=1024,
+        )
+
+
+def test_fixed_chunk_leaves_indivisible_large_groups_unsplit():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 50000, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.offload_unit_sizes == (16384, 50000, 1056)
+    assert planner.requires_partial_group_offload == (True, False, False)
+    assert planner.first_hashable_chunk_idx == 3
+    assert planner.chunk_count_for_tokens(16_384) == 0
+    assert planner.chunk_count_for_tokens(50_000) == 1
+
+
+def test_storable_prefix_uses_common_fully_covered_units():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 65536, 65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.storable_prefix_tokens(10_000) == 0
+    assert planner.storable_prefix_tokens(16_384) == 15_840
+    assert planner.storable_prefix_tokens(20_000) == 16_384
+    assert planner.storable_prefix_tokens(33_000) == 32_736
+
+
+def test_loadable_prefix_reconciles_existing_group_coverage():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 65536, 65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.loadable_prefix_tokens((16384, 16384, 16384, 15840)) == 15840
+    assert planner.loadable_prefix_tokens((32768, 32768, 32768, 32736)) == 32736
+    assert planner.loadable_prefix_tokens((16384, 0, 16384, 15840)) == 0
+
+
+def test_planner_reports_partial_group_requirement():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.requires_partial_group_offload_any is True
+
+
+def test_planner_allows_engine_hash_size_to_differ_from_hybrid_chunk():
+    planner = HybridOffloadPlanner(
+        hash_block_size=1056,
+        gpu_block_sizes=(65536, 65536, 65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.offload_unit_sizes == (16384, 16384, 16384, 1056)
+    assert planner.group_hash_factors == (None, None, None, 1)
+    assert planner.chunk_prefix_tokens(1) == 15840
+
+
+def test_chunk_prefix_tokens_uses_common_covered_prefix():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 65536, 65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.chunk_prefix_tokens(0) == 0
+    assert planner.chunk_prefix_tokens(1) == 15840
+    assert planner.chunk_prefix_tokens(2) == 32736
+    assert planner.chunk_prefix_tokens(4) == 65472
+
+
+def test_chunk_count_for_tokens_inverts_common_prefix_boundaries():
+    planner = HybridOffloadPlanner(
+        hash_block_size=16,
+        gpu_block_sizes=(65536, 65536, 65536, 1056),
+        fixed_chunk_size=16384,
+    )
+
+    assert planner.chunk_count_for_tokens(0) == 0
+    assert planner.chunk_count_for_tokens(15839) == 0
+    assert planner.chunk_count_for_tokens(15840) == 1
+    assert planner.chunk_count_for_tokens(32735) == 1
+    assert planner.chunk_count_for_tokens(32736) == 2
@@ -16,6 +16,7 @@
     KVConnectorBase_V1,
     KVConnectorMetadata,
     KVConnectorRole,
+    SupportsHMA,
 )
 from vllm.logger import init_logger
 from vllm.v1.attention.backend import AttentionMetadata
@@ -69,7 +70,7 @@ def __repr__(self) -> str:
         return f"<LMCacheKVEvents events={self.get_all_events()}>"
 
 
-class LMCacheConnectorV1(KVConnectorBase_V1):
+class LMCacheConnectorV1(KVConnectorBase_V1, SupportsHMA):
     @classmethod
     def requires_piecewise_for_cudagraph(cls, extra_config: dict[str, Any]) -> bool:
         """
@@ -339,6 +340,18 @@ def request_finished(
         """
         return self._lmcache_engine.request_finished(request, block_ids)
 
+    def request_finished_all_groups(
+        self,
+        request: "Request",
+        block_ids: tuple[list[int], ...],
+    ) -> tuple[bool, dict[str, Any] | None]:
+        """
+        Called when a request has finished for all kv cache groups.
+        Flatten per-group block IDs and delegate to request_finished.
+        """
+        flat_block_ids = [bid for group_ids in block_ids for bid in group_ids]
+        return self._lmcache_engine.request_finished(request, flat_block_ids)
+
     def take_events(self) -> Iterable["KVCacheEvent"]:
         """
         Take the KV cache events from the connector.