diff --git a/tests/unit_tests/kv_offload/test_offloading_connector.py b/tests/unit_tests/kv_offload/test_offloading_connector.py index b3eb1bc0e0..a570adf404 100644 --- a/tests/unit_tests/kv_offload/test_offloading_connector.py +++ b/tests/unit_tests/kv_offload/test_offloading_connector.py @@ -27,7 +27,7 @@ init_none_hash, ) from vllm.v1.core.sched.scheduler import Scheduler -from vllm.v1.kv_cache_interface import KVCacheConfig +from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec from vllm.v1.kv_offload.abstract import ( LoadStoreSpec, OffloadingEvent, @@ -160,8 +160,24 @@ def __init__(self, offloaded_block_size: int, gpu_block_size: int, num_gpu_block }, ) + kv_cache_config = KVCacheConfig( + num_blocks=num_gpu_blocks, + kv_cache_tensors=[], + kv_cache_groups=[ + KVCacheGroupSpec( + ["layer"], + FullAttentionSpec( + block_size=gpu_block_size, + num_kv_heads=1, + head_size=1, + dtype=torch.float32, + ), + ) + ], + ) + self.scheduler: Scheduler = create_scheduler(vllm_config, num_blocks=num_gpu_blocks) - self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER) + self.worker_connector = OffloadingConnector(vllm_config, KVConnectorRole.WORKER, kv_cache_config) # register worker kv_caches to enable OffloadingWorker creations self.worker_connector.register_cross_layers_kv_cache( diff --git a/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py b/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py index c764305afc..60a94a2d68 100644 --- a/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py +++ b/vllm_gaudi/v1/kv_offload/worker/cpu_hpu.py @@ -360,10 +360,12 @@ def get_handlers( attn_backends: dict[str, type[AttentionBackend]], ) -> Iterator[tuple[type[LoadStoreSpec], type[LoadStoreSpec], OffloadingHandler]]: if not self._handlers: + assert len(self.gpu_block_size) == 1 + gpu_block_size = self.gpu_block_size[0] self._handlers = CpuGpuOffloadingHandlers( attn_backends=attn_backends, - gpu_block_size=self.gpu_block_size, - cpu_block_size=self.offloaded_block_size, + gpu_block_size=gpu_block_size, + cpu_block_size=gpu_block_size * self.block_size_factor, num_cpu_blocks=self.num_blocks, gpu_caches=kv_caches, )