diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 5287cd9c1bfb..36b5a4d6d86f 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,5 +1,8 @@ from typing import Dict, FrozenSet, List, Optional, Tuple +import torch +import torch.distributed + from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator @@ -318,7 +321,14 @@ def get_common_computed_block_ids( device = Device.GPU return self._allocators[device].get_common_computed_block_ids( computed_seq_block_ids) - + + def get_kv_cache_from_block_id(block_id: int) -> torch.Tensor: + if torch.distributed.is_initialized(): + dev = torch.device(f"cuda:{torch.distributed.get_rank()}") + else: + dev = torch.device("cuda:0") + return torch.tensor([block_id], device=dev, dtype=torch.int64).view(-1, 1) + @property def all_block_ids(self) -> FrozenSet[int]: return frozenset(self._block_ids_to_allocator.keys()) diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index ab39832bc1f6..4890833c80f5 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,6 +1,8 @@ from abc import ABC, abstractmethod from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple +import torch + from vllm.utils import Device BlockId = int @@ -254,6 +256,10 @@ def get_common_computed_block_ids( self, computed_seq_block_ids: List[List[int]]) -> List[int]: pass + @abstractmethod + def get_kv_tensor_from_block_id(block_id: int) -> torch.Tensor: + pass + @abstractmethod def get_num_blocks_touched(self, blocks: List[Block], diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b48ea1b19b82..2fae4682845b 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -4,6 +4,8 @@ from typing import Sequence as GenericSequence from typing import Tuple +import torch + from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.core.block.interfaces import Block @@ -119,8 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: block_size=self.block_size, ) - if seq_group.is_encoder_decoder(): - num_required_blocks += BlockTable.get_num_required_blocks( + if seq_group.is_encoder_decoder():num_required_blocks += BlockTable.get_num_required_blocks( seq_group.get_encoder_seq().get_token_ids(), block_size=self.block_size, ) @@ -315,6 +316,17 @@ def get_common_computed_block_ids( # NOTE(sang): This assumes seq_block_ids doesn't contain any None. return self.block_allocator.get_common_computed_block_ids( computed_seq_block_ids) # type: ignore + + def get_kv_cache_from_seq(self, seq_group: SequenceGroup) -> torch.Tensor: + abs_block_ids = self.get_block_table( + seq_group.get_seqs(status=SequenceStatus.RUNNING)[0]) + rel_block_id = self.block_allocator.get_physical_block_id( + Device.GPU, abs_block_ids[0]) + kv_cache = self.block_allocator.get_kv_tensor_from_block_id(rel_block_id) + return kv_cache + + def get_kv_cache_from_block(self, seq_group: SequenceGroup) -> torch.Tensor: + return self.get_kv_cache_from_seq(seq_group) def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: if parent_seq.seq_id not in self.block_tables: