vllm-project · KrishnaM251 · Aug 1, 2024 · Aug 1, 2024
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,5 +1,8 @@
 from typing import Dict, FrozenSet, List, Optional, Tuple
 
+import torch
+import torch.distributed
+
 from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
@@ -318,7 +321,14 @@ def get_common_computed_block_ids(
         device = Device.GPU
         return self._allocators[device].get_common_computed_block_ids(
             computed_seq_block_ids)
-
+
+    def get_kv_cache_from_block_id(block_id: int) -> torch.Tensor:
+        if torch.distributed.is_initialized():
+            dev = torch.device(f"cuda:{torch.distributed.get_rank()}") 
+        else:
+            dev = torch.device("cuda:0")
+        return torch.tensor([block_id], device=dev, dtype=torch.int64).view(-1, 1)
+
     @property
     def all_block_ids(self) -> FrozenSet[int]:
         return frozenset(self._block_ids_to_allocator.keys())

diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
@@ -1,6 +1,8 @@
 from abc import ABC, abstractmethod
 from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
 
+import torch
+
 from vllm.utils import Device
 
 BlockId = int
@@ -254,6 +256,10 @@ def get_common_computed_block_ids(
             self, computed_seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
+    @abstractmethod
+    def get_kv_tensor_from_block_id(block_id: int) -> torch.Tensor:
+        pass
+
     @abstractmethod
     def get_num_blocks_touched(self,
                                blocks: List[Block],

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
@@ -4,6 +4,8 @@
 from typing import Sequence as GenericSequence
 from typing import Tuple
 
+import torch
+
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.block.interfaces import Block
@@ -119,8 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
             block_size=self.block_size,
         )
 
-        if seq_group.is_encoder_decoder():
-            num_required_blocks += BlockTable.get_num_required_blocks(
+        if seq_group.is_encoder_decoder():num_required_blocks += BlockTable.get_num_required_blocks(
                 seq_group.get_encoder_seq().get_token_ids(),
                 block_size=self.block_size,
             )
@@ -315,6 +316,17 @@ def get_common_computed_block_ids(
         # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
         return self.block_allocator.get_common_computed_block_ids(
             computed_seq_block_ids)  # type: ignore
+
+    def get_kv_cache_from_seq(self, seq_group: SequenceGroup) -> torch.Tensor:
+        abs_block_ids = self.get_block_table(
+            seq_group.get_seqs(status=SequenceStatus.RUNNING)[0])
+        rel_block_id = self.block_allocator.get_physical_block_id(
+            Device.GPU, abs_block_ids[0])
+        kv_cache = self.block_allocator.get_kv_tensor_from_block_id(rel_block_id)
+        return kv_cache
+
+    def get_kv_cache_from_block(self, seq_group: SequenceGroup) -> torch.Tensor:
+        return self.get_kv_cache_from_seq(seq_group)
 
     def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         if parent_seq.seq_id not in self.block_tables: