Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion vllm/core/block/cpu_gpu_block_allocator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from typing import Dict, FrozenSet, List, Optional, Tuple

import torch
import torch.distributed

from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
DeviceAwareBlockAllocator)
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
Expand Down Expand Up @@ -318,7 +321,14 @@ def get_common_computed_block_ids(
device = Device.GPU
return self._allocators[device].get_common_computed_block_ids(
computed_seq_block_ids)


def get_kv_cache_from_block_id(block_id: int) -> torch.Tensor:
if torch.distributed.is_initialized():
dev = torch.device(f"cuda:{torch.distributed.get_rank()}")
else:
dev = torch.device("cuda:0")
return torch.tensor([block_id], device=dev, dtype=torch.int64).view(-1, 1)

@property
def all_block_ids(self) -> FrozenSet[int]:
return frozenset(self._block_ids_to_allocator.keys())
Expand Down
6 changes: 6 additions & 0 deletions vllm/core/block/interfaces.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from abc import ABC, abstractmethod
from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple

import torch

from vllm.utils import Device

BlockId = int
Expand Down Expand Up @@ -254,6 +256,10 @@ def get_common_computed_block_ids(
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
pass

@abstractmethod
def get_kv_tensor_from_block_id(block_id: int) -> torch.Tensor:
pass

@abstractmethod
def get_num_blocks_touched(self,
blocks: List[Block],
Expand Down
16 changes: 14 additions & 2 deletions vllm/core/block_manager_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import Sequence as GenericSequence
from typing import Tuple

import torch

from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.core.block.interfaces import Block
Expand Down Expand Up @@ -119,8 +121,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
block_size=self.block_size,
)

if seq_group.is_encoder_decoder():
num_required_blocks += BlockTable.get_num_required_blocks(
if seq_group.is_encoder_decoder():num_required_blocks += BlockTable.get_num_required_blocks(
seq_group.get_encoder_seq().get_token_ids(),
block_size=self.block_size,
)
Expand Down Expand Up @@ -315,6 +316,17 @@ def get_common_computed_block_ids(
# NOTE(sang): This assumes seq_block_ids doesn't contain any None.
return self.block_allocator.get_common_computed_block_ids(
computed_seq_block_ids) # type: ignore

def get_kv_cache_from_seq(self, seq_group: SequenceGroup) -> torch.Tensor:
abs_block_ids = self.get_block_table(
seq_group.get_seqs(status=SequenceStatus.RUNNING)[0])
rel_block_id = self.block_allocator.get_physical_block_id(
Device.GPU, abs_block_ids[0])
kv_cache = self.block_allocator.get_kv_tensor_from_block_id(rel_block_id)
return kv_cache

def get_kv_cache_from_block(self, seq_group: SequenceGroup) -> torch.Tensor:
return self.get_kv_cache_from_seq(seq_group)

def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
if parent_seq.seq_id not in self.block_tables:
Expand Down