From 182d4a8b5ecb86221554275e7569c4884187e1ac Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Wed, 3 Apr 2024 22:07:04 +0000 Subject: [PATCH 01/32] feat: support swap in/out for block manager v2 --- vllm/core/block/block_table.py | 30 ++++++++- vllm/core/block/cpu_gpu_block_allocator.py | 5 +- vllm/core/block/interfaces.py | 1 + vllm/core/block/naive_block.py | 1 + vllm/core/block/prefix_caching_block.py | 1 + vllm/core/block_manager_v2.py | 74 ++++++++++++++++++++-- 6 files changed, 105 insertions(+), 7 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index ba061bbc4fbc..e87f035ca57b 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -64,6 +64,28 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: sequence of token IDs. """ return cdiv(len(token_ids), block_size) + + def append_by_blocks(self, + token_ids: List[int], + device: Device = Device.GPU) -> Block: + """Allocates memory blocks for storing the given sequence of token IDs. + + This method allocates the required number of blocks to store the given + sequence of token IDs. + + Args: + token_ids (List[int]): The sequence of token IDs to be stored. + device (Device, optional): The device on which the blocks should be + allocated. Defaults to Device.GPU. + """ + block = self._allocate_blocks_for_token_ids(prev_block=None, + token_ids=token_ids, + device=device) + self._blocks.append(block) + self._num_full_slots += len(token_ids) + return block + + def allocate(self, token_ids: List[int], @@ -78,8 +100,8 @@ def allocate(self, device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. """ - assert not self._is_allocated assert token_ids + assert not self._is_allocated self._blocks = self._allocate_blocks_for_token_ids(prev_block=None, token_ids=token_ids, device=device) @@ -248,7 +270,11 @@ def _get_all_token_ids(self) -> List[int]: @property def _is_allocated(self) -> bool: return self._blocks is not None - + + @property + def _num_touched_blocks(self) -> int: + return len(self._blocks) + @property def _num_empty_slots(self) -> int: assert self._is_allocated diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 3135e194c593..a44845729a13 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,5 +1,5 @@ +from __future__ import annotations from typing import Dict, List, Optional - from vllm.core.block.interfaces import (Block, BlockAllocator, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator @@ -189,6 +189,9 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]: # CoW only supported on GPU device = Device.GPU return self._allocators[device].clear_copy_on_writes() + + def increase_ref_count(self, device: Device, block_id: int) -> None: + return self._allocators[device].refcounter().incr(block_id) def mark_blocks_as_computed(self) -> None: # Prefix caching only supported on GPU. diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 9f466566f096..eb8c2b73edf0 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,3 +1,4 @@ +from __future__ import annotations from abc import ABC, abstractmethod, abstractproperty from typing import Dict, List, Optional, Protocol diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index f8e9265bb2d6..35539945b942 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,3 +1,4 @@ +from __future__ import annotations from typing import Dict, Iterable, List, Optional, Set from vllm.core.block.common import (CopyOnWriteTracker, RefCounter, diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 6aa75a8abb80..56efdf0165c8 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,4 +1,5 @@ """Token blocks.""" +from __future__ import annotations from itertools import takewhile from os.path import commonprefix from typing import Dict, Iterable, List, Optional diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 813e71ad883b..01e81896aca8 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -1,6 +1,7 @@ """A block manager that manages token blocks.""" from typing import Dict, List, Optional +from vllm.core.block.interfaces import Block from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator from vllm.core.interfaces import AllocStatus, BlockSpaceManager @@ -227,17 +228,82 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: def can_swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: - return False + """ + We go through all sequence in seq group to get their number of blocks + touched and sum them up to see whether there is enough memory to swap in + """ + num_touched_blocks = 0 + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + block_table = self.block_tables[seq.seq_id] + num_touched_blocks += ( + block_table.get_num_blocks_touched_by_append_slots( + token_ids=seq.get_token_ids(), + num_lookahead_slots=num_lookahead_slots, + )) + num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU) + return num_free_blocks - num_touched_blocks >= self.watermark_blocks + def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> Dict[int, int]: - raise NotImplementedError + mapping: Dict[Block, Block] = {} + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + new_block_table = BlockTable( + block_size=self.block_size, + block_allocator=self.block_allocator, + ) + block_table = self.block_tables[seq.seq_id] + + for cpu_block in block_table: + if cpu_block in mapping: + gpu_block = mapping[cpu_block] + self.block_allocator.increase_ref_count(Device.GPU, gpu_block.block_id()) + else: + gpu_block = new_block_table.append_by_blocks(token_ids=cpu_block.token_ids(), device=Device.GPU) + mapping[cpu_block] = gpu_block + self.block_allocator.free(cpu_block) + self.block_tables[seq.seq_id] = new_block_table + + # NOTE: since the memory operation in physical blocks need the relative position + # of CPU block to its starting address, here we need to shift the block id of cpu + # block back to its relative position within CPU cache + block_number_mapping = { + cpu_block.block_id() - self.num_total_gpu_blocks: gpu_block.block_id() + for cpu_block, gpu_block in mapping.items() + } + return block_number_mapping + def can_swap_out(self, seq_group: SequenceGroup) -> bool: - return False + num_touched_blocks = 0 + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + num_touched_blocks += self.block_tables[seq.seq_id]._num_touched_blocks() + return num_touched_blocks <= self.block_allocator.get_num_free_blocks(Device.CPU) def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: - raise NotImplementedError + mapping: Dict[Block, Block] = {} + for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + new_block_table = BlockTable( + block_size=self.block_size, + block_allocator=self.block_allocator, + ) + block_table = self.block_tables[seq.seq_id] + + for gpu_block in block_table: + if gpu_block in mapping: + cpu_block = mapping[gpu_block] + self.block_allocator.increase_ref_count(Device.CPU, cpu_block.block_id()) + else: + cpu_block = new_block_table.append_by_blocks(token_ids=gpu_block.token_ids(), device=Device.CPU) + mapping[gpu_block] = cpu_block + self.block_allocator.free(cpu_block) + self.block_tables[seq.seq_id] = new_block_table + + block_number_mapping = { + cpu_block.block_number - self.num_total_gpu_blocks: gpu_block.block_number + for cpu_block, gpu_block in mapping.items() + } + return block_number_mapping def get_num_free_gpu_blocks(self) -> int: return self.block_allocator.get_num_free_blocks(Device.GPU) From b6b4b8fa72e9fedc27d4272102c8030bec96a466 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Wed, 3 Apr 2024 22:15:55 +0000 Subject: [PATCH 02/32] fix: linter --- vllm/core/block/block_table.py | 16 ++++------ vllm/core/block/cpu_gpu_block_allocator.py | 4 ++- vllm/core/block/interfaces.py | 1 + vllm/core/block/naive_block.py | 1 + vllm/core/block/prefix_caching_block.py | 1 + vllm/core/block_manager_v2.py | 37 +++++++++++++--------- 6 files changed, 35 insertions(+), 25 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index e87f035ca57b..91ee218fa616 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -64,10 +64,10 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: sequence of token IDs. """ return cdiv(len(token_ids), block_size) - + def append_by_blocks(self, - token_ids: List[int], - device: Device = Device.GPU) -> Block: + token_ids: List[int], + device: Device = Device.GPU) -> Block: """Allocates memory blocks for storing the given sequence of token IDs. This method allocates the required number of blocks to store the given @@ -79,13 +79,11 @@ def append_by_blocks(self, allocated. Defaults to Device.GPU. """ block = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device) + token_ids=token_ids, + device=device) self._blocks.append(block) self._num_full_slots += len(token_ids) return block - - def allocate(self, token_ids: List[int], @@ -270,11 +268,11 @@ def _get_all_token_ids(self) -> List[int]: @property def _is_allocated(self) -> bool: return self._blocks is not None - + @property def _num_touched_blocks(self) -> int: return len(self._blocks) - + @property def _num_empty_slots(self) -> int: assert self._is_allocated diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index a44845729a13..8d9188286de8 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,5 +1,7 @@ from __future__ import annotations + from typing import Dict, List, Optional + from vllm.core.block.interfaces import (Block, BlockAllocator, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator @@ -189,7 +191,7 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]: # CoW only supported on GPU device = Device.GPU return self._allocators[device].clear_copy_on_writes() - + def increase_ref_count(self, device: Device, block_id: int) -> None: return self._allocators[device].refcounter().incr(block_id) diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index eb8c2b73edf0..d463f7b09131 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,4 +1,5 @@ from __future__ import annotations + from abc import ABC, abstractmethod, abstractproperty from typing import Dict, List, Optional, Protocol diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 35539945b942..323fb85707ab 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,4 +1,5 @@ from __future__ import annotations + from typing import Dict, Iterable, List, Optional, Set from vllm.core.block.common import (CopyOnWriteTracker, RefCounter, diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 56efdf0165c8..2232564381ea 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,5 +1,6 @@ """Token blocks.""" from __future__ import annotations + from itertools import takewhile from os.path import commonprefix from typing import Dict, Iterable, List, Optional diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 01e81896aca8..05a0d1110700 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -1,9 +1,9 @@ """A block manager that manages token blocks.""" from typing import Dict, List, Optional -from vllm.core.block.interfaces import Block from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator +from vllm.core.block.interfaces import Block from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device @@ -242,7 +242,6 @@ def can_swap_in(self, seq_group: SequenceGroup, )) num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU) return num_free_blocks - num_touched_blocks >= self.watermark_blocks - def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> Dict[int, int]: @@ -257,28 +256,33 @@ def swap_in(self, seq_group: SequenceGroup, for cpu_block in block_table: if cpu_block in mapping: gpu_block = mapping[cpu_block] - self.block_allocator.increase_ref_count(Device.GPU, gpu_block.block_id()) + self.block_allocator.increase_ref_count( + Device.GPU, gpu_block.block_id()) else: - gpu_block = new_block_table.append_by_blocks(token_ids=cpu_block.token_ids(), device=Device.GPU) + gpu_block = new_block_table.append_by_blocks( + token_ids=cpu_block.token_ids(), device=Device.GPU) mapping[cpu_block] = gpu_block self.block_allocator.free(cpu_block) self.block_tables[seq.seq_id] = new_block_table - # NOTE: since the memory operation in physical blocks need the relative position - # of CPU block to its starting address, here we need to shift the block id of cpu - # block back to its relative position within CPU cache + # NOTE: since the memory operation in physical blocks need the + # relative position of CPU block to its starting address, here + # we need to shift the block id of cpu block back to its relative + # position within CPU cache. block_number_mapping = { - cpu_block.block_id() - self.num_total_gpu_blocks: gpu_block.block_id() + cpu_block.block_id() - self.num_total_gpu_blocks: + gpu_block.block_id() for cpu_block, gpu_block in mapping.items() } return block_number_mapping - def can_swap_out(self, seq_group: SequenceGroup) -> bool: num_touched_blocks = 0 for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - num_touched_blocks += self.block_tables[seq.seq_id]._num_touched_blocks() - return num_touched_blocks <= self.block_allocator.get_num_free_blocks(Device.CPU) + num_touched_blocks += self.block_tables[ + seq.seq_id]._num_touched_blocks() + return num_touched_blocks <= self.block_allocator.get_num_free_blocks( + Device.CPU) def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: mapping: Dict[Block, Block] = {} @@ -292,15 +296,18 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: for gpu_block in block_table: if gpu_block in mapping: cpu_block = mapping[gpu_block] - self.block_allocator.increase_ref_count(Device.CPU, cpu_block.block_id()) + self.block_allocator.increase_ref_count( + Device.CPU, cpu_block.block_id()) else: - cpu_block = new_block_table.append_by_blocks(token_ids=gpu_block.token_ids(), device=Device.CPU) + cpu_block = new_block_table.append_by_blocks( + token_ids=gpu_block.token_ids(), device=Device.CPU) mapping[gpu_block] = cpu_block - self.block_allocator.free(cpu_block) + self.block_allocator.free(cpu_block) self.block_tables[seq.seq_id] = new_block_table block_number_mapping = { - cpu_block.block_number - self.num_total_gpu_blocks: gpu_block.block_number + cpu_block.block_number - self.num_total_gpu_blocks: + gpu_block.block_number for cpu_block, gpu_block in mapping.items() } return block_number_mapping From 938d10e638ea72b6aba9f220abe4a10b8b8e475a Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Thu, 4 Apr 2024 02:44:09 +0000 Subject: [PATCH 03/32] fix: fix some bugs and add test --- tests/core/block/test_block_manager_v2.py | 48 ++++++++++++++++++++++- vllm/core/block/block_table.py | 33 ++++++++++------ vllm/core/block/naive_block.py | 3 -- vllm/core/block_manager_v2.py | 24 ++++++------ 4 files changed, 80 insertions(+), 28 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 1e8e4ccdfb15..f1ee17224c7d 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -5,7 +5,7 @@ from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list -from ..utils import create_seq_group +from ..utils import create_dummy_prompt, create_seq_group @pytest.mark.parametrize("block_size", [16]) @@ -101,3 +101,49 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, range(prompt_len + num_slots_to_append + num_lookahead_slots)), block_size)) - len(chunk_list(list(range(prompt_len)), block_size)) assert num_consumed_blocks == expected_consumed_blocks + + +@pytest.mark.parametrize("block_size", [8]) +@pytest.mark.parametrize("num_cpu_blocks", [4]) +@pytest.mark.parametrize("num_gpu_blocks", [4]) +@pytest.mark.parametrize("num_lookahead_slots", [2]) +def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots): + block_manager = BlockSpaceManagerV2(block_size, + num_cpu_blocks, + num_gpu_blocks, + watermark=0) + prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) + prompt.status = SequenceStatus.WAITING + block_manager.allocate(seq_group) + # Emulate a forward pass by appending a single token. + # The block manager then knows how many unprocessed + # tokens will be written in the next forward pass. + token_id = 0 + prompt.status = SequenceStatus.RUNNING + prompt.append_token_id(token_id, {token_id: Logprob(0.0)}) + + # Swap seq group from GPU -> CPU. + gpu_blocks = block_manager.get_block_table(prompt) + assert block_manager.can_swap_out(seq_group) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_out(seq_group) + assert list(mapping.keys()) == gpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) + assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks + prompt.status = SequenceStatus.SWAPPED + + # Swap seq group from CPU -> GPU. + cpu_blocks = block_manager.get_block_table(prompt) + assert block_manager.can_swap_in(seq_group, num_lookahead_slots) + before_cpu_blocks = block_manager.get_num_free_cpu_blocks() + before_gpu_blocks = block_manager.get_num_free_gpu_blocks() + mapping = block_manager.swap_in(seq_group, num_lookahead_slots) + adjusted_cpu_blocks = [block - num_gpu_blocks for block in cpu_blocks] + assert list(mapping.keys()) == adjusted_cpu_blocks + after_cpu_blocks = block_manager.get_num_free_cpu_blocks() + after_gpu_blocks = block_manager.get_num_free_gpu_blocks() + assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks + assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 91ee218fa616..a8fd6665c977 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -65,23 +65,34 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: """ return cdiv(len(token_ids), block_size) - def append_by_blocks(self, - token_ids: List[int], - device: Device = Device.GPU) -> Block: - """Allocates memory blocks for storing the given sequence of token IDs. + def get_blocks(self) -> Optional[List[Block]]: + return self._blocks - This method allocates the required number of blocks to store the given - sequence of token IDs. + def append_by_block(self, + token_ids: List[int], + device: Device = Device.GPU) -> Block: + """Allocates memory block for storing the given sequence + of token IDs and append it back to the block list. + + This method allocates a block to store the given + sequence of token IDs append it back to the block list. Args: token_ids (List[int]): The sequence of token IDs to be stored. device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. """ - block = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device) - self._blocks.append(block) + blocks = self._allocate_blocks_for_token_ids(prev_block=None, + token_ids=token_ids, + device=device) + # Note: whenever we call append_by_block because of swapping, the tokens + # must fit in a block + assert len(blocks) <= 1 + block = blocks[0] + if not self._is_allocated: + self._blocks = blocks + else: + self._blocks.append(block) self._num_full_slots += len(token_ids) return block @@ -98,8 +109,8 @@ def allocate(self, device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. """ - assert token_ids assert not self._is_allocated + assert token_ids self._blocks = self._allocate_blocks_for_token_ids(prev_block=None, token_ids=token_ids, device=device) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 323fb85707ab..2cb6739de7d4 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -92,9 +92,6 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: def free(self, block: Block) -> None: self._free_block_id(block.block_id) - # Mark the block as having no allocation. - block.block_id = None - def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 05a0d1110700..9385a7a9ce87 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -253,14 +253,14 @@ def swap_in(self, seq_group: SequenceGroup, ) block_table = self.block_tables[seq.seq_id] - for cpu_block in block_table: + for cpu_block in block_table.get_blocks(): if cpu_block in mapping: gpu_block = mapping[cpu_block] self.block_allocator.increase_ref_count( Device.GPU, gpu_block.block_id()) else: - gpu_block = new_block_table.append_by_blocks( - token_ids=cpu_block.token_ids(), device=Device.GPU) + gpu_block = new_block_table.append_by_block( + token_ids=cpu_block.token_ids, device=Device.GPU) mapping[cpu_block] = gpu_block self.block_allocator.free(cpu_block) self.block_tables[seq.seq_id] = new_block_table @@ -270,8 +270,7 @@ def swap_in(self, seq_group: SequenceGroup, # we need to shift the block id of cpu block back to its relative # position within CPU cache. block_number_mapping = { - cpu_block.block_id() - self.num_total_gpu_blocks: - gpu_block.block_id() + cpu_block.block_id - self.num_total_gpu_blocks: gpu_block.block_id for cpu_block, gpu_block in mapping.items() } return block_number_mapping @@ -280,7 +279,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: num_touched_blocks = 0 for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): num_touched_blocks += self.block_tables[ - seq.seq_id]._num_touched_blocks() + seq.seq_id]._num_touched_blocks return num_touched_blocks <= self.block_allocator.get_num_free_blocks( Device.CPU) @@ -293,22 +292,21 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: ) block_table = self.block_tables[seq.seq_id] - for gpu_block in block_table: + for gpu_block in block_table.get_blocks(): if gpu_block in mapping: cpu_block = mapping[gpu_block] self.block_allocator.increase_ref_count( Device.CPU, cpu_block.block_id()) else: - cpu_block = new_block_table.append_by_blocks( - token_ids=gpu_block.token_ids(), device=Device.CPU) + cpu_block = new_block_table.append_by_block( + token_ids=gpu_block.token_ids, device=Device.CPU) mapping[gpu_block] = cpu_block - self.block_allocator.free(cpu_block) + self.block_allocator.free(gpu_block) self.block_tables[seq.seq_id] = new_block_table block_number_mapping = { - cpu_block.block_number - self.num_total_gpu_blocks: - gpu_block.block_number - for cpu_block, gpu_block in mapping.items() + gpu_block.block_id: cpu_block.block_id - self.num_total_gpu_blocks + for gpu_block, cpu_block in mapping.items() } return block_number_mapping From 9181552465f165b5f2fac45b3c128be4efc02bcb Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Thu, 4 Apr 2024 07:34:18 +0000 Subject: [PATCH 04/32] fix: address comment --- vllm/core/block/block_table.py | 51 ++++++---------- vllm/core/block/cpu_gpu_block_allocator.py | 68 +++++++++++++++++++--- vllm/core/block/naive_block.py | 3 + vllm/core/block/prefix_caching_block.py | 3 + vllm/core/block_manager_v2.py | 44 ++++---------- 5 files changed, 93 insertions(+), 76 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index a8fd6665c977..ae7df17f4089 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -68,37 +68,10 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: def get_blocks(self) -> Optional[List[Block]]: return self._blocks - def append_by_block(self, - token_ids: List[int], - device: Device = Device.GPU) -> Block: - """Allocates memory block for storing the given sequence - of token IDs and append it back to the block list. - - This method allocates a block to store the given - sequence of token IDs append it back to the block list. - - Args: - token_ids (List[int]): The sequence of token IDs to be stored. - device (Device, optional): The device on which the blocks should be - allocated. Defaults to Device.GPU. - """ - blocks = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device) - # Note: whenever we call append_by_block because of swapping, the tokens - # must fit in a block - assert len(blocks) <= 1 - block = blocks[0] - if not self._is_allocated: - self._blocks = blocks - else: - self._blocks.append(block) - self._num_full_slots += len(token_ids) - return block - def allocate(self, token_ids: List[int], - device: Device = Device.GPU) -> None: + device: Device = Device.GPU, + by_block: bool = False) -> Optional[Block]: """Allocates memory blocks for storing the given sequence of token IDs. This method allocates the required number of blocks to store the given @@ -108,13 +81,23 @@ def allocate(self, token_ids (List[int]): The sequence of token IDs to be stored. device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. + by_block (bool, optional): whether we are allocate block by block. + Set to True when doing cache swapping. Defaults to False. """ - assert not self._is_allocated + assert not self._is_allocated or by_block assert token_ids - self._blocks = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device) - self._num_full_slots = len(token_ids) + blocks = self._allocate_blocks_for_token_ids(prev_block=None, + token_ids=token_ids, + device=device) + self._num_full_slots += len(token_ids) + if not (by_block and self._is_allocated): + self._blocks = blocks + else: + # Note: whenever we call allocate with by_block set to True, + # because of swapping, the tokens must fit in a block + assert len(blocks) == 1 + self._blocks.append(blocks[0]) + return blocks[0] def append_token_ids(self, token_ids: List[int], diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 8d9188286de8..753629f6f406 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -2,10 +2,12 @@ from typing import Dict, List, Optional +from vllm.core.block.block_table import BlockTable from vllm.core.block.interfaces import (Block, BlockAllocator, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator +from vllm.sequence import Sequence from vllm.utils import Device @@ -90,18 +92,17 @@ def create( return CpuGpuBlockAllocator( cpu_block_allocator=cpu_allocator, gpu_block_allocator=gpu_allocator, + block_size=block_size, ) - def __init__( - self, - cpu_block_allocator: BlockAllocator, - gpu_block_allocator: BlockAllocator, - ): + def __init__(self, cpu_block_allocator: BlockAllocator, + gpu_block_allocator: BlockAllocator, block_size: int): assert not ( cpu_block_allocator.all_block_ids & gpu_block_allocator.all_block_ids ), "cpu and gpu block allocators can't have intersection of block ids" + self._block_size = block_size self._allocators = { Device.CPU: cpu_block_allocator, Device.GPU: gpu_block_allocator, @@ -145,6 +146,16 @@ def allocate_immutable(self, prev_block: Optional[Block], return self._allocators[device].allocate_immutable( prev_block, token_ids) + def reference(self, block: Block) -> None: + """Notify the device aware allocator there is new sequence reference + the given block. + + Args: + block (Block): The block to be referenced. + """ + allocator = self._block_ids_to_allocator[block.block_id] + return allocator.reference(block) + def free(self, block: Block) -> None: """Frees the memory occupied by the given block. @@ -192,9 +203,6 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]: device = Device.GPU return self._allocators[device].clear_copy_on_writes() - def increase_ref_count(self, device: Device, block_id: int) -> None: - return self._allocators[device].refcounter().incr(block_id) - def mark_blocks_as_computed(self) -> None: # Prefix caching only supported on GPU. device = Device.GPU @@ -209,3 +217,47 @@ def get_common_computed_block_ids( def all_block_ids(self) -> frozenset[int]: return frozenset(self._block_ids_to_allocator.keys()) + + def get_seq_swap_out_block_mapping( + self, seq: Sequence, block_table: BlockTable, + mapping: Dict[Block, Block]) -> BlockTable: + # The swap out logic for a sequence, the mapping dict will be updated + # and the new block table for swapped out sequence is returned. + new_block_table = BlockTable( + block_size=self._block_size, + block_allocator=self, + ) + for gpu_block in block_table.get_blocks(): + if gpu_block in mapping: + cpu_block = mapping[gpu_block] + self.reference(cpu_block) + else: + cpu_block = new_block_table.allocate( + token_ids=gpu_block.token_ids, + device=Device.CPU, + by_block=True) + mapping[gpu_block] = cpu_block + self.free(gpu_block) + return new_block_table + + def get_seq_swap_in_block_mapping( + self, seq: Sequence, block_table: BlockTable, + mapping: Dict[Block, Block]) -> BlockTable: + # The swap in logic for a sequence, the mapping dict will be updated + # and the new block table for swapped in sequence is returned. + new_block_table = BlockTable( + block_size=self._block_size, + block_allocator=self, + ) + for cpu_block in block_table.get_blocks(): + if cpu_block in mapping: + gpu_block = mapping[cpu_block] + self.reference(gpu_block) + else: + gpu_block = new_block_table.allocate( + token_ids=cpu_block.token_ids, + device=Device.GPU, + by_block=True) + mapping[cpu_block] = gpu_block + self.free(cpu_block) + return new_block_table diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 2cb6739de7d4..ad0b3a451434 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -92,6 +92,9 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: def free(self, block: Block) -> None: self._free_block_id(block.block_id) + def reference(self, block: Block) -> None: + self._refcounter.incr(block.block_id) + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 2232564381ea..d95f4a92ea2a 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -199,6 +199,9 @@ def _free_block_id_for_block(self, block_id: BlockId, assert block.content_hash in self._cached_blocks self._unused_cached_blocks[block.content_hash] = block_id + def reference(self, block: Block) -> None: + self._refcounter.incr(block.block_id) + def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 9385a7a9ce87..c5a221473117 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -247,23 +247,11 @@ def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> Dict[int, int]: mapping: Dict[Block, Block] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - new_block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - ) block_table = self.block_tables[seq.seq_id] - - for cpu_block in block_table.get_blocks(): - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - self.block_allocator.increase_ref_count( - Device.GPU, gpu_block.block_id()) - else: - gpu_block = new_block_table.append_by_block( - token_ids=cpu_block.token_ids, device=Device.GPU) - mapping[cpu_block] = gpu_block - self.block_allocator.free(cpu_block) - self.block_tables[seq.seq_id] = new_block_table + self.block_tables[ + seq. + seq_id] = self.block_allocator.get_seq_swap_in_block_mapping( + seq, block_table, mapping) # NOTE: since the memory operation in physical blocks need the # relative position of CPU block to its starting address, here @@ -278,31 +266,19 @@ def swap_in(self, seq_group: SequenceGroup, def can_swap_out(self, seq_group: SequenceGroup) -> bool: num_touched_blocks = 0 for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - num_touched_blocks += self.block_tables[ - seq.seq_id]._num_touched_blocks + block_table = self.block_tables[seq.seq_id] + num_touched_blocks += block_table._num_touched_blocks return num_touched_blocks <= self.block_allocator.get_num_free_blocks( Device.CPU) def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: mapping: Dict[Block, Block] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - new_block_table = BlockTable( - block_size=self.block_size, - block_allocator=self.block_allocator, - ) block_table = self.block_tables[seq.seq_id] - - for gpu_block in block_table.get_blocks(): - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - self.block_allocator.increase_ref_count( - Device.CPU, cpu_block.block_id()) - else: - cpu_block = new_block_table.append_by_block( - token_ids=gpu_block.token_ids, device=Device.CPU) - mapping[gpu_block] = cpu_block - self.block_allocator.free(gpu_block) - self.block_tables[seq.seq_id] = new_block_table + self.block_tables[ + seq. + seq_id] = self.block_allocator.get_seq_swap_out_block_mapping( + seq, block_table, mapping) block_number_mapping = { gpu_block.block_id: cpu_block.block_id - self.num_total_gpu_blocks From e9a907f1d3008f2ce65549aea2727dafba3f3968 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 5 Apr 2024 00:33:12 +0000 Subject: [PATCH 05/32] fix: reduce overestimate for can_swap_in --- vllm/core/block/block_table.py | 36 ++++++++++++++++++++++ vllm/core/block/cpu_gpu_block_allocator.py | 15 +++++++++ vllm/core/block/interfaces.py | 10 ++++++ vllm/core/block/naive_block.py | 27 ++++++++++++++++ vllm/core/block/prefix_caching_block.py | 27 ++++++++++++++-- vllm/core/block_manager_v2.py | 28 ++++++++++++++--- 6 files changed, 136 insertions(+), 7 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index ae7df17f4089..3a1a28713f64 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -307,3 +307,39 @@ def _chunk_token_blocks_for_append( token_blocks = [token_ids[:first_chunk_size]] + chunk_list( token_ids[first_chunk_size:], self._block_size) return token_blocks + + def get_num_cache_blocks_touched_by_swapping_in(self, token_ids: List[int], + num_lookahead_slots: int, + device: Device) -> int: + """Determine how many blocks will be "touched" by swapping in the + token ids. + + This is required for the scheduler to determine whether a sequence can + be swapped in. + """ + all_token_ids = token_ids + [-1] * num_lookahead_slots + token_blocks = self._chunk_token_blocks_for_append(all_token_ids) + prev_block = None + num_blocks_touched = 0 + for token_block in token_blocks: + block = self.block_allocator.mock_mutable(prev_block, token_block, + device) + if not block.prefix_caching_allocator.is_block_cached(block): + num_blocks_touched += 1 + prev_block = block + return num_blocks_touched + + def get_num_naive_blocks_touched_by_swapping_in(self, token_ids: List[int], + num_lookahead_slots: int, + total_touched_blocks: int, + block_set: set) -> None: + num_blocks_touched = self.get_num_blocks_touched_by_append_slots( + token_ids, num_lookahead_slots) + blocks = self.get_blocks() + if num_blocks_touched > len(blocks): + total_touched_blocks += 1 + for block in blocks: + if not block.is_full: + total_touched_blocks += 1 + else: + block_set.add(block.block_id) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 753629f6f406..8101f1e0df46 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -146,6 +146,21 @@ def allocate_immutable(self, prev_block: Optional[Block], return self._allocators[device].allocate_immutable( prev_block, token_ids) + def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int], + device: Device) -> Block: + """Mock a new mutable block, linked to the previous block, to help with + content hash calculation. + + Args: + prev_block (Optional[Block]): The previous block in the sequence. If + None, then the block to be allocated is the first block in the + sequence. + + Returns: + Block: The newly allocated mutable block. + """ + return self._allocators[device].mock_mutable(prev_block, token_ids) + def reference(self, block: Block) -> None: """Notify the device aware allocator there is new sequence reference the given block. diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index d463f7b09131..f6e20ff11a13 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -57,6 +57,11 @@ def allocate_immutable(self, prev_block: Optional[Block], token_ids: List[int]) -> Block: pass + @abstractmethod + def mock_mutable(self, prev_block: Optional[Block], + token_ids: List[int]) -> Block: + pass + @abstractmethod def free(self, block: Block) -> None: pass @@ -102,6 +107,11 @@ def allocate_immutable(self, prev_block: Optional[Block], token_ids: List[int], device: Device) -> Block: pass + @abstractmethod + def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int], + device: Device) -> Block: + pass + @abstractmethod def get_num_free_blocks(self, device: Device) -> int: pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index ad0b3a451434..5f529bdd6e27 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -89,6 +89,33 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: allocator=self, ) + def mock_mutable( + self, + prev_block: Optional[Block], + token_ids: List[int], + ) -> Block: + """Mock a new mutable block, linked to the previous block, to help with + content hash calculation. + + Args: + prev_block (Optional[Block]): The previous block in the sequence. If + None, then the block to be allocated is the first block in the + sequence. + + Returns: + Block: The newly allocated mutable block. + """ + + # NOTE: we use -1 as block_id for mock block + block_id = -1 + return self._create_block( + prev_block=prev_block, + token_ids=token_ids, + block_id=block_id, + block_size=self._block_size, + allocator=self, + ) + def free(self, block: Block) -> None: self._free_block_id(block.block_id) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index d95f4a92ea2a..1a5a812e13dd 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -164,6 +164,24 @@ def allocate_mutable(self, prev_block: Block) -> Block: # No block available in hashless allocator, nor in unused cache blocks. raise BlockAllocator.NoFreeBlocksError() + def mock_mutable( + self, + prev_block: Optional[Block], + token_ids: List[int], + ) -> Block: + """Mock a new mutable block, linked to the previous block, to help with + content hash calculation. + + Args: + prev_block (Optional[Block]): The previous block in the sequence. If + None, then the block to be allocated is the first block in the + sequence. + + Returns: + Block: The newly allocated mutable block. + """ + return self._hashless_allocator.mock_mutable(prev_block, token_ids) + def _incr_refcount_cached_block(self, content_hash: int, block_id: BlockId) -> None: refcount = self._refcounter.incr(block_id) @@ -234,8 +252,8 @@ def fork(self, last_block: Block) -> List[Block]: return forked_blocks def get_num_free_blocks(self) -> int: - # The number of free blocks is the number of hashless free blocks - # plus the number of hashful blocks that are unused. + # The number of free blocks is the number of hashless free + # blocks plus the number of hashful blocks that are unused. return self._hashless_allocator.get_num_free_blocks() + len( self._unused_cached_blocks) @@ -243,6 +261,11 @@ def get_num_free_blocks(self) -> int: def all_block_ids(self) -> frozenset[int]: return self._hashless_allocator.all_block_ids + def is_block_cached(self, block: "PrefixCachingBlock") -> bool: + if block.content_hash not in self._cached_blocks: + return True + return False + def promote_to_immutable_block(self, block: "PrefixCachingBlock") -> BlockId: """Once a mutable block is full, it can be promoted to an immutable diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index c5a221473117..7dee8349c2f6 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -233,13 +233,31 @@ def can_swap_in(self, seq_group: SequenceGroup, touched and sum them up to see whether there is enough memory to swap in """ num_touched_blocks = 0 - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - block_table = self.block_tables[seq.seq_id] - num_touched_blocks += ( - block_table.get_num_blocks_touched_by_append_slots( + + if self.enable_caching: + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + block_table = self.block_tables[seq.seq_id] + num_touched_blocks += ( + block_table.get_num_cache_blocks_touched_by_swapping_in( + token_ids=seq.get_token_ids(), + num_lookahead_slots=num_lookahead_slots, + device=Device.GPU)) + else: + # NOTE: for naive block, we go though all the sequence to collect + # a set of immutable block id, and accumulate number of isolated + # blocks (mutable ones and single block caused by lookahead). We + # sum them up at the end to get the final num_touched_blocks + # num_touched_blocks swap in op. + block_set = set() + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + block_table = self.block_tables[seq.seq_id] + block_table.get_num_naive_blocks_touched_by_swapping_in( token_ids=seq.get_token_ids(), num_lookahead_slots=num_lookahead_slots, - )) + total_touched_blocks=num_touched_blocks, + block_set=block_set) + num_touched_blocks += len(block_set) + num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU) return num_free_blocks - num_touched_blocks >= self.watermark_blocks From dcff0e1c76661970eccad95e507b9287614a02dc Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 5 Apr 2024 00:45:17 +0000 Subject: [PATCH 06/32] fix: reuse similar logic in can_swap_in to reduce overestimation in can_swap_out --- vllm/core/block/block_table.py | 18 +++++++++--------- vllm/core/block_manager_v2.py | 27 ++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 3a1a28713f64..095afcee01b1 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -308,14 +308,14 @@ def _chunk_token_blocks_for_append( token_ids[first_chunk_size:], self._block_size) return token_blocks - def get_num_cache_blocks_touched_by_swapping_in(self, token_ids: List[int], - num_lookahead_slots: int, - device: Device) -> int: - """Determine how many blocks will be "touched" by swapping in the + def get_num_cache_blocks_touched_by_swapping(self, token_ids: List[int], + num_lookahead_slots: int, + device: Device) -> int: + """Determine how many blocks will be "touched" by swapping in/out the token ids. This is required for the scheduler to determine whether a sequence can - be swapped in. + be swapped in/out. """ all_token_ids = token_ids + [-1] * num_lookahead_slots token_blocks = self._chunk_token_blocks_for_append(all_token_ids) @@ -329,10 +329,10 @@ def get_num_cache_blocks_touched_by_swapping_in(self, token_ids: List[int], prev_block = block return num_blocks_touched - def get_num_naive_blocks_touched_by_swapping_in(self, token_ids: List[int], - num_lookahead_slots: int, - total_touched_blocks: int, - block_set: set) -> None: + def get_num_naive_blocks_touched_by_swapping(self, token_ids: List[int], + num_lookahead_slots: int, + total_touched_blocks: int, + block_set: set) -> None: num_blocks_touched = self.get_num_blocks_touched_by_append_slots( token_ids, num_lookahead_slots) blocks = self.get_blocks() diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 7dee8349c2f6..89a52730d5d6 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -238,7 +238,7 @@ def can_swap_in(self, seq_group: SequenceGroup, for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): block_table = self.block_tables[seq.seq_id] num_touched_blocks += ( - block_table.get_num_cache_blocks_touched_by_swapping_in( + block_table.get_num_cache_blocks_touched_by_swapping( token_ids=seq.get_token_ids(), num_lookahead_slots=num_lookahead_slots, device=Device.GPU)) @@ -251,7 +251,7 @@ def can_swap_in(self, seq_group: SequenceGroup, block_set = set() for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): block_table = self.block_tables[seq.seq_id] - block_table.get_num_naive_blocks_touched_by_swapping_in( + block_table.get_num_naive_blocks_touched_by_swapping( token_ids=seq.get_token_ids(), num_lookahead_slots=num_lookahead_slots, total_touched_blocks=num_touched_blocks, @@ -283,9 +283,26 @@ def swap_in(self, seq_group: SequenceGroup, def can_swap_out(self, seq_group: SequenceGroup) -> bool: num_touched_blocks = 0 - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - block_table = self.block_tables[seq.seq_id] - num_touched_blocks += block_table._num_touched_blocks + + if self.enable_caching: + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + block_table = self.block_tables[seq.seq_id] + num_touched_blocks += ( + block_table.get_num_cache_blocks_touched_by_swapping( + token_ids=seq.get_token_ids(), + num_lookahead_slots=0, + device=Device.CPU)) + else: + block_set = set() + for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): + block_table = self.block_tables[seq.seq_id] + block_table.get_num_naive_blocks_touched_by_swapping( + token_ids=seq.get_token_ids(), + num_lookahead_slots=0, + total_touched_blocks=num_touched_blocks, + block_set=block_set) + num_touched_blocks += len(block_set) + return num_touched_blocks <= self.block_allocator.get_num_free_blocks( Device.CPU) From 205dda1ec050189cbbfd697f065cd1b500178444 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 5 Apr 2024 08:48:47 +0000 Subject: [PATCH 07/32] fix: refactor swap in/out logic --- tests/core/block/test_block_manager_v2.py | 13 +-- vllm/core/block/block_table.py | 11 ++ vllm/core/block/cpu_gpu_block_allocator.py | 115 ++++++++++++--------- vllm/core/block/naive_block.py | 5 +- vllm/core/block/prefix_caching_block.py | 4 +- vllm/core/block_manager_v2.py | 25 ++--- 6 files changed, 99 insertions(+), 74 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index f1ee17224c7d..67c0789f03b6 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -107,11 +107,14 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, @pytest.mark.parametrize("num_cpu_blocks", [4]) @pytest.mark.parametrize("num_gpu_blocks", [4]) @pytest.mark.parametrize("num_lookahead_slots", [2]) -def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots): +@pytest.mark.parametrize("enable_caching", [False]) +def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, + enable_caching): block_manager = BlockSpaceManagerV2(block_size, num_cpu_blocks, num_gpu_blocks, - watermark=0) + watermark=0, + enable_caching=enable_caching) prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1) prompt.status = SequenceStatus.WAITING block_manager.allocate(seq_group) @@ -136,14 +139,12 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots): prompt.status = SequenceStatus.SWAPPED # Swap seq group from CPU -> GPU. - cpu_blocks = block_manager.get_block_table(prompt) assert block_manager.can_swap_in(seq_group, num_lookahead_slots) before_cpu_blocks = block_manager.get_num_free_cpu_blocks() before_gpu_blocks = block_manager.get_num_free_gpu_blocks() mapping = block_manager.swap_in(seq_group, num_lookahead_slots) - adjusted_cpu_blocks = [block - num_gpu_blocks for block in cpu_blocks] - assert list(mapping.keys()) == adjusted_cpu_blocks + cpu_blocks = block_manager.get_block_table(prompt) + assert list(mapping.keys()) == [cpu_blocks[0]] after_cpu_blocks = block_manager.get_num_free_cpu_blocks() after_gpu_blocks = block_manager.get_num_free_gpu_blocks() - assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 095afcee01b1..a15fcd0a752b 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -193,6 +193,17 @@ def free(self) -> None: self._allocator.free(block) self._blocks = None + def swap(self, destination_device: Device) -> "BlockTable": + new_block_table = BlockTable( + block_size=self._block_size, + block_allocator=self._allocator, + ) + for src_block in self.get_blocks(): + self._allocator.update_seq_swap_out_block_mapping( + src_block, new_block_table, destination_device) + self._allocator.free(src_block) + return new_block_table + @property def physical_block_ids(self) -> List[int]: """Returns a list of physical block indices for the blocks in the diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 8101f1e0df46..8f00515dfb3a 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -7,7 +7,6 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.sequence import Sequence from vllm.utils import Device @@ -92,23 +91,22 @@ def create( return CpuGpuBlockAllocator( cpu_block_allocator=cpu_allocator, gpu_block_allocator=gpu_allocator, - block_size=block_size, ) def __init__(self, cpu_block_allocator: BlockAllocator, - gpu_block_allocator: BlockAllocator, block_size: int): + gpu_block_allocator: BlockAllocator): assert not ( cpu_block_allocator.all_block_ids & gpu_block_allocator.all_block_ids ), "cpu and gpu block allocators can't have intersection of block ids" - self._block_size = block_size self._allocators = { Device.CPU: cpu_block_allocator, Device.GPU: gpu_block_allocator, } self._block_ids_to_allocator = {} + self._swap_mapping = {} for _, allocator in self._allocators.items(): for block_id in allocator.all_block_ids: self._block_ids_to_allocator[block_id] = allocator @@ -161,15 +159,15 @@ def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int], """ return self._allocators[device].mock_mutable(prev_block, token_ids) - def reference(self, block: Block) -> None: + def reference(self, block_id: int) -> None: """Notify the device aware allocator there is new sequence reference the given block. Args: block (Block): The block to be referenced. """ - allocator = self._block_ids_to_allocator[block.block_id] - return allocator.reference(block) + allocator = self._block_ids_to_allocator[block_id] + return allocator.reference(block_id) def free(self, block: Block) -> None: """Frees the memory occupied by the given block. @@ -233,46 +231,63 @@ def get_common_computed_block_ids( def all_block_ids(self) -> frozenset[int]: return frozenset(self._block_ids_to_allocator.keys()) - def get_seq_swap_out_block_mapping( - self, seq: Sequence, block_table: BlockTable, - mapping: Dict[Block, Block]) -> BlockTable: - # The swap out logic for a sequence, the mapping dict will be updated - # and the new block table for swapped out sequence is returned. - new_block_table = BlockTable( - block_size=self._block_size, - block_allocator=self, - ) - for gpu_block in block_table.get_blocks(): - if gpu_block in mapping: - cpu_block = mapping[gpu_block] - self.reference(cpu_block) - else: - cpu_block = new_block_table.allocate( - token_ids=gpu_block.token_ids, - device=Device.CPU, - by_block=True) - mapping[gpu_block] = cpu_block - self.free(gpu_block) - return new_block_table - - def get_seq_swap_in_block_mapping( - self, seq: Sequence, block_table: BlockTable, - mapping: Dict[Block, Block]) -> BlockTable: - # The swap in logic for a sequence, the mapping dict will be updated - # and the new block table for swapped in sequence is returned. - new_block_table = BlockTable( - block_size=self._block_size, - block_allocator=self, - ) - for cpu_block in block_table.get_blocks(): - if cpu_block in mapping: - gpu_block = mapping[cpu_block] - self.reference(gpu_block) - else: - gpu_block = new_block_table.allocate( - token_ids=cpu_block.token_ids, - device=Device.GPU, - by_block=True) - mapping[cpu_block] = gpu_block - self.free(cpu_block) - return new_block_table + def update_seq_swap_out_block_mapping(self, block: Block, + block_table: BlockTable, + destination_device: Device) -> None: + if block.block_id in self._swap_mapping: + dest_block_id = self._swap_mapping[block.block_id] + self.reference(dest_block_id) + else: + dest_block = block_table.allocate(token_ids=block.token_ids, + device=destination_device, + by_block=True) + self._swap_mapping[block.block_id] = dest_block.block_id + + def get_and_reset_swaps(self) -> dict: + mapping = self._swap_mapping.copy() + self._swap_mapping.clear() + return mapping + + # def get_seq_swap_out_block_mapping( + # self, seq: Sequence, block_table: BlockTable, + # mapping: Dict[Block, Block]) -> BlockTable: + # # The swap out logic for a sequence, the mapping dict will be updated + # # and the new block table for swapped out sequence is returned. + # new_block_table = BlockTable( + # block_size=self._block_size, + # block_allocator=self, + # ) + # for src_block in block_table.get_blocks(): + # if src_block in mapping: + # cpu_block = mapping[src_block] + # self.reference(cpu_block) + # else: + # cpu_block = new_block_table.allocate( + # token_ids=src_block.token_ids, + # device=Device.CPU, + # by_block=True) + # mapping[src_block] = cpu_block + # self.free(src_block) + # return new_block_table + + # def get_seq_swap_in_block_mapping( + # self, seq: Sequence, block_table: BlockTable, + # mapping: Dict[Block, Block]) -> BlockTable: + # # The swap in logic for a sequence, the mapping dict will be updated + # # and the new block table for swapped in sequence is returned. + # new_block_table = BlockTable( + # block_size=self._block_size, + # block_allocator=self, + # ) + # for cpu_block in block_table.get_blocks(): + # if cpu_block in mapping: + # gpu_block = mapping[cpu_block] + # self.reference(gpu_block) + # else: + # gpu_block = new_block_table.allocate( + # token_ids=cpu_block.token_ids, + # device=Device.GPU, + # by_block=True) + # mapping[cpu_block] = gpu_block + # self.free(cpu_block) + # return new_block_table diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 5f529bdd6e27..60aa145b00b4 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -118,9 +118,10 @@ def mock_mutable( def free(self, block: Block) -> None: self._free_block_id(block.block_id) + block.block_id = None - def reference(self, block: Block) -> None: - self._refcounter.incr(block.block_id) + def reference(self, block_id: int) -> None: + self._refcounter.incr(block_id) def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 1a5a812e13dd..b3ade6b23d3f 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -217,8 +217,8 @@ def _free_block_id_for_block(self, block_id: BlockId, assert block.content_hash in self._cached_blocks self._unused_cached_blocks[block.content_hash] = block_id - def reference(self, block: Block) -> None: - self._refcounter.incr(block.block_id) + def reference(self, block_id: int) -> None: + self._refcounter.incr(block_id) def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 89a52730d5d6..7a330079887f 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -176,7 +176,6 @@ def append_slots( token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()), num_lookahead_slots=num_lookahead_slots, ) - # Return any new copy-on-writes. new_cows = self.block_allocator.clear_copy_on_writes() return new_cows @@ -263,21 +262,20 @@ def can_swap_in(self, seq_group: SequenceGroup, def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> Dict[int, int]: - mapping: Dict[Block, Block] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): block_table = self.block_tables[seq.seq_id] - self.block_tables[ - seq. - seq_id] = self.block_allocator.get_seq_swap_in_block_mapping( - seq, block_table, mapping) + new_block_table = block_table.swap(destination_device=Device.GPU) + self.block_tables[seq.seq_id] = new_block_table + self.append_slots(seq=seq, num_lookahead_slots=num_lookahead_slots) # NOTE: since the memory operation in physical blocks need the # relative position of CPU block to its starting address, here # we need to shift the block id of cpu block back to its relative # position within CPU cache. + mapping = self.block_allocator.get_and_reset_swaps() block_number_mapping = { - cpu_block.block_id - self.num_total_gpu_blocks: gpu_block.block_id - for cpu_block, gpu_block in mapping.items() + cpu_block_id - self.num_total_gpu_blocks: gpu_block_id + for cpu_block_id, gpu_block_id in mapping.items() } return block_number_mapping @@ -310,14 +308,13 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: mapping: Dict[Block, Block] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): block_table = self.block_tables[seq.seq_id] - self.block_tables[ - seq. - seq_id] = self.block_allocator.get_seq_swap_out_block_mapping( - seq, block_table, mapping) + new_block_table = block_table.swap(destination_device=Device.CPU) + self.block_tables[seq.seq_id] = new_block_table + mapping = self.block_allocator.get_and_reset_swaps() block_number_mapping = { - gpu_block.block_id: cpu_block.block_id - self.num_total_gpu_blocks - for gpu_block, cpu_block in mapping.items() + gpu_block_id: cpu_block_id - self.num_total_gpu_blocks + for gpu_block_id, cpu_block_id in mapping.items() } return block_number_mapping From 3bb125c223ea72e58862b49a890b064e800f4d82 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 5 Apr 2024 08:58:14 +0000 Subject: [PATCH 08/32] misc: remove useless code --- vllm/core/block/cpu_gpu_block_allocator.py | 44 ---------------------- 1 file changed, 44 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 8f00515dfb3a..1c9bc0edc368 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -247,47 +247,3 @@ def get_and_reset_swaps(self) -> dict: mapping = self._swap_mapping.copy() self._swap_mapping.clear() return mapping - - # def get_seq_swap_out_block_mapping( - # self, seq: Sequence, block_table: BlockTable, - # mapping: Dict[Block, Block]) -> BlockTable: - # # The swap out logic for a sequence, the mapping dict will be updated - # # and the new block table for swapped out sequence is returned. - # new_block_table = BlockTable( - # block_size=self._block_size, - # block_allocator=self, - # ) - # for src_block in block_table.get_blocks(): - # if src_block in mapping: - # cpu_block = mapping[src_block] - # self.reference(cpu_block) - # else: - # cpu_block = new_block_table.allocate( - # token_ids=src_block.token_ids, - # device=Device.CPU, - # by_block=True) - # mapping[src_block] = cpu_block - # self.free(src_block) - # return new_block_table - - # def get_seq_swap_in_block_mapping( - # self, seq: Sequence, block_table: BlockTable, - # mapping: Dict[Block, Block]) -> BlockTable: - # # The swap in logic for a sequence, the mapping dict will be updated - # # and the new block table for swapped in sequence is returned. - # new_block_table = BlockTable( - # block_size=self._block_size, - # block_allocator=self, - # ) - # for cpu_block in block_table.get_blocks(): - # if cpu_block in mapping: - # gpu_block = mapping[cpu_block] - # self.reference(gpu_block) - # else: - # gpu_block = new_block_table.allocate( - # token_ids=cpu_block.token_ids, - # device=Device.GPU, - # by_block=True) - # mapping[cpu_block] = gpu_block - # self.free(cpu_block) - # return new_block_table From 403a9bd4bada9666eec4c83ffeb7fa5e4b1ffbc6 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 12 Apr 2024 02:40:13 +0000 Subject: [PATCH 09/32] fix: refactor can_swap_in/out --- vllm/core/block/block_table.py | 38 +-------- vllm/core/block/cpu_gpu_block_allocator.py | 8 ++ vllm/core/block/naive_block.py | 34 +++++++++ vllm/core/block/prefix_caching_block.py | 29 +++++++ vllm/core/block_manager_v2.py | 89 +++++++--------------- 5 files changed, 101 insertions(+), 97 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index a15fcd0a752b..796b69280b4c 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -82,7 +82,7 @@ def allocate(self, device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. by_block (bool, optional): whether we are allocate block by block. - Set to True when doing cache swapping. Defaults to False. + Set to True when doing cache swapping. Default to False. """ assert not self._is_allocated or by_block assert token_ids @@ -318,39 +318,3 @@ def _chunk_token_blocks_for_append( token_blocks = [token_ids[:first_chunk_size]] + chunk_list( token_ids[first_chunk_size:], self._block_size) return token_blocks - - def get_num_cache_blocks_touched_by_swapping(self, token_ids: List[int], - num_lookahead_slots: int, - device: Device) -> int: - """Determine how many blocks will be "touched" by swapping in/out the - token ids. - - This is required for the scheduler to determine whether a sequence can - be swapped in/out. - """ - all_token_ids = token_ids + [-1] * num_lookahead_slots - token_blocks = self._chunk_token_blocks_for_append(all_token_ids) - prev_block = None - num_blocks_touched = 0 - for token_block in token_blocks: - block = self.block_allocator.mock_mutable(prev_block, token_block, - device) - if not block.prefix_caching_allocator.is_block_cached(block): - num_blocks_touched += 1 - prev_block = block - return num_blocks_touched - - def get_num_naive_blocks_touched_by_swapping(self, token_ids: List[int], - num_lookahead_slots: int, - total_touched_blocks: int, - block_set: set) -> None: - num_blocks_touched = self.get_num_blocks_touched_by_append_slots( - token_ids, num_lookahead_slots) - blocks = self.get_blocks() - if num_blocks_touched > len(blocks): - total_touched_blocks += 1 - for block in blocks: - if not block.is_full: - total_touched_blocks += 1 - else: - block_set.add(block.block_id) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 1c9bc0edc368..8bb27f2da484 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -204,6 +204,14 @@ def get_num_free_blocks(self, device: Device) -> int: """ return self._allocators[device].get_num_free_blocks() + def can_swap(self, + blocks: List[Block], + device: Device, + num_lookahead_slots: int = 0, + watermark_blocks: int = 0) -> bool: + return self._allocators[device].can_swap(blocks, num_lookahead_slots, + watermark_blocks) + def clear_copy_on_writes(self) -> Dict[int, List[int]]: """Clears the copy-on-write (CoW) state and returns the mapping of source to destination block IDs. diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 60aa145b00b4..422b130c1194 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -221,6 +221,40 @@ def get_common_computed_block_ids( """ return [] + def can_swap(self, + blocks: List[Block], + num_lookahead_slots: int = 0, + watermark_blocks: int = 0) -> bool: + """Determine can we swap in/out the given blocks from certain sequence + group with the provided num_lookahead_slots. + + Args: + blocks (List[Block]): The potential blocks to swap. + num_lookahead_slots (int): number of lookahead slots (0 for swap + out). + + Returns: + bool: whether the allocator has capacity to accept the swap + with given blocks and num_lookahead_slots. + """ + # NOTE: for naive block, we use set to eliminate common blocks among + # seqs, also we compare the empty slots in the mutable blocks with + # lookahead slots to get the number of unique new block that are + # needed. + old_block_set = set() + new_block_count = 0 + for block in blocks: + if not block.is_full and num_lookahead_slots != 0: + if block.num_empty_slots >= num_lookahead_slots: + new_block_count += 1 + else: + new_block_count += 2 + else: + old_block_set.add(block.block_id) + num_touched_blocks = new_block_count + len(old_block_set) + return self.get_num_free_blocks( + ) - num_touched_blocks >= watermark_blocks + class NaiveBlock(Block): """An implementation of the Block class that does not support prefix diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index b3ade6b23d3f..70b3775ffdd6 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -345,6 +345,35 @@ def get_common_computed_block_ids( ] return commonprefix([ids for ids in ids_list if ids != []]) + def can_swap(self, + blocks: List[Block], + num_lookahead_slots: int = 0, + watermark_blocks: int = 0) -> bool: + """Determine can we swap in/out the given blocks from certain sequence + group with the provided num_lookahead_slots. + + Args: + blocks (List[Block]): The potential blocks to swap. + num_lookahead_slots (int): number of lookahead slots (0 for + swap out). + + Returns: + bool: whether the allocator has capacity to accept the swap + with given blocks and num_lookahead_slots. + """ + num_touched_blocks = 0 + for block in blocks: + if not block.is_full: + if block.num_empty_slots >= num_lookahead_slots: + num_touched_blocks += 1 + else: + num_touched_blocks += 2 + else: + if not self.is_block_cached(block): + num_touched_blocks += 1 + return self.get_num_free_blocks( + ) - num_touched_blocks >= watermark_blocks + class PrefixCachingBlock(Block): """A block implementation that supports prefix caching. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 7a330079887f..c4b414cb1f0b 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -1,4 +1,5 @@ """A block manager that manages token blocks.""" +from itertools import chain from typing import Dict, List, Optional from vllm.core.block.block_table import BlockTable @@ -225,41 +226,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.fork() - def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - """ - We go through all sequence in seq group to get their number of blocks - touched and sum them up to see whether there is enough memory to swap in - """ - num_touched_blocks = 0 - - if self.enable_caching: - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - block_table = self.block_tables[seq.seq_id] - num_touched_blocks += ( - block_table.get_num_cache_blocks_touched_by_swapping( - token_ids=seq.get_token_ids(), - num_lookahead_slots=num_lookahead_slots, - device=Device.GPU)) - else: - # NOTE: for naive block, we go though all the sequence to collect - # a set of immutable block id, and accumulate number of isolated - # blocks (mutable ones and single block caused by lookahead). We - # sum them up at the end to get the final num_touched_blocks - # num_touched_blocks swap in op. - block_set = set() - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - block_table = self.block_tables[seq.seq_id] - block_table.get_num_naive_blocks_touched_by_swapping( - token_ids=seq.get_token_ids(), - num_lookahead_slots=num_lookahead_slots, - total_touched_blocks=num_touched_blocks, - block_set=block_set) - num_touched_blocks += len(block_set) - - num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU) - return num_free_blocks - num_touched_blocks >= self.watermark_blocks - def swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> Dict[int, int]: for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): @@ -279,31 +245,6 @@ def swap_in(self, seq_group: SequenceGroup, } return block_number_mapping - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - num_touched_blocks = 0 - - if self.enable_caching: - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - block_table = self.block_tables[seq.seq_id] - num_touched_blocks += ( - block_table.get_num_cache_blocks_touched_by_swapping( - token_ids=seq.get_token_ids(), - num_lookahead_slots=0, - device=Device.CPU)) - else: - block_set = set() - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - block_table = self.block_tables[seq.seq_id] - block_table.get_num_naive_blocks_touched_by_swapping( - token_ids=seq.get_token_ids(), - num_lookahead_slots=0, - total_touched_blocks=num_touched_blocks, - block_set=block_set) - num_touched_blocks += len(block_set) - - return num_touched_blocks <= self.block_allocator.get_num_free_blocks( - Device.CPU) - def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: mapping: Dict[Block, Block] = {} for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): @@ -323,3 +264,31 @@ def get_num_free_gpu_blocks(self) -> int: def get_num_free_cpu_blocks(self) -> int: return self.block_allocator.get_num_free_blocks(Device.CPU) + + def can_swap_in(self, seq_group: SequenceGroup, + num_lookahead_slots: int) -> bool: + return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, + num_lookahead_slots, self.watermark_blocks) + + def can_swap_out(self, seq_group: SequenceGroup) -> bool: + return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) + + def _can_swap(self, + seq_group: SequenceGroup, + device: Device, + status: SequenceStatus, + num_lookahead_slots: int = 0, + watermark_blocks: int = 0) -> bool: + blocks = self._get_blocks_for_swap(seq_group, status) + return self.block_allocator.can_swap(blocks, device, + num_lookahead_slots, + watermark_blocks) + + def _get_blocks_for_swap(self, seq_group: SequenceGroup, + status: SequenceStatus) -> List[Block]: + blocks: Dict[int, List[Block]] = {} + for seq in seq_group.get_seqs(status=status): + block_table = self.block_tables[seq.seq_id] + blocks[seq.seq_id] = block_table.get_blocks() + combined_blocks = list(chain(*blocks.values())) + return combined_blocks From 3237d633a36c914bdcea86bdb907bd12748476c4 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 12 Apr 2024 02:43:04 +0000 Subject: [PATCH 10/32] fix: remove unused code --- vllm/core/block/cpu_gpu_block_allocator.py | 15 ------------ vllm/core/block/interfaces.py | 10 -------- vllm/core/block/naive_block.py | 27 ---------------------- vllm/core/block/prefix_caching_block.py | 18 --------------- 4 files changed, 70 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 8bb27f2da484..4289de2c52ae 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -144,21 +144,6 @@ def allocate_immutable(self, prev_block: Optional[Block], return self._allocators[device].allocate_immutable( prev_block, token_ids) - def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int], - device: Device) -> Block: - """Mock a new mutable block, linked to the previous block, to help with - content hash calculation. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - - Returns: - Block: The newly allocated mutable block. - """ - return self._allocators[device].mock_mutable(prev_block, token_ids) - def reference(self, block_id: int) -> None: """Notify the device aware allocator there is new sequence reference the given block. diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index f6e20ff11a13..d463f7b09131 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -57,11 +57,6 @@ def allocate_immutable(self, prev_block: Optional[Block], token_ids: List[int]) -> Block: pass - @abstractmethod - def mock_mutable(self, prev_block: Optional[Block], - token_ids: List[int]) -> Block: - pass - @abstractmethod def free(self, block: Block) -> None: pass @@ -107,11 +102,6 @@ def allocate_immutable(self, prev_block: Optional[Block], token_ids: List[int], device: Device) -> Block: pass - @abstractmethod - def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int], - device: Device) -> Block: - pass - @abstractmethod def get_num_free_blocks(self, device: Device) -> int: pass diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 422b130c1194..fec19d1be404 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -89,33 +89,6 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block: allocator=self, ) - def mock_mutable( - self, - prev_block: Optional[Block], - token_ids: List[int], - ) -> Block: - """Mock a new mutable block, linked to the previous block, to help with - content hash calculation. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - - Returns: - Block: The newly allocated mutable block. - """ - - # NOTE: we use -1 as block_id for mock block - block_id = -1 - return self._create_block( - prev_block=prev_block, - token_ids=token_ids, - block_id=block_id, - block_size=self._block_size, - allocator=self, - ) - def free(self, block: Block) -> None: self._free_block_id(block.block_id) block.block_id = None diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 70b3775ffdd6..060a2691e82a 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -164,24 +164,6 @@ def allocate_mutable(self, prev_block: Block) -> Block: # No block available in hashless allocator, nor in unused cache blocks. raise BlockAllocator.NoFreeBlocksError() - def mock_mutable( - self, - prev_block: Optional[Block], - token_ids: List[int], - ) -> Block: - """Mock a new mutable block, linked to the previous block, to help with - content hash calculation. - - Args: - prev_block (Optional[Block]): The previous block in the sequence. If - None, then the block to be allocated is the first block in the - sequence. - - Returns: - Block: The newly allocated mutable block. - """ - return self._hashless_allocator.mock_mutable(prev_block, token_ids) - def _incr_refcount_cached_block(self, content_hash: int, block_id: BlockId) -> None: refcount = self._refcounter.incr(block_id) From 413124794d7bc7f97e1bb9db720bf7f031b7b380 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 12 Apr 2024 02:44:31 +0000 Subject: [PATCH 11/32] fix: remove unused code --- vllm/core/block/block_table.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 796b69280b4c..f271ddf86950 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -274,10 +274,6 @@ def _get_all_token_ids(self) -> List[int]: def _is_allocated(self) -> bool: return self._blocks is not None - @property - def _num_touched_blocks(self) -> int: - return len(self._blocks) - @property def _num_empty_slots(self) -> int: assert self._is_allocated From 0067ddff1c00fa9f8cff2aea21c22f784713ae5c Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 12 Apr 2024 04:52:14 +0000 Subject: [PATCH 12/32] fix: refactor swap in/out oprations --- vllm/core/block/block_table.py | 11 ---- vllm/core/block/common.py | 1 - vllm/core/block/cpu_gpu_block_allocator.py | 34 ++++------ vllm/core/block/naive_block.py | 14 +++++ vllm/core/block/prefix_caching_block.py | 14 +++++ vllm/core/block_manager_v2.py | 72 +++++++++++----------- 6 files changed, 76 insertions(+), 70 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index f271ddf86950..a237390b5aa8 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -193,17 +193,6 @@ def free(self) -> None: self._allocator.free(block) self._blocks = None - def swap(self, destination_device: Device) -> "BlockTable": - new_block_table = BlockTable( - block_size=self._block_size, - block_allocator=self._allocator, - ) - for src_block in self.get_blocks(): - self._allocator.update_seq_swap_out_block_mapping( - src_block, new_block_table, destination_device) - self._allocator.free(src_block) - return new_block_table - @property def physical_block_ids(self) -> List[int]: """Returns a list of physical block indices for the blocks in the diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py index 50c70533c4fb..d9f07321950f 100644 --- a/vllm/core/block/common.py +++ b/vllm/core/block/common.py @@ -129,7 +129,6 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: assert refcount != 0 if refcount > 1: src_block_id = block_id - # Decrement refcount of the old block. self._allocator.free(block) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 4289de2c52ae..742c5bfc835e 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -2,7 +2,6 @@ from typing import Dict, List, Optional -from vllm.core.block.block_table import BlockTable from vllm.core.block.interfaces import (Block, BlockAllocator, DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator @@ -144,16 +143,6 @@ def allocate_immutable(self, prev_block: Optional[Block], return self._allocators[device].allocate_immutable( prev_block, token_ids) - def reference(self, block_id: int) -> None: - """Notify the device aware allocator there is new sequence reference - the given block. - - Args: - block (Block): The block to be referenced. - """ - allocator = self._block_ids_to_allocator[block_id] - return allocator.reference(block_id) - def free(self, block: Block) -> None: """Frees the memory occupied by the given block. @@ -189,6 +178,17 @@ def get_num_free_blocks(self, device: Device) -> int: """ return self._allocators[device].get_num_free_blocks() + def swap(self, blocks: List[Block], source_device: Device, + dest_device: Device) -> None: + source_block_ids = [block.block_id for block in blocks] + self._allocators[source_device].swap_out(blocks) + self._allocators[dest_device].swap_in(blocks) + dest_block_ids = [block.block_id for block in blocks] + self._swap_mapping = { + src: dest + for src, dest in zip(source_block_ids, dest_block_ids) + } + def can_swap(self, blocks: List[Block], device: Device, @@ -224,18 +224,6 @@ def get_common_computed_block_ids( def all_block_ids(self) -> frozenset[int]: return frozenset(self._block_ids_to_allocator.keys()) - def update_seq_swap_out_block_mapping(self, block: Block, - block_table: BlockTable, - destination_device: Device) -> None: - if block.block_id in self._swap_mapping: - dest_block_id = self._swap_mapping[block.block_id] - self.reference(dest_block_id) - else: - dest_block = block_table.allocate(token_ids=block.token_ids, - device=destination_device, - by_block=True) - self._swap_mapping[block.block_id] = dest_block.block_id - def get_and_reset_swaps(self) -> dict: mapping = self._swap_mapping.copy() self._swap_mapping.clear() diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index fec19d1be404..90d708c6e40e 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -228,6 +228,20 @@ def can_swap(self, return self.get_num_free_blocks( ) - num_touched_blocks >= watermark_blocks + def swap_out(self, blocks: List[Block]) -> None: + for block in blocks: + self.free(block) + + def swap_in(self, blocks: List[Block]) -> None: + for block in blocks: + if block.is_full: + alloc = self.allocate_immutable(block.prev_block, + block.token_ids) + else: + alloc = self.allocate_mutable(block.prev_block) + alloc.append_token_ids(block.token_ids) + block.block_id = alloc.block_id + class NaiveBlock(Block): """An implementation of the Block class that does not support prefix diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 060a2691e82a..f9a809dae1e1 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -356,6 +356,20 @@ def can_swap(self, return self.get_num_free_blocks( ) - num_touched_blocks >= watermark_blocks + def swap_out(self, blocks: List[Block]) -> None: + for block in blocks: + self.free(block) + + def swap_in(self, blocks: List[Block]) -> None: + for block in blocks: + if block.is_full: + alloc = self.allocate_immutable(block.prev_block, + block.token_ids) + else: + alloc = self.allocate_mutable(block.prev_block) + alloc.append_token_ids(block.token_ids) + block.block_id = alloc.block_id + class PrefixCachingBlock(Block): """A block implementation that supports prefix caching. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index c4b414cb1f0b..b3a22fe17089 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -226,39 +226,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.fork() - def swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> Dict[int, int]: - for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): - block_table = self.block_tables[seq.seq_id] - new_block_table = block_table.swap(destination_device=Device.GPU) - self.block_tables[seq.seq_id] = new_block_table - self.append_slots(seq=seq, num_lookahead_slots=num_lookahead_slots) - - # NOTE: since the memory operation in physical blocks need the - # relative position of CPU block to its starting address, here - # we need to shift the block id of cpu block back to its relative - # position within CPU cache. - mapping = self.block_allocator.get_and_reset_swaps() - block_number_mapping = { - cpu_block_id - self.num_total_gpu_blocks: gpu_block_id - for cpu_block_id, gpu_block_id in mapping.items() - } - return block_number_mapping - - def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]: - mapping: Dict[Block, Block] = {} - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): - block_table = self.block_tables[seq.seq_id] - new_block_table = block_table.swap(destination_device=Device.CPU) - self.block_tables[seq.seq_id] = new_block_table - - mapping = self.block_allocator.get_and_reset_swaps() - block_number_mapping = { - gpu_block_id: cpu_block_id - self.num_total_gpu_blocks - for gpu_block_id, cpu_block_id in mapping.items() - } - return block_number_mapping - def get_num_free_gpu_blocks(self) -> int: return self.block_allocator.get_num_free_blocks(Device.GPU) @@ -273,6 +240,39 @@ def can_swap_in(self, seq_group: SequenceGroup, def can_swap_out(self, seq_group: SequenceGroup) -> bool: return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) + def swap_in(self, + sequence_group: SequenceGroup, + num_lookahead_slots: int = 0) -> Dict[int, int]: + blocks = self._get_blocks_for_swap(sequence_group, + SequenceStatus.SWAPPED, + num_lookahead_slots) + self.block_allocator.swap(blocks=blocks, + source_device=Device.CPU, + dest_device=Device.GPU) + # NOTE: Once the BlockManagerV1 implementation is deleted, we can + # move this get_and_reset_swaps call outside of swap_in/swap_out. + # Then the scheduler can make calls to get all swaps and all + # copy-on-writes for the batch. + mapping = self.block_allocator.get_and_reset_swaps() + block_number_mapping = { + cpu_block_id - self.num_total_gpu_blocks: gpu_block_id + for cpu_block_id, gpu_block_id in mapping.items() + } + return block_number_mapping + + def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: + blocks = self._get_blocks_for_swap(sequence_group, + SequenceStatus.RUNNING) + self.block_allocator.swap(blocks=blocks, + source_device=Device.GPU, + dest_device=Device.CPU) + mapping = self.block_allocator.get_and_reset_swaps() + block_number_mapping = { + gpu_block_id: cpu_block_id - self.num_total_gpu_blocks + for gpu_block_id, cpu_block_id in mapping.items() + } + return block_number_mapping + def _can_swap(self, seq_group: SequenceGroup, device: Device, @@ -284,8 +284,10 @@ def _can_swap(self, num_lookahead_slots, watermark_blocks) - def _get_blocks_for_swap(self, seq_group: SequenceGroup, - status: SequenceStatus) -> List[Block]: + def _get_blocks_for_swap(self, + seq_group: SequenceGroup, + status: SequenceStatus, + num_lookahead_slots: int = 0) -> List[Block]: blocks: Dict[int, List[Block]] = {} for seq in seq_group.get_seqs(status=status): block_table = self.block_tables[seq.seq_id] From b8aee85fe71d5f84798362403a67c0f6829d78cf Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 12 Apr 2024 04:56:47 +0000 Subject: [PATCH 13/32] fix --- vllm/core/block/naive_block.py | 3 --- vllm/core/block/prefix_caching_block.py | 3 --- vllm/core/block_manager_v2.py | 18 +++++++++--------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 90d708c6e40e..f0b75a49abd2 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -93,9 +93,6 @@ def free(self, block: Block) -> None: self._free_block_id(block.block_id) block.block_id = None - def reference(self, block_id: int) -> None: - self._refcounter.incr(block_id) - def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index f9a809dae1e1..4f1bd004e846 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -199,9 +199,6 @@ def _free_block_id_for_block(self, block_id: BlockId, assert block.content_hash in self._cached_blocks self._unused_cached_blocks[block.content_hash] = block_id - def reference(self, block_id: int) -> None: - self._refcounter.incr(block_id) - def fork(self, last_block: Block) -> List[Block]: """Creates a new sequence of blocks that shares the same underlying memory as the original sequence. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index b3a22fe17089..095ab13ec2ba 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -226,20 +226,11 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: src_block_table = self.block_tables[parent_seq.seq_id] self.block_tables[child_seq.seq_id] = src_block_table.fork() - def get_num_free_gpu_blocks(self) -> int: - return self.block_allocator.get_num_free_blocks(Device.GPU) - - def get_num_free_cpu_blocks(self) -> int: - return self.block_allocator.get_num_free_blocks(Device.CPU) - def can_swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, num_lookahead_slots, self.watermark_blocks) - def can_swap_out(self, seq_group: SequenceGroup) -> bool: - return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) - def swap_in(self, sequence_group: SequenceGroup, num_lookahead_slots: int = 0) -> Dict[int, int]: @@ -260,6 +251,9 @@ def swap_in(self, } return block_number_mapping + def can_swap_out(self, seq_group: SequenceGroup) -> bool: + return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) + def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: blocks = self._get_blocks_for_swap(sequence_group, SequenceStatus.RUNNING) @@ -273,6 +267,12 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: } return block_number_mapping + def get_num_free_gpu_blocks(self) -> int: + return self.block_allocator.get_num_free_blocks(Device.GPU) + + def get_num_free_cpu_blocks(self) -> int: + return self.block_allocator.get_num_free_blocks(Device.CPU) + def _can_swap(self, seq_group: SequenceGroup, device: Device, From cba0f62e65716794e635a912c217a5d211adc9e4 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 12 Apr 2024 04:59:20 +0000 Subject: [PATCH 14/32] fix --- vllm/core/block/block_table.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index a237390b5aa8..30fd3050f0b4 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -70,8 +70,7 @@ def get_blocks(self) -> Optional[List[Block]]: def allocate(self, token_ids: List[int], - device: Device = Device.GPU, - by_block: bool = False) -> Optional[Block]: + device: Device = Device.GPU) -> None: """Allocates memory blocks for storing the given sequence of token IDs. This method allocates the required number of blocks to store the given @@ -81,23 +80,13 @@ def allocate(self, token_ids (List[int]): The sequence of token IDs to be stored. device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. - by_block (bool, optional): whether we are allocate block by block. - Set to True when doing cache swapping. Default to False. """ - assert not self._is_allocated or by_block assert token_ids - blocks = self._allocate_blocks_for_token_ids(prev_block=None, - token_ids=token_ids, - device=device) - self._num_full_slots += len(token_ids) - if not (by_block and self._is_allocated): - self._blocks = blocks - else: - # Note: whenever we call allocate with by_block set to True, - # because of swapping, the tokens must fit in a block - assert len(blocks) == 1 - self._blocks.append(blocks[0]) - return blocks[0] + assert not self._is_allocated + self._blocks = self._allocate_blocks_for_token_ids(prev_block=None, + token_ids=token_ids, + device=device) + self._num_full_slots = len(token_ids) def append_token_ids(self, token_ids: List[int], From 0430758205644020226cd80dcc519a7c4dc64fb7 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Tue, 30 Apr 2024 18:53:37 +0000 Subject: [PATCH 15/32] doc: adding docstring --- tests/core/block/test_block_manager_v2.py | 7 +- vllm/core/block/block_table.py | 7 +- vllm/core/block/cpu_gpu_block_allocator.py | 59 ++++++++++-- vllm/core/block/naive_block.py | 32 +++--- vllm/core/block/prefix_caching_block.py | 39 ++++++-- vllm/core/block_manager_v1.py | 7 +- vllm/core/block_manager_v2.py | 107 +++++++++++++++++---- 7 files changed, 202 insertions(+), 56 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 67c0789f03b6..bbba018a3f92 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -106,10 +106,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, @pytest.mark.parametrize("block_size", [8]) @pytest.mark.parametrize("num_cpu_blocks", [4]) @pytest.mark.parametrize("num_gpu_blocks", [4]) -@pytest.mark.parametrize("num_lookahead_slots", [2]) +@pytest.mark.parametrize("num_lookahead_slots", [0, 2]) @pytest.mark.parametrize("enable_caching", [False]) def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, enable_caching): + """Verify blocks number on src/desc device is correct after swapping in/out + sequence group (not missing or extra blocks). + """ block_manager = BlockSpaceManagerV2(block_size, num_cpu_blocks, num_gpu_blocks, @@ -142,7 +145,7 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, assert block_manager.can_swap_in(seq_group, num_lookahead_slots) before_cpu_blocks = block_manager.get_num_free_cpu_blocks() before_gpu_blocks = block_manager.get_num_free_gpu_blocks() - mapping = block_manager.swap_in(seq_group, num_lookahead_slots) + mapping = block_manager.swap_in(seq_group) cpu_blocks = block_manager.get_block_table(prompt) assert list(mapping.keys()) == [cpu_blocks[0]] after_cpu_blocks = block_manager.get_num_free_cpu_blocks() diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 30fd3050f0b4..9813e6882f0e 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -65,9 +65,6 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: """ return cdiv(len(token_ids), block_size) - def get_blocks(self) -> Optional[List[Block]]: - return self._blocks - def allocate(self, token_ids: List[int], device: Device = Device.GPU) -> None: @@ -252,6 +249,10 @@ def _get_all_token_ids(self) -> List[int]: def _is_allocated(self) -> bool: return self._blocks is not None + @property + def blocks(self) -> Optional[List[Block]]: + return self._blocks + @property def _num_empty_slots(self) -> int: assert self._is_allocated diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 742c5bfc835e..be5a20e0ac15 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -178,8 +178,32 @@ def get_num_free_blocks(self, device: Device) -> int: """ return self._allocators[device].get_num_free_blocks() + def get_device_related_block_id(self, device: Device, + absolute_id: int) -> int: + """Returns the relative block id on certain device given the absolute + block id. + + Args: + device (Device): The device for which to query relative block id. + absolute_id (int): The absolute block id for the block in + whole allocator. + + Returns: + int: The relative block id on certain device. + """ + return self._allocators[device].get_device_related_block_id( + absolute_id) + def swap(self, blocks: List[Block], source_device: Device, dest_device: Device) -> None: + """Execute the swap for the given blocks from source_device + on to dest_device, and save the swap mapping. + + Args: + blocks: List of blocks to be swapped. + source_device (Device): Device to swap the 'blocks' from. + dest_device (Device): Device to swap the 'blocks' to. + """ source_block_ids = [block.block_id for block in blocks] self._allocators[source_device].swap_out(blocks) self._allocators[dest_device].swap_in(blocks) @@ -189,13 +213,25 @@ def swap(self, blocks: List[Block], source_device: Device, for src, dest in zip(source_block_ids, dest_block_ids) } - def can_swap(self, - blocks: List[Block], - device: Device, - num_lookahead_slots: int = 0, - watermark_blocks: int = 0) -> bool: - return self._allocators[device].can_swap(blocks, num_lookahead_slots, - watermark_blocks) + def get_num_blocks_touched(self, + blocks: List[Block], + device: Device, + num_lookahead_slots: int = 0) -> int: + """Returns the number of blocks that will be touched by + swapping in/out the given blocks on to the 'device'. + + Args: + blocks: List of blocks to be swapped. + device (Device): Device to swap the 'blocks' on. + num_lookahead_slots (int): Number of lookahead slots used in + speculative decoding, default to 0. + + Returns: + int: the number of blocks that will be touched by + swapping in/out the given blocks on to the 'device'. + """ + return self._allocators[device].get_num_blocks_touched( + blocks, num_lookahead_slots) def clear_copy_on_writes(self) -> Dict[int, List[int]]: """Clears the copy-on-write (CoW) state and returns the mapping of @@ -224,7 +260,14 @@ def get_common_computed_block_ids( def all_block_ids(self) -> frozenset[int]: return frozenset(self._block_ids_to_allocator.keys()) - def get_and_reset_swaps(self) -> dict: + def get_and_reset_swaps(self) -> dict[int, int]: + """Returns and clears the mapping of source to destination block IDs. + Will be called after every swapping operations for now, and after every + schedule when BlockManagerV2 become default. + + Returns: + Dict[int, int]: A mapping of source to destination block IDs. + """ mapping = self._swap_mapping.copy() self._swap_mapping.clear() return mapping diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index f0b75a49abd2..0857ed4db978 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Dict, Iterable, List, Optional, Set from vllm.core.block.common import (CopyOnWriteTracker, RefCounter, @@ -143,6 +141,19 @@ def _free_block_id(self, block_id: BlockId) -> None: if refcount == 0: self._free_block_indices.add(block_id) + def get_device_related_block_id(self, absolute_id: int) -> int: + """Returns the relative block id on certain block allocator + given the absolute block id. + + Args: + absolute_id (int): The absolute block id for the block + in whole allocator. + + Returns: + int: The relative block id on certain device. + """ + return sorted(self._all_block_indices).index(absolute_id) + @property def refcounter(self): return self._refcounter @@ -191,11 +202,11 @@ def get_common_computed_block_ids( """ return [] - def can_swap(self, - blocks: List[Block], - num_lookahead_slots: int = 0, - watermark_blocks: int = 0) -> bool: - """Determine can we swap in/out the given blocks from certain sequence + def get_num_blocks_touched(self, + blocks: List[Block], + num_lookahead_slots: int = 0) -> int: + """Determine the number of blocks that will be touched by + swapping in/out the given blocks from certain sequence group with the provided num_lookahead_slots. Args: @@ -204,8 +215,8 @@ def can_swap(self, out). Returns: - bool: whether the allocator has capacity to accept the swap - with given blocks and num_lookahead_slots. + int: the number of blocks that will be touched by + swapping in/out the given blocks and num_lookahead_slots. """ # NOTE: for naive block, we use set to eliminate common blocks among # seqs, also we compare the empty slots in the mutable blocks with @@ -222,8 +233,7 @@ def can_swap(self, else: old_block_set.add(block.block_id) num_touched_blocks = new_block_count + len(old_block_set) - return self.get_num_free_blocks( - ) - num_touched_blocks >= watermark_blocks + return num_touched_blocks def swap_out(self, blocks: List[Block]) -> None: for block in blocks: diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 4f1bd004e846..86051448dd5a 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -236,6 +236,19 @@ def get_num_free_blocks(self) -> int: return self._hashless_allocator.get_num_free_blocks() + len( self._unused_cached_blocks) + def get_device_related_block_id(self, absolute_id: int) -> int: + """Returns the relative block id on certain block allocator + given the absolute block id. + + Args: + absolute_id (int): The absolute block id for the block + in whole allocator. + + Returns: + int: The relative block id on certain device. + """ + return sorted(self._all_block_indices).index(absolute_id) + @property def all_block_ids(self) -> frozenset[int]: return self._hashless_allocator.all_block_ids @@ -326,9 +339,9 @@ def get_common_computed_block_ids( def can_swap(self, blocks: List[Block], - num_lookahead_slots: int = 0, - watermark_blocks: int = 0) -> bool: - """Determine can we swap in/out the given blocks from certain sequence + num_lookahead_slots: int = 0) -> int: + """Determine the number of blocks that will be touched by + swapping in/out the given blocks from certain sequence group with the provided num_lookahead_slots. Args: @@ -337,8 +350,8 @@ def can_swap(self, swap out). Returns: - bool: whether the allocator has capacity to accept the swap - with given blocks and num_lookahead_slots. + int: the number of blocks that will be touched by + swapping in/out the given blocks and num_lookahead_slots. """ num_touched_blocks = 0 for block in blocks: @@ -350,14 +363,26 @@ def can_swap(self, else: if not self.is_block_cached(block): num_touched_blocks += 1 - return self.get_num_free_blocks( - ) - num_touched_blocks >= watermark_blocks + return num_touched_blocks def swap_out(self, blocks: List[Block]) -> None: + """Execute the swap out actions. Basically just free the + given blocks. + + Args: + blocks: List of blocks to be swapped out. + """ for block in blocks: self.free(block) def swap_in(self, blocks: List[Block]) -> None: + """Execute the swap int actions. Change the block id from + old allocator to current allocator for each block to finish + the block table update. + + Args: + blocks: List of blocks to be swapped in. + """ for block in blocks: if block.is_full: alloc = self.allocate_immutable(block.prev_block, diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index b2aaeb33c529..6781c03f5251 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -453,12 +453,7 @@ def can_swap_in(self, num_required_blocks = len(blocks) + num_swapped_seqs return num_free_blocks - num_required_blocks >= self.watermark_blocks - def swap_in(self, - seq_group: SequenceGroup, - num_lookahead_slots: int = 0) -> Dict[int, int]: - assert (num_lookahead_slots == 0 - ), "BlockSpaceManagerV1 does not support lookahead allocation" - + def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: # CPU block -> GPU block. mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {} for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED): diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 095ab13ec2ba..943566e843be 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -228,15 +228,32 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: def can_swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> bool: + """Returns whether we can swap in the given sequence_group + with num_lookahead_slots. + + Args: + sequence_group (SequenceGroup): The sequence group to swap in. + num_lookahead_slots (int): Number of lookahead slots used in + speculative decoding, default to 0. + + Returns: + bool: Whether it's possible to swap in current sequence group. + """ return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, - num_lookahead_slots, self.watermark_blocks) + num_lookahead_slots) + + def swap_in(self, sequence_group: SequenceGroup) -> Dict[int, int]: + """Returns the block id mapping (from CPU to GPU) generated by + swapping in the given sequence_group with num_lookahead_slots. - def swap_in(self, - sequence_group: SequenceGroup, - num_lookahead_slots: int = 0) -> Dict[int, int]: + Args: + sequence_group (SequenceGroup): The sequence group to swap in. + + Returns: + Dict[int, int]: The mapping of swapping block from CPU to GPU. + """ blocks = self._get_blocks_for_swap(sequence_group, - SequenceStatus.SWAPPED, - num_lookahead_slots) + SequenceStatus.SWAPPED) self.block_allocator.swap(blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU) @@ -246,15 +263,38 @@ def swap_in(self, # copy-on-writes for the batch. mapping = self.block_allocator.get_and_reset_swaps() block_number_mapping = { - cpu_block_id - self.num_total_gpu_blocks: gpu_block_id + self.block_allocator.get_device_related_block_id( + Device.CPU, cpu_block_id): + self.block_allocator.get_device_related_block_id( + Device.GPU, gpu_block_id) for cpu_block_id, gpu_block_id in mapping.items() } return block_number_mapping def can_swap_out(self, seq_group: SequenceGroup) -> bool: + """Returns whether we can swap out the given sequence_group + with num_lookahead_slots. + + Args: + sequence_group (SequenceGroup): The sequence group to swap in. + num_lookahead_slots (int): Number of lookahead slots used in + speculative decoding, default to 0. + + Returns: + bool: Whether it's possible to swap out current sequence group. + """ return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: + """Returns the block id mapping (from GPU to CPU) generated by + swapping out the given sequence_group with num_lookahead_slots. + + Args: + sequence_group (SequenceGroup): The sequence group to swap in. + + Returns: + Dict[int, int]: The mapping of swapping block from GPU to CPU. + """ blocks = self._get_blocks_for_swap(sequence_group, SequenceStatus.RUNNING) self.block_allocator.swap(blocks=blocks, @@ -262,7 +302,10 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: dest_device=Device.CPU) mapping = self.block_allocator.get_and_reset_swaps() block_number_mapping = { - gpu_block_id: cpu_block_id - self.num_total_gpu_blocks + self.block_allocator.get_device_related_block_id( + Device.GPU, gpu_block_id): + self.block_allocator.get_device_related_block_id( + Device.CPU, cpu_block_id) for gpu_block_id, cpu_block_id in mapping.items() } return block_number_mapping @@ -277,20 +320,46 @@ def _can_swap(self, seq_group: SequenceGroup, device: Device, status: SequenceStatus, - num_lookahead_slots: int = 0, - watermark_blocks: int = 0) -> bool: + num_lookahead_slots: int = 0) -> bool: + """Returns whether we can swap in/out the given sequence_group + on to the 'device'. + + Args: + sequence_group (SequenceGroup): The sequence group to swap in. + device (Device): device to swap the 'seq_group' on. + status (SequenceStatus): The status of sequence which is needed + for action. RUNNING for swap out and SWAPPED for swap in + num_lookahead_slots (int): Number of lookahead slots used in + speculative decoding, default to 0. + + Returns: + bool: whether we can swap in/out the given sequence_group + on to the 'device'. + """ blocks = self._get_blocks_for_swap(seq_group, status) - return self.block_allocator.can_swap(blocks, device, - num_lookahead_slots, - watermark_blocks) - - def _get_blocks_for_swap(self, - seq_group: SequenceGroup, - status: SequenceStatus, - num_lookahead_slots: int = 0) -> List[Block]: + num_blocks_touched = self.block_allocator.get_num_blocks_touched( + blocks, device, num_lookahead_slots) + watermark_blocks = 0 + if device == Device.GPU: + watermark_blocks = self.watermark_blocks + return self.block_allocator.get_num_free_blocks( + device) - num_blocks_touched > watermark_blocks + + def _get_blocks_for_swap(self, seq_group: SequenceGroup, + status: SequenceStatus) -> List[Block]: + """Returns the list of blocks those are touched by the seq_group + + Args: + sequence_group (SequenceGroup): The sequence group to swap in. + status (SequenceStatus): The status of sequence which is needed + for action. RUNNING for swap out and SWAPPED for swap in + + Returns: + The list of blocks those are touched by the seq_group. + """ blocks: Dict[int, List[Block]] = {} for seq in seq_group.get_seqs(status=status): block_table = self.block_tables[seq.seq_id] - blocks[seq.seq_id] = block_table.get_blocks() + blocks[seq.seq_id] = block_table.blocks combined_blocks = list(chain(*blocks.values())) return combined_blocks From fbb30995b25d949852fde7cc45d8251c4d12effc Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Wed, 1 May 2024 07:52:19 +0000 Subject: [PATCH 16/32] test: adding e2e correstness test for preemption by swapping --- tests/core/block/e2e/test_correctness.py | 33 +++++++++++++++++++----- vllm/config.py | 22 +++++++++------- vllm/core/scheduler.py | 12 ++++++--- vllm/engine/arg_utils.py | 2 ++ 4 files changed, 49 insertions(+), 20 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 5a7f828456e2..43d25a966c49 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -22,7 +22,13 @@ @pytest.mark.parametrize("baseline_llm_kwargs", [{ "use_v2_block_manager": False }]) -@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "use_v2_block_manager": True, + "preemption_mode": "swap" +}, { + "use_v2_block_manager": True, + "preemption_mode": "recompute" +}]) @pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("seed", [1]) def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, @@ -93,7 +99,13 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator, @pytest.mark.parametrize("baseline_llm_kwargs", [{ "use_v2_block_manager": False }]) -@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "use_v2_block_manager": True, + "preemption_mode": "swap" +}, { + "use_v2_block_manager": True, + "preemption_mode": "recompute" +}]) @pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("seed", [1]) def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, @@ -177,11 +189,18 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator, }]) @pytest.mark.parametrize( "test_llm_kwargs", - [{ - # We run one test with block_size < lookahead_slots, one test with - # block_size > lookahead_slots - "num_lookahead_slots": 10, - }]) + [ + { + # We run one test with block_size < lookahead_slots, one test with + # block_size > lookahead_slots + "num_lookahead_slots": 10, + "preemption_mode": "swap", + }, + { + "num_lookahead_slots": 10, + "preemption_mode": "recompute", + } + ]) @pytest.mark.parametrize("batch_size", [4]) @pytest.mark.parametrize("seed", [1]) def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, diff --git a/vllm/config.py b/vllm/config.py index eef3fc53c3a6..b84b67dd30ad 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -542,18 +542,19 @@ class SchedulerConfig: prompt latency) before scheduling next prompt. enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. + preemption_mode: Whether to perform preemption by swapping or + recomputation (default) """ - def __init__( - self, - max_num_batched_tokens: Optional[int], - max_num_seqs: int, - max_model_len: int, - use_v2_block_manager: bool = False, - num_lookahead_slots: int = 0, - delay_factor: float = 0.0, - enable_chunked_prefill: bool = False, - ) -> None: + def __init__(self, + max_num_batched_tokens: Optional[int], + max_num_seqs: int, + max_model_len: int, + use_v2_block_manager: bool = False, + num_lookahead_slots: int = 0, + delay_factor: float = 0.0, + enable_chunked_prefill: bool = False, + preemption_mode: Optional[str] = None) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: @@ -566,6 +567,7 @@ def __init__( self.num_lookahead_slots = num_lookahead_slots self.delay_factor = delay_factor self.chunked_prefill_enabled = enable_chunked_prefill + self.preemption_mode = preemption_mode self._verify_args() diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 9d098801233e..da2a291d0397 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -331,12 +331,14 @@ def _schedule(self) -> SchedulerOutputs: if self.running: # Preempt the lowest-priority sequence groups. victim_seq_group = self.running.pop() - self._preempt(victim_seq_group, blocks_to_swap_out) + self._preempt(victim_seq_group, blocks_to_swap_out, + self.scheduler_config.preemption_mode) preempted.append(victim_seq_group) else: # No other sequence groups can be preempted. # Preempt the current sequence group. - self._preempt(seq_group, blocks_to_swap_out) + self._preempt(seq_group, blocks_to_swap_out, + self.scheduler_config.preemption_mode) preempted.append(seq_group) break else: @@ -538,7 +540,7 @@ def _preempt( self, seq_group: SequenceGroup, blocks_to_swap_out: Dict[int, int], - preemption_mode: Optional[PreemptionMode] = None, + preemption_mode: Optional[str] = None, ) -> None: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than @@ -556,6 +558,10 @@ def _preempt( preemption_mode = PreemptionMode.RECOMPUTE else: preemption_mode = PreemptionMode.SWAP + elif preemption_mode == "swap": + preemption_mode = PreemptionMode.SWAP + else: + preemption_mode = PreemptionMode.RECOMPUTE if preemption_mode == PreemptionMode.RECOMPUTE: self._preempt_by_recompute(seq_group) elif preemption_mode == PreemptionMode.SWAP: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 8d61f2f9ff19..44a14286120f 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -63,6 +63,7 @@ class EngineArgs: image_feature_size: Optional[int] = None scheduler_delay_factor: float = 0.0 enable_chunked_prefill: bool = False + preemption_mode: Optional[str] = None def __post_init__(self): if self.tokenizer is None: @@ -417,6 +418,7 @@ def create_engine_configs( num_lookahead_slots=self.num_lookahead_slots, delay_factor=self.scheduler_delay_factor, enable_chunked_prefill=self.enable_chunked_prefill, + preemption_mode=self.preemption_mode, ) lora_config = LoRAConfig( max_lora_rank=self.max_lora_rank, From 66a7bbdf9c528bf765469278b0d651f8c5cf59ce Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Wed, 1 May 2024 08:25:32 +0000 Subject: [PATCH 17/32] fix --- tests/core/block/test_block_manager_v2.py | 2 +- vllm/core/block/naive_block.py | 5 ++++- vllm/core/block/prefix_caching_block.py | 7 +++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index bbba018a3f92..a75b1f80a6e5 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -106,7 +106,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, @pytest.mark.parametrize("block_size", [8]) @pytest.mark.parametrize("num_cpu_blocks", [4]) @pytest.mark.parametrize("num_gpu_blocks", [4]) -@pytest.mark.parametrize("num_lookahead_slots", [0, 2]) +@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10]) @pytest.mark.parametrize("enable_caching", [False]) def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, enable_caching): diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 0857ed4db978..1ccbec331aea 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -3,6 +3,7 @@ from vllm.core.block.common import (CopyOnWriteTracker, RefCounter, get_all_blocks_recursively) from vllm.core.block.interfaces import Block, BlockAllocator +from vllm.utils import cdiv BlockId = int Refcount = int @@ -229,7 +230,9 @@ def get_num_blocks_touched(self, if block.num_empty_slots >= num_lookahead_slots: new_block_count += 1 else: - new_block_count += 2 + new_block_count += cdiv( + num_lookahead_slots - block.num_empty_slots, + self._block_size) else: old_block_set.add(block.block_id) num_touched_blocks = new_block_count + len(old_block_set) diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 86051448dd5a..a1cbc87f54b0 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -9,6 +9,7 @@ get_all_blocks_recursively) from vllm.core.block.interfaces import Block, BlockAllocator from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator +from vllm.utils import cdiv PrefixHash = int BlockId = int @@ -254,7 +255,7 @@ def all_block_ids(self) -> frozenset[int]: return self._hashless_allocator.all_block_ids def is_block_cached(self, block: "PrefixCachingBlock") -> bool: - if block.content_hash not in self._cached_blocks: + if block.content_hash in self._cached_blocks: return True return False @@ -359,7 +360,9 @@ def can_swap(self, if block.num_empty_slots >= num_lookahead_slots: num_touched_blocks += 1 else: - num_touched_blocks += 2 + num_touched_blocks += cdiv( + num_lookahead_slots - block.num_empty_slots, + self._block_size) else: if not self.is_block_cached(block): num_touched_blocks += 1 From 35d391e5f26237be7486d2b5201b3b85a2b54b6e Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Wed, 1 May 2024 08:34:51 +0000 Subject: [PATCH 18/32] remove import for __future__.annotations --- vllm/core/block/cpu_gpu_block_allocator.py | 2 -- vllm/core/block/interfaces.py | 2 -- vllm/core/block/prefix_caching_block.py | 1 - 3 files changed, 5 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index be5a20e0ac15..c09b928bc14e 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Dict, List, Optional from vllm.core.block.interfaces import (Block, BlockAllocator, diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index d463f7b09131..9f466566f096 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from abc import ABC, abstractmethod, abstractproperty from typing import Dict, List, Optional, Protocol diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index a1cbc87f54b0..f24952285367 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -1,5 +1,4 @@ """Token blocks.""" -from __future__ import annotations from itertools import takewhile from os.path import commonprefix From 13ab5f543816877fe90c0bf9ed5b5f27ec38003e Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Thu, 2 May 2024 00:24:52 +0000 Subject: [PATCH 19/32] fix: address comments --- vllm/config.py | 6 +++- vllm/core/block/cpu_gpu_block_allocator.py | 35 ++++++++++++++-------- vllm/core/block/naive_block.py | 7 +++-- vllm/core/block/prefix_caching_block.py | 6 ++-- vllm/core/block_manager_v2.py | 33 ++++++++++---------- vllm/core/scheduler.py | 13 ++++---- 6 files changed, 55 insertions(+), 45 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b84b67dd30ad..96ffed8cdf34 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -543,7 +543,11 @@ class SchedulerConfig: enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. preemption_mode: Whether to perform preemption by swapping or - recomputation (default) + recomputation. If not specified, we determine the mode as follows: + We use recomputation by default since it incurs lower overhead than + swapping. However, when the sequence group has multiple sequences + (e.g., beam search), recomputation is not currently supported. In + such a case, we use swapping instead. """ def __init__(self, diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index c09b928bc14e..5b185d8466f5 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -176,10 +176,9 @@ def get_num_free_blocks(self, device: Device) -> int: """ return self._allocators[device].get_num_free_blocks() - def get_device_related_block_id(self, device: Device, - absolute_id: int) -> int: - """Returns the relative block id on certain device given the absolute - block id. + def get_physical_block_id(self, device: Device, absolute_id: int) -> int: + """Returns the zero-offset block id on certain device given the + absolute block id. Args: device (Device): The device for which to query relative block id. @@ -187,29 +186,39 @@ def get_device_related_block_id(self, device: Device, whole allocator. Returns: - int: The relative block id on certain device. + int: The zero-offset block id on certain device. """ - return self._allocators[device].get_device_related_block_id( - absolute_id) + return self._allocators[device].get_physical_block_id(absolute_id) def swap(self, blocks: List[Block], source_device: Device, - dest_device: Device) -> None: + dest_device: Device) -> dict[int, int]: """Execute the swap for the given blocks from source_device - on to dest_device, and save the swap mapping. + on to dest_device, save the current swap mapping and append + them to the accumulated `self._swap_mapping` for each + scheduling move. Args: blocks: List of blocks to be swapped. source_device (Device): Device to swap the 'blocks' from. dest_device (Device): Device to swap the 'blocks' to. + + Returns: + dict[int, int]: Swap mapping from source_device + on to dest_device. """ source_block_ids = [block.block_id for block in blocks] self._allocators[source_device].swap_out(blocks) self._allocators[dest_device].swap_in(blocks) dest_block_ids = [block.block_id for block in blocks] - self._swap_mapping = { - src: dest - for src, dest in zip(source_block_ids, dest_block_ids) - } + # self._swap_mapping = { + # src: dest + # for src, dest in zip(source_block_ids, dest_block_ids) + # } + current_swap_mapping = {} + for src, dest in zip(source_block_ids, dest_block_ids): + self._swap_mapping[src] = dest + current_swap_mapping[src] = dest + return current_swap_mapping def get_num_blocks_touched(self, blocks: List[Block], diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py index 1ccbec331aea..7f205d5df008 100644 --- a/vllm/core/block/naive_block.py +++ b/vllm/core/block/naive_block.py @@ -142,8 +142,8 @@ def _free_block_id(self, block_id: BlockId) -> None: if refcount == 0: self._free_block_indices.add(block_id) - def get_device_related_block_id(self, absolute_id: int) -> int: - """Returns the relative block id on certain block allocator + def get_physical_block_id(self, absolute_id: int) -> int: + """Returns the zero-offset block id on certain block allocator given the absolute block id. Args: @@ -151,7 +151,7 @@ def get_device_related_block_id(self, absolute_id: int) -> int: in whole allocator. Returns: - int: The relative block id on certain device. + int: The zero-offset block id on certain device. """ return sorted(self._all_block_indices).index(absolute_id) @@ -225,6 +225,7 @@ def get_num_blocks_touched(self, # needed. old_block_set = set() new_block_count = 0 + # TODO(cade): make sure the logic is correct and clean it up. for block in blocks: if not block.is_full and num_lookahead_slots != 0: if block.num_empty_slots >= num_lookahead_slots: diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index f24952285367..fd90dd02fda8 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -236,8 +236,8 @@ def get_num_free_blocks(self) -> int: return self._hashless_allocator.get_num_free_blocks() + len( self._unused_cached_blocks) - def get_device_related_block_id(self, absolute_id: int) -> int: - """Returns the relative block id on certain block allocator + def get_physical_block_id(self, absolute_id: int) -> int: + """Returns the zero-offset block id on certain block allocator given the absolute block id. Args: @@ -245,7 +245,7 @@ def get_device_related_block_id(self, absolute_id: int) -> int: in whole allocator. Returns: - int: The relative block id on certain device. + int: The rzero-offset block id on certain device. """ return sorted(self._all_block_indices).index(absolute_id) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 943566e843be..82ee336943e3 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -254,20 +254,19 @@ def swap_in(self, sequence_group: SequenceGroup) -> Dict[int, int]: """ blocks = self._get_blocks_for_swap(sequence_group, SequenceStatus.SWAPPED) - self.block_allocator.swap(blocks=blocks, - source_device=Device.CPU, - dest_device=Device.GPU) + current_swap_mapping = self.block_allocator.swap( + blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU) # NOTE: Once the BlockManagerV1 implementation is deleted, we can # move this get_and_reset_swaps call outside of swap_in/swap_out. # Then the scheduler can make calls to get all swaps and all # copy-on-writes for the batch. - mapping = self.block_allocator.get_and_reset_swaps() + block_number_mapping = { - self.block_allocator.get_device_related_block_id( - Device.CPU, cpu_block_id): - self.block_allocator.get_device_related_block_id( - Device.GPU, gpu_block_id) - for cpu_block_id, gpu_block_id in mapping.items() + self.block_allocator.get_physical_block_id(Device.CPU, + cpu_block_id): + self.block_allocator.get_physical_block_id(Device.GPU, + gpu_block_id) + for cpu_block_id, gpu_block_id in current_swap_mapping.items() } return block_number_mapping @@ -297,16 +296,14 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: """ blocks = self._get_blocks_for_swap(sequence_group, SequenceStatus.RUNNING) - self.block_allocator.swap(blocks=blocks, - source_device=Device.GPU, - dest_device=Device.CPU) - mapping = self.block_allocator.get_and_reset_swaps() + current_swap_mapping = self.block_allocator.swap( + blocks=blocks, source_device=Device.GPU, dest_device=Device.CPU) block_number_mapping = { - self.block_allocator.get_device_related_block_id( - Device.GPU, gpu_block_id): - self.block_allocator.get_device_related_block_id( - Device.CPU, cpu_block_id) - for gpu_block_id, cpu_block_id in mapping.items() + self.block_allocator.get_physical_block_id(Device.GPU, + gpu_block_id): + self.block_allocator.get_physical_block_id(Device.CPU, + cpu_block_id) + for gpu_block_id, cpu_block_id in current_swap_mapping.items() } return block_number_mapping diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index da2a291d0397..fabe7af42d0f 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -154,6 +154,8 @@ def __init__( self.prev_prompt = False # Latency of the last prompt step self.last_prompt_latency = 0.0 + # preemption mode, RECOMPUTE or SWAP + self.user_specified_preemption_mode = scheduler_config.preemption_mode @property def lora_enabled(self) -> bool: @@ -331,14 +333,12 @@ def _schedule(self) -> SchedulerOutputs: if self.running: # Preempt the lowest-priority sequence groups. victim_seq_group = self.running.pop() - self._preempt(victim_seq_group, blocks_to_swap_out, - self.scheduler_config.preemption_mode) + self._preempt(victim_seq_group, blocks_to_swap_out) preempted.append(victim_seq_group) else: # No other sequence groups can be preempted. # Preempt the current sequence group. - self._preempt(seq_group, blocks_to_swap_out, - self.scheduler_config.preemption_mode) + self._preempt(seq_group, blocks_to_swap_out) preempted.append(seq_group) break else: @@ -540,7 +540,6 @@ def _preempt( self, seq_group: SequenceGroup, blocks_to_swap_out: Dict[int, int], - preemption_mode: Optional[str] = None, ) -> None: # If preemption mode is not specified, we determine the mode as follows: # We use recomputation by default since it incurs lower overhead than @@ -553,12 +552,12 @@ def _preempt( # over sequence groups with a single sequence. # TODO(woosuk): Support recomputation for sequence groups with multiple # sequences. This may require a more sophisticated CUDA kernel. - if preemption_mode is None: + if self.user_specified_preemption_mode is None: if seq_group.get_max_num_running_seqs() == 1: preemption_mode = PreemptionMode.RECOMPUTE else: preemption_mode = PreemptionMode.SWAP - elif preemption_mode == "swap": + elif self.user_specified_preemption_mode == "swap": preemption_mode = PreemptionMode.SWAP else: preemption_mode = PreemptionMode.RECOMPUTE From a1e228ca08948cde1a55c7c04846682bc184fb42 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Thu, 2 May 2024 00:34:03 +0000 Subject: [PATCH 20/32] feat: add preemption as an user input arg --- vllm/engine/arg_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 44a14286120f..d365668fb7c1 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -55,6 +55,7 @@ class EngineArgs: ray_workers_use_nsight: bool = False forced_num_gpu_blocks: Optional[int] = None num_lookahead_slots: int = 0 + preemption_mode: Optional[str] = None # Related to Vision-language models such as llava image_input_type: Optional[str] = None @@ -63,7 +64,6 @@ class EngineArgs: image_feature_size: Optional[int] = None scheduler_delay_factor: float = 0.0 enable_chunked_prefill: bool = False - preemption_mode: Optional[str] = None def __post_init__(self): if self.tokenizer is None: @@ -372,6 +372,13 @@ def add_cli_args( default=False, help='If True, the prefill requests can be chunked based on the ' 'max_num_batched_tokens') + parser.add_argument( + '--preemption_mode', + type=str, + default=None, + help='If \'recompute\', the engine performs preemption by block ' + 'swapping; If \'swap\', the engine performs preemption by block ' + 'swapping.') return parser @classmethod From 98484194c966c5156732dc49925e99e0485b9221 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Thu, 2 May 2024 22:10:08 +0000 Subject: [PATCH 21/32] nit --- vllm/core/block/block_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index 9813e6882f0e..1704bd41864d 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -78,8 +78,8 @@ def allocate(self, device (Device, optional): The device on which the blocks should be allocated. Defaults to Device.GPU. """ - assert token_ids assert not self._is_allocated + assert token_ids self._blocks = self._allocate_blocks_for_token_ids(prev_block=None, token_ids=token_ids, device=device) From 170d5a255f61e06f9a7ac8cf4b17762d6d0a4d95 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 3 May 2024 04:55:22 +0000 Subject: [PATCH 22/32] fix: format and test --- tests/core/block/e2e/test_correctness.py | 16 +++++++-- tests/core/block/test_block_manager_v2.py | 3 ++ vllm/core/block/cpu_gpu_block_allocator.py | 18 +++++----- vllm/core/block/interfaces.py | 34 ++++++++++++++++++ vllm/core/block/prefix_caching_block.py | 20 ++++------- vllm/core/block_manager_v2.py | 42 +++++++++++++--------- vllm/core/interfaces.py | 3 +- vllm/core/scheduler.py | 4 ++- vllm/engine/arg_utils.py | 5 --- 9 files changed, 96 insertions(+), 49 deletions(-) diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py index 8d03b0520442..c381b2b886f0 100644 --- a/tests/core/block/e2e/test_correctness.py +++ b/tests/core/block/e2e/test_correctness.py @@ -339,7 +339,13 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator, @pytest.mark.parametrize("baseline_llm_kwargs", [{ "use_v2_block_manager": False }]) -@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "use_v2_block_manager": True, + "preemption_mode": "swap" +}, { + "use_v2_block_manager": True, + "preemption_mode": "recompute" +}]) @pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("seed", [1]) def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( @@ -414,7 +420,13 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption( @pytest.mark.parametrize("baseline_llm_kwargs", [{ "enable_prefix_caching": False }]) -@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}]) +@pytest.mark.parametrize("test_llm_kwargs", [{ + "enable_prefix_caching": True, + "preemption_mode": "swap" +}, { + "enable_prefix_caching": True, + "preemption_mode": "recompute" +}]) @pytest.mark.parametrize("batch_size", [10]) @pytest.mark.parametrize("seed", [1]) def test_auto_prefix_caching_with_preemption(baseline_llm_generator, diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index a75b1f80a6e5..0af35f5c2fa2 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -151,3 +151,6 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, after_cpu_blocks = block_manager.get_num_free_cpu_blocks() after_gpu_blocks = block_manager.get_num_free_gpu_blocks() assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) + + +# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level. diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 7d5b4537b256..f6238b6dc4c7 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -102,7 +102,7 @@ def __init__(self, cpu_block_allocator: BlockAllocator, Device.GPU: gpu_block_allocator, } - self._swap_mapping = {} + self._swap_mapping: Dict[int, int] = {} self._block_ids_to_allocator: Dict[int, BlockAllocator] = {} for _, allocator in self._allocators.items(): for block_id in allocator.all_block_ids: @@ -198,7 +198,7 @@ def get_physical_block_id(self, device: Device, absolute_id: int) -> int: return self._allocators[device].get_physical_block_id(absolute_id) def swap(self, blocks: List[Block], source_device: Device, - dest_device: Device) -> dict[int, int]: + dest_device: Device) -> Dict[int, int]: """Execute the swap for the given blocks from source_device on to dest_device, save the current swap mapping and append them to the accumulated `self._swap_mapping` for each @@ -217,14 +217,12 @@ def swap(self, blocks: List[Block], source_device: Device, self._allocators[source_device].swap_out(blocks) self._allocators[dest_device].swap_in(blocks) dest_block_ids = [block.block_id for block in blocks] - # self._swap_mapping = { - # src: dest - # for src, dest in zip(source_block_ids, dest_block_ids) - # } - current_swap_mapping = {} + + current_swap_mapping: Dict[int, int] = {} for src, dest in zip(source_block_ids, dest_block_ids): - self._swap_mapping[src] = dest - current_swap_mapping[src] = dest + if src is not None and dest is not None: + self._swap_mapping[src] = dest + current_swap_mapping[src] = dest return current_swap_mapping def get_num_blocks_touched(self, @@ -289,7 +287,7 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: raise NotImplementedError - def get_and_reset_swaps(self) -> dict[int, int]: + def get_and_reset_swaps(self) -> Dict[int, int]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every schedule when BlockManagerV2 become default. diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 634c4016ca19..b42d0529446b 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -116,6 +116,18 @@ def get_num_total_blocks(self) -> int: def get_num_free_blocks(self) -> int: pass + @abstractmethod + def get_physical_block_id(self, absolute_id: int) -> int: + pass + + @abstractmethod + def swap_out(self, blocks: List[Block]) -> None: + pass + + @abstractmethod + def swap_in(self, blocks: List[Block]) -> None: + pass + @property @abstractmethod def all_block_ids(self) -> FrozenSet[int]: @@ -149,6 +161,12 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: """NOTE: This should not be used besides Block""" pass + @abstractmethod + def get_num_blocks_touched(self, + blocks: List[Block], + num_lookahead_slots: int = 0) -> int: + pass + class NoFreeBlocksError(ValueError): pass @@ -203,3 +221,19 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None: def get_common_computed_block_ids( self, seq_block_ids: List[List[int]]) -> List[int]: pass + + @abstractmethod + def get_num_blocks_touched(self, + blocks: List[Block], + device: Device, + num_lookahead_slots: int = 0) -> int: + pass + + @abstractmethod + def swap(self, blocks: List[Block], source_device: Device, + dest_device: Device) -> Dict[int, int]: + pass + + @abstractmethod + def get_physical_block_id(self, device: Device, absolute_id: int) -> int: + pass \ No newline at end of file diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py index 372e0809099e..34f9ad6d53a7 100644 --- a/vllm/core/block/prefix_caching_block.py +++ b/vllm/core/block/prefix_caching_block.py @@ -289,11 +289,6 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int: def get_num_total_blocks(self) -> int: return self._hashless_allocator.get_num_total_blocks() - def get_num_free_blocks(self) -> int: - # The number of free blocks is the number of hashless free - # blocks plus the number of hashful blocks that are unused. - return self._hashless_allocator.get_num_free_blocks() + len( - self._unused_cached_blocks) def get_physical_block_id(self, absolute_id: int) -> int: """Returns the zero-offset block id on certain block allocator @@ -306,20 +301,19 @@ def get_physical_block_id(self, absolute_id: int) -> int: Returns: int: The rzero-offset block id on certain device. """ - return sorted(self._all_block_indices).index(absolute_id) + return sorted(self.all_block_ids).index(absolute_id) @property def all_block_ids(self) -> FrozenSet[int]: return self._hashless_allocator.all_block_ids - def promote_to_immutable_block(self, block: Block) -> BlockId: - def is_block_cached(self, block: "PrefixCachingBlock") -> bool: + def is_block_cached(self, block: Block) -> bool: + assert block.content_hash is not None if block.content_hash in self._cached_blocks: return True return False - def promote_to_immutable_block(self, - block: "PrefixCachingBlock") -> BlockId: + def promote_to_immutable_block(self, block: Block) -> BlockId: """Once a mutable block is full, it can be promoted to an immutable block. This means that its content can be referenced by future blocks having the same prefix. @@ -431,9 +425,9 @@ def get_common_computed_block_ids( if ids != [] ]) - def can_swap(self, - blocks: List[Block], - num_lookahead_slots: int = 0) -> int: + def get_num_blocks_touched(self, + blocks: List[Block], + num_lookahead_slots: int = 0) -> int: """Determine the number of blocks that will be touched by swapping in/out the given blocks from certain sequence group with the provided num_lookahead_slots. diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index d04cfa3fec40..0a7f23e55246 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -239,8 +239,8 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None: self.block_tables[child_seq.seq_id] = src_block_table.fork() def can_swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> bool: - """Returns whether we can swap in the given sequence_group + num_lookahead_slots: int) -> AllocStatus: + """Returns the AllocStatus for the given sequence_group with num_lookahead_slots. Args: @@ -249,23 +249,22 @@ def can_swap_in(self, seq_group: SequenceGroup, speculative decoding, default to 0. Returns: - bool: Whether it's possible to swap in current sequence group. + AllocStatus: The AllocStatus for the given sequence group. """ return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, num_lookahead_slots) - def swap_in(self, sequence_group: SequenceGroup) -> Dict[int, int]: + def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: """Returns the block id mapping (from CPU to GPU) generated by - swapping in the given sequence_group with num_lookahead_slots. + swapping in the given seq_group with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap in. Returns: Dict[int, int]: The mapping of swapping block from CPU to GPU. """ - blocks = self._get_blocks_for_swap(sequence_group, - SequenceStatus.SWAPPED) + blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED) current_swap_mapping = self.block_allocator.swap( blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU) # NOTE: Once the BlockManagerV1 implementation is deleted, we can @@ -287,14 +286,18 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: with num_lookahead_slots. Args: - sequence_group (SequenceGroup): The sequence group to swap in. + seq_group (SequenceGroup): The sequence group to swap in. num_lookahead_slots (int): Number of lookahead slots used in speculative decoding, default to 0. Returns: bool: Whether it's possible to swap out current sequence group. """ - return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING) + alloc_status = self._can_swap(seq_group, Device.CPU, + SequenceStatus.RUNNING) + if alloc_status == AllocStatus.OK: + return True + return False def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: """Returns the block id mapping (from GPU to CPU) generated by @@ -329,8 +332,8 @@ def _can_swap(self, seq_group: SequenceGroup, device: Device, status: SequenceStatus, - num_lookahead_slots: int = 0) -> bool: - """Returns whether we can swap in/out the given sequence_group + num_lookahead_slots: int = 0) -> AllocStatus: + """Returns the AllocStatus for swapping in/out the given sequence_group on to the 'device'. Args: @@ -342,7 +345,7 @@ def _can_swap(self, speculative decoding, default to 0. Returns: - bool: whether we can swap in/out the given sequence_group + AllocStatus: The AllocStatus for swapping in/out the given sequence_group on to the 'device'. """ blocks = self._get_blocks_for_swap(seq_group, status) @@ -351,8 +354,14 @@ def _can_swap(self, watermark_blocks = 0 if device == Device.GPU: watermark_blocks = self.watermark_blocks - return self.block_allocator.get_num_free_blocks( - device) - num_blocks_touched > watermark_blocks + if self.block_allocator.get_num_total_blocks( + device) < num_blocks_touched: + return AllocStatus.NEVER + elif self.block_allocator.get_num_free_blocks( + device) - num_blocks_touched >= watermark_blocks: + return AllocStatus.OK + else: + return AllocStatus.LATER def _get_blocks_for_swap(self, seq_group: SequenceGroup, status: SequenceStatus) -> List[Block]: @@ -369,6 +378,7 @@ def _get_blocks_for_swap(self, seq_group: SequenceGroup, blocks: Dict[int, List[Block]] = {} for seq in seq_group.get_seqs(status=status): block_table = self.block_tables[seq.seq_id] - blocks[seq.seq_id] = block_table.blocks + if block_table.blocks is not None: + blocks[seq.seq_id] = block_table.blocks combined_blocks = list(chain(*blocks.values())) return combined_blocks diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 09ccaddb6261..26b6f492168b 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -67,8 +67,7 @@ def can_swap_in(self, seq_group: SequenceGroup, pass @abstractmethod - def swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> Dict[int, int]: + def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: pass @abstractmethod diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 322489fe9354..aee77d2d7a17 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -523,7 +523,9 @@ def _schedule_swapped( seq_group = swapped_queue[0] # If the sequence group cannot be swapped in, stop. - alloc_status = self.block_manager.can_swap_in(seq_group) + is_prefill = seq_group.is_prefill() + alloc_status = self.block_manager.can_swap_in( + seq_group, self._get_num_lookahead_slots(is_prefill)) if alloc_status == AllocStatus.LATER: break elif alloc_status == AllocStatus.NEVER: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 7973f9a6eee1..f1fe84297bd5 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -474,11 +474,6 @@ def add_cli_args( 'corresponding to the chosen load_format. ' 'This should be a JSON string that will be ' 'parsed into a dictionary.') - - type=bool, - default=False, - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') parser.add_argument( '--preemption_mode', type=str, From c7a3484324505e25bb8463e0747ee0a1f9b4d710 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 3 May 2024 05:00:49 +0000 Subject: [PATCH 23/32] fix: ruff --- vllm/core/block_manager_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 0a7f23e55246..4b4f823d6a96 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -345,8 +345,8 @@ def _can_swap(self, speculative decoding, default to 0. Returns: - AllocStatus: The AllocStatus for swapping in/out the given sequence_group - on to the 'device'. + AllocStatus: The AllocStatus for swapping in/out the given + sequence_group on to the 'device'. """ blocks = self._get_blocks_for_swap(seq_group, status) num_blocks_touched = self.block_allocator.get_num_blocks_touched( From c252294474bcbfca12a0f8f5355ace088b36bbcc Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 3 May 2024 06:11:10 +0000 Subject: [PATCH 24/32] test: add enable_cache=True for test_swap --- tests/core/block/test_block_manager_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 0af35f5c2fa2..3aa9960cc157 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -107,7 +107,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append, @pytest.mark.parametrize("num_cpu_blocks", [4]) @pytest.mark.parametrize("num_gpu_blocks", [4]) @pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10]) -@pytest.mark.parametrize("enable_caching", [False]) +@pytest.mark.parametrize("enable_caching", [False, True]) def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, enable_caching): """Verify blocks number on src/desc device is correct after swapping in/out From 880b8555e8012032bab013cbe74b9833a3586e72 Mon Sep 17 00:00:00 2001 From: Kaiyang Chen Date: Fri, 3 May 2024 09:01:21 +0000 Subject: [PATCH 25/32] nit --- vllm/core/block/cpu_gpu_block_allocator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index f6238b6dc4c7..5eb206b24c8c 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -240,7 +240,7 @@ def get_num_blocks_touched(self, Returns: int: the number of blocks that will be touched by - swapping in/out the given blocks on to the 'device'. + swapping in/out the given blocks on to the 'device'. """ return self._allocators[device].get_num_blocks_touched( blocks, num_lookahead_slots) From fe13a919646eba82749125deefe6d133a848b5ed Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Fri, 10 May 2024 15:54:20 +0800 Subject: [PATCH 26/32] fix --- vllm/core/block/cpu_gpu_block_allocator.py | 2 +- vllm/core/block/interfaces.py | 2 +- vllm/core/block_manager_v2.py | 16 +++++++--------- vllm/core/interfaces.py | 1 + vllm/engine/arg_utils.py | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index f97d408a1bc0..7391d8179e1d 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -210,7 +210,7 @@ def swap(self, blocks: List[Block], source_device: Device, dest_device (Device): Device to swap the 'blocks' to. Returns: - dict[int, int]: Swap mapping from source_device + Dict[int, int]: Swap mapping from source_device on to dest_device. """ source_block_ids = [block.block_id for block in blocks] diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index c2f2ddaa29ee..7780d2d80bc9 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import FrozenSet, List, Optional, Protocol, Tuple +from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple from vllm.utils import Device diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index d6b55348980d..4d6dec3f467e 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -255,7 +255,7 @@ def can_swap_in(self, seq_group: SequenceGroup, return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED, num_lookahead_slots) - def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: + def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: """Returns the block id mapping (from CPU to GPU) generated by swapping in the given seq_group with num_lookahead_slots. @@ -263,15 +263,12 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]: seq_group (SequenceGroup): The sequence group to swap in. Returns: - Dict[int, int]: The mapping of swapping block from CPU to GPU. + List[Tuple[int, int]]: The mapping of swapping block from CPU + to GPU. """ blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED) current_swap_mapping = self.block_allocator.swap( blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU) - # NOTE: Once the BlockManagerV1 implementation is deleted, we can - # move this get_and_reset_swaps call outside of swap_in/swap_out. - # Then the scheduler can make calls to get all swaps and all - # copy-on-writes for the batch. block_number_mapping = { self.block_allocator.get_physical_block_id(Device.CPU, @@ -301,7 +298,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool: return True return False - def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: + def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]: """Returns the block id mapping (from GPU to CPU) generated by swapping out the given sequence_group with num_lookahead_slots. @@ -309,7 +306,8 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: sequence_group (SequenceGroup): The sequence group to swap in. Returns: - Dict[int, int]: The mapping of swapping block from GPU to CPU. + List[Tuple[int, int]]: The mapping of swapping block from + GPU to CPU. """ blocks = self._get_blocks_for_swap(sequence_group, SequenceStatus.RUNNING) @@ -322,7 +320,7 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]: cpu_block_id) for gpu_block_id, cpu_block_id in current_swap_mapping.items() } - # convert to list of tuples once here + # convert to list of tuples once here return list(block_number_mapping.items()) def get_num_free_gpu_blocks(self) -> int: diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 3de7b78259e7..034f340ad78b 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -69,6 +69,7 @@ def can_swap_in(self, seq_group: SequenceGroup, @abstractmethod def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + pass @abstractmethod def can_swap_out(self, seq_group: SequenceGroup) -> bool: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 100df8b89ce1..2d2ac40a7ceb 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -505,7 +505,7 @@ def add_cli_args( help='If \'recompute\', the engine performs preemption by block ' 'swapping; If \'swap\', the engine performs preemption by block ' 'swapping.') - + parser.add_argument( "--served-model-name", nargs="+", From 773d3318cc4340a8107aefe07e5f475354678c87 Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Fri, 10 May 2024 16:28:07 +0800 Subject: [PATCH 27/32] fix: test --- tests/core/block/test_block_manager_v2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 3aa9960cc157..a872fe995dd3 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -134,7 +134,8 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, before_cpu_blocks = block_manager.get_num_free_cpu_blocks() before_gpu_blocks = block_manager.get_num_free_gpu_blocks() mapping = block_manager.swap_out(seq_group) - assert list(mapping.keys()) == gpu_blocks + mapping_keys = [key for key, _ in mapping] + assert mapping_keys == gpu_blocks after_cpu_blocks = block_manager.get_num_free_cpu_blocks() after_gpu_blocks = block_manager.get_num_free_gpu_blocks() assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks) @@ -147,7 +148,8 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots, before_gpu_blocks = block_manager.get_num_free_gpu_blocks() mapping = block_manager.swap_in(seq_group) cpu_blocks = block_manager.get_block_table(prompt) - assert list(mapping.keys()) == [cpu_blocks[0]] + mapping_keys = [key for key, _ in mapping] + assert mapping_keys == [cpu_blocks[0]] after_cpu_blocks = block_manager.get_num_free_cpu_blocks() after_gpu_blocks = block_manager.get_num_free_gpu_blocks() assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks) From 37d9b3122a5ef7009bb7dca38ef40f3e1bd2fff9 Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Fri, 10 May 2024 18:11:58 +0800 Subject: [PATCH 28/32] test: retry ci tests --- vllm/core/block/cpu_gpu_block_allocator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 7391d8179e1d..20e2e2340f91 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -287,14 +287,14 @@ def promote_to_immutable_block(self, block: Block) -> BlockId: def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: raise NotImplementedError - def get_and_reset_swaps(self) -> Dict[int, int]: + def get_and_reset_swaps(self) -> List[Tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every schedule when BlockManagerV2 become default. Returns: - Dict[int, int]: A mapping of source to destination block IDs. + List[Tuple[int, int]]: A mapping of source to destination block IDs. """ mapping = self._swap_mapping.copy() self._swap_mapping.clear() - return mapping + return list(mapping.items()) From a2f1df38e64e12838c68f85fc182aba079ba3110 Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Fri, 10 May 2024 19:54:07 +0800 Subject: [PATCH 29/32] retry --- vllm/core/block/cpu_gpu_block_allocator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index 20e2e2340f91..07ecab256cd8 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -290,7 +290,7 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]: def get_and_reset_swaps(self) -> List[Tuple[int, int]]: """Returns and clears the mapping of source to destination block IDs. Will be called after every swapping operations for now, and after every - schedule when BlockManagerV2 become default. + schedule when BlockManagerV2 become default. Currently not useful. Returns: List[Tuple[int, int]]: A mapping of source to destination block IDs. From 216eb7643f4c52b658518ea9425efcc0bd66b79f Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Mon, 13 May 2024 22:05:22 +0800 Subject: [PATCH 30/32] merge --- vllm/config.py | 21 ++++++++++----------- vllm/core/embedding_model_block_manager.py | 3 +-- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 5f51e9b0dd9e..bb91fca2e518 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -608,17 +608,16 @@ class SchedulerConfig: such a case, we use swapping instead. """ - def __init__( - self, - max_num_batched_tokens: Optional[int], - max_num_seqs: int, - max_model_len: int, - use_v2_block_manager: bool = False, - num_lookahead_slots: int = 0, - delay_factor: float = 0.0, - enable_chunked_prefill: bool = False, - embedding_mode: Optional[bool] = False, - preemption_mode: Optional[str] = None) -> None: + def __init__(self, + max_num_batched_tokens: Optional[int], + max_num_seqs: int, + max_model_len: int, + use_v2_block_manager: bool = False, + num_lookahead_slots: int = 0, + delay_factor: float = 0.0, + enable_chunked_prefill: bool = False, + embedding_mode: Optional[bool] = False, + preemption_mode: Optional[str] = None) -> None: if max_num_batched_tokens is not None: self.max_num_batched_tokens = max_num_batched_tokens else: diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py index a09d79ec3c42..f2d67306d7ce 100644 --- a/vllm/core/embedding_model_block_manager.py +++ b/vllm/core/embedding_model_block_manager.py @@ -46,8 +46,7 @@ def can_swap_in(self, seq_group: SequenceGroup, num_lookahead_slots: int) -> AllocStatus: return AllocStatus.OK - def swap_in(self, seq_group: SequenceGroup, - num_lookahead_slots: int) -> List[Tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: return None # type: ignore def can_swap_out(self, seq_group: SequenceGroup) -> bool: From 862a5d46b5669088af2f758d466889a8d3fdd076 Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Tue, 28 May 2024 21:29:57 +0800 Subject: [PATCH 31/32] fix: ci --- format.sh | 2 +- vllm/core/block/interfaces.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/format.sh b/format.sh index aaec25a8aa0d..c7ac2a56ce11 100755 --- a/format.sh +++ b/format.sh @@ -114,7 +114,7 @@ mypy vllm/model_executor --config-file pyproject.toml CODESPELL_EXCLUDES=( - '--skip' '*docs/source/_build/**,./tests/lora/data' + '--skip' '*docs/source/_build/**,*tests/lora/data/**' ) # check spelling of specified files diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py index 1c1e7c4c6862..4b20856a1b42 100644 --- a/vllm/core/block/interfaces.py +++ b/vllm/core/block/interfaces.py @@ -238,6 +238,7 @@ def swap(self, blocks: List[Block], source_device: Device, def get_physical_block_id(self, device: Device, absolute_id: int) -> int: pass + @abstractmethod def allocate_or_get_null_block(self) -> Block: """ Null blocks are used as a placeholders for KV cache blocks that have From 29df09262c4e86d0684440b03ef4b5946e45af18 Mon Sep 17 00:00:00 2001 From: Kaiyang-Chen Date: Thu, 30 May 2024 22:18:58 +0800 Subject: [PATCH 32/32] fix: merge --- tests/core/block/test_block_manager_v2.py | 3 ++- vllm/core/block_manager_v1.py | 2 +- vllm/core/block_manager_v2.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py index 882dcba3f0d6..d0ca09c4be0d 100644 --- a/tests/core/block/test_block_manager_v2.py +++ b/tests/core/block/test_block_manager_v2.py @@ -7,7 +7,8 @@ from vllm.sequence import Logprob, SequenceStatus from vllm.utils import chunk_list -from ..utils import create_seq_group, create_seq_group_encoder_decoder, create_dummy_prompt +from ..utils import (create_dummy_prompt, create_seq_group, + create_seq_group_encoder_decoder) @pytest.mark.parametrize("block_size", [16]) diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index decfac8dcead..4010aaf02b82 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -541,7 +541,7 @@ def _swap_block_table( return new_block_table - def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: + def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]: request_id = seq_group.request_id diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 0541572d497f..121092cf189b 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -6,8 +6,8 @@ from vllm.core.block.block_table import BlockTable from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator -from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec from vllm.core.block.interfaces import Block +from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.sequence import Sequence, SequenceGroup, SequenceStatus from vllm.utils import Device