From 182d4a8b5ecb86221554275e7569c4884187e1ac Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Wed, 3 Apr 2024 22:07:04 +0000
Subject: [PATCH 01/32] feat: support swap in/out for block manager v2

---
 vllm/core/block/block_table.py             | 30 ++++++++-
 vllm/core/block/cpu_gpu_block_allocator.py |  5 +-
 vllm/core/block/interfaces.py              |  1 +
 vllm/core/block/naive_block.py             |  1 +
 vllm/core/block/prefix_caching_block.py    |  1 +
 vllm/core/block_manager_v2.py              | 74 ++++++++++++++++++++--
 6 files changed, 105 insertions(+), 7 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index ba061bbc4fbc..e87f035ca57b 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -64,6 +64,28 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
                 sequence of token IDs.
         """
         return cdiv(len(token_ids), block_size)
+    
+    def append_by_blocks(self,
+                 token_ids: List[int],
+                 device: Device = Device.GPU) -> Block:
+        """Allocates memory blocks for storing the given sequence of token IDs.
+
+        This method allocates the required number of blocks to store the given
+        sequence of token IDs.
+
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be stored.
+            device (Device, optional): The device on which the blocks should be
+                allocated. Defaults to Device.GPU.
+        """
+        block = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                        token_ids=token_ids,
+                                                        device=device)
+        self._blocks.append(block)
+        self._num_full_slots += len(token_ids)
+        return block
+        
+
 
     def allocate(self,
                  token_ids: List[int],
@@ -78,8 +100,8 @@ def allocate(self,
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
         """
-        assert not self._is_allocated
         assert token_ids
+        assert not self._is_allocated
         self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                            token_ids=token_ids,
                                                            device=device)
@@ -248,7 +270,11 @@ def _get_all_token_ids(self) -> List[int]:
     @property
     def _is_allocated(self) -> bool:
         return self._blocks is not None
-
+    
+    @property
+    def _num_touched_blocks(self) -> int:
+        return len(self._blocks)
+    
     @property
     def _num_empty_slots(self) -> int:
         assert self._is_allocated
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 3135e194c593..a44845729a13 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,5 +1,5 @@
+from __future__ import annotations
 from typing import Dict, List, Optional
-
 from vllm.core.block.interfaces import (Block, BlockAllocator,
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
@@ -189,6 +189,9 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         # CoW only supported on GPU
         device = Device.GPU
         return self._allocators[device].clear_copy_on_writes()
+    
+    def increase_ref_count(self, device: Device, block_id: int) -> None:
+        return self._allocators[device].refcounter().incr(block_id)
 
     def mark_blocks_as_computed(self) -> None:
         # Prefix caching only supported on GPU.
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 9f466566f096..eb8c2b73edf0 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 from abc import ABC, abstractmethod, abstractproperty
 from typing import Dict, List, Optional, Protocol
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index f8e9265bb2d6..35539945b942 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 from typing import Dict, Iterable, List, Optional, Set
 
 from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 6aa75a8abb80..56efdf0165c8 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,4 +1,5 @@
 """Token blocks."""
+from __future__ import annotations
 from itertools import takewhile
 from os.path import commonprefix
 from typing import Dict, Iterable, List, Optional
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 813e71ad883b..01e81896aca8 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,6 +1,7 @@
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 
+from vllm.core.block.interfaces import Block
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
@@ -227,17 +228,82 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
 
     def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> bool:
-        return False
+        """
+        We go through all sequence in seq group to get their number of blocks 
+        touched and sum them up to see whether there is enough memory to swap in
+        """
+        num_touched_blocks = 0
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            block_table = self.block_tables[seq.seq_id]
+            num_touched_blocks += (
+                block_table.get_num_blocks_touched_by_append_slots(
+                    token_ids=seq.get_token_ids(),
+                    num_lookahead_slots=num_lookahead_slots,
+                ))
+        num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU)
+        return num_free_blocks - num_touched_blocks >= self.watermark_blocks
+        
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
-        raise NotImplementedError
+        mapping: Dict[Block, Block] = {}
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            new_block_table = BlockTable(
+                block_size=self.block_size,
+                block_allocator=self.block_allocator,
+            )
+            block_table = self.block_tables[seq.seq_id]
+
+            for cpu_block in block_table:
+                if cpu_block in mapping:
+                    gpu_block = mapping[cpu_block]
+                    self.block_allocator.increase_ref_count(Device.GPU, gpu_block.block_id())
+                else:
+                    gpu_block = new_block_table.append_by_blocks(token_ids=cpu_block.token_ids(), device=Device.GPU)
+                    mapping[cpu_block] = gpu_block
+                self.block_allocator.free(cpu_block)
+            self.block_tables[seq.seq_id] = new_block_table
+
+        # NOTE: since the memory operation in physical blocks need the relative position 
+        # of CPU block to its starting address, here we need to shift the block id of cpu
+        # block back to its relative position within CPU cache
+        block_number_mapping = {
+            cpu_block.block_id() - self.num_total_gpu_blocks: gpu_block.block_id()
+            for cpu_block, gpu_block in mapping.items()
+        }
+        return block_number_mapping
+
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        return False
+        num_touched_blocks = 0
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            num_touched_blocks += self.block_tables[seq.seq_id]._num_touched_blocks()
+        return num_touched_blocks <= self.block_allocator.get_num_free_blocks(Device.CPU)
 
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        raise NotImplementedError
+        mapping: Dict[Block, Block] = {}
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            new_block_table = BlockTable(
+                block_size=self.block_size,
+                block_allocator=self.block_allocator,
+            )
+            block_table = self.block_tables[seq.seq_id]
+
+            for gpu_block in block_table:
+                if gpu_block in mapping:
+                    cpu_block = mapping[gpu_block]
+                    self.block_allocator.increase_ref_count(Device.CPU, cpu_block.block_id())
+                else:
+                    cpu_block = new_block_table.append_by_blocks(token_ids=gpu_block.token_ids(), device=Device.CPU)
+                    mapping[gpu_block] = cpu_block
+                self.block_allocator.free(cpu_block) 
+            self.block_tables[seq.seq_id] = new_block_table
+
+        block_number_mapping = {
+            cpu_block.block_number - self.num_total_gpu_blocks: gpu_block.block_number
+            for cpu_block, gpu_block in mapping.items()
+        }
+        return block_number_mapping
 
     def get_num_free_gpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.GPU)

From b6b4b8fa72e9fedc27d4272102c8030bec96a466 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Wed, 3 Apr 2024 22:15:55 +0000
Subject: [PATCH 02/32] fix: linter

---
 vllm/core/block/block_table.py             | 16 ++++------
 vllm/core/block/cpu_gpu_block_allocator.py |  4 ++-
 vllm/core/block/interfaces.py              |  1 +
 vllm/core/block/naive_block.py             |  1 +
 vllm/core/block/prefix_caching_block.py    |  1 +
 vllm/core/block_manager_v2.py              | 37 +++++++++++++---------
 6 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index e87f035ca57b..91ee218fa616 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -64,10 +64,10 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
                 sequence of token IDs.
         """
         return cdiv(len(token_ids), block_size)
-    
+
     def append_by_blocks(self,
-                 token_ids: List[int],
-                 device: Device = Device.GPU) -> Block:
+                         token_ids: List[int],
+                         device: Device = Device.GPU) -> Block:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -79,13 +79,11 @@ def append_by_blocks(self,
                 allocated. Defaults to Device.GPU.
         """
         block = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                        token_ids=token_ids,
-                                                        device=device)
+                                                    token_ids=token_ids,
+                                                    device=device)
         self._blocks.append(block)
         self._num_full_slots += len(token_ids)
         return block
-        
-
 
     def allocate(self,
                  token_ids: List[int],
@@ -270,11 +268,11 @@ def _get_all_token_ids(self) -> List[int]:
     @property
     def _is_allocated(self) -> bool:
         return self._blocks is not None
-    
+
     @property
     def _num_touched_blocks(self) -> int:
         return len(self._blocks)
-    
+
     @property
     def _num_empty_slots(self) -> int:
         assert self._is_allocated
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index a44845729a13..8d9188286de8 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
+
 from typing import Dict, List, Optional
+
 from vllm.core.block.interfaces import (Block, BlockAllocator,
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
@@ -189,7 +191,7 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         # CoW only supported on GPU
         device = Device.GPU
         return self._allocators[device].clear_copy_on_writes()
-    
+
     def increase_ref_count(self, device: Device, block_id: int) -> None:
         return self._allocators[device].refcounter().incr(block_id)
 
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index eb8c2b73edf0..d463f7b09131 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 from abc import ABC, abstractmethod, abstractproperty
 from typing import Dict, List, Optional, Protocol
 
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 35539945b942..323fb85707ab 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+
 from typing import Dict, Iterable, List, Optional, Set
 
 from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 56efdf0165c8..2232564381ea 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,5 +1,6 @@
 """Token blocks."""
 from __future__ import annotations
+
 from itertools import takewhile
 from os.path import commonprefix
 from typing import Dict, Iterable, List, Optional
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 01e81896aca8..05a0d1110700 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,9 +1,9 @@
 """A block manager that manages token blocks."""
 from typing import Dict, List, Optional
 
-from vllm.core.block.interfaces import Block
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.interfaces import Block
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
@@ -242,7 +242,6 @@ def can_swap_in(self, seq_group: SequenceGroup,
                 ))
         num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU)
         return num_free_blocks - num_touched_blocks >= self.watermark_blocks
-        
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
@@ -257,28 +256,33 @@ def swap_in(self, seq_group: SequenceGroup,
             for cpu_block in block_table:
                 if cpu_block in mapping:
                     gpu_block = mapping[cpu_block]
-                    self.block_allocator.increase_ref_count(Device.GPU, gpu_block.block_id())
+                    self.block_allocator.increase_ref_count(
+                        Device.GPU, gpu_block.block_id())
                 else:
-                    gpu_block = new_block_table.append_by_blocks(token_ids=cpu_block.token_ids(), device=Device.GPU)
+                    gpu_block = new_block_table.append_by_blocks(
+                        token_ids=cpu_block.token_ids(), device=Device.GPU)
                     mapping[cpu_block] = gpu_block
                 self.block_allocator.free(cpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
-        # NOTE: since the memory operation in physical blocks need the relative position 
-        # of CPU block to its starting address, here we need to shift the block id of cpu
-        # block back to its relative position within CPU cache
+        # NOTE: since the memory operation in physical blocks need the
+        # relative position of CPU block to its starting address, here
+        # we need to shift the block id of cpu block back to its relative
+        # position within CPU cache.
         block_number_mapping = {
-            cpu_block.block_id() - self.num_total_gpu_blocks: gpu_block.block_id()
+            cpu_block.block_id() - self.num_total_gpu_blocks:
+            gpu_block.block_id()
             for cpu_block, gpu_block in mapping.items()
         }
         return block_number_mapping
 
-
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         num_touched_blocks = 0
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            num_touched_blocks += self.block_tables[seq.seq_id]._num_touched_blocks()
-        return num_touched_blocks <= self.block_allocator.get_num_free_blocks(Device.CPU)
+            num_touched_blocks += self.block_tables[
+                seq.seq_id]._num_touched_blocks()
+        return num_touched_blocks <= self.block_allocator.get_num_free_blocks(
+            Device.CPU)
 
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         mapping: Dict[Block, Block] = {}
@@ -292,15 +296,18 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
             for gpu_block in block_table:
                 if gpu_block in mapping:
                     cpu_block = mapping[gpu_block]
-                    self.block_allocator.increase_ref_count(Device.CPU, cpu_block.block_id())
+                    self.block_allocator.increase_ref_count(
+                        Device.CPU, cpu_block.block_id())
                 else:
-                    cpu_block = new_block_table.append_by_blocks(token_ids=gpu_block.token_ids(), device=Device.CPU)
+                    cpu_block = new_block_table.append_by_blocks(
+                        token_ids=gpu_block.token_ids(), device=Device.CPU)
                     mapping[gpu_block] = cpu_block
-                self.block_allocator.free(cpu_block) 
+                self.block_allocator.free(cpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
         block_number_mapping = {
-            cpu_block.block_number - self.num_total_gpu_blocks: gpu_block.block_number
+            cpu_block.block_number - self.num_total_gpu_blocks:
+            gpu_block.block_number
             for cpu_block, gpu_block in mapping.items()
         }
         return block_number_mapping

From 938d10e638ea72b6aba9f220abe4a10b8b8e475a Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Thu, 4 Apr 2024 02:44:09 +0000
Subject: [PATCH 03/32] fix: fix some bugs and add test

---
 tests/core/block/test_block_manager_v2.py | 48 ++++++++++++++++++++++-
 vllm/core/block/block_table.py            | 33 ++++++++++------
 vllm/core/block/naive_block.py            |  3 --
 vllm/core/block_manager_v2.py             | 24 ++++++------
 4 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 1e8e4ccdfb15..f1ee17224c7d 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -5,7 +5,7 @@
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
 
-from ..utils import create_seq_group
+from ..utils import create_dummy_prompt, create_seq_group
 
 
 @pytest.mark.parametrize("block_size", [16])
@@ -101,3 +101,49 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
                 range(prompt_len + num_slots_to_append + num_lookahead_slots)),
             block_size)) - len(chunk_list(list(range(prompt_len)), block_size))
     assert num_consumed_blocks == expected_consumed_blocks
+
+
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_cpu_blocks", [4])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [2])
+def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots):
+    block_manager = BlockSpaceManagerV2(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0)
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    assert list(mapping.keys()) == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    cpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group, num_lookahead_slots)
+    adjusted_cpu_blocks = [block - num_gpu_blocks for block in cpu_blocks]
+    assert list(mapping.keys()) == adjusted_cpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 91ee218fa616..a8fd6665c977 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -65,23 +65,34 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
         """
         return cdiv(len(token_ids), block_size)
 
-    def append_by_blocks(self,
-                         token_ids: List[int],
-                         device: Device = Device.GPU) -> Block:
-        """Allocates memory blocks for storing the given sequence of token IDs.
+    def get_blocks(self) -> Optional[List[Block]]:
+        return self._blocks
 
-        This method allocates the required number of blocks to store the given
-        sequence of token IDs.
+    def append_by_block(self,
+                        token_ids: List[int],
+                        device: Device = Device.GPU) -> Block:
+        """Allocates memory block for storing the given sequence 
+        of token IDs and append it back to the block list.
+
+        This method allocates a block to store the given
+        sequence of token IDs append it back to the block list.
 
         Args:
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
         """
-        block = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                    token_ids=token_ids,
-                                                    device=device)
-        self._blocks.append(block)
+        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                     token_ids=token_ids,
+                                                     device=device)
+        # Note: whenever we call append_by_block because of swapping, the tokens
+        # must fit in a block
+        assert len(blocks) <= 1
+        block = blocks[0]
+        if not self._is_allocated:
+            self._blocks = blocks
+        else:
+            self._blocks.append(block)
         self._num_full_slots += len(token_ids)
         return block
 
@@ -98,8 +109,8 @@ def allocate(self,
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
         """
-        assert token_ids
         assert not self._is_allocated
+        assert token_ids
         self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                            token_ids=token_ids,
                                                            device=device)
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 323fb85707ab..2cb6739de7d4 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -92,9 +92,6 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
     def free(self, block: Block) -> None:
         self._free_block_id(block.block_id)
 
-        # Mark the block as having no allocation.
-        block.block_id = None
-
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 05a0d1110700..9385a7a9ce87 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -253,14 +253,14 @@ def swap_in(self, seq_group: SequenceGroup,
             )
             block_table = self.block_tables[seq.seq_id]
 
-            for cpu_block in block_table:
+            for cpu_block in block_table.get_blocks():
                 if cpu_block in mapping:
                     gpu_block = mapping[cpu_block]
                     self.block_allocator.increase_ref_count(
                         Device.GPU, gpu_block.block_id())
                 else:
-                    gpu_block = new_block_table.append_by_blocks(
-                        token_ids=cpu_block.token_ids(), device=Device.GPU)
+                    gpu_block = new_block_table.append_by_block(
+                        token_ids=cpu_block.token_ids, device=Device.GPU)
                     mapping[cpu_block] = gpu_block
                 self.block_allocator.free(cpu_block)
             self.block_tables[seq.seq_id] = new_block_table
@@ -270,8 +270,7 @@ def swap_in(self, seq_group: SequenceGroup,
         # we need to shift the block id of cpu block back to its relative
         # position within CPU cache.
         block_number_mapping = {
-            cpu_block.block_id() - self.num_total_gpu_blocks:
-            gpu_block.block_id()
+            cpu_block.block_id - self.num_total_gpu_blocks: gpu_block.block_id
             for cpu_block, gpu_block in mapping.items()
         }
         return block_number_mapping
@@ -280,7 +279,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         num_touched_blocks = 0
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             num_touched_blocks += self.block_tables[
-                seq.seq_id]._num_touched_blocks()
+                seq.seq_id]._num_touched_blocks
         return num_touched_blocks <= self.block_allocator.get_num_free_blocks(
             Device.CPU)
 
@@ -293,22 +292,21 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
             )
             block_table = self.block_tables[seq.seq_id]
 
-            for gpu_block in block_table:
+            for gpu_block in block_table.get_blocks():
                 if gpu_block in mapping:
                     cpu_block = mapping[gpu_block]
                     self.block_allocator.increase_ref_count(
                         Device.CPU, cpu_block.block_id())
                 else:
-                    cpu_block = new_block_table.append_by_blocks(
-                        token_ids=gpu_block.token_ids(), device=Device.CPU)
+                    cpu_block = new_block_table.append_by_block(
+                        token_ids=gpu_block.token_ids, device=Device.CPU)
                     mapping[gpu_block] = cpu_block
-                self.block_allocator.free(cpu_block)
+                self.block_allocator.free(gpu_block)
             self.block_tables[seq.seq_id] = new_block_table
 
         block_number_mapping = {
-            cpu_block.block_number - self.num_total_gpu_blocks:
-            gpu_block.block_number
-            for cpu_block, gpu_block in mapping.items()
+            gpu_block.block_id: cpu_block.block_id - self.num_total_gpu_blocks
+            for gpu_block, cpu_block in mapping.items()
         }
         return block_number_mapping
 

From 9181552465f165b5f2fac45b3c128be4efc02bcb Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Thu, 4 Apr 2024 07:34:18 +0000
Subject: [PATCH 04/32] fix: address comment

---
 vllm/core/block/block_table.py             | 51 ++++++----------
 vllm/core/block/cpu_gpu_block_allocator.py | 68 +++++++++++++++++++---
 vllm/core/block/naive_block.py             |  3 +
 vllm/core/block/prefix_caching_block.py    |  3 +
 vllm/core/block_manager_v2.py              | 44 ++++----------
 5 files changed, 93 insertions(+), 76 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index a8fd6665c977..ae7df17f4089 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -68,37 +68,10 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
     def get_blocks(self) -> Optional[List[Block]]:
         return self._blocks
 
-    def append_by_block(self,
-                        token_ids: List[int],
-                        device: Device = Device.GPU) -> Block:
-        """Allocates memory block for storing the given sequence 
-        of token IDs and append it back to the block list.
-
-        This method allocates a block to store the given
-        sequence of token IDs append it back to the block list.
-
-        Args:
-            token_ids (List[int]): The sequence of token IDs to be stored.
-            device (Device, optional): The device on which the blocks should be
-                allocated. Defaults to Device.GPU.
-        """
-        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                     token_ids=token_ids,
-                                                     device=device)
-        # Note: whenever we call append_by_block because of swapping, the tokens
-        # must fit in a block
-        assert len(blocks) <= 1
-        block = blocks[0]
-        if not self._is_allocated:
-            self._blocks = blocks
-        else:
-            self._blocks.append(block)
-        self._num_full_slots += len(token_ids)
-        return block
-
     def allocate(self,
                  token_ids: List[int],
-                 device: Device = Device.GPU) -> None:
+                 device: Device = Device.GPU,
+                 by_block: bool = False) -> Optional[Block]:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -108,13 +81,23 @@ def allocate(self,
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
+            by_block (bool, optional): whether we are allocate block by block.
+            Set to True when doing cache swapping. Defaults to False. 
         """
-        assert not self._is_allocated
+        assert not self._is_allocated or by_block
         assert token_ids
-        self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                           token_ids=token_ids,
-                                                           device=device)
-        self._num_full_slots = len(token_ids)
+        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                     token_ids=token_ids,
+                                                     device=device)
+        self._num_full_slots += len(token_ids)
+        if not (by_block and self._is_allocated):
+            self._blocks = blocks
+        else:
+            # Note: whenever we call allocate with by_block set to True,
+            # because of swapping, the tokens must fit in a block
+            assert len(blocks) == 1
+            self._blocks.append(blocks[0])
+        return blocks[0]
 
     def append_token_ids(self,
                          token_ids: List[int],
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 8d9188286de8..753629f6f406 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -2,10 +2,12 @@
 
 from typing import Dict, List, Optional
 
+from vllm.core.block.block_table import BlockTable
 from vllm.core.block.interfaces import (Block, BlockAllocator,
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.sequence import Sequence
 from vllm.utils import Device
 
 
@@ -90,18 +92,17 @@ def create(
         return CpuGpuBlockAllocator(
             cpu_block_allocator=cpu_allocator,
             gpu_block_allocator=gpu_allocator,
+            block_size=block_size,
         )
 
-    def __init__(
-        self,
-        cpu_block_allocator: BlockAllocator,
-        gpu_block_allocator: BlockAllocator,
-    ):
+    def __init__(self, cpu_block_allocator: BlockAllocator,
+                 gpu_block_allocator: BlockAllocator, block_size: int):
         assert not (
             cpu_block_allocator.all_block_ids
             & gpu_block_allocator.all_block_ids
         ), "cpu and gpu block allocators can't have intersection of block ids"
 
+        self._block_size = block_size
         self._allocators = {
             Device.CPU: cpu_block_allocator,
             Device.GPU: gpu_block_allocator,
@@ -145,6 +146,16 @@ def allocate_immutable(self, prev_block: Optional[Block],
         return self._allocators[device].allocate_immutable(
             prev_block, token_ids)
 
+    def reference(self, block: Block) -> None:
+        """Notify the device aware allocator there is new sequence reference
+        the given block.
+
+        Args:
+            block (Block): The block to be referenced.
+        """
+        allocator = self._block_ids_to_allocator[block.block_id]
+        return allocator.reference(block)
+
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
 
@@ -192,9 +203,6 @@ def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         device = Device.GPU
         return self._allocators[device].clear_copy_on_writes()
 
-    def increase_ref_count(self, device: Device, block_id: int) -> None:
-        return self._allocators[device].refcounter().incr(block_id)
-
     def mark_blocks_as_computed(self) -> None:
         # Prefix caching only supported on GPU.
         device = Device.GPU
@@ -209,3 +217,47 @@ def get_common_computed_block_ids(
 
     def all_block_ids(self) -> frozenset[int]:
         return frozenset(self._block_ids_to_allocator.keys())
+
+    def get_seq_swap_out_block_mapping(
+            self, seq: Sequence, block_table: BlockTable,
+            mapping: Dict[Block, Block]) -> BlockTable:
+        # The swap out logic for a sequence, the mapping dict will be updated
+        # and the new block table for swapped out sequence is returned.
+        new_block_table = BlockTable(
+            block_size=self._block_size,
+            block_allocator=self,
+        )
+        for gpu_block in block_table.get_blocks():
+            if gpu_block in mapping:
+                cpu_block = mapping[gpu_block]
+                self.reference(cpu_block)
+            else:
+                cpu_block = new_block_table.allocate(
+                    token_ids=gpu_block.token_ids,
+                    device=Device.CPU,
+                    by_block=True)
+                mapping[gpu_block] = cpu_block
+            self.free(gpu_block)
+        return new_block_table
+
+    def get_seq_swap_in_block_mapping(
+            self, seq: Sequence, block_table: BlockTable,
+            mapping: Dict[Block, Block]) -> BlockTable:
+        # The swap in logic for a sequence, the mapping dict will be updated
+        # and the new block table for swapped in sequence is returned.
+        new_block_table = BlockTable(
+            block_size=self._block_size,
+            block_allocator=self,
+        )
+        for cpu_block in block_table.get_blocks():
+            if cpu_block in mapping:
+                gpu_block = mapping[cpu_block]
+                self.reference(gpu_block)
+            else:
+                gpu_block = new_block_table.allocate(
+                    token_ids=cpu_block.token_ids,
+                    device=Device.GPU,
+                    by_block=True)
+                mapping[cpu_block] = gpu_block
+            self.free(cpu_block)
+        return new_block_table
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 2cb6739de7d4..ad0b3a451434 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -92,6 +92,9 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
     def free(self, block: Block) -> None:
         self._free_block_id(block.block_id)
 
+    def reference(self, block: Block) -> None:
+        self._refcounter.incr(block.block_id)
+
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 2232564381ea..d95f4a92ea2a 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -199,6 +199,9 @@ def _free_block_id_for_block(self, block_id: BlockId,
             assert block.content_hash in self._cached_blocks
             self._unused_cached_blocks[block.content_hash] = block_id
 
+    def reference(self, block: Block) -> None:
+        self._refcounter.incr(block.block_id)
+
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 9385a7a9ce87..c5a221473117 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -247,23 +247,11 @@ def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
         mapping: Dict[Block, Block] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            new_block_table = BlockTable(
-                block_size=self.block_size,
-                block_allocator=self.block_allocator,
-            )
             block_table = self.block_tables[seq.seq_id]
-
-            for cpu_block in block_table.get_blocks():
-                if cpu_block in mapping:
-                    gpu_block = mapping[cpu_block]
-                    self.block_allocator.increase_ref_count(
-                        Device.GPU, gpu_block.block_id())
-                else:
-                    gpu_block = new_block_table.append_by_block(
-                        token_ids=cpu_block.token_ids, device=Device.GPU)
-                    mapping[cpu_block] = gpu_block
-                self.block_allocator.free(cpu_block)
-            self.block_tables[seq.seq_id] = new_block_table
+            self.block_tables[
+                seq.
+                seq_id] = self.block_allocator.get_seq_swap_in_block_mapping(
+                    seq, block_table, mapping)
 
         # NOTE: since the memory operation in physical blocks need the
         # relative position of CPU block to its starting address, here
@@ -278,31 +266,19 @@ def swap_in(self, seq_group: SequenceGroup,
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         num_touched_blocks = 0
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            num_touched_blocks += self.block_tables[
-                seq.seq_id]._num_touched_blocks
+            block_table = self.block_tables[seq.seq_id]
+            num_touched_blocks += block_table._num_touched_blocks
         return num_touched_blocks <= self.block_allocator.get_num_free_blocks(
             Device.CPU)
 
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         mapping: Dict[Block, Block] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            new_block_table = BlockTable(
-                block_size=self.block_size,
-                block_allocator=self.block_allocator,
-            )
             block_table = self.block_tables[seq.seq_id]
-
-            for gpu_block in block_table.get_blocks():
-                if gpu_block in mapping:
-                    cpu_block = mapping[gpu_block]
-                    self.block_allocator.increase_ref_count(
-                        Device.CPU, cpu_block.block_id())
-                else:
-                    cpu_block = new_block_table.append_by_block(
-                        token_ids=gpu_block.token_ids, device=Device.CPU)
-                    mapping[gpu_block] = cpu_block
-                self.block_allocator.free(gpu_block)
-            self.block_tables[seq.seq_id] = new_block_table
+            self.block_tables[
+                seq.
+                seq_id] = self.block_allocator.get_seq_swap_out_block_mapping(
+                    seq, block_table, mapping)
 
         block_number_mapping = {
             gpu_block.block_id: cpu_block.block_id - self.num_total_gpu_blocks

From e9a907f1d3008f2ce65549aea2727dafba3f3968 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 5 Apr 2024 00:33:12 +0000
Subject: [PATCH 05/32] fix: reduce overestimate for can_swap_in

---
 vllm/core/block/block_table.py             | 36 ++++++++++++++++++++++
 vllm/core/block/cpu_gpu_block_allocator.py | 15 +++++++++
 vllm/core/block/interfaces.py              | 10 ++++++
 vllm/core/block/naive_block.py             | 27 ++++++++++++++++
 vllm/core/block/prefix_caching_block.py    | 27 ++++++++++++++--
 vllm/core/block_manager_v2.py              | 28 ++++++++++++++---
 6 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index ae7df17f4089..3a1a28713f64 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -307,3 +307,39 @@ def _chunk_token_blocks_for_append(
         token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
             token_ids[first_chunk_size:], self._block_size)
         return token_blocks
+
+    def get_num_cache_blocks_touched_by_swapping_in(self, token_ids: List[int],
+                                                    num_lookahead_slots: int,
+                                                    device: Device) -> int:
+        """Determine how many blocks will be "touched" by swapping in the 
+        token ids.
+
+        This is required for the scheduler to determine whether a sequence can
+        be swapped in.
+        """
+        all_token_ids = token_ids + [-1] * num_lookahead_slots
+        token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
+        prev_block = None
+        num_blocks_touched = 0
+        for token_block in token_blocks:
+            block = self.block_allocator.mock_mutable(prev_block, token_block,
+                                                      device)
+            if not block.prefix_caching_allocator.is_block_cached(block):
+                num_blocks_touched += 1
+            prev_block = block
+        return num_blocks_touched
+
+    def get_num_naive_blocks_touched_by_swapping_in(self, token_ids: List[int],
+                                                    num_lookahead_slots: int,
+                                                    total_touched_blocks: int,
+                                                    block_set: set) -> None:
+        num_blocks_touched = self.get_num_blocks_touched_by_append_slots(
+            token_ids, num_lookahead_slots)
+        blocks = self.get_blocks()
+        if num_blocks_touched > len(blocks):
+            total_touched_blocks += 1
+        for block in blocks:
+            if not block.is_full:
+                total_touched_blocks += 1
+            else:
+                block_set.add(block.block_id)
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 753629f6f406..8101f1e0df46 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -146,6 +146,21 @@ def allocate_immutable(self, prev_block: Optional[Block],
         return self._allocators[device].allocate_immutable(
             prev_block, token_ids)
 
+    def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int],
+                     device: Device) -> Block:
+        """Mock a new mutable block, linked to the previous block, to help with
+        content hash calculation.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        return self._allocators[device].mock_mutable(prev_block, token_ids)
+
     def reference(self, block: Block) -> None:
         """Notify the device aware allocator there is new sequence reference
         the given block.
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index d463f7b09131..f6e20ff11a13 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -57,6 +57,11 @@ def allocate_immutable(self, prev_block: Optional[Block],
                            token_ids: List[int]) -> Block:
         pass
 
+    @abstractmethod
+    def mock_mutable(self, prev_block: Optional[Block],
+                     token_ids: List[int]) -> Block:
+        pass
+
     @abstractmethod
     def free(self, block: Block) -> None:
         pass
@@ -102,6 +107,11 @@ def allocate_immutable(self, prev_block: Optional[Block],
                            token_ids: List[int], device: Device) -> Block:
         pass
 
+    @abstractmethod
+    def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int],
+                     device: Device) -> Block:
+        pass
+
     @abstractmethod
     def get_num_free_blocks(self, device: Device) -> int:
         pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index ad0b3a451434..5f529bdd6e27 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -89,6 +89,33 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
             allocator=self,
         )
 
+    def mock_mutable(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+    ) -> Block:
+        """Mock a new mutable block, linked to the previous block, to help with
+        content hash calculation.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+
+        # NOTE: we use -1 as block_id for mock block
+        block_id = -1
+        return self._create_block(
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_id=block_id,
+            block_size=self._block_size,
+            allocator=self,
+        )
+
     def free(self, block: Block) -> None:
         self._free_block_id(block.block_id)
 
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index d95f4a92ea2a..1a5a812e13dd 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -164,6 +164,24 @@ def allocate_mutable(self, prev_block: Block) -> Block:
         # No block available in hashless allocator, nor in unused cache blocks.
         raise BlockAllocator.NoFreeBlocksError()
 
+    def mock_mutable(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+    ) -> Block:
+        """Mock a new mutable block, linked to the previous block, to help with
+        content hash calculation.
+
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        return self._hashless_allocator.mock_mutable(prev_block, token_ids)
+
     def _incr_refcount_cached_block(self, content_hash: int,
                                     block_id: BlockId) -> None:
         refcount = self._refcounter.incr(block_id)
@@ -234,8 +252,8 @@ def fork(self, last_block: Block) -> List[Block]:
         return forked_blocks
 
     def get_num_free_blocks(self) -> int:
-        # The number of free blocks is the number of hashless free blocks
-        # plus the number of hashful blocks that are unused.
+        # The number of free blocks is the number of hashless free
+        # blocks plus the number of hashful blocks that are unused.
         return self._hashless_allocator.get_num_free_blocks() + len(
             self._unused_cached_blocks)
 
@@ -243,6 +261,11 @@ def get_num_free_blocks(self) -> int:
     def all_block_ids(self) -> frozenset[int]:
         return self._hashless_allocator.all_block_ids
 
+    def is_block_cached(self, block: "PrefixCachingBlock") -> bool:
+        if block.content_hash not in self._cached_blocks:
+            return True
+        return False
+
     def promote_to_immutable_block(self,
                                    block: "PrefixCachingBlock") -> BlockId:
         """Once a mutable block is full, it can be promoted to an immutable
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index c5a221473117..7dee8349c2f6 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -233,13 +233,31 @@ def can_swap_in(self, seq_group: SequenceGroup,
         touched and sum them up to see whether there is enough memory to swap in
         """
         num_touched_blocks = 0
-        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            block_table = self.block_tables[seq.seq_id]
-            num_touched_blocks += (
-                block_table.get_num_blocks_touched_by_append_slots(
+
+        if self.enable_caching:
+            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+                block_table = self.block_tables[seq.seq_id]
+                num_touched_blocks += (
+                    block_table.get_num_cache_blocks_touched_by_swapping_in(
+                        token_ids=seq.get_token_ids(),
+                        num_lookahead_slots=num_lookahead_slots,
+                        device=Device.GPU))
+        else:
+            # NOTE: for naive block, we go though all the sequence to collect
+            # a set of immutable block id, and accumulate number of isolated
+            # blocks (mutable ones and single block caused by lookahead). We
+            # sum them up at the end to get the final num_touched_blocks
+            # num_touched_blocks swap in op.
+            block_set = set()
+            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+                block_table = self.block_tables[seq.seq_id]
+                block_table.get_num_naive_blocks_touched_by_swapping_in(
                     token_ids=seq.get_token_ids(),
                     num_lookahead_slots=num_lookahead_slots,
-                ))
+                    total_touched_blocks=num_touched_blocks,
+                    block_set=block_set)
+            num_touched_blocks += len(block_set)
+
         num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU)
         return num_free_blocks - num_touched_blocks >= self.watermark_blocks
 

From dcff0e1c76661970eccad95e507b9287614a02dc Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 5 Apr 2024 00:45:17 +0000
Subject: [PATCH 06/32] fix: reuse similar logic in can_swap_in to reduce
 overestimation in can_swap_out

---
 vllm/core/block/block_table.py | 18 +++++++++---------
 vllm/core/block_manager_v2.py  | 27 ++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 3a1a28713f64..095afcee01b1 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -308,14 +308,14 @@ def _chunk_token_blocks_for_append(
             token_ids[first_chunk_size:], self._block_size)
         return token_blocks
 
-    def get_num_cache_blocks_touched_by_swapping_in(self, token_ids: List[int],
-                                                    num_lookahead_slots: int,
-                                                    device: Device) -> int:
-        """Determine how many blocks will be "touched" by swapping in the 
+    def get_num_cache_blocks_touched_by_swapping(self, token_ids: List[int],
+                                                 num_lookahead_slots: int,
+                                                 device: Device) -> int:
+        """Determine how many blocks will be "touched" by swapping in/out the 
         token ids.
 
         This is required for the scheduler to determine whether a sequence can
-        be swapped in.
+        be swapped in/out.
         """
         all_token_ids = token_ids + [-1] * num_lookahead_slots
         token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
@@ -329,10 +329,10 @@ def get_num_cache_blocks_touched_by_swapping_in(self, token_ids: List[int],
             prev_block = block
         return num_blocks_touched
 
-    def get_num_naive_blocks_touched_by_swapping_in(self, token_ids: List[int],
-                                                    num_lookahead_slots: int,
-                                                    total_touched_blocks: int,
-                                                    block_set: set) -> None:
+    def get_num_naive_blocks_touched_by_swapping(self, token_ids: List[int],
+                                                 num_lookahead_slots: int,
+                                                 total_touched_blocks: int,
+                                                 block_set: set) -> None:
         num_blocks_touched = self.get_num_blocks_touched_by_append_slots(
             token_ids, num_lookahead_slots)
         blocks = self.get_blocks()
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 7dee8349c2f6..89a52730d5d6 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -238,7 +238,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
             for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
                 block_table = self.block_tables[seq.seq_id]
                 num_touched_blocks += (
-                    block_table.get_num_cache_blocks_touched_by_swapping_in(
+                    block_table.get_num_cache_blocks_touched_by_swapping(
                         token_ids=seq.get_token_ids(),
                         num_lookahead_slots=num_lookahead_slots,
                         device=Device.GPU))
@@ -251,7 +251,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
             block_set = set()
             for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
                 block_table = self.block_tables[seq.seq_id]
-                block_table.get_num_naive_blocks_touched_by_swapping_in(
+                block_table.get_num_naive_blocks_touched_by_swapping(
                     token_ids=seq.get_token_ids(),
                     num_lookahead_slots=num_lookahead_slots,
                     total_touched_blocks=num_touched_blocks,
@@ -283,9 +283,26 @@ def swap_in(self, seq_group: SequenceGroup,
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         num_touched_blocks = 0
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            block_table = self.block_tables[seq.seq_id]
-            num_touched_blocks += block_table._num_touched_blocks
+
+        if self.enable_caching:
+            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+                block_table = self.block_tables[seq.seq_id]
+                num_touched_blocks += (
+                    block_table.get_num_cache_blocks_touched_by_swapping(
+                        token_ids=seq.get_token_ids(),
+                        num_lookahead_slots=0,
+                        device=Device.CPU))
+        else:
+            block_set = set()
+            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+                block_table = self.block_tables[seq.seq_id]
+                block_table.get_num_naive_blocks_touched_by_swapping(
+                    token_ids=seq.get_token_ids(),
+                    num_lookahead_slots=0,
+                    total_touched_blocks=num_touched_blocks,
+                    block_set=block_set)
+            num_touched_blocks += len(block_set)
+
         return num_touched_blocks <= self.block_allocator.get_num_free_blocks(
             Device.CPU)
 

From 205dda1ec050189cbbfd697f065cd1b500178444 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 5 Apr 2024 08:48:47 +0000
Subject: [PATCH 07/32] fix: refactor swap in/out logic

---
 tests/core/block/test_block_manager_v2.py  |  13 +--
 vllm/core/block/block_table.py             |  11 ++
 vllm/core/block/cpu_gpu_block_allocator.py | 115 ++++++++++++---------
 vllm/core/block/naive_block.py             |   5 +-
 vllm/core/block/prefix_caching_block.py    |   4 +-
 vllm/core/block_manager_v2.py              |  25 ++---
 6 files changed, 99 insertions(+), 74 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index f1ee17224c7d..67c0789f03b6 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -107,11 +107,14 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 @pytest.mark.parametrize("num_cpu_blocks", [4])
 @pytest.mark.parametrize("num_gpu_blocks", [4])
 @pytest.mark.parametrize("num_lookahead_slots", [2])
-def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots):
+@pytest.mark.parametrize("enable_caching", [False])
+def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
+              enable_caching):
     block_manager = BlockSpaceManagerV2(block_size,
                                         num_cpu_blocks,
                                         num_gpu_blocks,
-                                        watermark=0)
+                                        watermark=0,
+                                        enable_caching=enable_caching)
     prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
     prompt.status = SequenceStatus.WAITING
     block_manager.allocate(seq_group)
@@ -136,14 +139,12 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots):
     prompt.status = SequenceStatus.SWAPPED
 
     # Swap seq group from CPU -> GPU.
-    cpu_blocks = block_manager.get_block_table(prompt)
     assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
     before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     mapping = block_manager.swap_in(seq_group, num_lookahead_slots)
-    adjusted_cpu_blocks = [block - num_gpu_blocks for block in cpu_blocks]
-    assert list(mapping.keys()) == adjusted_cpu_blocks
+    cpu_blocks = block_manager.get_block_table(prompt)
+    assert list(mapping.keys()) == [cpu_blocks[0]]
     after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    assert before_cpu_blocks + len(cpu_blocks) == after_cpu_blocks
     assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 095afcee01b1..a15fcd0a752b 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -193,6 +193,17 @@ def free(self) -> None:
             self._allocator.free(block)
         self._blocks = None
 
+    def swap(self, destination_device: Device) -> "BlockTable":
+        new_block_table = BlockTable(
+            block_size=self._block_size,
+            block_allocator=self._allocator,
+        )
+        for src_block in self.get_blocks():
+            self._allocator.update_seq_swap_out_block_mapping(
+                src_block, new_block_table, destination_device)
+            self._allocator.free(src_block)
+        return new_block_table
+
     @property
     def physical_block_ids(self) -> List[int]:
         """Returns a list of physical block indices for the blocks in the
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 8101f1e0df46..8f00515dfb3a 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -7,7 +7,6 @@
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.sequence import Sequence
 from vllm.utils import Device
 
 
@@ -92,23 +91,22 @@ def create(
         return CpuGpuBlockAllocator(
             cpu_block_allocator=cpu_allocator,
             gpu_block_allocator=gpu_allocator,
-            block_size=block_size,
         )
 
     def __init__(self, cpu_block_allocator: BlockAllocator,
-                 gpu_block_allocator: BlockAllocator, block_size: int):
+                 gpu_block_allocator: BlockAllocator):
         assert not (
             cpu_block_allocator.all_block_ids
             & gpu_block_allocator.all_block_ids
         ), "cpu and gpu block allocators can't have intersection of block ids"
 
-        self._block_size = block_size
         self._allocators = {
             Device.CPU: cpu_block_allocator,
             Device.GPU: gpu_block_allocator,
         }
 
         self._block_ids_to_allocator = {}
+        self._swap_mapping = {}
         for _, allocator in self._allocators.items():
             for block_id in allocator.all_block_ids:
                 self._block_ids_to_allocator[block_id] = allocator
@@ -161,15 +159,15 @@ def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int],
         """
         return self._allocators[device].mock_mutable(prev_block, token_ids)
 
-    def reference(self, block: Block) -> None:
+    def reference(self, block_id: int) -> None:
         """Notify the device aware allocator there is new sequence reference
         the given block.
 
         Args:
             block (Block): The block to be referenced.
         """
-        allocator = self._block_ids_to_allocator[block.block_id]
-        return allocator.reference(block)
+        allocator = self._block_ids_to_allocator[block_id]
+        return allocator.reference(block_id)
 
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
@@ -233,46 +231,63 @@ def get_common_computed_block_ids(
     def all_block_ids(self) -> frozenset[int]:
         return frozenset(self._block_ids_to_allocator.keys())
 
-    def get_seq_swap_out_block_mapping(
-            self, seq: Sequence, block_table: BlockTable,
-            mapping: Dict[Block, Block]) -> BlockTable:
-        # The swap out logic for a sequence, the mapping dict will be updated
-        # and the new block table for swapped out sequence is returned.
-        new_block_table = BlockTable(
-            block_size=self._block_size,
-            block_allocator=self,
-        )
-        for gpu_block in block_table.get_blocks():
-            if gpu_block in mapping:
-                cpu_block = mapping[gpu_block]
-                self.reference(cpu_block)
-            else:
-                cpu_block = new_block_table.allocate(
-                    token_ids=gpu_block.token_ids,
-                    device=Device.CPU,
-                    by_block=True)
-                mapping[gpu_block] = cpu_block
-            self.free(gpu_block)
-        return new_block_table
-
-    def get_seq_swap_in_block_mapping(
-            self, seq: Sequence, block_table: BlockTable,
-            mapping: Dict[Block, Block]) -> BlockTable:
-        # The swap in logic for a sequence, the mapping dict will be updated
-        # and the new block table for swapped in sequence is returned.
-        new_block_table = BlockTable(
-            block_size=self._block_size,
-            block_allocator=self,
-        )
-        for cpu_block in block_table.get_blocks():
-            if cpu_block in mapping:
-                gpu_block = mapping[cpu_block]
-                self.reference(gpu_block)
-            else:
-                gpu_block = new_block_table.allocate(
-                    token_ids=cpu_block.token_ids,
-                    device=Device.GPU,
-                    by_block=True)
-                mapping[cpu_block] = gpu_block
-            self.free(cpu_block)
-        return new_block_table
+    def update_seq_swap_out_block_mapping(self, block: Block,
+                                          block_table: BlockTable,
+                                          destination_device: Device) -> None:
+        if block.block_id in self._swap_mapping:
+            dest_block_id = self._swap_mapping[block.block_id]
+            self.reference(dest_block_id)
+        else:
+            dest_block = block_table.allocate(token_ids=block.token_ids,
+                                              device=destination_device,
+                                              by_block=True)
+            self._swap_mapping[block.block_id] = dest_block.block_id
+
+    def get_and_reset_swaps(self) -> dict:
+        mapping = self._swap_mapping.copy()
+        self._swap_mapping.clear()
+        return mapping
+
+    # def get_seq_swap_out_block_mapping(
+    #         self, seq: Sequence, block_table: BlockTable,
+    #         mapping: Dict[Block, Block]) -> BlockTable:
+    #     # The swap out logic for a sequence, the mapping dict will be updated
+    #     # and the new block table for swapped out sequence is returned.
+    #     new_block_table = BlockTable(
+    #         block_size=self._block_size,
+    #         block_allocator=self,
+    #     )
+    #     for src_block in block_table.get_blocks():
+    #         if src_block in mapping:
+    #             cpu_block = mapping[src_block]
+    #             self.reference(cpu_block)
+    #         else:
+    #             cpu_block = new_block_table.allocate(
+    #                 token_ids=src_block.token_ids,
+    #                 device=Device.CPU,
+    #                 by_block=True)
+    #             mapping[src_block] = cpu_block
+    #         self.free(src_block)
+    #     return new_block_table
+
+    # def get_seq_swap_in_block_mapping(
+    #         self, seq: Sequence, block_table: BlockTable,
+    #         mapping: Dict[Block, Block]) -> BlockTable:
+    #     # The swap in logic for a sequence, the mapping dict will be updated
+    #     # and the new block table for swapped in sequence is returned.
+    #     new_block_table = BlockTable(
+    #         block_size=self._block_size,
+    #         block_allocator=self,
+    #     )
+    #     for cpu_block in block_table.get_blocks():
+    #         if cpu_block in mapping:
+    #             gpu_block = mapping[cpu_block]
+    #             self.reference(gpu_block)
+    #         else:
+    #             gpu_block = new_block_table.allocate(
+    #                 token_ids=cpu_block.token_ids,
+    #                 device=Device.GPU,
+    #                 by_block=True)
+    #             mapping[cpu_block] = gpu_block
+    #         self.free(cpu_block)
+    #     return new_block_table
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 5f529bdd6e27..60aa145b00b4 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -118,9 +118,10 @@ def mock_mutable(
 
     def free(self, block: Block) -> None:
         self._free_block_id(block.block_id)
+        block.block_id = None
 
-    def reference(self, block: Block) -> None:
-        self._refcounter.incr(block.block_id)
+    def reference(self, block_id: int) -> None:
+        self._refcounter.incr(block_id)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1a5a812e13dd..b3ade6b23d3f 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -217,8 +217,8 @@ def _free_block_id_for_block(self, block_id: BlockId,
             assert block.content_hash in self._cached_blocks
             self._unused_cached_blocks[block.content_hash] = block_id
 
-    def reference(self, block: Block) -> None:
-        self._refcounter.incr(block.block_id)
+    def reference(self, block_id: int) -> None:
+        self._refcounter.incr(block_id)
 
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 89a52730d5d6..7a330079887f 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -176,7 +176,6 @@ def append_slots(
             token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
             num_lookahead_slots=num_lookahead_slots,
         )
-
         # Return any new copy-on-writes.
         new_cows = self.block_allocator.clear_copy_on_writes()
         return new_cows
@@ -263,21 +262,20 @@ def can_swap_in(self, seq_group: SequenceGroup,
 
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
-        mapping: Dict[Block, Block] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
             block_table = self.block_tables[seq.seq_id]
-            self.block_tables[
-                seq.
-                seq_id] = self.block_allocator.get_seq_swap_in_block_mapping(
-                    seq, block_table, mapping)
+            new_block_table = block_table.swap(destination_device=Device.GPU)
+            self.block_tables[seq.seq_id] = new_block_table
+            self.append_slots(seq=seq, num_lookahead_slots=num_lookahead_slots)
 
         # NOTE: since the memory operation in physical blocks need the
         # relative position of CPU block to its starting address, here
         # we need to shift the block id of cpu block back to its relative
         # position within CPU cache.
+        mapping = self.block_allocator.get_and_reset_swaps()
         block_number_mapping = {
-            cpu_block.block_id - self.num_total_gpu_blocks: gpu_block.block_id
-            for cpu_block, gpu_block in mapping.items()
+            cpu_block_id - self.num_total_gpu_blocks: gpu_block_id
+            for cpu_block_id, gpu_block_id in mapping.items()
         }
         return block_number_mapping
 
@@ -310,14 +308,13 @@ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         mapping: Dict[Block, Block] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
             block_table = self.block_tables[seq.seq_id]
-            self.block_tables[
-                seq.
-                seq_id] = self.block_allocator.get_seq_swap_out_block_mapping(
-                    seq, block_table, mapping)
+            new_block_table = block_table.swap(destination_device=Device.CPU)
+            self.block_tables[seq.seq_id] = new_block_table
 
+        mapping = self.block_allocator.get_and_reset_swaps()
         block_number_mapping = {
-            gpu_block.block_id: cpu_block.block_id - self.num_total_gpu_blocks
-            for gpu_block, cpu_block in mapping.items()
+            gpu_block_id: cpu_block_id - self.num_total_gpu_blocks
+            for gpu_block_id, cpu_block_id in mapping.items()
         }
         return block_number_mapping
 

From 3bb125c223ea72e58862b49a890b064e800f4d82 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 5 Apr 2024 08:58:14 +0000
Subject: [PATCH 08/32] misc: remove useless code

---
 vllm/core/block/cpu_gpu_block_allocator.py | 44 ----------------------
 1 file changed, 44 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 8f00515dfb3a..1c9bc0edc368 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -247,47 +247,3 @@ def get_and_reset_swaps(self) -> dict:
         mapping = self._swap_mapping.copy()
         self._swap_mapping.clear()
         return mapping
-
-    # def get_seq_swap_out_block_mapping(
-    #         self, seq: Sequence, block_table: BlockTable,
-    #         mapping: Dict[Block, Block]) -> BlockTable:
-    #     # The swap out logic for a sequence, the mapping dict will be updated
-    #     # and the new block table for swapped out sequence is returned.
-    #     new_block_table = BlockTable(
-    #         block_size=self._block_size,
-    #         block_allocator=self,
-    #     )
-    #     for src_block in block_table.get_blocks():
-    #         if src_block in mapping:
-    #             cpu_block = mapping[src_block]
-    #             self.reference(cpu_block)
-    #         else:
-    #             cpu_block = new_block_table.allocate(
-    #                 token_ids=src_block.token_ids,
-    #                 device=Device.CPU,
-    #                 by_block=True)
-    #             mapping[src_block] = cpu_block
-    #         self.free(src_block)
-    #     return new_block_table
-
-    # def get_seq_swap_in_block_mapping(
-    #         self, seq: Sequence, block_table: BlockTable,
-    #         mapping: Dict[Block, Block]) -> BlockTable:
-    #     # The swap in logic for a sequence, the mapping dict will be updated
-    #     # and the new block table for swapped in sequence is returned.
-    #     new_block_table = BlockTable(
-    #         block_size=self._block_size,
-    #         block_allocator=self,
-    #     )
-    #     for cpu_block in block_table.get_blocks():
-    #         if cpu_block in mapping:
-    #             gpu_block = mapping[cpu_block]
-    #             self.reference(gpu_block)
-    #         else:
-    #             gpu_block = new_block_table.allocate(
-    #                 token_ids=cpu_block.token_ids,
-    #                 device=Device.GPU,
-    #                 by_block=True)
-    #             mapping[cpu_block] = gpu_block
-    #         self.free(cpu_block)
-    #     return new_block_table

From 403a9bd4bada9666eec4c83ffeb7fa5e4b1ffbc6 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 12 Apr 2024 02:40:13 +0000
Subject: [PATCH 09/32] fix: refactor can_swap_in/out

---
 vllm/core/block/block_table.py             | 38 +--------
 vllm/core/block/cpu_gpu_block_allocator.py |  8 ++
 vllm/core/block/naive_block.py             | 34 +++++++++
 vllm/core/block/prefix_caching_block.py    | 29 +++++++
 vllm/core/block_manager_v2.py              | 89 +++++++---------------
 5 files changed, 101 insertions(+), 97 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index a15fcd0a752b..796b69280b4c 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -82,7 +82,7 @@ def allocate(self,
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
             by_block (bool, optional): whether we are allocate block by block.
-            Set to True when doing cache swapping. Defaults to False. 
+            Set to True when doing cache swapping. Default to False. 
         """
         assert not self._is_allocated or by_block
         assert token_ids
@@ -318,39 +318,3 @@ def _chunk_token_blocks_for_append(
         token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
             token_ids[first_chunk_size:], self._block_size)
         return token_blocks
-
-    def get_num_cache_blocks_touched_by_swapping(self, token_ids: List[int],
-                                                 num_lookahead_slots: int,
-                                                 device: Device) -> int:
-        """Determine how many blocks will be "touched" by swapping in/out the 
-        token ids.
-
-        This is required for the scheduler to determine whether a sequence can
-        be swapped in/out.
-        """
-        all_token_ids = token_ids + [-1] * num_lookahead_slots
-        token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
-        prev_block = None
-        num_blocks_touched = 0
-        for token_block in token_blocks:
-            block = self.block_allocator.mock_mutable(prev_block, token_block,
-                                                      device)
-            if not block.prefix_caching_allocator.is_block_cached(block):
-                num_blocks_touched += 1
-            prev_block = block
-        return num_blocks_touched
-
-    def get_num_naive_blocks_touched_by_swapping(self, token_ids: List[int],
-                                                 num_lookahead_slots: int,
-                                                 total_touched_blocks: int,
-                                                 block_set: set) -> None:
-        num_blocks_touched = self.get_num_blocks_touched_by_append_slots(
-            token_ids, num_lookahead_slots)
-        blocks = self.get_blocks()
-        if num_blocks_touched > len(blocks):
-            total_touched_blocks += 1
-        for block in blocks:
-            if not block.is_full:
-                total_touched_blocks += 1
-            else:
-                block_set.add(block.block_id)
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 1c9bc0edc368..8bb27f2da484 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -204,6 +204,14 @@ def get_num_free_blocks(self, device: Device) -> int:
         """
         return self._allocators[device].get_num_free_blocks()
 
+    def can_swap(self,
+                 blocks: List[Block],
+                 device: Device,
+                 num_lookahead_slots: int = 0,
+                 watermark_blocks: int = 0) -> bool:
+        return self._allocators[device].can_swap(blocks, num_lookahead_slots,
+                                                 watermark_blocks)
+
     def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         """Clears the copy-on-write (CoW) state and returns the mapping of
             source to destination block IDs.
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 60aa145b00b4..422b130c1194 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -221,6 +221,40 @@ def get_common_computed_block_ids(
         """
         return []
 
+    def can_swap(self,
+                 blocks: List[Block],
+                 num_lookahead_slots: int = 0,
+                 watermark_blocks: int = 0) -> bool:
+        """Determine can we swap in/out the given blocks from certain sequence
+        group with the provided num_lookahead_slots.
+
+        Args:
+            blocks (List[Block]): The potential blocks to swap.
+            num_lookahead_slots (int): number of lookahead slots (0 for swap 
+                out).
+        
+        Returns:
+            bool: whether the allocator has capacity to accept the swap 
+                with given blocks and num_lookahead_slots.
+        """
+        # NOTE: for naive block, we use set to eliminate common blocks among
+        # seqs, also we compare the empty slots in the mutable blocks with
+        # lookahead slots to get the number of unique new block that are
+        # needed.
+        old_block_set = set()
+        new_block_count = 0
+        for block in blocks:
+            if not block.is_full and num_lookahead_slots != 0:
+                if block.num_empty_slots >= num_lookahead_slots:
+                    new_block_count += 1
+                else:
+                    new_block_count += 2
+            else:
+                old_block_set.add(block.block_id)
+        num_touched_blocks = new_block_count + len(old_block_set)
+        return self.get_num_free_blocks(
+        ) - num_touched_blocks >= watermark_blocks
+
 
 class NaiveBlock(Block):
     """An implementation of the Block class that does not support prefix
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index b3ade6b23d3f..70b3775ffdd6 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -345,6 +345,35 @@ def get_common_computed_block_ids(
         ]
         return commonprefix([ids for ids in ids_list if ids != []])
 
+    def can_swap(self,
+                 blocks: List[Block],
+                 num_lookahead_slots: int = 0,
+                 watermark_blocks: int = 0) -> bool:
+        """Determine can we swap in/out the given blocks from certain sequence
+        group with the provided num_lookahead_slots.
+
+        Args:
+            blocks (List[Block]): The potential blocks to swap.
+            num_lookahead_slots (int): number of lookahead slots (0 for 
+                swap out).
+        
+        Returns:
+            bool: whether the allocator has capacity to accept the swap 
+                with given blocks and num_lookahead_slots.
+        """
+        num_touched_blocks = 0
+        for block in blocks:
+            if not block.is_full:
+                if block.num_empty_slots >= num_lookahead_slots:
+                    num_touched_blocks += 1
+                else:
+                    num_touched_blocks += 2
+            else:
+                if not self.is_block_cached(block):
+                    num_touched_blocks += 1
+        return self.get_num_free_blocks(
+        ) - num_touched_blocks >= watermark_blocks
+
 
 class PrefixCachingBlock(Block):
     """A block implementation that supports prefix caching.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 7a330079887f..c4b414cb1f0b 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,4 +1,5 @@
 """A block manager that manages token blocks."""
+from itertools import chain
 from typing import Dict, List, Optional
 
 from vllm.core.block.block_table import BlockTable
@@ -225,41 +226,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
-    def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> bool:
-        """
-        We go through all sequence in seq group to get their number of blocks 
-        touched and sum them up to see whether there is enough memory to swap in
-        """
-        num_touched_blocks = 0
-
-        if self.enable_caching:
-            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-                block_table = self.block_tables[seq.seq_id]
-                num_touched_blocks += (
-                    block_table.get_num_cache_blocks_touched_by_swapping(
-                        token_ids=seq.get_token_ids(),
-                        num_lookahead_slots=num_lookahead_slots,
-                        device=Device.GPU))
-        else:
-            # NOTE: for naive block, we go though all the sequence to collect
-            # a set of immutable block id, and accumulate number of isolated
-            # blocks (mutable ones and single block caused by lookahead). We
-            # sum them up at the end to get the final num_touched_blocks
-            # num_touched_blocks swap in op.
-            block_set = set()
-            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-                block_table = self.block_tables[seq.seq_id]
-                block_table.get_num_naive_blocks_touched_by_swapping(
-                    token_ids=seq.get_token_ids(),
-                    num_lookahead_slots=num_lookahead_slots,
-                    total_touched_blocks=num_touched_blocks,
-                    block_set=block_set)
-            num_touched_blocks += len(block_set)
-
-        num_free_blocks = self.block_allocator.get_num_free_blocks(Device.GPU)
-        return num_free_blocks - num_touched_blocks >= self.watermark_blocks
-
     def swap_in(self, seq_group: SequenceGroup,
                 num_lookahead_slots: int) -> Dict[int, int]:
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
@@ -279,31 +245,6 @@ def swap_in(self, seq_group: SequenceGroup,
         }
         return block_number_mapping
 
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        num_touched_blocks = 0
-
-        if self.enable_caching:
-            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-                block_table = self.block_tables[seq.seq_id]
-                num_touched_blocks += (
-                    block_table.get_num_cache_blocks_touched_by_swapping(
-                        token_ids=seq.get_token_ids(),
-                        num_lookahead_slots=0,
-                        device=Device.CPU))
-        else:
-            block_set = set()
-            for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-                block_table = self.block_tables[seq.seq_id]
-                block_table.get_num_naive_blocks_touched_by_swapping(
-                    token_ids=seq.get_token_ids(),
-                    num_lookahead_slots=0,
-                    total_touched_blocks=num_touched_blocks,
-                    block_set=block_set)
-            num_touched_blocks += len(block_set)
-
-        return num_touched_blocks <= self.block_allocator.get_num_free_blocks(
-            Device.CPU)
-
     def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
         mapping: Dict[Block, Block] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
@@ -323,3 +264,31 @@ def get_num_free_gpu_blocks(self) -> int:
 
     def get_num_free_cpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.CPU)
+
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> bool:
+        return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
+                              num_lookahead_slots, self.watermark_blocks)
+
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING)
+
+    def _can_swap(self,
+                  seq_group: SequenceGroup,
+                  device: Device,
+                  status: SequenceStatus,
+                  num_lookahead_slots: int = 0,
+                  watermark_blocks: int = 0) -> bool:
+        blocks = self._get_blocks_for_swap(seq_group, status)
+        return self.block_allocator.can_swap(blocks, device,
+                                             num_lookahead_slots,
+                                             watermark_blocks)
+
+    def _get_blocks_for_swap(self, seq_group: SequenceGroup,
+                             status: SequenceStatus) -> List[Block]:
+        blocks: Dict[int, List[Block]] = {}
+        for seq in seq_group.get_seqs(status=status):
+            block_table = self.block_tables[seq.seq_id]
+            blocks[seq.seq_id] = block_table.get_blocks()
+        combined_blocks = list(chain(*blocks.values()))
+        return combined_blocks

From 3237d633a36c914bdcea86bdb907bd12748476c4 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 12 Apr 2024 02:43:04 +0000
Subject: [PATCH 10/32] fix: remove unused code

---
 vllm/core/block/cpu_gpu_block_allocator.py | 15 ------------
 vllm/core/block/interfaces.py              | 10 --------
 vllm/core/block/naive_block.py             | 27 ----------------------
 vllm/core/block/prefix_caching_block.py    | 18 ---------------
 4 files changed, 70 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 8bb27f2da484..4289de2c52ae 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -144,21 +144,6 @@ def allocate_immutable(self, prev_block: Optional[Block],
         return self._allocators[device].allocate_immutable(
             prev_block, token_ids)
 
-    def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int],
-                     device: Device) -> Block:
-        """Mock a new mutable block, linked to the previous block, to help with
-        content hash calculation.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-        return self._allocators[device].mock_mutable(prev_block, token_ids)
-
     def reference(self, block_id: int) -> None:
         """Notify the device aware allocator there is new sequence reference
         the given block.
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index f6e20ff11a13..d463f7b09131 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -57,11 +57,6 @@ def allocate_immutable(self, prev_block: Optional[Block],
                            token_ids: List[int]) -> Block:
         pass
 
-    @abstractmethod
-    def mock_mutable(self, prev_block: Optional[Block],
-                     token_ids: List[int]) -> Block:
-        pass
-
     @abstractmethod
     def free(self, block: Block) -> None:
         pass
@@ -107,11 +102,6 @@ def allocate_immutable(self, prev_block: Optional[Block],
                            token_ids: List[int], device: Device) -> Block:
         pass
 
-    @abstractmethod
-    def mock_mutable(self, prev_block: Optional[Block], token_ids: List[int],
-                     device: Device) -> Block:
-        pass
-
     @abstractmethod
     def get_num_free_blocks(self, device: Device) -> int:
         pass
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 422b130c1194..fec19d1be404 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -89,33 +89,6 @@ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
             allocator=self,
         )
 
-    def mock_mutable(
-        self,
-        prev_block: Optional[Block],
-        token_ids: List[int],
-    ) -> Block:
-        """Mock a new mutable block, linked to the previous block, to help with
-        content hash calculation.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-
-        # NOTE: we use -1 as block_id for mock block
-        block_id = -1
-        return self._create_block(
-            prev_block=prev_block,
-            token_ids=token_ids,
-            block_id=block_id,
-            block_size=self._block_size,
-            allocator=self,
-        )
-
     def free(self, block: Block) -> None:
         self._free_block_id(block.block_id)
         block.block_id = None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 70b3775ffdd6..060a2691e82a 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -164,24 +164,6 @@ def allocate_mutable(self, prev_block: Block) -> Block:
         # No block available in hashless allocator, nor in unused cache blocks.
         raise BlockAllocator.NoFreeBlocksError()
 
-    def mock_mutable(
-        self,
-        prev_block: Optional[Block],
-        token_ids: List[int],
-    ) -> Block:
-        """Mock a new mutable block, linked to the previous block, to help with
-        content hash calculation.
-
-        Args:
-            prev_block (Optional[Block]): The previous block in the sequence. If
-                None, then the block to be allocated is the first block in the
-                sequence.
-
-        Returns:
-            Block: The newly allocated mutable block.
-        """
-        return self._hashless_allocator.mock_mutable(prev_block, token_ids)
-
     def _incr_refcount_cached_block(self, content_hash: int,
                                     block_id: BlockId) -> None:
         refcount = self._refcounter.incr(block_id)

From 413124794d7bc7f97e1bb9db720bf7f031b7b380 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 12 Apr 2024 02:44:31 +0000
Subject: [PATCH 11/32] fix: remove unused code

---
 vllm/core/block/block_table.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 796b69280b4c..f271ddf86950 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -274,10 +274,6 @@ def _get_all_token_ids(self) -> List[int]:
     def _is_allocated(self) -> bool:
         return self._blocks is not None
 
-    @property
-    def _num_touched_blocks(self) -> int:
-        return len(self._blocks)
-
     @property
     def _num_empty_slots(self) -> int:
         assert self._is_allocated

From 0067ddff1c00fa9f8cff2aea21c22f784713ae5c Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 12 Apr 2024 04:52:14 +0000
Subject: [PATCH 12/32] fix: refactor swap in/out oprations

---
 vllm/core/block/block_table.py             | 11 ----
 vllm/core/block/common.py                  |  1 -
 vllm/core/block/cpu_gpu_block_allocator.py | 34 ++++------
 vllm/core/block/naive_block.py             | 14 +++++
 vllm/core/block/prefix_caching_block.py    | 14 +++++
 vllm/core/block_manager_v2.py              | 72 +++++++++++-----------
 6 files changed, 76 insertions(+), 70 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index f271ddf86950..a237390b5aa8 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -193,17 +193,6 @@ def free(self) -> None:
             self._allocator.free(block)
         self._blocks = None
 
-    def swap(self, destination_device: Device) -> "BlockTable":
-        new_block_table = BlockTable(
-            block_size=self._block_size,
-            block_allocator=self._allocator,
-        )
-        for src_block in self.get_blocks():
-            self._allocator.update_seq_swap_out_block_mapping(
-                src_block, new_block_table, destination_device)
-            self._allocator.free(src_block)
-        return new_block_table
-
     @property
     def physical_block_ids(self) -> List[int]:
         """Returns a list of physical block indices for the blocks in the
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 50c70533c4fb..d9f07321950f 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -129,7 +129,6 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
         assert refcount != 0
         if refcount > 1:
             src_block_id = block_id
-
             # Decrement refcount of the old block.
             self._allocator.free(block)
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 4289de2c52ae..742c5bfc835e 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -2,7 +2,6 @@
 
 from typing import Dict, List, Optional
 
-from vllm.core.block.block_table import BlockTable
 from vllm.core.block.interfaces import (Block, BlockAllocator,
                                         DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
@@ -144,16 +143,6 @@ def allocate_immutable(self, prev_block: Optional[Block],
         return self._allocators[device].allocate_immutable(
             prev_block, token_ids)
 
-    def reference(self, block_id: int) -> None:
-        """Notify the device aware allocator there is new sequence reference
-        the given block.
-
-        Args:
-            block (Block): The block to be referenced.
-        """
-        allocator = self._block_ids_to_allocator[block_id]
-        return allocator.reference(block_id)
-
     def free(self, block: Block) -> None:
         """Frees the memory occupied by the given block.
 
@@ -189,6 +178,17 @@ def get_num_free_blocks(self, device: Device) -> int:
         """
         return self._allocators[device].get_num_free_blocks()
 
+    def swap(self, blocks: List[Block], source_device: Device,
+             dest_device: Device) -> None:
+        source_block_ids = [block.block_id for block in blocks]
+        self._allocators[source_device].swap_out(blocks)
+        self._allocators[dest_device].swap_in(blocks)
+        dest_block_ids = [block.block_id for block in blocks]
+        self._swap_mapping = {
+            src: dest
+            for src, dest in zip(source_block_ids, dest_block_ids)
+        }
+
     def can_swap(self,
                  blocks: List[Block],
                  device: Device,
@@ -224,18 +224,6 @@ def get_common_computed_block_ids(
     def all_block_ids(self) -> frozenset[int]:
         return frozenset(self._block_ids_to_allocator.keys())
 
-    def update_seq_swap_out_block_mapping(self, block: Block,
-                                          block_table: BlockTable,
-                                          destination_device: Device) -> None:
-        if block.block_id in self._swap_mapping:
-            dest_block_id = self._swap_mapping[block.block_id]
-            self.reference(dest_block_id)
-        else:
-            dest_block = block_table.allocate(token_ids=block.token_ids,
-                                              device=destination_device,
-                                              by_block=True)
-            self._swap_mapping[block.block_id] = dest_block.block_id
-
     def get_and_reset_swaps(self) -> dict:
         mapping = self._swap_mapping.copy()
         self._swap_mapping.clear()
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index fec19d1be404..90d708c6e40e 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -228,6 +228,20 @@ def can_swap(self,
         return self.get_num_free_blocks(
         ) - num_touched_blocks >= watermark_blocks
 
+    def swap_out(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            self.free(block)
+
+    def swap_in(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            if block.is_full:
+                alloc = self.allocate_immutable(block.prev_block,
+                                                block.token_ids)
+            else:
+                alloc = self.allocate_mutable(block.prev_block)
+                alloc.append_token_ids(block.token_ids)
+            block.block_id = alloc.block_id
+
 
 class NaiveBlock(Block):
     """An implementation of the Block class that does not support prefix
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 060a2691e82a..f9a809dae1e1 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -356,6 +356,20 @@ def can_swap(self,
         return self.get_num_free_blocks(
         ) - num_touched_blocks >= watermark_blocks
 
+    def swap_out(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            self.free(block)
+
+    def swap_in(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            if block.is_full:
+                alloc = self.allocate_immutable(block.prev_block,
+                                                block.token_ids)
+            else:
+                alloc = self.allocate_mutable(block.prev_block)
+                alloc.append_token_ids(block.token_ids)
+            block.block_id = alloc.block_id
+
 
 class PrefixCachingBlock(Block):
     """A block implementation that supports prefix caching.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index c4b414cb1f0b..b3a22fe17089 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -226,39 +226,6 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
-    def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> Dict[int, int]:
-        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
-            block_table = self.block_tables[seq.seq_id]
-            new_block_table = block_table.swap(destination_device=Device.GPU)
-            self.block_tables[seq.seq_id] = new_block_table
-            self.append_slots(seq=seq, num_lookahead_slots=num_lookahead_slots)
-
-        # NOTE: since the memory operation in physical blocks need the
-        # relative position of CPU block to its starting address, here
-        # we need to shift the block id of cpu block back to its relative
-        # position within CPU cache.
-        mapping = self.block_allocator.get_and_reset_swaps()
-        block_number_mapping = {
-            cpu_block_id - self.num_total_gpu_blocks: gpu_block_id
-            for cpu_block_id, gpu_block_id in mapping.items()
-        }
-        return block_number_mapping
-
-    def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
-        mapping: Dict[Block, Block] = {}
-        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
-            block_table = self.block_tables[seq.seq_id]
-            new_block_table = block_table.swap(destination_device=Device.CPU)
-            self.block_tables[seq.seq_id] = new_block_table
-
-        mapping = self.block_allocator.get_and_reset_swaps()
-        block_number_mapping = {
-            gpu_block_id: cpu_block_id - self.num_total_gpu_blocks
-            for gpu_block_id, cpu_block_id in mapping.items()
-        }
-        return block_number_mapping
-
     def get_num_free_gpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.GPU)
 
@@ -273,6 +240,39 @@ def can_swap_in(self, seq_group: SequenceGroup,
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING)
 
+    def swap_in(self,
+                sequence_group: SequenceGroup,
+                num_lookahead_slots: int = 0) -> Dict[int, int]:
+        blocks = self._get_blocks_for_swap(sequence_group,
+                                           SequenceStatus.SWAPPED,
+                                           num_lookahead_slots)
+        self.block_allocator.swap(blocks=blocks,
+                                  source_device=Device.CPU,
+                                  dest_device=Device.GPU)
+        # NOTE: Once the BlockManagerV1 implementation is deleted, we can
+        # move this get_and_reset_swaps call outside of swap_in/swap_out.
+        # Then the scheduler can make calls to get all swaps and all
+        # copy-on-writes for the batch.
+        mapping = self.block_allocator.get_and_reset_swaps()
+        block_number_mapping = {
+            cpu_block_id - self.num_total_gpu_blocks: gpu_block_id
+            for cpu_block_id, gpu_block_id in mapping.items()
+        }
+        return block_number_mapping
+
+    def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
+        blocks = self._get_blocks_for_swap(sequence_group,
+                                           SequenceStatus.RUNNING)
+        self.block_allocator.swap(blocks=blocks,
+                                  source_device=Device.GPU,
+                                  dest_device=Device.CPU)
+        mapping = self.block_allocator.get_and_reset_swaps()
+        block_number_mapping = {
+            gpu_block_id: cpu_block_id - self.num_total_gpu_blocks
+            for gpu_block_id, cpu_block_id in mapping.items()
+        }
+        return block_number_mapping
+
     def _can_swap(self,
                   seq_group: SequenceGroup,
                   device: Device,
@@ -284,8 +284,10 @@ def _can_swap(self,
                                              num_lookahead_slots,
                                              watermark_blocks)
 
-    def _get_blocks_for_swap(self, seq_group: SequenceGroup,
-                             status: SequenceStatus) -> List[Block]:
+    def _get_blocks_for_swap(self,
+                             seq_group: SequenceGroup,
+                             status: SequenceStatus,
+                             num_lookahead_slots: int = 0) -> List[Block]:
         blocks: Dict[int, List[Block]] = {}
         for seq in seq_group.get_seqs(status=status):
             block_table = self.block_tables[seq.seq_id]

From b8aee85fe71d5f84798362403a67c0f6829d78cf Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 12 Apr 2024 04:56:47 +0000
Subject: [PATCH 13/32] fix

---
 vllm/core/block/naive_block.py          |  3 ---
 vllm/core/block/prefix_caching_block.py |  3 ---
 vllm/core/block_manager_v2.py           | 18 +++++++++---------
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 90d708c6e40e..f0b75a49abd2 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -93,9 +93,6 @@ def free(self, block: Block) -> None:
         self._free_block_id(block.block_id)
         block.block_id = None
 
-    def reference(self, block_id: int) -> None:
-        self._refcounter.incr(block_id)
-
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index f9a809dae1e1..4f1bd004e846 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -199,9 +199,6 @@ def _free_block_id_for_block(self, block_id: BlockId,
             assert block.content_hash in self._cached_blocks
             self._unused_cached_blocks[block.content_hash] = block_id
 
-    def reference(self, block_id: int) -> None:
-        self._refcounter.incr(block_id)
-
     def fork(self, last_block: Block) -> List[Block]:
         """Creates a new sequence of blocks that shares the same underlying
         memory as the original sequence.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index b3a22fe17089..095ab13ec2ba 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -226,20 +226,11 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         src_block_table = self.block_tables[parent_seq.seq_id]
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
-    def get_num_free_gpu_blocks(self) -> int:
-        return self.block_allocator.get_num_free_blocks(Device.GPU)
-
-    def get_num_free_cpu_blocks(self) -> int:
-        return self.block_allocator.get_num_free_blocks(Device.CPU)
-
     def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> bool:
         return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
                               num_lookahead_slots, self.watermark_blocks)
 
-    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
-        return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING)
-
     def swap_in(self,
                 sequence_group: SequenceGroup,
                 num_lookahead_slots: int = 0) -> Dict[int, int]:
@@ -260,6 +251,9 @@ def swap_in(self,
         }
         return block_number_mapping
 
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING)
+
     def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
         blocks = self._get_blocks_for_swap(sequence_group,
                                            SequenceStatus.RUNNING)
@@ -273,6 +267,12 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
         }
         return block_number_mapping
 
+    def get_num_free_gpu_blocks(self) -> int:
+        return self.block_allocator.get_num_free_blocks(Device.GPU)
+
+    def get_num_free_cpu_blocks(self) -> int:
+        return self.block_allocator.get_num_free_blocks(Device.CPU)
+
     def _can_swap(self,
                   seq_group: SequenceGroup,
                   device: Device,

From cba0f62e65716794e635a912c217a5d211adc9e4 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 12 Apr 2024 04:59:20 +0000
Subject: [PATCH 14/32] fix

---
 vllm/core/block/block_table.py | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index a237390b5aa8..30fd3050f0b4 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -70,8 +70,7 @@ def get_blocks(self) -> Optional[List[Block]]:
 
     def allocate(self,
                  token_ids: List[int],
-                 device: Device = Device.GPU,
-                 by_block: bool = False) -> Optional[Block]:
+                 device: Device = Device.GPU) -> None:
         """Allocates memory blocks for storing the given sequence of token IDs.
 
         This method allocates the required number of blocks to store the given
@@ -81,23 +80,13 @@ def allocate(self,
             token_ids (List[int]): The sequence of token IDs to be stored.
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
-            by_block (bool, optional): whether we are allocate block by block.
-            Set to True when doing cache swapping. Default to False. 
         """
-        assert not self._is_allocated or by_block
         assert token_ids
-        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
-                                                     token_ids=token_ids,
-                                                     device=device)
-        self._num_full_slots += len(token_ids)
-        if not (by_block and self._is_allocated):
-            self._blocks = blocks
-        else:
-            # Note: whenever we call allocate with by_block set to True,
-            # because of swapping, the tokens must fit in a block
-            assert len(blocks) == 1
-            self._blocks.append(blocks[0])
-        return blocks[0]
+        assert not self._is_allocated
+        self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                           token_ids=token_ids,
+                                                           device=device)
+        self._num_full_slots = len(token_ids)
 
     def append_token_ids(self,
                          token_ids: List[int],

From 0430758205644020226cd80dcc519a7c4dc64fb7 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Tue, 30 Apr 2024 18:53:37 +0000
Subject: [PATCH 15/32] doc: adding docstring

---
 tests/core/block/test_block_manager_v2.py  |   7 +-
 vllm/core/block/block_table.py             |   7 +-
 vllm/core/block/cpu_gpu_block_allocator.py |  59 ++++++++++--
 vllm/core/block/naive_block.py             |  32 +++---
 vllm/core/block/prefix_caching_block.py    |  39 ++++++--
 vllm/core/block_manager_v1.py              |   7 +-
 vllm/core/block_manager_v2.py              | 107 +++++++++++++++++----
 7 files changed, 202 insertions(+), 56 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 67c0789f03b6..bbba018a3f92 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -106,10 +106,13 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 @pytest.mark.parametrize("block_size", [8])
 @pytest.mark.parametrize("num_cpu_blocks", [4])
 @pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [2])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2])
 @pytest.mark.parametrize("enable_caching", [False])
 def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
               enable_caching):
+    """Verify blocks number on src/desc device is correct after swapping in/out
+        sequence group (not missing or extra blocks).
+    """
     block_manager = BlockSpaceManagerV2(block_size,
                                         num_cpu_blocks,
                                         num_gpu_blocks,
@@ -142,7 +145,7 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
     assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
     before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
-    mapping = block_manager.swap_in(seq_group, num_lookahead_slots)
+    mapping = block_manager.swap_in(seq_group)
     cpu_blocks = block_manager.get_block_table(prompt)
     assert list(mapping.keys()) == [cpu_blocks[0]]
     after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 30fd3050f0b4..9813e6882f0e 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -65,9 +65,6 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
         """
         return cdiv(len(token_ids), block_size)
 
-    def get_blocks(self) -> Optional[List[Block]]:
-        return self._blocks
-
     def allocate(self,
                  token_ids: List[int],
                  device: Device = Device.GPU) -> None:
@@ -252,6 +249,10 @@ def _get_all_token_ids(self) -> List[int]:
     def _is_allocated(self) -> bool:
         return self._blocks is not None
 
+    @property
+    def blocks(self) -> Optional[List[Block]]:
+        return self._blocks
+
     @property
     def _num_empty_slots(self) -> int:
         assert self._is_allocated
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 742c5bfc835e..be5a20e0ac15 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -178,8 +178,32 @@ def get_num_free_blocks(self, device: Device) -> int:
         """
         return self._allocators[device].get_num_free_blocks()
 
+    def get_device_related_block_id(self, device: Device,
+                                    absolute_id: int) -> int:
+        """Returns the relative block id on certain device given the absolute 
+        block id.
+
+        Args:
+            device (Device): The device for which to query relative block id.
+                absolute_id (int): The absolute block id for the block in 
+                whole allocator.
+
+        Returns:
+            int: The relative block id on certain device.
+        """
+        return self._allocators[device].get_device_related_block_id(
+            absolute_id)
+
     def swap(self, blocks: List[Block], source_device: Device,
              dest_device: Device) -> None:
+        """Execute the swap for the given blocks from source_device
+        on to dest_device, and save the swap mapping.
+
+        Args:
+            blocks: List of blocks to be swapped.
+            source_device (Device): Device to swap the 'blocks' from.
+            dest_device (Device): Device to swap the 'blocks' to.
+        """
         source_block_ids = [block.block_id for block in blocks]
         self._allocators[source_device].swap_out(blocks)
         self._allocators[dest_device].swap_in(blocks)
@@ -189,13 +213,25 @@ def swap(self, blocks: List[Block], source_device: Device,
             for src, dest in zip(source_block_ids, dest_block_ids)
         }
 
-    def can_swap(self,
-                 blocks: List[Block],
-                 device: Device,
-                 num_lookahead_slots: int = 0,
-                 watermark_blocks: int = 0) -> bool:
-        return self._allocators[device].can_swap(blocks, num_lookahead_slots,
-                                                 watermark_blocks)
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               device: Device,
+                               num_lookahead_slots: int = 0) -> int:
+        """Returns the number of blocks that will be touched by
+        swapping in/out the given blocks on to the 'device'.
+
+        Args:
+            blocks: List of blocks to be swapped.
+            device (Device): Device to swap the 'blocks' on.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            int: the number of blocks that will be touched by
+        swapping in/out the given blocks on to the 'device'.
+        """
+        return self._allocators[device].get_num_blocks_touched(
+            blocks, num_lookahead_slots)
 
     def clear_copy_on_writes(self) -> Dict[int, List[int]]:
         """Clears the copy-on-write (CoW) state and returns the mapping of
@@ -224,7 +260,14 @@ def get_common_computed_block_ids(
     def all_block_ids(self) -> frozenset[int]:
         return frozenset(self._block_ids_to_allocator.keys())
 
-    def get_and_reset_swaps(self) -> dict:
+    def get_and_reset_swaps(self) -> dict[int, int]:
+        """Returns and clears the mapping of source to destination block IDs.
+        Will be called after every swapping operations for now, and after every
+        schedule when BlockManagerV2 become default.
+
+        Returns:
+            Dict[int, int]: A mapping of source to destination block IDs.
+        """
         mapping = self._swap_mapping.copy()
         self._swap_mapping.clear()
         return mapping
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index f0b75a49abd2..0857ed4db978 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 from typing import Dict, Iterable, List, Optional, Set
 
 from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
@@ -143,6 +141,19 @@ def _free_block_id(self, block_id: BlockId) -> None:
         if refcount == 0:
             self._free_block_indices.add(block_id)
 
+    def get_device_related_block_id(self, absolute_id: int) -> int:
+        """Returns the relative block id on certain block allocator
+        given the absolute block id.
+
+        Args:
+            absolute_id (int): The absolute block id for the block 
+            in whole allocator.
+
+        Returns:
+            int: The relative block id on certain device.
+        """
+        return sorted(self._all_block_indices).index(absolute_id)
+
     @property
     def refcounter(self):
         return self._refcounter
@@ -191,11 +202,11 @@ def get_common_computed_block_ids(
         """
         return []
 
-    def can_swap(self,
-                 blocks: List[Block],
-                 num_lookahead_slots: int = 0,
-                 watermark_blocks: int = 0) -> bool:
-        """Determine can we swap in/out the given blocks from certain sequence
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               num_lookahead_slots: int = 0) -> int:
+        """Determine the number of blocks that will be touched by
+        swapping in/out the given blocks from certain sequence
         group with the provided num_lookahead_slots.
 
         Args:
@@ -204,8 +215,8 @@ def can_swap(self,
                 out).
         
         Returns:
-            bool: whether the allocator has capacity to accept the swap 
-                with given blocks and num_lookahead_slots.
+            int: the number of blocks that will be touched by
+                swapping in/out the given blocks and num_lookahead_slots.
         """
         # NOTE: for naive block, we use set to eliminate common blocks among
         # seqs, also we compare the empty slots in the mutable blocks with
@@ -222,8 +233,7 @@ def can_swap(self,
             else:
                 old_block_set.add(block.block_id)
         num_touched_blocks = new_block_count + len(old_block_set)
-        return self.get_num_free_blocks(
-        ) - num_touched_blocks >= watermark_blocks
+        return num_touched_blocks
 
     def swap_out(self, blocks: List[Block]) -> None:
         for block in blocks:
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 4f1bd004e846..86051448dd5a 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -236,6 +236,19 @@ def get_num_free_blocks(self) -> int:
         return self._hashless_allocator.get_num_free_blocks() + len(
             self._unused_cached_blocks)
 
+    def get_device_related_block_id(self, absolute_id: int) -> int:
+        """Returns the relative block id on certain block allocator
+        given the absolute block id.
+
+        Args:
+            absolute_id (int): The absolute block id for the block 
+                in whole allocator.
+
+        Returns:
+            int: The relative block id on certain device.
+        """
+        return sorted(self._all_block_indices).index(absolute_id)
+
     @property
     def all_block_ids(self) -> frozenset[int]:
         return self._hashless_allocator.all_block_ids
@@ -326,9 +339,9 @@ def get_common_computed_block_ids(
 
     def can_swap(self,
                  blocks: List[Block],
-                 num_lookahead_slots: int = 0,
-                 watermark_blocks: int = 0) -> bool:
-        """Determine can we swap in/out the given blocks from certain sequence
+                 num_lookahead_slots: int = 0) -> int:
+        """Determine the number of blocks that will be touched by
+        swapping in/out the given blocks from certain sequence
         group with the provided num_lookahead_slots.
 
         Args:
@@ -337,8 +350,8 @@ def can_swap(self,
                 swap out).
         
         Returns:
-            bool: whether the allocator has capacity to accept the swap 
-                with given blocks and num_lookahead_slots.
+            int: the number of blocks that will be touched by
+                swapping in/out the given blocks and num_lookahead_slots.
         """
         num_touched_blocks = 0
         for block in blocks:
@@ -350,14 +363,26 @@ def can_swap(self,
             else:
                 if not self.is_block_cached(block):
                     num_touched_blocks += 1
-        return self.get_num_free_blocks(
-        ) - num_touched_blocks >= watermark_blocks
+        return num_touched_blocks
 
     def swap_out(self, blocks: List[Block]) -> None:
+        """Execute the swap out actions. Basically just free the 
+        given blocks.
+
+        Args:
+            blocks: List of blocks to be swapped out.
+        """
         for block in blocks:
             self.free(block)
 
     def swap_in(self, blocks: List[Block]) -> None:
+        """Execute the swap int actions. Change the block id from 
+        old allocator to current allocator for each block to finish 
+        the block table update. 
+
+        Args:
+            blocks: List of blocks to be swapped in.
+        """
         for block in blocks:
             if block.is_full:
                 alloc = self.allocate_immutable(block.prev_block,
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index b2aaeb33c529..6781c03f5251 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -453,12 +453,7 @@ def can_swap_in(self,
         num_required_blocks = len(blocks) + num_swapped_seqs
         return num_free_blocks - num_required_blocks >= self.watermark_blocks
 
-    def swap_in(self,
-                seq_group: SequenceGroup,
-                num_lookahead_slots: int = 0) -> Dict[int, int]:
-        assert (num_lookahead_slots == 0
-                ), "BlockSpaceManagerV1 does not support lookahead allocation"
-
+    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         # CPU block -> GPU block.
         mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
         for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 095ab13ec2ba..943566e843be 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -228,15 +228,32 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
 
     def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> bool:
+        """Returns whether we can swap in the given sequence_group 
+        with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            bool: Whether it's possible to swap in current sequence group.
+        """
         return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
-                              num_lookahead_slots, self.watermark_blocks)
+                              num_lookahead_slots)
+
+    def swap_in(self, sequence_group: SequenceGroup) -> Dict[int, int]:
+        """Returns the block id mapping (from CPU to GPU) generated by
+        swapping in the given sequence_group with num_lookahead_slots.
 
-    def swap_in(self,
-                sequence_group: SequenceGroup,
-                num_lookahead_slots: int = 0) -> Dict[int, int]:
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+
+        Returns:
+            Dict[int, int]: The mapping of swapping block from CPU to GPU.
+        """
         blocks = self._get_blocks_for_swap(sequence_group,
-                                           SequenceStatus.SWAPPED,
-                                           num_lookahead_slots)
+                                           SequenceStatus.SWAPPED)
         self.block_allocator.swap(blocks=blocks,
                                   source_device=Device.CPU,
                                   dest_device=Device.GPU)
@@ -246,15 +263,38 @@ def swap_in(self,
         # copy-on-writes for the batch.
         mapping = self.block_allocator.get_and_reset_swaps()
         block_number_mapping = {
-            cpu_block_id - self.num_total_gpu_blocks: gpu_block_id
+            self.block_allocator.get_device_related_block_id(
+                Device.CPU, cpu_block_id):
+            self.block_allocator.get_device_related_block_id(
+                Device.GPU, gpu_block_id)
             for cpu_block_id, gpu_block_id in mapping.items()
         }
         return block_number_mapping
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        """Returns whether we can swap out the given sequence_group 
+        with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            bool: Whether it's possible to swap out current sequence group.
+        """
         return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING)
 
     def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
+        """Returns the block id mapping (from GPU to CPU) generated by
+        swapping out the given sequence_group with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+
+        Returns:
+            Dict[int, int]: The mapping of swapping block from GPU to CPU.
+        """
         blocks = self._get_blocks_for_swap(sequence_group,
                                            SequenceStatus.RUNNING)
         self.block_allocator.swap(blocks=blocks,
@@ -262,7 +302,10 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
                                   dest_device=Device.CPU)
         mapping = self.block_allocator.get_and_reset_swaps()
         block_number_mapping = {
-            gpu_block_id: cpu_block_id - self.num_total_gpu_blocks
+            self.block_allocator.get_device_related_block_id(
+                Device.GPU, gpu_block_id):
+            self.block_allocator.get_device_related_block_id(
+                Device.CPU, cpu_block_id)
             for gpu_block_id, cpu_block_id in mapping.items()
         }
         return block_number_mapping
@@ -277,20 +320,46 @@ def _can_swap(self,
                   seq_group: SequenceGroup,
                   device: Device,
                   status: SequenceStatus,
-                  num_lookahead_slots: int = 0,
-                  watermark_blocks: int = 0) -> bool:
+                  num_lookahead_slots: int = 0) -> bool:
+        """Returns whether we can swap in/out the given sequence_group 
+        on to the 'device'.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            device (Device): device to swap the 'seq_group' on.
+            status (SequenceStatus): The status of sequence which is needed
+                for action. RUNNING for swap out and SWAPPED for swap in
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            bool: whether we can swap in/out the given sequence_group 
+        on to the 'device'.
+        """
         blocks = self._get_blocks_for_swap(seq_group, status)
-        return self.block_allocator.can_swap(blocks, device,
-                                             num_lookahead_slots,
-                                             watermark_blocks)
-
-    def _get_blocks_for_swap(self,
-                             seq_group: SequenceGroup,
-                             status: SequenceStatus,
-                             num_lookahead_slots: int = 0) -> List[Block]:
+        num_blocks_touched = self.block_allocator.get_num_blocks_touched(
+            blocks, device, num_lookahead_slots)
+        watermark_blocks = 0
+        if device == Device.GPU:
+            watermark_blocks = self.watermark_blocks
+        return self.block_allocator.get_num_free_blocks(
+            device) - num_blocks_touched > watermark_blocks
+
+    def _get_blocks_for_swap(self, seq_group: SequenceGroup,
+                             status: SequenceStatus) -> List[Block]:
+        """Returns the list of blocks those are touched by the seq_group
+        
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            status (SequenceStatus): The status of sequence which is needed
+                for action. RUNNING for swap out and SWAPPED for swap in
+        
+        Returns:
+            The list of blocks those are touched by the seq_group.
+        """
         blocks: Dict[int, List[Block]] = {}
         for seq in seq_group.get_seqs(status=status):
             block_table = self.block_tables[seq.seq_id]
-            blocks[seq.seq_id] = block_table.get_blocks()
+            blocks[seq.seq_id] = block_table.blocks
         combined_blocks = list(chain(*blocks.values()))
         return combined_blocks

From fbb30995b25d949852fde7cc45d8251c4d12effc Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Wed, 1 May 2024 07:52:19 +0000
Subject: [PATCH 16/32] test: adding e2e correstness test for preemption by
 swapping

---
 tests/core/block/e2e/test_correctness.py | 33 +++++++++++++++++++-----
 vllm/config.py                           | 22 +++++++++-------
 vllm/core/scheduler.py                   | 12 ++++++---
 vllm/engine/arg_utils.py                 |  2 ++
 4 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 5a7f828456e2..43d25a966c49 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -22,7 +22,13 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "preemption_mode": "swap"
+}, {
+    "use_v2_block_manager": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
@@ -93,7 +99,13 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "preemption_mode": "swap"
+}, {
+    "use_v2_block_manager": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
@@ -177,11 +189,18 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
 }])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
-    [{
-        # We run one test with block_size < lookahead_slots, one test with
-        # block_size > lookahead_slots
-        "num_lookahead_slots": 10,
-    }])
+    [
+        {
+            # We run one test with block_size < lookahead_slots, one test with
+            # block_size > lookahead_slots
+            "num_lookahead_slots": 10,
+            "preemption_mode": "swap",
+        },
+        {
+            "num_lookahead_slots": 10,
+            "preemption_mode": "recompute",
+        }
+    ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
diff --git a/vllm/config.py b/vllm/config.py
index eef3fc53c3a6..b84b67dd30ad 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -542,18 +542,19 @@ class SchedulerConfig:
             prompt latency) before scheduling next prompt.
         enable_chunked_prefill: If True, prefill requests can be chunked based
             on the remaining max_num_batched_tokens.
+        preemption_mode: Whether to perform preemption by swapping or 
+            recomputation (default)
     """
 
-    def __init__(
-        self,
-        max_num_batched_tokens: Optional[int],
-        max_num_seqs: int,
-        max_model_len: int,
-        use_v2_block_manager: bool = False,
-        num_lookahead_slots: int = 0,
-        delay_factor: float = 0.0,
-        enable_chunked_prefill: bool = False,
-    ) -> None:
+    def __init__(self,
+                 max_num_batched_tokens: Optional[int],
+                 max_num_seqs: int,
+                 max_model_len: int,
+                 use_v2_block_manager: bool = False,
+                 num_lookahead_slots: int = 0,
+                 delay_factor: float = 0.0,
+                 enable_chunked_prefill: bool = False,
+                 preemption_mode: Optional[str] = None) -> None:
         if max_num_batched_tokens is not None:
             self.max_num_batched_tokens = max_num_batched_tokens
         else:
@@ -566,6 +567,7 @@ def __init__(
         self.num_lookahead_slots = num_lookahead_slots
         self.delay_factor = delay_factor
         self.chunked_prefill_enabled = enable_chunked_prefill
+        self.preemption_mode = preemption_mode
 
         self._verify_args()
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 9d098801233e..da2a291d0397 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -331,12 +331,14 @@ def _schedule(self) -> SchedulerOutputs:
                 if self.running:
                     # Preempt the lowest-priority sequence groups.
                     victim_seq_group = self.running.pop()
-                    self._preempt(victim_seq_group, blocks_to_swap_out)
+                    self._preempt(victim_seq_group, blocks_to_swap_out,
+                                  self.scheduler_config.preemption_mode)
                     preempted.append(victim_seq_group)
                 else:
                     # No other sequence groups can be preempted.
                     # Preempt the current sequence group.
-                    self._preempt(seq_group, blocks_to_swap_out)
+                    self._preempt(seq_group, blocks_to_swap_out,
+                                  self.scheduler_config.preemption_mode)
                     preempted.append(seq_group)
                     break
             else:
@@ -538,7 +540,7 @@ def _preempt(
         self,
         seq_group: SequenceGroup,
         blocks_to_swap_out: Dict[int, int],
-        preemption_mode: Optional[PreemptionMode] = None,
+        preemption_mode: Optional[str] = None,
     ) -> None:
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
@@ -556,6 +558,10 @@ def _preempt(
                 preemption_mode = PreemptionMode.RECOMPUTE
             else:
                 preemption_mode = PreemptionMode.SWAP
+        elif preemption_mode == "swap":
+            preemption_mode = PreemptionMode.SWAP
+        else:
+            preemption_mode = PreemptionMode.RECOMPUTE
         if preemption_mode == PreemptionMode.RECOMPUTE:
             self._preempt_by_recompute(seq_group)
         elif preemption_mode == PreemptionMode.SWAP:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8d61f2f9ff19..44a14286120f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -63,6 +63,7 @@ class EngineArgs:
     image_feature_size: Optional[int] = None
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
+    preemption_mode: Optional[str] = None
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -417,6 +418,7 @@ def create_engine_configs(
             num_lookahead_slots=self.num_lookahead_slots,
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
+            preemption_mode=self.preemption_mode,
         )
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,

From 66a7bbdf9c528bf765469278b0d651f8c5cf59ce Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Wed, 1 May 2024 08:25:32 +0000
Subject: [PATCH 17/32] fix

---
 tests/core/block/test_block_manager_v2.py | 2 +-
 vllm/core/block/naive_block.py            | 5 ++++-
 vllm/core/block/prefix_caching_block.py   | 7 +++++--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index bbba018a3f92..a75b1f80a6e5 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -106,7 +106,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 @pytest.mark.parametrize("block_size", [8])
 @pytest.mark.parametrize("num_cpu_blocks", [4])
 @pytest.mark.parametrize("num_gpu_blocks", [4])
-@pytest.mark.parametrize("num_lookahead_slots", [0, 2])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
 @pytest.mark.parametrize("enable_caching", [False])
 def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
               enable_caching):
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 0857ed4db978..1ccbec331aea 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -3,6 +3,7 @@
 from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator
+from vllm.utils import cdiv
 
 BlockId = int
 Refcount = int
@@ -229,7 +230,9 @@ def get_num_blocks_touched(self,
                 if block.num_empty_slots >= num_lookahead_slots:
                     new_block_count += 1
                 else:
-                    new_block_count += 2
+                    new_block_count += cdiv(
+                        num_lookahead_slots - block.num_empty_slots,
+                        self._block_size)
             else:
                 old_block_set.add(block.block_id)
         num_touched_blocks = new_block_count + len(old_block_set)
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 86051448dd5a..a1cbc87f54b0 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -9,6 +9,7 @@
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.utils import cdiv
 
 PrefixHash = int
 BlockId = int
@@ -254,7 +255,7 @@ def all_block_ids(self) -> frozenset[int]:
         return self._hashless_allocator.all_block_ids
 
     def is_block_cached(self, block: "PrefixCachingBlock") -> bool:
-        if block.content_hash not in self._cached_blocks:
+        if block.content_hash in self._cached_blocks:
             return True
         return False
 
@@ -359,7 +360,9 @@ def can_swap(self,
                 if block.num_empty_slots >= num_lookahead_slots:
                     num_touched_blocks += 1
                 else:
-                    num_touched_blocks += 2
+                    num_touched_blocks += cdiv(
+                        num_lookahead_slots - block.num_empty_slots,
+                        self._block_size)
             else:
                 if not self.is_block_cached(block):
                     num_touched_blocks += 1

From 35d391e5f26237be7486d2b5201b3b85a2b54b6e Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Wed, 1 May 2024 08:34:51 +0000
Subject: [PATCH 18/32] remove import for __future__.annotations

---
 vllm/core/block/cpu_gpu_block_allocator.py | 2 --
 vllm/core/block/interfaces.py              | 2 --
 vllm/core/block/prefix_caching_block.py    | 1 -
 3 files changed, 5 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index be5a20e0ac15..c09b928bc14e 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 from typing import Dict, List, Optional
 
 from vllm.core.block.interfaces import (Block, BlockAllocator,
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index d463f7b09131..9f466566f096 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 from abc import ABC, abstractmethod, abstractproperty
 from typing import Dict, List, Optional, Protocol
 
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index a1cbc87f54b0..f24952285367 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,5 +1,4 @@
 """Token blocks."""
-from __future__ import annotations
 
 from itertools import takewhile
 from os.path import commonprefix

From 13ab5f543816877fe90c0bf9ed5b5f27ec38003e Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Thu, 2 May 2024 00:24:52 +0000
Subject: [PATCH 19/32] fix: address comments

---
 vllm/config.py                             |  6 +++-
 vllm/core/block/cpu_gpu_block_allocator.py | 35 ++++++++++++++--------
 vllm/core/block/naive_block.py             |  7 +++--
 vllm/core/block/prefix_caching_block.py    |  6 ++--
 vllm/core/block_manager_v2.py              | 33 ++++++++++----------
 vllm/core/scheduler.py                     | 13 ++++----
 6 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b84b67dd30ad..96ffed8cdf34 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -543,7 +543,11 @@ class SchedulerConfig:
         enable_chunked_prefill: If True, prefill requests can be chunked based
             on the remaining max_num_batched_tokens.
         preemption_mode: Whether to perform preemption by swapping or 
-            recomputation (default)
+            recomputation. If not specified, we determine the mode as follows:
+            We use recomputation by default since it incurs lower overhead than
+            swapping. However, when the sequence group has multiple sequences
+            (e.g., beam search), recomputation is not currently supported. In
+            such a case, we use swapping instead.
     """
 
     def __init__(self,
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index c09b928bc14e..5b185d8466f5 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -176,10 +176,9 @@ def get_num_free_blocks(self, device: Device) -> int:
         """
         return self._allocators[device].get_num_free_blocks()
 
-    def get_device_related_block_id(self, device: Device,
-                                    absolute_id: int) -> int:
-        """Returns the relative block id on certain device given the absolute 
-        block id.
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain device given the 
+        absolute block id.
 
         Args:
             device (Device): The device for which to query relative block id.
@@ -187,29 +186,39 @@ def get_device_related_block_id(self, device: Device,
                 whole allocator.
 
         Returns:
-            int: The relative block id on certain device.
+            int: The zero-offset block id on certain device.
         """
-        return self._allocators[device].get_device_related_block_id(
-            absolute_id)
+        return self._allocators[device].get_physical_block_id(absolute_id)
 
     def swap(self, blocks: List[Block], source_device: Device,
-             dest_device: Device) -> None:
+             dest_device: Device) -> dict[int, int]:
         """Execute the swap for the given blocks from source_device
-        on to dest_device, and save the swap mapping.
+        on to dest_device, save the current swap mapping and append 
+        them to the accumulated `self._swap_mapping` for each 
+        scheduling move.
 
         Args:
             blocks: List of blocks to be swapped.
             source_device (Device): Device to swap the 'blocks' from.
             dest_device (Device): Device to swap the 'blocks' to.
+        
+        Returns:
+            dict[int, int]: Swap mapping from source_device
+                on to dest_device.
         """
         source_block_ids = [block.block_id for block in blocks]
         self._allocators[source_device].swap_out(blocks)
         self._allocators[dest_device].swap_in(blocks)
         dest_block_ids = [block.block_id for block in blocks]
-        self._swap_mapping = {
-            src: dest
-            for src, dest in zip(source_block_ids, dest_block_ids)
-        }
+        # self._swap_mapping = {
+        #     src: dest
+        #     for src, dest in zip(source_block_ids, dest_block_ids)
+        # }
+        current_swap_mapping = {}
+        for src, dest in zip(source_block_ids, dest_block_ids):
+            self._swap_mapping[src] = dest
+            current_swap_mapping[src] = dest
+        return current_swap_mapping
 
     def get_num_blocks_touched(self,
                                blocks: List[Block],
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index 1ccbec331aea..7f205d5df008 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -142,8 +142,8 @@ def _free_block_id(self, block_id: BlockId) -> None:
         if refcount == 0:
             self._free_block_indices.add(block_id)
 
-    def get_device_related_block_id(self, absolute_id: int) -> int:
-        """Returns the relative block id on certain block allocator
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
         given the absolute block id.
 
         Args:
@@ -151,7 +151,7 @@ def get_device_related_block_id(self, absolute_id: int) -> int:
             in whole allocator.
 
         Returns:
-            int: The relative block id on certain device.
+            int: The zero-offset block id on certain device.
         """
         return sorted(self._all_block_indices).index(absolute_id)
 
@@ -225,6 +225,7 @@ def get_num_blocks_touched(self,
         # needed.
         old_block_set = set()
         new_block_count = 0
+        # TODO(cade): make sure the logic is correct and clean it up.
         for block in blocks:
             if not block.is_full and num_lookahead_slots != 0:
                 if block.num_empty_slots >= num_lookahead_slots:
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index f24952285367..fd90dd02fda8 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -236,8 +236,8 @@ def get_num_free_blocks(self) -> int:
         return self._hashless_allocator.get_num_free_blocks() + len(
             self._unused_cached_blocks)
 
-    def get_device_related_block_id(self, absolute_id: int) -> int:
-        """Returns the relative block id on certain block allocator
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
         given the absolute block id.
 
         Args:
@@ -245,7 +245,7 @@ def get_device_related_block_id(self, absolute_id: int) -> int:
                 in whole allocator.
 
         Returns:
-            int: The relative block id on certain device.
+            int: The rzero-offset block id on certain device.
         """
         return sorted(self._all_block_indices).index(absolute_id)
 
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 943566e843be..82ee336943e3 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -254,20 +254,19 @@ def swap_in(self, sequence_group: SequenceGroup) -> Dict[int, int]:
         """
         blocks = self._get_blocks_for_swap(sequence_group,
                                            SequenceStatus.SWAPPED)
-        self.block_allocator.swap(blocks=blocks,
-                                  source_device=Device.CPU,
-                                  dest_device=Device.GPU)
+        current_swap_mapping = self.block_allocator.swap(
+            blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
         # NOTE: Once the BlockManagerV1 implementation is deleted, we can
         # move this get_and_reset_swaps call outside of swap_in/swap_out.
         # Then the scheduler can make calls to get all swaps and all
         # copy-on-writes for the batch.
-        mapping = self.block_allocator.get_and_reset_swaps()
+
         block_number_mapping = {
-            self.block_allocator.get_device_related_block_id(
-                Device.CPU, cpu_block_id):
-            self.block_allocator.get_device_related_block_id(
-                Device.GPU, gpu_block_id)
-            for cpu_block_id, gpu_block_id in mapping.items()
+            self.block_allocator.get_physical_block_id(Device.CPU,
+                                                       cpu_block_id):
+            self.block_allocator.get_physical_block_id(Device.GPU,
+                                                       gpu_block_id)
+            for cpu_block_id, gpu_block_id in current_swap_mapping.items()
         }
         return block_number_mapping
 
@@ -297,16 +296,14 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
         """
         blocks = self._get_blocks_for_swap(sequence_group,
                                            SequenceStatus.RUNNING)
-        self.block_allocator.swap(blocks=blocks,
-                                  source_device=Device.GPU,
-                                  dest_device=Device.CPU)
-        mapping = self.block_allocator.get_and_reset_swaps()
+        current_swap_mapping = self.block_allocator.swap(
+            blocks=blocks, source_device=Device.GPU, dest_device=Device.CPU)
         block_number_mapping = {
-            self.block_allocator.get_device_related_block_id(
-                Device.GPU, gpu_block_id):
-            self.block_allocator.get_device_related_block_id(
-                Device.CPU, cpu_block_id)
-            for gpu_block_id, cpu_block_id in mapping.items()
+            self.block_allocator.get_physical_block_id(Device.GPU,
+                                                       gpu_block_id):
+            self.block_allocator.get_physical_block_id(Device.CPU,
+                                                       cpu_block_id)
+            for gpu_block_id, cpu_block_id in current_swap_mapping.items()
         }
         return block_number_mapping
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index da2a291d0397..fabe7af42d0f 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -154,6 +154,8 @@ def __init__(
         self.prev_prompt = False
         # Latency of the last prompt step
         self.last_prompt_latency = 0.0
+        # preemption mode, RECOMPUTE or SWAP
+        self.user_specified_preemption_mode = scheduler_config.preemption_mode
 
     @property
     def lora_enabled(self) -> bool:
@@ -331,14 +333,12 @@ def _schedule(self) -> SchedulerOutputs:
                 if self.running:
                     # Preempt the lowest-priority sequence groups.
                     victim_seq_group = self.running.pop()
-                    self._preempt(victim_seq_group, blocks_to_swap_out,
-                                  self.scheduler_config.preemption_mode)
+                    self._preempt(victim_seq_group, blocks_to_swap_out)
                     preempted.append(victim_seq_group)
                 else:
                     # No other sequence groups can be preempted.
                     # Preempt the current sequence group.
-                    self._preempt(seq_group, blocks_to_swap_out,
-                                  self.scheduler_config.preemption_mode)
+                    self._preempt(seq_group, blocks_to_swap_out)
                     preempted.append(seq_group)
                     break
             else:
@@ -540,7 +540,6 @@ def _preempt(
         self,
         seq_group: SequenceGroup,
         blocks_to_swap_out: Dict[int, int],
-        preemption_mode: Optional[str] = None,
     ) -> None:
         # If preemption mode is not specified, we determine the mode as follows:
         # We use recomputation by default since it incurs lower overhead than
@@ -553,12 +552,12 @@ def _preempt(
         # over sequence groups with a single sequence.
         # TODO(woosuk): Support recomputation for sequence groups with multiple
         # sequences. This may require a more sophisticated CUDA kernel.
-        if preemption_mode is None:
+        if self.user_specified_preemption_mode is None:
             if seq_group.get_max_num_running_seqs() == 1:
                 preemption_mode = PreemptionMode.RECOMPUTE
             else:
                 preemption_mode = PreemptionMode.SWAP
-        elif preemption_mode == "swap":
+        elif self.user_specified_preemption_mode == "swap":
             preemption_mode = PreemptionMode.SWAP
         else:
             preemption_mode = PreemptionMode.RECOMPUTE

From a1e228ca08948cde1a55c7c04846682bc184fb42 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Thu, 2 May 2024 00:34:03 +0000
Subject: [PATCH 20/32] feat: add preemption as an user input arg

---
 vllm/engine/arg_utils.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 44a14286120f..d365668fb7c1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -55,6 +55,7 @@ class EngineArgs:
     ray_workers_use_nsight: bool = False
     forced_num_gpu_blocks: Optional[int] = None
     num_lookahead_slots: int = 0
+    preemption_mode: Optional[str] = None
 
     # Related to Vision-language models such as llava
     image_input_type: Optional[str] = None
@@ -63,7 +64,6 @@ class EngineArgs:
     image_feature_size: Optional[int] = None
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
-    preemption_mode: Optional[str] = None
 
     def __post_init__(self):
         if self.tokenizer is None:
@@ -372,6 +372,13 @@ def add_cli_args(
             default=False,
             help='If True, the prefill requests can be chunked based on the '
             'max_num_batched_tokens')
+        parser.add_argument(
+            '--preemption_mode',
+            type=str,
+            default=None,
+            help='If \'recompute\', the engine performs preemption by block '
+            'swapping; If \'swap\', the engine performs preemption by block '
+            'swapping.')
         return parser
 
     @classmethod

From 98484194c966c5156732dc49925e99e0485b9221 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Thu, 2 May 2024 22:10:08 +0000
Subject: [PATCH 21/32] nit

---
 vllm/core/block/block_table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 9813e6882f0e..1704bd41864d 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -78,8 +78,8 @@ def allocate(self,
             device (Device, optional): The device on which the blocks should be
                 allocated. Defaults to Device.GPU.
         """
-        assert token_ids
         assert not self._is_allocated
+        assert token_ids
         self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
                                                            token_ids=token_ids,
                                                            device=device)

From 170d5a255f61e06f9a7ac8cf4b17762d6d0a4d95 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 3 May 2024 04:55:22 +0000
Subject: [PATCH 22/32] fix: format and test

---
 tests/core/block/e2e/test_correctness.py   | 16 +++++++--
 tests/core/block/test_block_manager_v2.py  |  3 ++
 vllm/core/block/cpu_gpu_block_allocator.py | 18 +++++-----
 vllm/core/block/interfaces.py              | 34 ++++++++++++++++++
 vllm/core/block/prefix_caching_block.py    | 20 ++++-------
 vllm/core/block_manager_v2.py              | 42 +++++++++++++---------
 vllm/core/interfaces.py                    |  3 +-
 vllm/core/scheduler.py                     |  4 ++-
 vllm/engine/arg_utils.py                   |  5 ---
 9 files changed, 96 insertions(+), 49 deletions(-)

diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 8d03b0520442..c381b2b886f0 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -339,7 +339,13 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "preemption_mode": "swap"
+}, {
+    "use_v2_block_manager": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
@@ -414,7 +420,13 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "enable_prefix_caching": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+    "preemption_mode": "swap"
+}, {
+    "enable_prefix_caching": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index a75b1f80a6e5..0af35f5c2fa2 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -151,3 +151,6 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
     after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 7d5b4537b256..f6238b6dc4c7 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -102,7 +102,7 @@ def __init__(self, cpu_block_allocator: BlockAllocator,
             Device.GPU: gpu_block_allocator,
         }
 
-        self._swap_mapping = {}
+        self._swap_mapping: Dict[int, int] = {}
         self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
         for _, allocator in self._allocators.items():
             for block_id in allocator.all_block_ids:
@@ -198,7 +198,7 @@ def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
         return self._allocators[device].get_physical_block_id(absolute_id)
 
     def swap(self, blocks: List[Block], source_device: Device,
-             dest_device: Device) -> dict[int, int]:
+             dest_device: Device) -> Dict[int, int]:
         """Execute the swap for the given blocks from source_device
         on to dest_device, save the current swap mapping and append 
         them to the accumulated `self._swap_mapping` for each 
@@ -217,14 +217,12 @@ def swap(self, blocks: List[Block], source_device: Device,
         self._allocators[source_device].swap_out(blocks)
         self._allocators[dest_device].swap_in(blocks)
         dest_block_ids = [block.block_id for block in blocks]
-        # self._swap_mapping = {
-        #     src: dest
-        #     for src, dest in zip(source_block_ids, dest_block_ids)
-        # }
-        current_swap_mapping = {}
+
+        current_swap_mapping: Dict[int, int] = {}
         for src, dest in zip(source_block_ids, dest_block_ids):
-            self._swap_mapping[src] = dest
-            current_swap_mapping[src] = dest
+            if src is not None and dest is not None:
+                self._swap_mapping[src] = dest
+                current_swap_mapping[src] = dest
         return current_swap_mapping
 
     def get_num_blocks_touched(self,
@@ -289,7 +287,7 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
     def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
         raise NotImplementedError
 
-    def get_and_reset_swaps(self) -> dict[int, int]:
+    def get_and_reset_swaps(self) -> Dict[int, int]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
         schedule when BlockManagerV2 become default.
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 634c4016ca19..b42d0529446b 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -116,6 +116,18 @@ def get_num_total_blocks(self) -> int:
     def get_num_free_blocks(self) -> int:
         pass
 
+    @abstractmethod
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        pass
+
+    @abstractmethod
+    def swap_out(self, blocks: List[Block]) -> None:
+        pass
+
+    @abstractmethod
+    def swap_in(self, blocks: List[Block]) -> None:
+        pass
+
     @property
     @abstractmethod
     def all_block_ids(self) -> FrozenSet[int]:
@@ -149,6 +161,12 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
         """NOTE: This should not be used besides Block"""
         pass
 
+    @abstractmethod
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               num_lookahead_slots: int = 0) -> int:
+        pass
+
     class NoFreeBlocksError(ValueError):
         pass
 
@@ -203,3 +221,19 @@ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
     def get_common_computed_block_ids(
             self, seq_block_ids: List[List[int]]) -> List[int]:
         pass
+
+    @abstractmethod
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               device: Device,
+                               num_lookahead_slots: int = 0) -> int:
+        pass
+
+    @abstractmethod
+    def swap(self, blocks: List[Block], source_device: Device,
+             dest_device: Device) -> Dict[int, int]:
+        pass
+
+    @abstractmethod
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        pass
\ No newline at end of file
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 372e0809099e..34f9ad6d53a7 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -289,11 +289,6 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
 
     def get_num_total_blocks(self) -> int:
         return self._hashless_allocator.get_num_total_blocks()
-    def get_num_free_blocks(self) -> int:
-        # The number of free blocks is the number of hashless free
-        # blocks plus the number of hashful blocks that are unused.
-        return self._hashless_allocator.get_num_free_blocks() + len(
-            self._unused_cached_blocks)
 
     def get_physical_block_id(self, absolute_id: int) -> int:
         """Returns the zero-offset block id on certain block allocator
@@ -306,20 +301,19 @@ def get_physical_block_id(self, absolute_id: int) -> int:
         Returns:
             int: The rzero-offset block id on certain device.
         """
-        return sorted(self._all_block_indices).index(absolute_id)
+        return sorted(self.all_block_ids).index(absolute_id)
 
     @property
     def all_block_ids(self) -> FrozenSet[int]:
         return self._hashless_allocator.all_block_ids
 
-    def promote_to_immutable_block(self, block: Block) -> BlockId:
-    def is_block_cached(self, block: "PrefixCachingBlock") -> bool:
+    def is_block_cached(self, block: Block) -> bool:
+        assert block.content_hash is not None
         if block.content_hash in self._cached_blocks:
             return True
         return False
 
-    def promote_to_immutable_block(self,
-                                   block: "PrefixCachingBlock") -> BlockId:
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
         """Once a mutable block is full, it can be promoted to an immutable
         block. This means that its content can be referenced by future blocks
         having the same prefix.
@@ -431,9 +425,9 @@ def get_common_computed_block_ids(
             if ids != []
         ])
 
-    def can_swap(self,
-                 blocks: List[Block],
-                 num_lookahead_slots: int = 0) -> int:
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               num_lookahead_slots: int = 0) -> int:
         """Determine the number of blocks that will be touched by
         swapping in/out the given blocks from certain sequence
         group with the provided num_lookahead_slots.
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index d04cfa3fec40..0a7f23e55246 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -239,8 +239,8 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
         self.block_tables[child_seq.seq_id] = src_block_table.fork()
 
     def can_swap_in(self, seq_group: SequenceGroup,
-                    num_lookahead_slots: int) -> bool:
-        """Returns whether we can swap in the given sequence_group 
+                    num_lookahead_slots: int) -> AllocStatus:
+        """Returns the AllocStatus for the given sequence_group 
         with num_lookahead_slots.
 
         Args:
@@ -249,23 +249,22 @@ def can_swap_in(self, seq_group: SequenceGroup,
                 speculative decoding, default to 0.
 
         Returns:
-            bool: Whether it's possible to swap in current sequence group.
+            AllocStatus: The AllocStatus for the given sequence group.
         """
         return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
                               num_lookahead_slots)
 
-    def swap_in(self, sequence_group: SequenceGroup) -> Dict[int, int]:
+    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         """Returns the block id mapping (from CPU to GPU) generated by
-        swapping in the given sequence_group with num_lookahead_slots.
+        swapping in the given seq_group with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap in.
 
         Returns:
             Dict[int, int]: The mapping of swapping block from CPU to GPU.
         """
-        blocks = self._get_blocks_for_swap(sequence_group,
-                                           SequenceStatus.SWAPPED)
+        blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED)
         current_swap_mapping = self.block_allocator.swap(
             blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
         # NOTE: Once the BlockManagerV1 implementation is deleted, we can
@@ -287,14 +286,18 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap in.
             num_lookahead_slots (int): Number of lookahead slots used in 
                 speculative decoding, default to 0.
 
         Returns:
             bool: Whether it's possible to swap out current sequence group.
         """
-        return self._can_swap(seq_group, Device.CPU, SequenceStatus.RUNNING)
+        alloc_status = self._can_swap(seq_group, Device.CPU,
+                                      SequenceStatus.RUNNING)
+        if alloc_status == AllocStatus.OK:
+            return True
+        return False
 
     def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
         """Returns the block id mapping (from GPU to CPU) generated by
@@ -329,8 +332,8 @@ def _can_swap(self,
                   seq_group: SequenceGroup,
                   device: Device,
                   status: SequenceStatus,
-                  num_lookahead_slots: int = 0) -> bool:
-        """Returns whether we can swap in/out the given sequence_group 
+                  num_lookahead_slots: int = 0) -> AllocStatus:
+        """Returns the AllocStatus for swapping in/out the given sequence_group 
         on to the 'device'.
 
         Args:
@@ -342,7 +345,7 @@ def _can_swap(self,
                 speculative decoding, default to 0.
 
         Returns:
-            bool: whether we can swap in/out the given sequence_group 
+            AllocStatus: The AllocStatus for swapping in/out the given sequence_group 
         on to the 'device'.
         """
         blocks = self._get_blocks_for_swap(seq_group, status)
@@ -351,8 +354,14 @@ def _can_swap(self,
         watermark_blocks = 0
         if device == Device.GPU:
             watermark_blocks = self.watermark_blocks
-        return self.block_allocator.get_num_free_blocks(
-            device) - num_blocks_touched > watermark_blocks
+        if self.block_allocator.get_num_total_blocks(
+                device) < num_blocks_touched:
+            return AllocStatus.NEVER
+        elif self.block_allocator.get_num_free_blocks(
+                device) - num_blocks_touched >= watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
 
     def _get_blocks_for_swap(self, seq_group: SequenceGroup,
                              status: SequenceStatus) -> List[Block]:
@@ -369,6 +378,7 @@ def _get_blocks_for_swap(self, seq_group: SequenceGroup,
         blocks: Dict[int, List[Block]] = {}
         for seq in seq_group.get_seqs(status=status):
             block_table = self.block_tables[seq.seq_id]
-            blocks[seq.seq_id] = block_table.blocks
+            if block_table.blocks is not None:
+                blocks[seq.seq_id] = block_table.blocks
         combined_blocks = list(chain(*blocks.values()))
         return combined_blocks
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 09ccaddb6261..26b6f492168b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -67,8 +67,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
         pass
 
     @abstractmethod
-    def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> Dict[int, int]:
+    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
         pass
 
     @abstractmethod
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 322489fe9354..aee77d2d7a17 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -523,7 +523,9 @@ def _schedule_swapped(
             seq_group = swapped_queue[0]
 
             # If the sequence group cannot be swapped in, stop.
-            alloc_status = self.block_manager.can_swap_in(seq_group)
+            is_prefill = seq_group.is_prefill()
+            alloc_status = self.block_manager.can_swap_in(
+                seq_group, self._get_num_lookahead_slots(is_prefill))
             if alloc_status == AllocStatus.LATER:
                 break
             elif alloc_status == AllocStatus.NEVER:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 7973f9a6eee1..f1fe84297bd5 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -474,11 +474,6 @@ def add_cli_args(
                             'corresponding to the chosen load_format. '
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
-
-            type=bool,
-            default=False,
-            help='If True, the prefill requests can be chunked based on the '
-            'max_num_batched_tokens')
         parser.add_argument(
             '--preemption_mode',
             type=str,

From c7a3484324505e25bb8463e0747ee0a1f9b4d710 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 3 May 2024 05:00:49 +0000
Subject: [PATCH 23/32] fix: ruff

---
 vllm/core/block_manager_v2.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 0a7f23e55246..4b4f823d6a96 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -345,8 +345,8 @@ def _can_swap(self,
                 speculative decoding, default to 0.
 
         Returns:
-            AllocStatus: The AllocStatus for swapping in/out the given sequence_group 
-        on to the 'device'.
+            AllocStatus: The AllocStatus for swapping in/out the given 
+                sequence_group on to the 'device'.
         """
         blocks = self._get_blocks_for_swap(seq_group, status)
         num_blocks_touched = self.block_allocator.get_num_blocks_touched(

From c252294474bcbfca12a0f8f5355ace088b36bbcc Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 3 May 2024 06:11:10 +0000
Subject: [PATCH 24/32] test: add enable_cache=True for test_swap

---
 tests/core/block/test_block_manager_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 0af35f5c2fa2..3aa9960cc157 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -107,7 +107,7 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
 @pytest.mark.parametrize("num_cpu_blocks", [4])
 @pytest.mark.parametrize("num_gpu_blocks", [4])
 @pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
-@pytest.mark.parametrize("enable_caching", [False])
+@pytest.mark.parametrize("enable_caching", [False, True])
 def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
               enable_caching):
     """Verify blocks number on src/desc device is correct after swapping in/out

From 880b8555e8012032bab013cbe74b9833a3586e72 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <kaiyangchen1225@gmail.com>
Date: Fri, 3 May 2024 09:01:21 +0000
Subject: [PATCH 25/32] nit

---
 vllm/core/block/cpu_gpu_block_allocator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index f6238b6dc4c7..5eb206b24c8c 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -240,7 +240,7 @@ def get_num_blocks_touched(self,
 
         Returns:
             int: the number of blocks that will be touched by
-        swapping in/out the given blocks on to the 'device'.
+                swapping in/out the given blocks on to the 'device'.
         """
         return self._allocators[device].get_num_blocks_touched(
             blocks, num_lookahead_slots)

From fe13a919646eba82749125deefe6d133a848b5ed Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Fri, 10 May 2024 15:54:20 +0800
Subject: [PATCH 26/32] fix

---
 vllm/core/block/cpu_gpu_block_allocator.py |  2 +-
 vllm/core/block/interfaces.py              |  2 +-
 vllm/core/block_manager_v2.py              | 16 +++++++---------
 vllm/core/interfaces.py                    |  1 +
 vllm/engine/arg_utils.py                   |  2 +-
 5 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index f97d408a1bc0..7391d8179e1d 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -210,7 +210,7 @@ def swap(self, blocks: List[Block], source_device: Device,
             dest_device (Device): Device to swap the 'blocks' to.
         
         Returns:
-            dict[int, int]: Swap mapping from source_device
+            Dict[int, int]: Swap mapping from source_device
                 on to dest_device.
         """
         source_block_ids = [block.block_id for block in blocks]
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index c2f2ddaa29ee..7780d2d80bc9 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import FrozenSet, List, Optional, Protocol, Tuple
+from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
 
 from vllm.utils import Device
 
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index d6b55348980d..4d6dec3f467e 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -255,7 +255,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
         return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
                               num_lookahead_slots)
 
-    def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         """Returns the block id mapping (from CPU to GPU) generated by
         swapping in the given seq_group with num_lookahead_slots.
 
@@ -263,15 +263,12 @@ def swap_in(self, seq_group: SequenceGroup) -> Dict[int, int]:
             seq_group (SequenceGroup): The sequence group to swap in.
 
         Returns:
-            Dict[int, int]: The mapping of swapping block from CPU to GPU.
+            List[Tuple[int, int]]: The mapping of swapping block from CPU 
+                to GPU.
         """
         blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED)
         current_swap_mapping = self.block_allocator.swap(
             blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
-        # NOTE: Once the BlockManagerV1 implementation is deleted, we can
-        # move this get_and_reset_swaps call outside of swap_in/swap_out.
-        # Then the scheduler can make calls to get all swaps and all
-        # copy-on-writes for the batch.
 
         block_number_mapping = {
             self.block_allocator.get_physical_block_id(Device.CPU,
@@ -301,7 +298,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
             return True
         return False
 
-    def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
+    def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]:
         """Returns the block id mapping (from GPU to CPU) generated by
         swapping out the given sequence_group with num_lookahead_slots.
 
@@ -309,7 +306,8 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
             sequence_group (SequenceGroup): The sequence group to swap in.
 
         Returns:
-            Dict[int, int]: The mapping of swapping block from GPU to CPU.
+            List[Tuple[int, int]]: The mapping of swapping block from 
+                GPU to CPU.
         """
         blocks = self._get_blocks_for_swap(sequence_group,
                                            SequenceStatus.RUNNING)
@@ -322,7 +320,7 @@ def swap_out(self, sequence_group: SequenceGroup) -> Dict[int, int]:
                                                        cpu_block_id)
             for gpu_block_id, cpu_block_id in current_swap_mapping.items()
         }
-         # convert to list of tuples once here
+        # convert to list of tuples once here
         return list(block_number_mapping.items())
 
     def get_num_free_gpu_blocks(self) -> int:
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 3de7b78259e7..034f340ad78b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -69,6 +69,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
 
     @abstractmethod
     def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        pass
 
     @abstractmethod
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 100df8b89ce1..2d2ac40a7ceb 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -505,7 +505,7 @@ def add_cli_args(
             help='If \'recompute\', the engine performs preemption by block '
             'swapping; If \'swap\', the engine performs preemption by block '
             'swapping.')
-        
+
         parser.add_argument(
             "--served-model-name",
             nargs="+",

From 773d3318cc4340a8107aefe07e5f475354678c87 Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Fri, 10 May 2024 16:28:07 +0800
Subject: [PATCH 27/32] fix: test

---
 tests/core/block/test_block_manager_v2.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 3aa9960cc157..a872fe995dd3 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -134,7 +134,8 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
     before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     mapping = block_manager.swap_out(seq_group)
-    assert list(mapping.keys()) == gpu_blocks
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
     after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
@@ -147,7 +148,8 @@ def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
     before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     mapping = block_manager.swap_in(seq_group)
     cpu_blocks = block_manager.get_block_table(prompt)
-    assert list(mapping.keys()) == [cpu_blocks[0]]
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == [cpu_blocks[0]]
     after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
     after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
     assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)

From 37d9b3122a5ef7009bb7dca38ef40f3e1bd2fff9 Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Fri, 10 May 2024 18:11:58 +0800
Subject: [PATCH 28/32] test: retry ci tests

---
 vllm/core/block/cpu_gpu_block_allocator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 7391d8179e1d..20e2e2340f91 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -287,14 +287,14 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
     def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
         raise NotImplementedError
 
-    def get_and_reset_swaps(self) -> Dict[int, int]:
+    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
         schedule when BlockManagerV2 become default.
 
         Returns:
-            Dict[int, int]: A mapping of source to destination block IDs.
+            List[Tuple[int, int]]: A mapping of source to destination block IDs.
         """
         mapping = self._swap_mapping.copy()
         self._swap_mapping.clear()
-        return mapping
+        return list(mapping.items())

From a2f1df38e64e12838c68f85fc182aba079ba3110 Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Fri, 10 May 2024 19:54:07 +0800
Subject: [PATCH 29/32] retry

---
 vllm/core/block/cpu_gpu_block_allocator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 20e2e2340f91..07ecab256cd8 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -290,7 +290,7 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
     def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
         """Returns and clears the mapping of source to destination block IDs.
         Will be called after every swapping operations for now, and after every
-        schedule when BlockManagerV2 become default.
+        schedule when BlockManagerV2 become default. Currently not useful.
 
         Returns:
             List[Tuple[int, int]]: A mapping of source to destination block IDs.

From 216eb7643f4c52b658518ea9425efcc0bd66b79f Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Mon, 13 May 2024 22:05:22 +0800
Subject: [PATCH 30/32] merge

---
 vllm/config.py                             | 21 ++++++++++-----------
 vllm/core/embedding_model_block_manager.py |  3 +--
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 5f51e9b0dd9e..bb91fca2e518 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -608,17 +608,16 @@ class SchedulerConfig:
             such a case, we use swapping instead.
     """
 
-    def __init__(
-        self,
-        max_num_batched_tokens: Optional[int],
-        max_num_seqs: int,
-        max_model_len: int,
-        use_v2_block_manager: bool = False,
-        num_lookahead_slots: int = 0,
-        delay_factor: float = 0.0,
-        enable_chunked_prefill: bool = False,
-        embedding_mode: Optional[bool] = False,
-        preemption_mode: Optional[str] = None) -> None:
+    def __init__(self,
+                 max_num_batched_tokens: Optional[int],
+                 max_num_seqs: int,
+                 max_model_len: int,
+                 use_v2_block_manager: bool = False,
+                 num_lookahead_slots: int = 0,
+                 delay_factor: float = 0.0,
+                 enable_chunked_prefill: bool = False,
+                 embedding_mode: Optional[bool] = False,
+                 preemption_mode: Optional[str] = None) -> None:
         if max_num_batched_tokens is not None:
             self.max_num_batched_tokens = max_num_batched_tokens
         else:
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index a09d79ec3c42..f2d67306d7ce 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -46,8 +46,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> AllocStatus:
         return AllocStatus.OK
 
-    def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> List[Tuple[int, int]]:
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         return None  # type: ignore
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:

From 862a5d46b5669088af2f758d466889a8d3fdd076 Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Tue, 28 May 2024 21:29:57 +0800
Subject: [PATCH 31/32] fix: ci

---
 format.sh                     | 2 +-
 vllm/core/block/interfaces.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/format.sh b/format.sh
index aaec25a8aa0d..c7ac2a56ce11 100755
--- a/format.sh
+++ b/format.sh
@@ -114,7 +114,7 @@ mypy vllm/model_executor --config-file pyproject.toml
 
 
 CODESPELL_EXCLUDES=(
-    '--skip' '*docs/source/_build/**,./tests/lora/data'
+    '--skip' '*docs/source/_build/**,*tests/lora/data/**'
 )
 
 # check spelling of specified files
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 1c1e7c4c6862..4b20856a1b42 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -238,6 +238,7 @@ def swap(self, blocks: List[Block], source_device: Device,
     def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
         pass
 
+    @abstractmethod
     def allocate_or_get_null_block(self) -> Block:
         """
         Null blocks are used as a placeholders for KV cache blocks that have

From 29df09262c4e86d0684440b03ef4b5946e45af18 Mon Sep 17 00:00:00 2001
From: Kaiyang-Chen <kaiyang-chen@sjtu.edu.cn>
Date: Thu, 30 May 2024 22:18:58 +0800
Subject: [PATCH 32/32] fix: merge

---
 tests/core/block/test_block_manager_v2.py | 3 ++-
 vllm/core/block_manager_v1.py             | 2 +-
 vllm/core/block_manager_v2.py             | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index 882dcba3f0d6..d0ca09c4be0d 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -7,7 +7,8 @@
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
 
-from ..utils import create_seq_group, create_seq_group_encoder_decoder, create_dummy_prompt
+from ..utils import (create_dummy_prompt, create_seq_group,
+                     create_seq_group_encoder_decoder)
 
 
 @pytest.mark.parametrize("block_size", [16])
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index decfac8dcead..4010aaf02b82 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -541,7 +541,7 @@ def _swap_block_table(
 
         return new_block_table
 
-     def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
 
         request_id = seq_group.request_id
 
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 0541572d497f..121092cf189b 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -6,8 +6,8 @@
 
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
-from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.block.interfaces import Block
+from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device