From 2dd0759df4a18f20de41b7d21789895ff4ed2d05 Mon Sep 17 00:00:00 2001 From: Stanislaw Wozniak Date: Wed, 6 May 2026 03:40:18 -0400 Subject: [PATCH 1/3] Optimized LRU logic Signed-off-by: Stanislaw Wozniak --- vllm/v1/core/block_pool.py | 20 ++++++++++++++------ vllm/v1/core/kv_cache_utils.py | 27 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 9097079ef33a..74458390502d 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -413,13 +413,21 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None: ordered_blocks: A list of blocks to free ordered by their eviction priority. """ - # Materialize the iterable to allow multiple passes. - blocks_list = list(ordered_blocks) - for block in blocks_list: + # Identify blocks with hash (LRU cache) and blocks without hash (will never match in APC) + blocks_with_hash = [] + blocks_without_hash = [] + for block in ordered_blocks: block.ref_cnt -= 1 - self.free_block_queue.append_n( - [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null] - ) + if block.ref_cnt == 0 and not block.is_null: + if block.block_hash is None: + blocks_without_hash.append(block) + else: + blocks_with_hash.append(block) + + # Allow immediate reallocation of blocks without hash + self.free_block_queue.prepend_n(blocks_without_hash) + # Append to LRU queue blocks for potential reuse + self.free_block_queue.append_n(blocks_with_hash) def evict_blocks(self, block_ids: set[int]) -> None: """evict blocks from the prefix cache by their block IDs. diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index b57e10b67faa..79b4eefbb3bc 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -349,6 +349,33 @@ def append_n(self, blocks: list[KVCacheBlock]) -> None: self.num_free_blocks += len(blocks) + def prepend_n(self, blocks: list[KVCacheBlock]) -> None: + """Put a list of blocks at the head of the free list (for immediate reuse). + + Args: + blocks: The blocks to prepend. + """ + if len(blocks) == 0: + return + + first_block = self.fake_free_list_head.next_free_block + assert first_block is not None, ( + "next_free_block of fake_free_list_head should always exist" + ) + + # Add inter-connections between consecutive blocks + prev_block = self.fake_free_list_head + for block in blocks: + block.prev_free_block = prev_block + prev_block.next_free_block = block + prev_block = block + + # Connect the last block of to the original first block + prev_block.next_free_block = first_block + first_block.prev_free_block = prev_block + + self.num_free_blocks += len(blocks) + def get_all_free_blocks(self) -> list[KVCacheBlock]: """Get all free blocks in the free list. Mainly used for testing. From 1c3ca429de9819deccf380c637cbb63da0b8bffd Mon Sep 17 00:00:00 2001 From: Stanislaw Wozniak Date: Thu, 14 May 2026 12:22:14 -0400 Subject: [PATCH 2/3] Pre-commit fixes Signed-off-by: Stanislaw Wozniak --- vllm/v1/core/block_pool.py | 6 +++--- vllm/v1/core/kv_cache_utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 74458390502d..c45f5b3765f5 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -413,17 +413,17 @@ def free_blocks(self, ordered_blocks: Iterable[KVCacheBlock]) -> None: ordered_blocks: A list of blocks to free ordered by their eviction priority. """ - # Identify blocks with hash (LRU cache) and blocks without hash (will never match in APC) + # Identify blocks with hash (LRU cache) and without it (will never match in APC) blocks_with_hash = [] blocks_without_hash = [] for block in ordered_blocks: block.ref_cnt -= 1 if block.ref_cnt == 0 and not block.is_null: if block.block_hash is None: - blocks_without_hash.append(block) + blocks_without_hash.append(block) else: blocks_with_hash.append(block) - + # Allow immediate reallocation of blocks without hash self.free_block_queue.prepend_n(blocks_without_hash) # Append to LRU queue blocks for potential reuse diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 79b4eefbb3bc..9b480ff988e7 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -362,7 +362,7 @@ def prepend_n(self, blocks: list[KVCacheBlock]) -> None: assert first_block is not None, ( "next_free_block of fake_free_list_head should always exist" ) - + # Add inter-connections between consecutive blocks prev_block = self.fake_free_list_head for block in blocks: From 28c3805363e951dc6745545a5de5930f75314a28 Mon Sep 17 00:00:00 2001 From: Stanislaw Wozniak Date: Wed, 10 Jun 2026 11:39:09 -0400 Subject: [PATCH 3/3] Comments cleanup Signed-off-by: Stanislaw Wozniak --- vllm/v1/core/block_pool.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index 52af8fabcae3..7fb5dbd1e75e 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -439,13 +439,12 @@ def free_blocks( else: blocks_with_hash.append(block) - # Append to LRU queue based on priority passed via parameter if prepend: self.free_block_queue.prepend_n(blocks_with_hash) else: self.free_block_queue.append_n(blocks_with_hash) - # Always allow immediate reallocation of blocks without hash + # Blocks without hash always get evicted first - prepend them last to the tail self.free_block_queue.prepend_n(blocks_without_hash) def evict_blocks(self, block_ids: set[int]) -> None: