vllm-project · LiuXiaoxuanPKU · Apr 13, 2025 · Apr 10, 2025 · Apr 12, 2025 · Apr 12, 2025
@@ -164,7 +164,8 @@ def allocate_slots(
         self,
         request: Request,
         num_tokens: int,
-        new_computed_blocks: Optional[list[KVCacheBlock]] = None
+        new_computed_blocks: Optional[list[KVCacheBlock]] = None,
+        num_spec_tokens: int = 0,
     ) -> Optional[list[KVCacheBlock]]:
         """Add slots for a request with new tokens to append.
 
@@ -174,6 +175,9 @@ def allocate_slots(
                 not include the tokens that have already been computed.
             new_computed_blocks: A list of new computed blocks just hitting the
                 prefix caching.
+            num_spec_tokens: The number of speculative tokens to allocate.
+                This field is only used by eagle. We allocate the slots for
+                the propose heads.
 
         Blocks layout:
         -----------------------------------------------------------------------
@@ -211,8 +215,9 @@ def allocate_slots(
         # the new prefix caching hits
         num_computed_tokens = (request.num_computed_tokens +
                                len(new_computed_blocks) * self.block_size)
-        num_required_blocks = cdiv(num_computed_tokens + num_tokens,
-                                   self.block_size)
+        num_required_blocks = cdiv(
+            num_computed_tokens + num_tokens + num_spec_tokens,
+            self.block_size)
         num_new_blocks = (num_required_blocks - len(req_blocks) -
                           len(new_computed_blocks))
 

@@ -7,7 +7,8 @@
 from collections.abc import Iterable
 from typing import Optional, Union
 
-from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import (CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.logger import init_logger
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
@@ -38,6 +39,7 @@ def __init__(
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
         kv_cache_config: KVCacheConfig,
+        speculative_config: SpeculativeConfig,
         structured_output_manager: StructuredOutputManager,
         mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         include_finished_set: bool = False,
@@ -112,6 +114,10 @@ def __init__(
         self.encoder_cache_manager = EncoderCacheManager(
             cache_size=encoder_cache_size)
 
+        self.num_spec_tokens = 0
+        if speculative_config and speculative_config.method == "eagle":
+            self.num_spec_tokens = speculative_config.num_speculative_tokens
+
     def schedule(self) -> SchedulerOutput:
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
@@ -188,7 +194,9 @@ def schedule(self) -> SchedulerOutput:
 
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens)
+                    request,
+                    num_new_tokens,
+                    num_spec_tokens=self.num_spec_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -98,6 +98,7 @@ def __init__(
             cache_config=vllm_config.cache_config,
             lora_config=vllm_config.lora_config,
             kv_cache_config=kv_cache_config,
+            speculative_config=vllm_config.speculative_config,
             structured_output_manager=self.structured_output_manager,
             include_finished_set=vllm_config.parallel_config.data_parallel_size
             > 1,