vllm-project · yewentao256 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 3, 2026
@@ -211,6 +211,7 @@ def __init__(
 
         speculative_config = vllm_config.speculative_config
         self.use_eagle = False
+        self.use_dflash = False
         self.num_spec_tokens = self.num_lookahead_tokens = 0
         if speculative_config:
             self.num_spec_tokens = speculative_config.num_speculative_tokens
@@ -220,6 +221,7 @@ def __init__(
             if speculative_config.uses_draft_model():
                 self.num_lookahead_tokens = self.num_spec_tokens
             if speculative_config.use_dflash():
+                self.use_dflash = True
                 # DFlash requires an extra lookahead slot since it uses in-fill-style
                 # decoding instead of standard next-token sampling, so it has a query
                 # for the last sampled token plus queries for each draft token.
@@ -731,9 +733,19 @@ def schedule(self) -> SchedulerOutput:
                 # extra block gets allocated which
                 # creates a mismatch between the number
                 # of local and remote blocks.
-                limit_lookahead_tokens = load_kv_async and self.use_eagle
+                is_pd_prefill_producer = (
+                    request.num_computed_tokens == 0
+                    and request.kv_transfer_params is not None
+                    and request.kv_transfer_params.get("do_remote_decode", False)
+                )
                 effective_lookahead_tokens = (
-                    0 if limit_lookahead_tokens else self.num_lookahead_tokens
+                    0
+                    if (
+                        self.use_eagle
+                        and not self.use_dflash
+                        and (load_kv_async or is_pd_prefill_producer)
+                    )
+                    else self.num_lookahead_tokens
                 )
 
                 # Determine if we need to allocate cross-attention blocks.