From df4ff4996466f42f3033d94a76da762cb63a08a4 Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 22 Jan 2026 17:46:38 +0000 Subject: [PATCH] [Bugfix][ptd_eagle] Fix buffer overflow in PTD EAGLE speculative decoding Parallel draft methods (PTD EAGLE) generate K draft tokens in a single forward pass using mask tokens, which requires larger buffers than sequential drafting. The inherited buffer allocation formula was insufficient, causing crashes under load. Bug manifestation: - Sequential EAGLE: needs max_num_batched_tokens + max_num_seqs tokens - Parallel draft: needs max_num_batched_tokens + max_num_seqs * num_speculative_tokens tokens - Error: "AssertionError: Shape: 8213 out of considered ranges: [(1, 8192)]" This fix addresses three critical issues: 1. Buffer Allocation (ptd_eagle.py): - Corrects max_num_tokens formula for parallel draft generation pattern - Reallocates all buffers (input_ids, positions, hidden_states, slot_buffer) - Adds ~6MB memory overhead (negligible for 3-4x speedup) 2. Compilation Ranges (vllm.py): - Extends compile_ranges_split_points when parallel_draft=True - Ensures CUDA graph compilation handles expanded token counts - Adds informative logging for parallel draft detection The bug was caught during benchmarking with 100 prompts (1600 input, 600 output tokens) where batch size reached 8192 tokens + 7 requests * 3 masks = 8213 tokens, exceeding the compilation range of 8192. Tested-by: Load testing with max batch size configurations Signed-off-by: Li Zhang Simplify updates to eagle files Signed-off-by: Li Zhang Minor format updates --- vllm/config/vllm.py | 21 +++++++++++++-------- vllm/v1/spec_decode/eagle.py | 6 +++++- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 513f0afbc1..9b99dee17d 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -1218,16 +1218,21 @@ def _set_compile_ranges(self): computed_compile_ranges_split_points = [] # The upper bound of the compile ranges is the max_num_batched_tokens. - # For speculative decoding with draft model, the compile range must be extended - # by 1 for each sequence. + # For speculative decoding, the compile range must be extended + # - Sequential: + 1 * max_num_seqs (one draft token per iteration) + # - Parallel draft: + num_speculative_tokens * max_num_seqs compile_range_end = self.scheduler_config.max_num_batched_tokens if compile_range_end is not None: - do_extend: bool = ( - self.speculative_config is not None - and self.speculative_config.uses_draft_model() - ) - if do_extend: - compile_range_end += self.scheduler_config.max_num_seqs + if self.speculative_config is not None and ( + self.speculative_config.uses_draft_model() + or self.speculative_config.use_eagle() + ): + multiplier = ( + self.speculative_config.num_speculative_tokens + if self.speculative_config.parallel_draft + else 1 + ) + compile_range_end += multiplier * self.scheduler_config.max_num_seqs computed_compile_ranges_split_points.append(compile_range_end) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 1ae058c2ea..e10f401f1f 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -76,8 +76,12 @@ def __init__( self.num_speculative_tokens = self.speculative_config.num_speculative_tokens # The drafter can get longer sequences than the target model. max_batch_size = vllm_config.scheduler_config.max_num_seqs + multiplier = ( + self.num_speculative_tokens if self.speculative_config.parallel_draft else 1 + ) self.max_num_tokens = ( - vllm_config.scheduler_config.max_num_batched_tokens + max_batch_size + vllm_config.scheduler_config.max_num_batched_tokens + + max_batch_size * multiplier ) self.token_arange_np = np.arange(self.max_num_tokens) # We need to get the hidden size from the draft model config because