From 44f77152519e5006c6ed6864f128ef2fc9bd3c70 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Fri, 9 Jan 2026 20:37:37 +0000 Subject: [PATCH 1/9] Added qwen3 vision language moe support for speculative decoding Signed-off-by: shanjiaz --- vllm/model_executor/models/qwen3_vl_moe.py | 8 ++++++++ vllm/v1/spec_decode/eagle.py | 5 ++++- vllm/v1/worker/gpu_model_runner.py | 9 ++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen3_vl_moe.py b/vllm/model_executor/models/qwen3_vl_moe.py index 3186804488e5..9efaf2405267 100644 --- a/vllm/model_executor/models/qwen3_vl_moe.py +++ b/vllm/model_executor/models/qwen3_vl_moe.py @@ -110,9 +110,14 @@ def forward( assert intermediate_tensors is not None hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] + + aux_hidden_states = [] for layer_idx, layer in islice( enumerate(self.layers), self.start_layer, self.end_layer ): + if layer_idx in self.aux_hidden_state_layers: + aux_hidden_states.append(hidden_states + residual) + hidden_states, residual = layer( positions, hidden_states, @@ -132,6 +137,9 @@ def forward( {"hidden_states": hidden_states, "residual": residual} ) hidden_states, _ = self.norm(hidden_states, residual) + + if len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states def load_fused_expert_weights( diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f6d198f63aff..8fbfff7b7997 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -117,7 +117,9 @@ def __init__( self.input_ids = torch.zeros( self.max_num_tokens, dtype=torch.int32, device=device ) - self.uses_mrope = self.vllm_config.model_config.uses_mrope + # Use draft model's M-RoPE setting, not target model's + # Draft models may be text-only even if target is multimodal + self.uses_mrope = self.draft_model_config.uses_mrope if self.uses_mrope: # NOTE: `mrope_positions` is implemented with one additional dummy # position on purpose to make it non-contiguous so that it can work @@ -1040,6 +1042,7 @@ def load_model(self, target_model: nn.Module) -> None: if self.get_model_name(target_model) in [ "Qwen2_5_VLForConditionalGeneration", "Qwen3VLForConditionalGeneration", + "Qwen3VLMoeForConditionalGeneration", ]: self.model.config.image_token_index = target_model.config.image_token_id elif self.get_model_name(target_model) == "PixtralForConditionalGeneration": diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 5228167ed3c6..9f164bc88042 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3751,6 +3751,12 @@ def propose_draft_token_ids( else: mm_embed_inputs = None + # Convert M-RoPE positions to 1D if draft model is text-only + if not self.drafter.uses_mrope: + assert target_positions.dim() == 2 + # For text inputs, all M-RoPE dimensions are identical + target_positions = target_positions[0] + draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, target_positions=target_positions, @@ -4441,12 +4447,13 @@ def _dummy_run( inputs_embeds=inputs_embeds, **model_kwargs, ) - + if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: hidden_states = outputs + if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) # Eagle currently only supports PIECEWISE cudagraphs. From bbef7e7b3e3466cf9d8c1b4ec144052ddda1802c Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Fri, 9 Jan 2026 21:32:54 +0000 Subject: [PATCH 2/9] min diff Signed-off-by: shanjiaz --- vllm/v1/spec_decode/eagle.py | 5 +++++ vllm/v1/worker/gpu_model_runner.py | 9 +-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 8fbfff7b7997..376f401eebf5 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -251,6 +251,11 @@ def propose( if last_token_indices is None: last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 + # Convert M-RoPE positions to 1D if draft model is text-only + if not self.uses_mrope and target_positions.dim() == 2: + # For text inputs, all M-RoPE dimensions are identical + target_positions = target_positions[0] + if self.method == "eagle3": assert isinstance(self.model, Eagle3LlamaForCausalLM) target_hidden_states = self.model.combine_hidden_states( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9f164bc88042..debde2e28bb9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3751,12 +3751,6 @@ def propose_draft_token_ids( else: mm_embed_inputs = None - # Convert M-RoPE positions to 1D if draft model is text-only - if not self.drafter.uses_mrope: - assert target_positions.dim() == 2 - # For text inputs, all M-RoPE dimensions are identical - target_positions = target_positions[0] - draft_token_ids = self.drafter.propose( target_token_ids=target_token_ids, target_positions=target_positions, @@ -4447,13 +4441,12 @@ def _dummy_run( inputs_embeds=inputs_embeds, **model_kwargs, ) - + if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: hidden_states = outputs - if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) # Eagle currently only supports PIECEWISE cudagraphs. From 86e804fbe92d24009dece995dded8a2137157297 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Fri, 9 Jan 2026 21:33:42 +0000 Subject: [PATCH 3/9] min diff Signed-off-by: shanjiaz --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index debde2e28bb9..081944993bee 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4441,7 +4441,7 @@ def _dummy_run( inputs_embeds=inputs_embeds, **model_kwargs, ) - + if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: From 35a1024154b9b1445740f3a6678c9a5bc2522033 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Fri, 9 Jan 2026 21:35:07 +0000 Subject: [PATCH 4/9] white space Signed-off-by: shanjiaz --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 081944993bee..5228167ed3c6 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4441,7 +4441,7 @@ def _dummy_run( inputs_embeds=inputs_embeds, **model_kwargs, ) - + if self.use_aux_hidden_state_outputs: hidden_states, _ = outputs else: From 4bef2f954499a97828e079b7ea8de34986c68f2a Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 13 Jan 2026 20:01:44 +0000 Subject: [PATCH 5/9] Added test and refined conditions. Signed-off-by: shanjiaz --- tests/v1/spec_decode/test_speculators_eagle3.py | 4 ++++ vllm/v1/spec_decode/eagle.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/v1/spec_decode/test_speculators_eagle3.py b/tests/v1/spec_decode/test_speculators_eagle3.py index 9a252cfffc8f..f00aa56fa8d5 100644 --- a/tests/v1/spec_decode/test_speculators_eagle3.py +++ b/tests/v1/spec_decode/test_speculators_eagle3.py @@ -19,6 +19,10 @@ "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized", id="qwen3-eagle3-speculator", ), + pytest.param( + "nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3", + id="qwen3-30b-moe-vl-eagle3-speculator", + ), pytest.param( "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16", id="qwen3-eagle3-speculator-w4a16-verifier", diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0494f75a16f1..8bee32b1ef56 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -254,7 +254,7 @@ def propose( last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 # Convert M-RoPE positions to 1D if draft model is text-only - if not self.uses_mrope and target_positions.dim() == 2: + if not self.uses_mrope and self.vllm_config.model_config.uses_mrope: # For text inputs, all M-RoPE dimensions are identical target_positions = target_positions[0] From b27e6c4df80d552f5de589af48cb0a11db765d95 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Mon, 19 Jan 2026 20:05:23 +0000 Subject: [PATCH 6/9] move logic to set_positions Signed-off-by: shanjiaz --- vllm/v1/spec_decode/eagle.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index f0daee8a5b51..759aa105901f 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -229,6 +229,10 @@ def _set_positions(self, num_tokens: int, positions: torch.Tensor): if self.uses_mrope: self.mrope_positions[:, :num_tokens] = positions else: + # Convert M-RoPE positions to 1D if target model uses M-RoPE but draft doesn't + # For text inputs, all M-RoPE dimensions are identical + if self.vllm_config.model_config.uses_mrope: + positions = positions[0] self.positions[:num_tokens] = positions def propose( @@ -253,11 +257,6 @@ def propose( if last_token_indices is None: last_token_indices = common_attn_metadata.query_start_loc[1:] - 1 - # Convert M-RoPE positions to 1D if draft model is text-only - if not self.uses_mrope and self.vllm_config.model_config.uses_mrope: - # For text inputs, all M-RoPE dimensions are identical - target_positions = target_positions[0] - if self.method == "eagle3": assert isinstance(self.model, Eagle3LlamaForCausalLM) target_hidden_states = self.model.combine_hidden_states( @@ -337,6 +336,9 @@ def propose( input_ids = self.input_ids[:num_input_tokens] inputs_embeds = None + # Get normalized positions (needed both for model forward pass and later indexing) + all_positions = self._get_positions(num_input_tokens) + with set_forward_context( per_layer_attn_metadata, self.vllm_config, @@ -346,7 +348,7 @@ def propose( ): ret_hidden_states = self.model( input_ids=input_ids, - positions=self._get_positions(num_input_tokens), + positions=all_positions, hidden_states=self.hidden_states[:num_input_tokens], inputs_embeds=inputs_embeds, ) @@ -363,10 +365,11 @@ def propose( draft_token_ids = logits.argmax(dim=-1) return draft_token_ids.view(-1, 1) + # Extract positions at last_token_indices (reuse all_positions from above) if self.uses_mrope: - positions = target_positions[:, last_token_indices] + positions = all_positions[:, last_token_indices] else: - positions = target_positions[last_token_indices] + positions = all_positions[last_token_indices] if self.method in ( "deepseek_mtp", "ernie_mtp", From f8cbcaf4650df7066306ce7e0670eb85bce6f429 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 20 Jan 2026 14:35:39 +0000 Subject: [PATCH 7/9] format Signed-off-by: shanjiaz --- vllm/v1/spec_decode/eagle.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 3852a21e1ef0..0913bbccbb91 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -234,8 +234,9 @@ def _set_positions(self, num_tokens: int, positions: torch.Tensor): if self.uses_mrope: self.mrope_positions[:, :num_tokens] = positions else: - # Convert M-RoPE positions to 1D if target model uses M-RoPE but draft doesn't - # For text inputs, all M-RoPE dimensions are identical + # Convert M-RoPE positions if target model uses M-RoPE + # but draft doesn't, For text inputs, all M-RoPE + # dimensions are identical if self.vllm_config.model_config.uses_mrope: positions = positions[0] self.positions[:num_tokens] = positions From 23798e3ab17c68745174e953e26cf789d3d390e0 Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 20 Jan 2026 14:36:38 +0000 Subject: [PATCH 8/9] min diff Signed-off-by: shanjiaz --- vllm/v1/spec_decode/eagle.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 0913bbccbb91..54c9e81c4100 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -374,7 +374,6 @@ def propose( draft_token_ids = logits.argmax(dim=-1) return draft_token_ids.view(-1, 1) - # Extract positions at last_token_indices (reuse all_positions from above) if self.uses_mrope: positions = self.positions[:, last_token_indices] else: From 654ddb762b79e6db1a51a6ff4905cf40a5646bab Mon Sep 17 00:00:00 2001 From: shanjiaz Date: Tue, 20 Jan 2026 16:45:27 +0000 Subject: [PATCH 9/9] remove test for now Signed-off-by: shanjiaz --- tests/v1/spec_decode/test_speculators_eagle3.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/v1/spec_decode/test_speculators_eagle3.py b/tests/v1/spec_decode/test_speculators_eagle3.py index f00aa56fa8d5..9a252cfffc8f 100644 --- a/tests/v1/spec_decode/test_speculators_eagle3.py +++ b/tests/v1/spec_decode/test_speculators_eagle3.py @@ -19,10 +19,6 @@ "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized", id="qwen3-eagle3-speculator", ), - pytest.param( - "nm-testing/Speculator-Qwen3-30B-MOE-VL-Eagle3", - id="qwen3-30b-moe-vl-eagle3-speculator", - ), pytest.param( "nm-testing/Speculator-Qwen3-8B-Eagle3-converted-071-quantized-w4a16", id="qwen3-eagle3-speculator-w4a16-verifier",