diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index dc6db0138806..2520403634e4 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -703,6 +703,15 @@ def propose_tree( # Copy inputs to buffer for cudagraph. num_tokens = attn_metadata.num_actual_tokens input_ids = tree_input_ids.view(-1) + + # Handle -1 sentinel values from padded speculation for MTP models + # which call embed_tokens() and can't handle invalid indices + if self.method == "mtp": + # Filter out -1 sentinel values that mark discarded/invalid + # tokens + vocab_size = self.model.model.embed_tokens.weight.size(0) + input_ids = torch.clamp(input_ids, min=0, max=vocab_size - 1) + self.input_ids[:num_tokens] = input_ids self.positions[:num_tokens] = tree_positions.view(-1) self.hidden_states[:num_tokens] = tree_hidden_states.view(