vllm-project · wangxiyuan · Jan 14, 2026 · Jan 13, 2026
diff --git a/.github/workflows/_e2e_test.yaml b/.github/workflows/_e2e_test.yaml
@@ -215,6 +215,7 @@ jobs:
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_expert_parallel.py
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_external_launcher.py
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_full_graph_mode.py
+          pytest -sv --durations=0 tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py
           # torch 2.8 doesn't work with lora, fix me
           pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_ilama_lora_tp2.py
 

diff --git a/tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py b/tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+# Run `pytest tests/e2e/multicard/2-cards/spec_decode/test_spec_decode.py`.
+
+from __future__ import annotations
+
+import os
+from unittest.mock import patch
+
+import pytest
+from transformers import AutoTokenizer
+from vllm import SamplingParams
+from vllm.config import CompilationConfig
+from vllm.v1.metrics.reader import Counter, Vector
+
+from tests.e2e.conftest import VllmRunner
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+MODELS = {
+    "eagle3": {
+        "main": "Qwen/Qwen3-8B",
+        "spec": "RedHatAI/Qwen3-8B-speculator.eagle3",
+    },
+}
+
+# NOTE: golden may change (eagle_proposer only runs in eager mode currently),
+# thus please update it if ci fails but you have better acceptance
+BASELINES_SP = {
+    "eagle3": [0.68, 0.40, 0.18],
+}
+
+
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
+@pytest.mark.parametrize("method", ["eagle3"])
+@pytest.mark.parametrize("num_speculative_tokens", [3])
+@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
+@pytest.mark.parametrize("async_scheduling", [True, False])
+def test_eagle3_sp_acceptance(
+    method: str,
+    num_speculative_tokens: int,
+    disable_padded_drafter_batch: bool,
+    async_scheduling: bool,
+):
+    if disable_padded_drafter_batch and async_scheduling:
+        pytest.skip(
+            "skip disable_padded_drafter_batch=True and async_scheduling=True",
+        )
+
+    main_model_name = MODELS[method]["main"]
+    spec_model_name = MODELS[method]["spec"]
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        main_model_name,
+        trust_remote_code=True,
+    )
+    sampling_params = SamplingParams(
+        temperature=0,
+        ignore_eos=False,
+        max_tokens=256,
+    )
+
+    # sp will only be enabled when query_lens > 1000
+    prompts = [
+        {
+            "role": "user",
+            "content": " " * 1000 + "Hello, my name is",
+        },
+        {
+            "role": "user",
+            "content": " " * 1000 + "The president of the United States is",
+        },
+        {
+            "role": "user",
+            "content": " " * 1000 + "The capital of France is",
+        },
+        {
+            "role": "user",
+            "content": " " * 1000 + "The future of AI is",
+        },
+    ]
+    prompts = [
+        tokenizer.apply_chat_template(
+            [prompt],
+            tokenize=False,
+            add_generation_prompt=True,
+        ) for prompt in prompts
+    ]
+
+    speculative_config = {
+        "enforce_eager": True,
+        "method": method,
+        "num_speculative_tokens": num_speculative_tokens,
+        "disable_padded_drafter_batch": disable_padded_drafter_batch,
+        "model": spec_model_name,
+    }
+
+    compilation_config = CompilationConfig(cudagraph_mode="FULL_DECODE_ONLY",
+                                           cudagraph_capture_sizes=[12])
+
+    with VllmRunner(
+            main_model_name,
+            enforce_eager=True,
+            max_model_len=8192,
+            disable_log_stats=False,
+            tensor_parallel_size=2,
+            max_num_seqs=256,
+            distributed_executor_backend="mp",
+            gpu_memory_utilization=0.7,
+            speculative_config=speculative_config,
+            compilation_config=compilation_config,
+            async_scheduling=async_scheduling,
+    ) as llm:
+        _ = llm.generate(prompts, sampling_params)
+        metrics = llm.model.get_metrics()
+
+    num_drafts = 0
+    num_accepted_tokens_per_pos = [0] * num_speculative_tokens
+    for metric in metrics:
+        if metric.name == "vllm:spec_decode_num_drafts":
+            assert isinstance(metric, Counter)
+            num_drafts += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+            assert isinstance(metric, Vector)
+            for pos in range(len(metric.values)):
+                num_accepted_tokens_per_pos[pos] += metric.values[pos]
+
+    acceptance_per_pos = [
+        num_accepted_tokens / num_drafts
+        for num_accepted_tokens in num_accepted_tokens_per_pos
+    ]
+    golden = BASELINES_SP[method]
+
+    match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
+    if not match:
+        print(f"acceptance_per_pos: {acceptance_per_pos}")
+        print(f"golden: {golden}")
+
+    assert match
diff --git a/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py b/tests/e2e/singlecard/spec_decode/test_v1_spec_decode.py
@@ -34,10 +34,6 @@
     "eagle3": [0.68, 0.40, 0.18],
 }
 
-BASELINES_SP = {
-    "eagle3": [0.68, 0.40, 0.18],
-}
-
 
 @pytest.fixture
 def test_prompts():
@@ -381,111 +377,3 @@ def test_llama_qwen_eagle_acceptance(
         print(f"golden: {golden}")
 
     assert match
-
-
-# TODO the function of sp in eagle3 is improving gradually,
-# there are still problems when enable sp + dp and some unknown scenes.
-# this e2e should also be improving gradually.
-@pytest.mark.parametrize("method", ["eagle3"])
-@pytest.mark.parametrize("num_speculative_tokens", [3])
-@pytest.mark.parametrize("disable_padded_drafter_batch", [True, False])
-@pytest.mark.parametrize("async_scheduling", [True, False])
-def test_eagle3_sp_acceptance(
-    method: str,
-    num_speculative_tokens: int,
-    disable_padded_drafter_batch: bool,
-    async_scheduling: bool,
-):
-    if disable_padded_drafter_batch and async_scheduling:
-        pytest.skip(
-            "skip disable_padded_drafter_batch=True and async_scheduling=True",
-        )
-
-    main_model_name = MODELS[method]["main"]
-    spec_model_name = MODELS[method]["spec"]
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        main_model_name,
-        trust_remote_code=True,
-    )
-    sampling_params = SamplingParams(
-        temperature=0,
-        ignore_eos=False,
-        max_tokens=256,
-    )
-
-    # sp will only be enabled when query_lens > 1000
-    prompts = [
-        {
-            "role": "user",
-            "content": " " * 1000 + "Hello, my name is",
-        },
-        {
-            "role": "user",
-            "content": " " * 1000 + "The president of the United States is",
-        },
-        {
-            "role": "user",
-            "content": " " * 1000 + "The capital of France is",
-        },
-        {
-            "role": "user",
-            "content": " " * 1000 + "The future of AI is",
-        },
-    ]
-    prompts = [
-        tokenizer.apply_chat_template(
-            [prompt],
-            tokenize=False,
-            add_generation_prompt=True,
-        ) for prompt in prompts
-    ]
-
-    speculative_config = {
-        "method": method,
-        "num_speculative_tokens": num_speculative_tokens,
-        "disable_padded_drafter_batch": disable_padded_drafter_batch,
-        "model": spec_model_name,
-    }
-
-    compilation_config = CompilationConfig(cudagraph_capture_sizes=[12])
-
-    with VllmRunner(
-            main_model_name,
-            enforce_eager=True,
-            max_model_len=8192,
-            disable_log_stats=False,
-            tensor_parallel_size=1,
-            max_num_seqs=256,
-            distributed_executor_backend="mp",
-            gpu_memory_utilization=0.7,
-            speculative_config=speculative_config,
-            compilation_config=compilation_config,
-            async_scheduling=async_scheduling,
-    ) as llm:
-        _ = llm.generate(prompts, sampling_params)
-        metrics = llm.model.get_metrics()
-
-    num_drafts = 0
-    num_accepted_tokens_per_pos = [0] * num_speculative_tokens
-    for metric in metrics:
-        if metric.name == "vllm:spec_decode_num_drafts":
-            assert isinstance(metric, Counter)
-            num_drafts += metric.value
-        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
-            assert isinstance(metric, Vector)
-            for pos in range(len(metric.values)):
-                num_accepted_tokens_per_pos[pos] += metric.values[pos]
-
-    acceptance_per_pos = [
-        num_accepted_tokens / num_drafts
-        for num_accepted_tokens in num_accepted_tokens_per_pos
-    ]
-    golden = BASELINES_SP[method]
-
-    match = all(abs(a - b) < 0.06 for a, b in zip(acceptance_per_pos, golden))
-    if not match:
-        print(f"acceptance_per_pos: {acceptance_per_pos}")
-        print(f"golden: {golden}")
-
-    assert match
diff --git a/tests/ut/spec_decode/test_eagle_proposer.py b/tests/ut/spec_decode/test_eagle_proposer.py
@@ -275,7 +275,9 @@ def tearDown(self):
         self.mock_cpugpubuffer.stop()
         self.mock_supports_multimodal_inputs.stop()
 
-    @patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
+    # cpu does not support parallel-group, let alone `sp`
+    @patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
+           **{"return_value.sp_enabled": False})
     @patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
     def test_dummy_run_basic(self, mock_context, mock_get_context):
         num_tokens = 32
@@ -288,7 +290,9 @@ def test_dummy_run_basic(self, mock_context, mock_get_context):
 
         self.assertTrue(self.proposer.model.call_count == 4)
 
-    @patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context")
+    # cpu does not support parallel-group, let alone `sp`
+    @patch("vllm_ascend.spec_decode.eagle_proposer.get_forward_context",
+           **{"return_value.sp_enabled": False})
     @patch("vllm_ascend.spec_decode.eagle_proposer.set_ascend_forward_context")
     def test_dummy_run_with_prefill(self, mock_context, mock_get_context):
         mock_context.return_value.__enter__.return_value = None
@@ -306,6 +310,8 @@ def test_dummy_run_in_graph_capture(self, mock_context, mock_get_context,
         mock_return_context = MagicMock()
         mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
         mock_return_context.capturing = True
+        # cpu does not support parallel-group, let alone `sp`
+        mock_return_context.sp_enabled = False
         mock_get_context.return_value = mock_return_context
         self.proposer.use_cuda_graph = True
         # cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`
@@ -326,6 +332,8 @@ def test_dummy_run_in_graph_run(self, mock_context, mock_get_context,
         mock_return_context = MagicMock()
         mock_return_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
         mock_return_context.capturing = False
+        # cpu does not support parallel-group, let alone `sp`
+        mock_return_context.sp_enabled = False
         mock_get_context.return_value = mock_return_context
         self.proposer.use_cuda_graph = True
         # cpu does not support `torch.ops.vllm.maybe_pad_and_reduce`

diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -14,7 +14,7 @@
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.utils import (AscendDeviceType, enable_sp, flashcomm2_enable,
                                get_ascend_device_type, has_layer_idx,
-                               is_moe_model,
+                               is_drafter_moe_model, is_moe_model,
                                speculative_enable_dispatch_gmm_combine_decode)
 
 
@@ -73,7 +73,10 @@ def set_ascend_forward_context(
         # the performance benefits can be maximized. Conversely, if the concurrency is below the threshold,
         # the performance may degrade due to the switching of communication methods.
         mmrs_fusion = True
-        if is_moe_model(vllm_config):
+        # main model and drafter model may have different architecture
+        is_context_moe_model = is_drafter_moe_model(vllm_config) \
+            if is_draft_model else is_moe_model(vllm_config)
+        if is_context_moe_model:
             sp_enabled = enable_sp(vllm_config) and num_tokens is not None
             mmrs_fusion = False
         else: