vllm-project · wangxiyuan · Mar 23, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
@@ -5,6 +5,8 @@ e2e-singlecard:
   estimated_time: 69
 - name: tests/e2e/singlecard/test_auto_fit_max_mode_len.py
   estimated_time: 70
+- name: tests/e2e/singlecard/test_eager_mode_acc.py
+  estimated_time: 255
 - name: tests/e2e/singlecard/test_aclgraph_accuracy.py
   estimated_time: 839
 - name: tests/e2e/singlecard/test_aclgraph_batch_invariant.py

@@ -21,97 +21,61 @@
 
 import pytest
 
-from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, gen_and_valid
+from tests.e2e.conftest import wait_until_npu_memory_free
+from tests.e2e.singlecard.utils import PROMPTS_LONG, PROMPTS_SHORT, LLMTestCase, compare_logprobs
+
+# ---------------------------------------------------------------------------
+# Test cases – no golden_answers needed; accuracy is verified via logprob
+# comparison against an eager-mode baseline.  Token 0 covers the prefill
+# forward pass; tokens 1-2 cover decode forward passes.
+# ---------------------------------------------------------------------------
 
 CASE_QWEN_ACLGRAPH = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_SHORT,
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
 )
 
 CASE_DS_ACLGRAPH = LLMTestCase(
     model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
     quantization="ascend",
     prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
-        " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
-        " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
-        " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
-    ],
 )
 
 CASE_QWEN_FULL = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_SHORT,
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of France is also the capital of the Republic of France. The capital of France is also the capital of the European Union. The capital of",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
 )
 
 CASE_DS_FULL = LLMTestCase(
     model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
     quantization="ascend",
     prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "\nI am a 20 year old female, and I have been suffering from depression for 3 years now. I have been on medication for 2",
-        " a man who has been in the public eye for decades. He has been a senator, a governor, and a businessman. He has also been married to the",
-        " Paris, which is also the largest city in the country. The city is located on the River Seine and is known for its beautiful architecture, museums, and art",
-        " here, and it’s not what you think.\nThe future of AI is here, and it’s not what you think.\nThe future of",
-    ],
 )
 
 CASE_QWEN_FULL_DECODE_ONLY = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_LONG,
-    golden_answers=[
-        " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
-        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
-        " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
-    ],
 )
 
 CASE_DS_FULL_DECODE_ONLY = LLMTestCase(
     model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
     quantization="ascend",
     prompts=PROMPTS_LONG,
-    golden_answers=[
-        "\n\nSelect an assignment template",
-        "\n\nI'm not sure how to approach this problem. I'm thinking that the area of the triangle is $1/2$ times the area",
-        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x = \\alpha$ be the common root",
-    ],
 )
 
 CASE_QWEN_EX = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_LONG,
-    golden_answers=[
-        " \n\nTo solve this problem, we need to use the Law of Sines and Law of Cosines. Let me start by drawing triangle $ABC$ with the",
-        " \n\nTo solve this problem, we can use the following approach: Let $P$ be the perimeter of the square. Then, the expected value of the area",
-        " \n\nTo solve this problem, we can use the following approach: Let $ \\alpha $ be the common real root of the two equations. Then, we can",
-    ],
 )
 
 CASE_DS_EX = LLMTestCase(
     model="vllm-ascend/DeepSeek-V2-Lite-W8A8",
     quantization="ascend",
     prompts=PROMPTS_LONG,
-    golden_answers=[
-        "\n\nSelect an assignment template",
-        "\n\nI'm not sure how to approach this problem. I'm thinking that the area of the triangle is $1/2$ times the area",
-        "\n\n## Answer\n\n$a + b + c = 0$\n\nSolution\n\nLet $x = \\alpha$ be the common root",
-    ],
 )
 
 
+@wait_until_npu_memory_free(0.7)
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_ACLGRAPH, CASE_DS_ACLGRAPH])
 def test_piecewise_res_consistency(cur_case: LLMTestCase):
     runner_kwargs = {
@@ -120,14 +84,10 @@ def test_piecewise_res_consistency(cur_case: LLMTestCase):
         "cudagraph_capture_sizes": [1, 2, 4, 8],
         "quantization": cur_case.quantization,
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
 
 
+@wait_until_npu_memory_free(0.7)
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL, CASE_DS_FULL])
 def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
     monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -137,14 +97,10 @@ def test_full_res_consistency(cur_case: LLMTestCase, monkeypatch):
         "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
         "quantization": cur_case.quantization,
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
 
 
+@wait_until_npu_memory_free(0.7)
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_FULL_DECODE_ONLY, CASE_DS_FULL_DECODE_ONLY])
 def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
     monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -155,14 +111,10 @@ def test_full_decode_only_res_consistency(cur_case: LLMTestCase, monkeypatch):
         "quantization": cur_case.quantization,
         "additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": False}},
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
 
 
+@wait_until_npu_memory_free(0.7)
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX, CASE_DS_EX])
 def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
     monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -173,17 +125,13 @@ def test_npugraph_ex_res_consistency(cur_case: LLMTestCase, monkeypatch):
         "compilation_config": {"cudagraph_capture_sizes": [4, 8, 32, 64], "cudagraph_mode": "FULL_DECODE_ONLY"},
         "additional_config": {"ascend_compilation_config": {"enable_npugraph_ex": True}},
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
 
 
 # The accuracy has already been verified in the previous test case.
 # This test case is used to check whether the functionality works properly
 # after enabling the static kernel and whether it is uninstalled as expected.
+@wait_until_npu_memory_free(0.7)
 @pytest.mark.parametrize("cur_case", [CASE_QWEN_EX])
 def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
     monkeypatch.delenv("HCCL_OP_EXPANSION_MODE", raising=False)
@@ -199,14 +147,9 @@ def test_npugraph_ex_with_static_kernel(cur_case: LLMTestCase, monkeypatch):
             }
         },
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
-
-    # Check whether the static kernel is properly uninstall
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
+
+    # Check whether the static kernel is properly uninstalled
     ascend_home_path = os.environ["ASCEND_HOME_PATH"]
     static_kernel_install_path = os.path.join(ascend_home_path, "opp/static_kernel/ai_core")
     assert not os.path.exists(static_kernel_install_path)
@@ -0,0 +1,68 @@
+#
+# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This file test accuracy via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+MODEL_NAMES = ["Qwen/Qwen3-0.6B", "vllm-ascend/DeepSeek-V2-Lite-W8A8"]
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUES = {"Qwen/Qwen3-0.6B": 0.414, "vllm-ascend/DeepSeek-V2-Lite-W8A8": 0.34}
+
+
+def run_test(model_name, more_args=None):
+    """Run the end to end accuracy test."""
+
+    # NOTE: Do not add any spaces to the string below, as this will cause parameter parsing errors.
+    model_args = f"pretrained={model_name},max_model_len=4096,enforce_eager=True"
+
+    if more_args is not None:
+        model_args = "{},{}".format(model_args, more_args)
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert model_name in EXPECTED_VALUES, f"Cannot find the expected value for the model {model_name=}"
+    expected_value = EXPECTED_VALUES[model_name]
+    assert measured_value - RTOL < expected_value and measured_value + RTOL > expected_value, (
+        f"Expected: {expected_value} |  Measured: {measured_value}"
+    )
+
+
+@pytest.mark.parametrize("model", MODEL_NAMES)
+def test_lm_eval_accuracy(model):
+    """Run with the V1 Engine."""
+    more_args = None
+    run_test(model, more_args)
@@ -15,7 +15,8 @@
 # limitations under the License.
 #
 """
-Compare the outputs of vLLM with and without xlite.
+Compare the outputs of vLLM with and without xlite via logprob-based accuracy
+check (3 tokens: 1 prefill + 2 decode).
 
 Run `pytest tests/e2e/singlecard/test_xlite.py`.
 """
@@ -25,51 +26,19 @@
 import os
 
 import pytest
-from vllm import SamplingParams
 
-from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, gen_and_valid
+from tests.e2e.singlecard.utils import PROMPTS_SHORT, LLMTestCase, compare_logprobs
 
 os.environ["VLLM_ASCEND_ENABLE_NZ"] = "2"
 
 CASE_DECODE_ONLY = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
     prompts=PROMPTS_SHORT,
-    golden_answers=[
-        "Hello, my name is Lina. I'm a 22-year-old student from China.",
-        "The president of the United States is the same as the president of the United Nations. This is because the president",
-        "The capital of France is Paris. The capital of France is also the capital of the French Republic.",
-        "The future of AI is not just a technological challenge but a profound transformation of how we live, work",
-    ],
-    sampling_params=SamplingParams(
-        max_tokens=15,
-        temperature=0.0,
-        top_p=1.0,
-        top_k=0,
-        n=1,
-    ),
 )
 
 CASE_FULL = LLMTestCase(
     model="Qwen/Qwen3-0.6B",
-    prompts=[
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ],
-    golden_answers=[
-        " Lina. I'm a 22-year-old student from China. I'm interested in studying in the US. I'm looking for a job in the",
-        " the same as the president of the United Nations. This is because the president of the United States is the same as the president of the United Nations. The president",
-        " Paris. The capital of Italy is Rome. The capital of Spain is Madrid. The capital of China is Beijing. The capital of Japan is Tokyo. The capital",
-        " not just a technological challenge but a profound transformation of how we live, work, and interact with the world. As we stand at the intersection of artificial intelligence and",
-    ],
-    sampling_params=SamplingParams(
-        max_tokens=32,
-        temperature=0.0,
-        top_p=1.0,
-        top_k=0,
-        n=1,
-    ),
+    prompts=PROMPTS_SHORT,
 )
 
 
@@ -82,12 +51,7 @@ def test_models_with_xlite_decode_only(cur_case: LLMTestCase):
         "block_size": 128,
         "additional_config": {"xlite_graph_config": {"enabled": True}},
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)
 
 
 @pytest.mark.parametrize("cur_case", [CASE_FULL])
@@ -98,9 +62,4 @@ def test_models_with_xlite_full_mode(cur_case: LLMTestCase):
         "block_size": 128,
         "additional_config": {"xlite_graph_config": {"enabled": True, "full_mode": True}},
     }
-    gen_and_valid(
-        runner_kwargs=runner_kwargs,
-        prompts=cur_case.prompts,
-        sampling_params=cur_case.sampling_params,
-        golden_answers=cur_case.golden_answers,
-    )
+    compare_logprobs(runner_kwargs=runner_kwargs, prompts=cur_case.prompts)