vllm-project · ZhongsJie · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
@@ -16,63 +16,40 @@
 # This file is a part of the vllm-ascend project.
 # Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
 #
+import pytest
 
 from tests.e2e.conftest import VllmRunner
 
-
-def test_qwen2_5_w8a8_external_quantized_tp2():
-    example_prompts = [
-        "The president of the United States is",
-    ]
-    max_tokens = 5
-    with VllmRunner(
+TEST_CASES = [
+    pytest.param(
         "neuralmagic/Qwen2.5-3B-quantized.w8a8",
-        tensor_parallel_size=2,
-        cudagraph_capture_sizes=[1, 2, 4, 8],
-        max_model_len=4096,
-        gpu_memory_utilization=0.8,
-    ) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    golden_results = [
-        "The president of the United States is the head of state and",
-    ]
-
-    for i in range(len(vllm_output)):
-        assert golden_results[i] == vllm_output[i][1]
-        print(f"Generated text: {vllm_output[i][1]!r}")
-
-
-def test_qwen3_moe_w8a8_dynamic_llm_compressor():
-    example_prompts = [
-        "The president of the United States is",
-    ]
-    max_tokens = 5
-    with VllmRunner(
+        id="dense-w8a8",
+    ),
+    pytest.param(
         "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
-        tensor_parallel_size=2,
-        max_model_len=4096,
-        gpu_memory_utilization=0.8,
-    ) as vllm_model:
-        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    golden_results = [
-        "The president of the United States is the head of state and",
-    ]
-
-    for i in range(len(vllm_output)):
-        assert golden_results[i] == vllm_output[i][1]
-        print(f"Generated text: {vllm_output[i][1]!r}")
+        id="moe-w8a8-dynamic",
+    ),
+    pytest.param(
+        "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
+        id="moe-w4a8-dynamic",
+    ),
+    pytest.param(
+        "billy800/Qwen3-30B-A3B-Instruct-2507-AWQ",
+        id="moe-awq-4bit",
+    ),
+]
 
 
-def test_qwen3_moe_w4a8_dynamic_llm_compressor():
+@pytest.mark.parametrize("model_id", TEST_CASES)
+def test_quantization_tp2(model_id):
     example_prompts = [
         "The president of the United States is",
     ]
     max_tokens = 5
     with VllmRunner(
-        "vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
+        model_id,
         tensor_parallel_size=2,
+        cudagraph_capture_sizes=[1, 2, 4, 8],
         max_model_len=4096,
         gpu_memory_utilization=0.8,
     ) as vllm_model: