Skip to content
65 changes: 21 additions & 44 deletions tests/e2e/multicard/2-cards/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,63 +16,40 @@
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
import pytest

from tests.e2e.conftest import VllmRunner


def test_qwen2_5_w8a8_external_quantized_tp2():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
TEST_CASES = [
pytest.param(
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

golden_results = [
"The president of the United States is the head of state and",
]

for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")


def test_qwen3_moe_w8a8_dynamic_llm_compressor():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
id="dense-w8a8",
),
pytest.param(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

golden_results = [
"The president of the United States is the head of state and",
]

for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")
id="moe-w8a8-dynamic",
),
pytest.param(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
id="moe-w4a8-dynamic",
),
pytest.param(
"billy800/Qwen3-30B-A3B-Instruct-2507-AWQ",
id="moe-awq-4bit",
),
]


def test_qwen3_moe_w4a8_dynamic_llm_compressor():
@pytest.mark.parametrize("model_id", TEST_CASES)
def test_quantization_tp2(model_id):
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
model_id,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
Expand Down
Loading
Loading