Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/misc/model_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"BAAI/bge-small-en-v1.5",
"BAAI/kernel_meta",
"ByteDance-Seed/BAGEL-7B-MoT",
"cpatonn-mirror/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
"DeepSeek-ai/DeepSeek-OCR",
"DevQuasar/deepseek-ai.DeepSeek-V3.2-BF16",
"Eco-Tech/DeepSeek-V3.1-w8a8-mtp-QuaRot",
Expand Down
88 changes: 37 additions & 51 deletions tests/e2e/multicard/2-cards/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,67 +21,53 @@
from tests.e2e.conftest import VllmRunner


def test_qwen2_5_w8a8_external_quantized_tp2():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
TEST_CASES = [
pytest.param(
"neuralmagic/Qwen2.5-3B-quantized.w8a8",
[
"The president of the United States is the head of state and",
],
id="dense-w8a8",
),
pytest.param(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
[
"The president of the United States is the head of state and",
],
id="moe-w8a8-dynamic",
),
pytest.param(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
[
"The president of the United States is the head of state and",
],
id="moe-w4a8-dynamic",
),
pytest.param(
"cpatonn-mirror/Qwen3-30B-A3B-Thinking-2507-AWQ-4bit",
[
"The president of the United States is the head of state and",
],
id="moe-w4a16-dynamic",
),
]

golden_results = [
'The president of the United States is the head of state and',
]

for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")


def test_qwen3_moe_w8a8_dynamic_llm_compressor():
@pytest.mark.parametrize("model_id, golden_results", TEST_CASES)
def test_compressed_tensors_tp2(model_id, golden_results):
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w8a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
model_id,
max_model_len=4096,
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

golden_results = [
'The president of the United States is the head of state and',
]

for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")

def test_qwen3_moe_w4a8_dynamic_llm_compressor():
example_prompts = [
"The president of the United States is",
]
max_tokens = 5
with VllmRunner(
"vllm-ascend/Qwen3-30B-A3B-Instruct-2507-quantized.w4a8",
tensor_parallel_size=2,
max_model_len=4096,
gpu_memory_utilization=0.8,
) as vllm_model:
vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)

golden_results = [
'The president of the United States is the head of state and',
]

for i in range(len(vllm_output)):
assert golden_results[i] == vllm_output[i][1]
print(f"Generated text: {vllm_output[i][1]!r}")
87 changes: 87 additions & 0 deletions tests/ut/quantization/test_quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
import os
import tempfile
from unittest.mock import MagicMock, patch
import torch

from tests.ut.base import TestBase
from vllm_ascend.quantization.modelslim_config import MODELSLIM_CONFIG_FILENAME
from vllm_ascend.quantization.utils import (
detect_quantization_method,
maybe_auto_detect_quantization,
pack_to_int32,
unpack_from_int32,
)
from vllm_ascend.utils import ASCEND_QUANTIZATION_METHOD, COMPRESSED_TENSORS_METHOD

Expand Down Expand Up @@ -180,3 +183,87 @@ def test_no_detection_emits_no_log(self, mock_detect):
maybe_auto_detect_quantization(vllm_config)

self.assertIsNone(vllm_config.model_config.quantization)


class TestUnpackFromInt32(TestBase):

def test_unpack_from_int32_packed_dim_1(self):
weight = torch.tensor([[305419896, -1420531520]], dtype=torch.int32)
shape = torch.Size([1, 8])
num_bits = 4

result = unpack_from_int32(weight, shape, num_bits, packed_dim=1)

self.assertEqual(result.dtype, torch.int8)
self.assertEqual(result.shape, shape)

def test_unpack_from_int32_packed_dim_0(self):
weight = torch.tensor([[305419896], [-1420531520]], dtype=torch.int32)
shape = torch.Size([8, 1])
num_bits = 4

result = unpack_from_int32(weight, shape, num_bits, packed_dim=0)

self.assertEqual(result.dtype, torch.int8)
self.assertEqual(result.shape, shape)

def test_unpack_from_int32_assertions(self):
with self.assertRaises(AssertionError):
weight = torch.tensor([[1, 2]], dtype=torch.int64)
unpack_from_int32(weight, torch.Size([8, 1]), 4)

with self.assertRaises(AssertionError):
weight = torch.tensor([[1, 2]], dtype=torch.int32)
unpack_from_int32(weight, torch.Size([8, 1]), 16)


class TestPackToInt32(TestBase):

@patch(
"vllm_ascend.quantization.utils.torch_npu.npu_convert_weight_to_int4pack"
)
def test_pack_to_int32_int8(self, mock_npu_convert_weight_to_int4pack):
mock_npu_convert_weight_to_int4pack.return_value = torch.zeros(
(2, 4), dtype=torch.int32)

weight = torch.zeros((2, 8, 16), dtype=torch.int8)
result = pack_to_int32(weight)

self.assertEqual(result.dtype, torch.int32)
mock_npu_convert_weight_to_int4pack.assert_not_called()

self.assertEqual(result.shape, torch.Size([2, 8, 4]))

@patch(
"vllm_ascend.quantization.utils.torch_npu.npu_convert_weight_to_int4pack"
)
def test_pack_to_int32_int32(self, mock_npu_convert_weight_to_int4pack):

def mock_convert_weight(weight):
return weight

mock_npu_convert_weight_to_int4pack.side_effect = mock_convert_weight
weight = torch.zeros((2, 8, 8), dtype=torch.int32)
result = pack_to_int32(weight)

self.assertEqual(result.dtype, torch.int32)
self.assertEqual(result.shape, weight.shape)

def test_pack_to_int32_assertion_dim(self):
with self.assertRaises(AssertionError):
weight = torch.zeros((8, 8), dtype=torch.int8)
pack_to_int32(weight)

def test_pack_to_int32_assertion_dtype(self):
with self.assertRaises(AssertionError):
weight = torch.zeros((2, 8, 8), dtype=torch.float32)
pack_to_int32(weight)

def test_pack_to_int32_assertion_divisible(self):
with self.assertRaises(AssertionError):
weight = torch.zeros((2, 8, 7), dtype=torch.int32)
pack_to_int32(weight)

with self.assertRaises(AssertionError):
weight = torch.zeros((2, 8, 7), dtype=torch.int8)
pack_to_int32(weight)
90 changes: 2 additions & 88 deletions tests/ut/quantization/test_w4a16.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,93 +3,7 @@
import torch

from tests.ut.base import TestBase
from vllm_ascend.quantization.methods.w4a16 import (AscendW4A16FusedMoEMethod,
pack_to_int32,
unpack_from_int32)


class TestUnpackFromInt32(TestBase):

def test_unpack_from_int32_packed_dim_1(self):
weight = torch.tensor([[305419896, -1420531520]], dtype=torch.int32)
shape = torch.Size([1, 8])
num_bits = 4

result = unpack_from_int32(weight, shape, num_bits, packed_dim=1)

self.assertEqual(result.dtype, torch.int8)
self.assertEqual(result.shape, shape)

def test_unpack_from_int32_packed_dim_0(self):
weight = torch.tensor([[305419896], [-1420531520]], dtype=torch.int32)
shape = torch.Size([8, 1])
num_bits = 4

result = unpack_from_int32(weight, shape, num_bits, packed_dim=0)

self.assertEqual(result.dtype, torch.int8)
self.assertEqual(result.shape, shape)

def test_unpack_from_int32_assertions(self):
with self.assertRaises(AssertionError):
weight = torch.tensor([[1, 2]], dtype=torch.int64)
unpack_from_int32(weight, torch.Size([8, 1]), 4)

with self.assertRaises(AssertionError):
weight = torch.tensor([[1, 2]], dtype=torch.int32)
unpack_from_int32(weight, torch.Size([8, 1]), 16)


class TestPackToInt32(TestBase):

@patch(
"vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
)
def test_pack_to_int32_int8(self, mock_npu_convert_weight_to_int4pack):
mock_npu_convert_weight_to_int4pack.return_value = torch.zeros(
(2, 4), dtype=torch.int32)

weight = torch.zeros((2, 8, 16), dtype=torch.int8)
result = pack_to_int32(weight)

self.assertEqual(result.dtype, torch.int32)
mock_npu_convert_weight_to_int4pack.assert_not_called()

self.assertEqual(result.shape, torch.Size([2, 8, 4]))

@patch(
"vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
)
def test_pack_to_int32_int32(self, mock_npu_convert_weight_to_int4pack):

def mock_convert_weight(weight):
return weight

mock_npu_convert_weight_to_int4pack.side_effect = mock_convert_weight
weight = torch.zeros((2, 8, 8), dtype=torch.int32)
result = pack_to_int32(weight)

self.assertEqual(result.dtype, torch.int32)
self.assertEqual(result.shape, weight.shape)

def test_pack_to_int32_assertion_dim(self):
with self.assertRaises(AssertionError):
weight = torch.zeros((8, 8), dtype=torch.int8)
pack_to_int32(weight)

def test_pack_to_int32_assertion_dtype(self):
with self.assertRaises(AssertionError):
weight = torch.zeros((2, 8, 8), dtype=torch.float32)
pack_to_int32(weight)

def test_pack_to_int32_assertion_divisible(self):
with self.assertRaises(AssertionError):
weight = torch.zeros((2, 8, 7), dtype=torch.int32)
pack_to_int32(weight)

with self.assertRaises(AssertionError):
weight = torch.zeros((2, 8, 7), dtype=torch.int8)
pack_to_int32(weight)
from vllm_ascend.quantization.methods.w4a16 import AscendW4A16FusedMoEMethod


class TestAscendW4A16FusedMoEMethod(TestBase):
Expand Down Expand Up @@ -219,7 +133,7 @@ def build_layer(self):
return layer

@patch(
"vllm_ascend.quantization.methods.w4a16.torch_npu.npu_convert_weight_to_int4pack"
"vllm_ascend.quantization.utils.torch_npu.npu_convert_weight_to_int4pack"
)
def test_process_weights_after_loading_with_transpose(
self, mock_npu_convert_weight_to_int4pack):
Expand Down
6 changes: 5 additions & 1 deletion vllm_ascend/ops/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,11 @@ def __init__(self, *args, **kwargs):
"weight_loader": self.weight_loader,
}
# need full intermediate size pre-sharding for WNA16 act order
if self.quant_method.__class__.__name__ in ("GPTQMarlinMoEMethod", "CompressedTensorsWNA16MoEMethod"):
if self.quant_method.__class__.__name__ in (
"GPTQMarlinMoEMethod",
"CompressedTensorsWNA16MarlinMoEMethod",
"CompressedTensorsWNA16MoEMethod",
):
moe_quant_params["intermediate_size_full"] = intermediate_size
self.quant_method.create_weights(layer=self, **moe_quant_params)

Expand Down
6 changes: 4 additions & 2 deletions vllm_ascend/ops/layernorm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ def __init__(
vllm_config = get_current_vllm_config()
self.bias = None
# quantization with anti_method m4 will generate none-zero norm bias
if vllm_config.quant_config is not None and any(
"norm.bias" in name for name in vllm_config.quant_config.quant_description
if (
vllm_config.quant_config is not None
and hasattr(vllm_config.quant_config, "quant_description")
and any("norm.bias" in name for name in vllm_config.quant_config.quant_description)
):
self.bias = torch.nn.Parameter(torch.zeros(hidden_size), requires_grad=False)

Expand Down
1 change: 1 addition & 0 deletions vllm_ascend/patch/worker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@
import vllm_ascend.patch.worker.patch_routed_experts_capturer # noqa
import vllm_ascend.patch.worker.patch_npugraph_ex_triton # noqa
import vllm_ascend.patch.worker.patch_kimi_k25 # noqa
import vllm_ascend.patch.worker.patch_quantization # noqa
Loading
Loading