Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,7 @@ jobs:
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_fc1_tp2
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_dense_prefetch_mlp_weight_tp2
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_deepseek3_2_w8a8_pruning_mtp_tp2_ep
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_inference_distributed.py::test_qwen3_w4a4_distributed_tp2

pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_offline_weight_load.py
pytest -sv --durations=0 tests/e2e/multicard/2-cards/test_pipeline_parallel.py
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
"vllm-ascend/Qwen3-1.7B-W4A8-V1",
]

QWEN_W4A4_MODELS = [
"Eco-Tech/Qwen3-32B-w4a4-LAOS",
]

DEEPSEEK_W4A8_MODELS = [
"vllm-ascend/DeepSeek-V3.1-W4A8-puring",
]
Expand Down Expand Up @@ -261,3 +265,18 @@ def test_deepseek3_2_w8a8_pruning_mtp_tp2_ep():
reasoning_parser="deepseek_v3",
tokenizer_mode="deepseek_v32") as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)


@pytest.mark.parametrize("model", QWEN_W4A4_MODELS)
def test_qwen3_w4a4_distributed_tp2(model):
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(model),
tensor_parallel_size=2,
cudagraph_capture_sizes=[1, 2, 4, 8],
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
4 changes: 4 additions & 0 deletions vllm_ascend/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from vllm_ascend.utils import COMPRESSED_TENSORS_METHOD

from .w4a4_flatquant_dynamic import AscendW4A4FlatQuantDynamicLinearMethod
from .w4a4_laos_dynamic import AscendW4A4LaosDynamicLinearMethod
from .w4a8_dynamic import (AscendW4A8DynamicFusedMoEMethod,
AscendW4A8DynamicLinearMethod)
from .w4a16 import AscendW4A16FusedMoEMethod
Expand All @@ -25,6 +26,9 @@
"linear": AscendW4A8DynamicLinearMethod,
"moe": AscendW4A8DynamicFusedMoEMethod,
},
"W4A4_DYNAMIC": {
"linear": AscendW4A4LaosDynamicLinearMethod,
},
"W4A4_FLATQUANT_DYNAMIC": {
"linear": AscendW4A4FlatQuantDynamicLinearMethod,
},
Expand Down
110 changes: 110 additions & 0 deletions vllm_ascend/quantization/w4a4_laos_dynamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Any, Callable, Dict, Optional
import torch
import torch_npu
import torch.nn.functional as F

class AscendW4A4LaosDynamicLinearMethod:
"""Linear method for Ascend W4A4_LAOS_DYNAMIC.
This class implements W4A4 quantization with LAOS approach and dynamic activation quantization.
- Weight: 4-bit quantization (per-channel) with scale and offset, stored as int8.
- Activation: 4-bit dynamic quantization.
"""
def __init__(self):
self.transpose_weight = True
self.rotation_type = None

def set_rotation_config(self, prefix, metadata):
layer_idx = prefix.split(".")[2]
if prefix.endswith("o_proj"):
layers = metadata["quarot"]["heads_rotation"]["layers"]
if layer_idx in layers:
return "heads_rotation"
if prefix.endswith("down_proj"):
layers = metadata["quarot"]["kronecker_rotation"]["layers"]
if layer_idx in layers:
return "kronecker_rotation"

@staticmethod
def get_weight(input_size: int, output_size: int, params_dtype: torch.dtype) -> Dict[str, Any]:
params_dict = {"weight": torch.empty(output_size, input_size, dtype=torch.int8)}
return params_dict

@staticmethod
def get_pertensor_param(params_dtype: torch.dtype) -> Dict[str, Any]:
return {}

def get_perchannel_param(self, output_size: int, params_dtype: torch.dtype) -> Dict[str, Any]:
params_dict = {}
params_dict["weight_scale"] = torch.empty(output_size,
1,
dtype=torch.float32)
params_dict["weight_offset"] = torch.empty(output_size,
1,
dtype=torch.float32)
if self.rotation_type == "heads_rotation":
params_dict["heads_rotation"] = torch.zeros((64, 64), dtype=torch.float32)
if self.rotation_type == "kronecker_rotation":
params_dict["kronecker_rotation_n"] = torch.zeros((160, 160), dtype=torch.float32)
params_dict["kronecker_rotation_m"] = torch.zeros((160, 160), dtype=torch.float32)
return params_dict

def get_pergroup_param(self,
input_size: int,
output_size: int,
params_dtype: torch.dtype,
layer_type: Optional[str] = None) -> Dict[str, Any]:
return {}

def apply_rotation(self, layer, x):
init_shape = x.shape
dtype = x.dtype
if self.rotation_type == "heads_rotation":
Q1 = layer.heads_rotation
scaled_x = x.reshape(-1, Q1.shape[1], 128)
scaled_x = torch.matmul(Q1.T, scaled_x).reshape(init_shape)
return scaled_x.to(dtype)
if self.rotation_type == "kronecker_rotation":
Q1 = layer.kronecker_rotation_m
Q2 = layer.kronecker_rotation_n
scaled_x = x.reshape(-1, Q1.shape[0], Q2.shape[0])
scaled_x = torch.matmul(scaled_x, Q2)
scaled_x = torch.matmul(Q1.T, scaled_x)
scaled_x = scaled_x.reshape(init_shape)
return scaled_x.to(dtype)
return x

def apply(
self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None,
tp_rank: Optional[int] = None,
) -> torch.Tensor:
dtype = x.dtype
x, pertoken_scale = torch_npu.npu_dynamic_quant(x, dst_type=torch.quint4x2)
pertoken_scale = pertoken_scale.reshape(-1, 1)
pertoken_scale = pertoken_scale.squeeze(-1)
y2 = torch_npu.npu_quant_matmul(x, layer.weight.data, scale=layer.weight_scale.data.view(-1), pertoken_scale=pertoken_scale, bias=None, output_dtype=dtype)
return y2

def process_weights_after_loading(self, layer: torch.nn.Module):
layer.weight_scale.data = layer.weight_scale.data.to(torch.float32)
layer.weight.data = torch_npu.npu_convert_weight_to_int4pack(layer.weight.data.to(torch.int32))
if self.transpose_weight:
layer.weight.data = layer.weight.data.transpose(-1, -2)