Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/_e2e_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ jobs:
pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
pytest -sv tests/e2e/multicard/test_prefix_caching.py
pytest -sv tests/e2e/multicard/test_qwen3_moe.py
pytest -sv tests/e2e/multicard/test_quantization.py

e2e-4-cards:
name: multicard-4
Expand Down
1 change: 1 addition & 0 deletions docs/source/user_guide/feature_guide/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ This section provides a detailed usage guide of vLLM Ascend features.
:maxdepth: 1
graph_mode
quantization
quantization-llm-compressor
sleep_mode
structured_output
lora
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Quantization Guide
Comment thread
LHXuuu marked this conversation as resolved.
Outdated

Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed.

## Install llm-compressor

To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM.

Install llm-compressor:

```bash
pip install llmcompressor
```

### Generate the W8A8 weights
Comment thread
LHXuuu marked this conversation as resolved.

```bash
cd examples/quantization/llm-compressor

python3 w8a8_int8_dynamic.py
```

for more details, see the [Official Sample](https://github.com/vllm-project/llm-compressor/tree/main/examples).

## Run the model

Now, you can run the quantized model with vLLM Ascend. Examples for online and offline inference are provided as follows:

### Offline inference

```python
import torch

from vllm import LLM, SamplingParams

prompts = [
"Hello, my name is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40)

llm = LLM(model="{quantized_model_save_path}",
max_model_len=2048,
trust_remote_code=True)

outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

### Online inference

Start the quantized model using vLLM Ascend; no modifications to the startup command are required.
164 changes: 164 additions & 0 deletions examples/quantization/llm-compressor/w8a8_int8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import os
import torch

from datasets import load_dataset
from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \
AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor

from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy

W8A8_W_cha_A_ten_static_symmetric = {
"group_0": QuantizationScheme(
targets=["Linear"],
weights=QuantizationArgs(
num_bits=8,
type=QuantizationType.INT,
strategy=QuantizationStrategy.CHANNEL,
symmetric=True,
dynamic=False
),
input_activations=QuantizationArgs(
num_bits=8,
type=QuantizationType.INT,
strategy=QuantizationStrategy.TENSOR,
symmetric=True,
dynamic=False
),
),
}

# supported modifiers
MODIFIER_DICT = {
"PTQ": QuantizationModifier,
"AWQ": AWQModifier,
"GPTQ": GPTQModifier,
}

# supported schemes
SCHEMES_DICT = {
"W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric,
}

MODEL_DICT = {
"qwen3": AutoModelForCausalLM,
}

TOKENIZER_DICT = {
"qwen3": AutoTokenizer,
}


def load_environment_variables():
env_vars = {
'model_path': "Qwen/Qwen3-32B",
'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
'modifier': "GPTQ",
'schemes': "W8A8_W_cha_A_ten_static_symmetric",
'calib_prompt_path': "HuggingFaceH4/ultrachat_200k"
}

# verify export model path
if env_vars['export_path'] is None:
env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier']
if env_vars['schemes'] is not None:
env_vars['export_path'] += "-" + env_vars['schemes']
os.makedirs(env_vars['export_path'], exist_ok=True)

return env_vars


def load_calibration_text_dataset(calib_prompt_path, tokenizer):
# Load dataset
for f in os.listdir(calib_prompt_path):
print(f)
if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)):
ds = load_dataset('json', data_dir=calib_prompt_path, split='validation')
elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)):
ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]")
else:
raise ValueError("Unsupported calibration file format: {}".format(
calib_prompt_path.split('.')[-1]))

# Preprocess dataset
def preprocess(example):
if tokenizer.chat_template is not None:
return {"text": tokenizer.apply_chat_template(
example["messages"], tokenize=False)}
else:
return {"text": example["messages"]}

# Tokenize inputs
def tokenize(sample):
return tokenizer(
sample["text"],
add_special_tokens=False,
)

ds = ds.map(preprocess)
ds = ds.map(tokenize, remove_columns=ds.column_names)
return ds


# Define a oneshot data collator for multimodal inputs.
def data_collator(batch):
assert len(batch) == 1
return {
key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long)
for key, value in batch[0].items()
}


def quantize_model(model, env_vars, dataset_dict=None):
# since the MoE gate layers are sensitive to quantization, we add them to the ignore
# list so they remain at full precision
ignore = ["lm_head", "re:.*mlp.down_proj"]

# define a llmcompressor recipe
recipe = [
MODIFIER_DICT[env_vars['modifier']](
config_groups=SCHEMES_DICT[env_vars['schemes']],
ignore=ignore,
),
]

# quantize the model
oneshot(
model=model,
dataset=dataset_dict,
recipe=recipe,
trust_remote_code_model=True,
)


def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
model.save_pretrained(save_path, save_compressed=save_compressed)
tokenizer.save_pretrained(save_path)


if __name__ == '__main__':
# get environment variables
env_vars = load_environment_variables()

# support model type list
config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True)
model_type = config.model_type

model = MODEL_DICT[model_type].from_pretrained(
env_vars['model_path'], torch_dtype="auto", trust_remote_code=True
)
tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)

# Load the calibration dataset
if env_vars["calib_prompt_path"] is None:
env_vars["calib_prompt_path"] = "dataset/ultrachat_200k"

ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)

# Quantize the model
quantize_model(model, env_vars, ds)

# save the quantized model
save_quantized_model(model, tokenizer, env_vars['export_path'], True)
83 changes: 83 additions & 0 deletions examples/quantization/llm-compressor/w8a8_int8_dynamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.utils import dispatch_for_generation

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)


def preprocess(example):
return {
"text": tokenizer.apply_chat_template(
example["messages"],
tokenize=False,
)
}


ds = ds.map(preprocess)


# Tokenize inputs.
def tokenize(sample):
return tokenizer(
sample["text"],
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


ds = ds.map(tokenize, remove_columns=ds.column_names)

# Configure algorithms. In this case, we:
# * apply SmoothQuant to make the activations easier to quantize
# * quantize the weights to int8 with GPTQ (static per channel)
# * quantize the activations to int8 (dynamic per token)
recipe = [
SmoothQuantModifier(smoothing_strength=0.8),
GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
]

# Apply algorithms and save to output_dir
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
dispatch_for_generation(model)
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
11 changes: 10 additions & 1 deletion mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ ignore_missing_imports = True
[mypy-lm_eval.*]
ignore_missing_imports = True

[mypy-compressed_tensors.*]
ignore_missing_imports = True

[mypy-datasets.*]
ignore_missing_imports = True

[mypy-llmcompressor.*]
ignore_missing_imports = True

[mypy-msprobe.*]
ignore_missing_imports = True
allow_untyped_imports = True
allow_untyped_imports = True
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ requires = [
"quart",
"numba",
"opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm
"compressed_tensors>=0.11.0"
Comment thread
LHXuuu marked this conversation as resolved.
]
build-backend = "setuptools.build_meta"

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ torchvision
wheel
pandas-stubs
opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm
compressed_tensors>=0.11.0
Comment thread
LHXuuu marked this conversation as resolved.

# requirements for disaggregated prefill
msgpack
Expand Down
40 changes: 40 additions & 0 deletions tests/e2e/multicard/test_quantization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
#
"""Compare the short outputs of HF and vLLM when using greedy sampling.

Run `pytest tests/e2e/multicard/test_quantization.py`.
"""
from modelscope import snapshot_download # type: ignore

from tests.e2e.conftest import VllmRunner


def test_models_distributed_quantized_W8A8():
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
tensor_parallel_size=4,
max_model_len=4096,
gpu_memory_utilization=0.8,
distributed_executor_backend="mp",
enforce_eager=True) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
Comment thread
LHXuuu marked this conversation as resolved.
Outdated
del vllm_model
Loading
Loading