vllm-project · wangxiyuan · Nov 28, 2025 · Nov 13, 2025 · Nov 20, 2025 · Nov 21, 2025
@@ -207,6 +207,7 @@ jobs:
           pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
           pytest -sv tests/e2e/multicard/test_prefix_caching.py
           pytest -sv tests/e2e/multicard/test_qwen3_moe.py
+          pytest -sv tests/e2e/multicard/test_quantization.py
 
   e2e-4-cards:
     name: multicard-4

@@ -7,6 +7,7 @@ This section provides a detailed usage guide of vLLM Ascend features.
 :maxdepth: 1
 graph_mode
 quantization
+quantization-llm-compressor
 sleep_mode
 structured_output
 lora

@@ -0,0 +1,55 @@
+# Quantization Guide
+
+Model quantization is a technique that reduces the size and computational requirements of a model by lowering the data precision of the weights and activation values in the model, thereby saving the memory and improving the inference speed.
+
+## Install llm-compressor
+
+To quantize a model, you should install [llm-compressor](https://github.com/vllm-project/llm-compressor/blob/main/README.md). It is a unified library for creating compressed models for faster inference with vLLM.
+
+Install llm-compressor:
+
+```bash
+pip install llmcompressor
+```
+
+### Generate the W8A8 weights
+
+```bash
+cd examples/quantization/llm-compressor
+
+python3 w8a8_int8_dynamic.py
+```
+
+for more details, see the [Official Sample](https://github.com/vllm-project/llm-compressor/tree/main/examples).
+
+## Run the model
+
+Now, you can run the quantized model with vLLM Ascend. Examples for online and offline inference are provided as follows:
+
+### Offline inference
+
+```python
+import torch
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The future of AI is",
+]
+sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40)
+
+llm = LLM(model="{quantized_model_save_path}",
+          max_model_len=2048,
+          trust_remote_code=True)
+
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+
+### Online inference
+
+Start the quantized model using vLLM Ascend; no modifications to the startup command are required.
@@ -0,0 +1,164 @@
+import os
+import torch
+
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration, \
+    AutoTokenizer, AutoProcessor, AutoConfig, AutoImageProcessor
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
+from llmcompressor.modifiers.quantization import GPTQModifier, QuantizationModifier
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme, QuantizationType, QuantizationStrategy
+
+W8A8_W_cha_A_ten_static_symmetric = {
+    "group_0": QuantizationScheme(
+        targets=["Linear"],
+        weights=QuantizationArgs(
+            num_bits=8,
+            type=QuantizationType.INT,
+            strategy=QuantizationStrategy.CHANNEL,
+            symmetric=True,
+            dynamic=False
+        ),
+        input_activations=QuantizationArgs(
+            num_bits=8,
+            type=QuantizationType.INT,
+            strategy=QuantizationStrategy.TENSOR,
+            symmetric=True,
+            dynamic=False
+        ),
+    ),
+}
+
+# supported modifiers
+MODIFIER_DICT = {
+    "PTQ": QuantizationModifier,
+    "AWQ": AWQModifier,
+    "GPTQ": GPTQModifier,
+}
+
+# supported schemes
+SCHEMES_DICT = {
+    "W8A8_W_cha_A_ten_static_symmetric": W8A8_W_cha_A_ten_static_symmetric,
+}
+
+MODEL_DICT = {
+    "qwen3": AutoModelForCausalLM,
+}
+
+TOKENIZER_DICT = {
+    "qwen3": AutoTokenizer,
+}
+
+
+def load_environment_variables():
+    env_vars = {
+        'model_path': "Qwen/Qwen3-32B",
+        'export_path': "/llm-compressor/export/GPTQ/W8A8_W_cha_A_ten_static_symmetric",
+        'modifier': "GPTQ",
+        'schemes': "W8A8_W_cha_A_ten_static_symmetric",
+        'calib_prompt_path': "HuggingFaceH4/ultrachat_200k"
+    }
+
+    # verify export model path
+    if env_vars['export_path'] is None:
+        env_vars['export_path'] = env_vars['model_path'].rstrip("/") + "-" + env_vars['modifier']
+        if env_vars['schemes'] is not None:
+            env_vars['export_path'] += "-" + env_vars['schemes']
+    os.makedirs(env_vars['export_path'], exist_ok=True)
+
+    return env_vars
+
+
+def load_calibration_text_dataset(calib_prompt_path, tokenizer):
+    # Load dataset
+    for f in os.listdir(calib_prompt_path):
+        print(f)
+    if any(f.lower().endswith('.jsonl') for f in os.listdir(calib_prompt_path)):
+        ds = load_dataset('json', data_dir=calib_prompt_path, split='validation')
+    elif any(f.lower().endswith('.parquet') for f in os.listdir(calib_prompt_path)):
+        ds = load_dataset("parquet", data_dir=calib_prompt_path, split="train[:512]")
+    else:
+        raise ValueError("Unsupported calibration file format: {}".format(
+            calib_prompt_path.split('.')[-1]))
+
+    # Preprocess dataset
+    def preprocess(example):
+        if tokenizer.chat_template is not None:
+            return {"text": tokenizer.apply_chat_template(
+                example["messages"], tokenize=False)}
+        else:
+            return {"text": example["messages"]}
+
+    # Tokenize inputs
+    def tokenize(sample):
+        return tokenizer(
+            sample["text"],
+            add_special_tokens=False,
+        )
+
+    ds = ds.map(preprocess)
+    ds = ds.map(tokenize, remove_columns=ds.column_names)
+    return ds
+
+
+# Define a oneshot data collator for multimodal inputs.
+def data_collator(batch):
+    assert len(batch) == 1
+    return {
+        key: torch.tensor(value, dtype=torch.bfloat16 if key == "pixel_values" else torch.long)
+        for key, value in batch[0].items()
+    }
+
+
+def quantize_model(model, env_vars, dataset_dict=None):
+    # since the MoE gate layers are sensitive to quantization, we add them to the ignore
+    # list so they remain at full precision
+    ignore = ["lm_head", "re:.*mlp.down_proj"]
+
+    # define a llmcompressor recipe
+    recipe = [
+        MODIFIER_DICT[env_vars['modifier']](
+            config_groups=SCHEMES_DICT[env_vars['schemes']],
+            ignore=ignore,
+        ),
+    ]
+
+    # quantize the model
+    oneshot(
+        model=model,
+        dataset=dataset_dict,
+        recipe=recipe,
+        trust_remote_code_model=True,
+    )
+
+
+def save_quantized_model(model, tokenizer, save_path, save_compressed=False):
+    model.save_pretrained(save_path, save_compressed=save_compressed)
+    tokenizer.save_pretrained(save_path)
+
+
+if __name__ == '__main__':
+    # get environment variables
+    env_vars = load_environment_variables()
+
+    # support model type list
+    config = AutoConfig.from_pretrained(env_vars['model_path'], trust_remote_code=True)
+    model_type = config.model_type
+
+    model = MODEL_DICT[model_type].from_pretrained(
+        env_vars['model_path'], torch_dtype="auto", trust_remote_code=True
+    )
+    tokenizer = TOKENIZER_DICT[model_type].from_pretrained(env_vars['model_path'], trust_remote_code=True)
+
+    # Load the calibration dataset
+    if env_vars["calib_prompt_path"] is None:
+        env_vars["calib_prompt_path"] = "dataset/ultrachat_200k"
+
+    ds = load_calibration_text_dataset(env_vars["calib_prompt_path"], tokenizer)
+
+    # Quantize the model
+    quantize_model(model, env_vars, ds)
+
+    # save the quantized model
+    save_quantized_model(model, tokenizer, env_vars['export_path'], True)
@@ -0,0 +1,83 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import GPTQModifier
+from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
+from llmcompressor.utils import dispatch_for_generation
+
+# Select model and load it.
+MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]")
+ds = ds.shuffle(seed=42)
+
+
+def preprocess(example):
+    return {
+        "text": tokenizer.apply_chat_template(
+            example["messages"],
+            tokenize=False,
+        )
+    }
+
+
+ds = ds.map(preprocess)
+
+
+# Tokenize inputs.
+def tokenize(sample):
+    return tokenizer(
+        sample["text"],
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(tokenize, remove_columns=ds.column_names)
+
+# Configure algorithms. In this case, we:
+#   * apply SmoothQuant to make the activations easier to quantize
+#   * quantize the weights to int8 with GPTQ (static per channel)
+#   * quantize the activations to int8 (dynamic per token)
+recipe = [
+    SmoothQuantModifier(smoothing_strength=0.8),
+    GPTQModifier(targets="Linear", scheme="W8A8", ignore=["lm_head"]),
+]
+
+# Apply algorithms and save to output_dir
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+dispatch_for_generation(model)
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("npu")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-W8A8-Dynamic-Per-Token"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
@@ -15,6 +15,15 @@ ignore_missing_imports = True
 [mypy-lm_eval.*]
 ignore_missing_imports = True
 
+[mypy-compressed_tensors.*]
+ignore_missing_imports = True
+
+[mypy-datasets.*]
+ignore_missing_imports = True
+
+[mypy-llmcompressor.*]
+ignore_missing_imports = True
+
 [mypy-msprobe.*]
 ignore_missing_imports = True
-allow_untyped_imports = True
+allow_untyped_imports = True
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ requires = [
     "quart",
     "numba",
     "opencv-python-headless<=4.11.0.86", # Required to avoid numpy version conflict with vllm
+    "compressed_tensors>=0.11.0"
 ]
 build-backend = "setuptools.build_meta"
 

@@ -16,6 +16,7 @@ torchvision
 wheel
 pandas-stubs
 opencv-python-headless<=4.11.0.86 # Required to avoid numpy version conflict with vllm
+compressed_tensors>=0.11.0
 
 # requirements for disaggregated prefill
 msgpack

@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/e2e/multicard/test_quantization.py`.
+"""
+from modelscope import snapshot_download  # type: ignore
+
+from tests.e2e.conftest import VllmRunner
+
+
+def test_models_distributed_quantized_W8A8():
+    example_prompts = [
+        "Hello, my name is",
+    ]
+    max_tokens = 5
+    with VllmRunner(snapshot_download("neuralmagic/Qwen2.5-3B-quantized.w8a8"),
+                    tensor_parallel_size=4,
+                    max_model_len=4096,
+                    gpu_memory_utilization=0.8,
+                    distributed_executor_backend="mp",
+                    enforce_eager=True) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
+        del vllm_model