From 61dee6546fbcc89b85661ebe401e72ff23b20051 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 10:18:42 +0800 Subject: [PATCH 01/26] [Feature] Add Int8 quantization support for Z-Image and Qwen-Image Signed-off-by: juboyu <767868009@qq.com> --- docs/.nav.yml | 1 + .../user_guide/diffusion/quantization/int8.md | 75 ++++ .../diffusion/quantization/overview.md | 9 +- docs/user_guide/diffusion_acceleration.md | 32 +- .../text_to_image/text_to_image.py | 2 +- .../quantization/test_int8_config.py | 98 ++++++ vllm_omni/diffusion/quantization/__init__.py | 3 + vllm_omni/diffusion/quantization/int8.py | 333 ++++++++++++++++++ 8 files changed, 544 insertions(+), 9 deletions(-) create mode 100644 docs/user_guide/diffusion/quantization/int8.md create mode 100644 tests/diffusion/quantization/test_int8_config.py create mode 100644 vllm_omni/diffusion/quantization/int8.py diff --git a/docs/.nav.yml b/docs/.nav.yml index 07db1b46512..5cf14f8f333 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -46,6 +46,7 @@ nav: - Quantization: - Overview: user_guide/diffusion/quantization/overview.md - FP8: user_guide/diffusion/quantization/fp8.md + - Int8: user_guide/diffusion/quantization/int8.md - Parallelism Acceleration: user_guide/diffusion/parallelism_acceleration.md - CPU Offloading: user_guide/diffusion/cpu_offload_diffusion.md - ComfyUI: features/comfyui.md diff --git a/docs/user_guide/diffusion/quantization/int8.md b/docs/user_guide/diffusion/quantization/int8.md new file mode 100644 index 00000000000..1e7853c3eb4 --- /dev/null +++ b/docs/user_guide/diffusion/quantization/int8.md @@ -0,0 +1,75 @@ +# Int8 Quantization + +## Overview + +Int8 quantization converts BF16/FP16 weights to Int8 at model load time. No calibration or pre-quantized checkpoint needed. + +Depending on the model, either all layers can be quantized, or some sensitive layers should stay in BF16/FP16. See the [per-model table](#supported-models) for which case applies. + +## Configuration + +1. **Python API**: set `quantization="int8"`. To skip sensitive layers, use `quantization_config` with `ignored_layers`. + +```python +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +# All layers quantized +omni = Omni(model="", quantization="int8") + +# Skip sensitive layers +omni = Omni( + model="", + quantization_config={ + "method": "int8", + "ignored_layers": [""], + }, +) + +outputs = omni.generate( + "A cat sitting on a windowsill", + OmniDiffusionSamplingParams(num_inference_steps=50), +) +``` + +2. **CLI**: pass `--quantization int8` and optionally `--ignored-layers`. + +```bash +# All layers +python text_to_image.py --model --quantization int8 + +# Skip sensitive layers +python text_to_image.py --model --quantization int8 --ignored-layers "img_mlp" + +# Online serving +vllm serve --omni --quantization int8 +``` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `method` | str | — | Quantization method (`"int8"`) | +| `ignored_layers` | list[str] | `[]` | Layer name patterns to keep in BF16/FP16 | +| `activation_scheme` | str | `"dynamic"` | `"dynamic"` (no calibration) | + + +The available `ignored_layers` names depend on the model architecture (e.g., `to_qkv`, `to_out`, `img_mlp`, `txt_mlp`). Consult the transformer source for your target model. + +## Supported Models + +| Model | HF Models | Recommendation | `ignored_layers` | +|-------|-----------|---------------|------------------| +| Z-Image | `Tongyi-MAI/Z-Image-Turbo` | All layers | None | +| Qwen-Image | `Qwen/Qwen-Image`, `Qwen/Qwen-Image-2512` | All layers | None | + +## Combining with Other Features + +Int8 quantization can be combined with cache acceleration: + +```python +omni = Omni( + model="", + quantization="int8", + cache_backend="tea_cache", + cache_config={"rel_l1_thresh": 0.2}, +) +``` diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md index 7dede292fc4..8c005c7b8da 100644 --- a/docs/user_guide/diffusion/quantization/overview.md +++ b/docs/user_guide/diffusion/quantization/overview.md @@ -7,11 +7,18 @@ vLLM-Omni supports quantization of DiT linear layers to reduce memory usage and | Method | Guide | |--------|-------| | FP8 | [FP8](fp8.md) | +| Int8 | [Int8](int8.md) | -## Device Compatibility +## Device Compatibility for FP8 | GPU Generation | Example GPUs | FP8 Mode | |---------------|-------------------|----------| | Ada/Hopper (SM 89+) | RTX 4090, H100, H200 | Full W8A8 with native hardware | Kernel selection is automatic. + +## Device Compatibility for Int8 + +| NPU Generation | Int8 Mode | +|---------------|----------| +| Atlas A2/A3 | W8A8 | diff --git a/docs/user_guide/diffusion_acceleration.md b/docs/user_guide/diffusion_acceleration.md index 1a2e0a7d23f..615770506cd 100644 --- a/docs/user_guide/diffusion_acceleration.md +++ b/docs/user_guide/diffusion_acceleration.md @@ -16,7 +16,7 @@ Both methods can provide significant speedups (typically **1.5x-2.0x**) while ma vLLM-Omni also supports quantization methods: -3. **[FP8 Quantization](diffusion/quantization/overview.md)** - Reduces DiT linear layers from BF16 to FP8, providing ~1.28x speedup with minimal quality loss. Supports per-layer skip for sensitive layers. +3. **[Quantization](diffusion/quantization/overview.md)** - Reduces DiT linear layers from BF16 to FP8 or Int8 (for NPU), providing ~1.28x speedup with minimal quality loss. Supports per-layer skip for sensitive layers. vLLM-Omni also supports parallelism methods for diffusion models, including: @@ -44,6 +44,7 @@ vLLM-Omni also supports parallelism methods for diffusion models, including: | Method | Configuration | Description | Best For | |--------|--------------|-------------|----------| | **FP8** | `quantization="fp8"` | FP8 W8A8 on Ada/Hopper, weight-only on older GPUs | Memory reduction, inference speedup | +| **Int8** | `quantization="int8"` | Int8 W8A8 on NPUs | Memory reduction, inference speedup | ## Supported Models @@ -74,11 +75,11 @@ The following table shows which models are currently supported by each accelerat ### Quantization -| Model | Model Identifier | FP8 | -|-------|------------------|:---:| -| **Qwen-Image** | `Qwen/Qwen-Image` | ✅ | -| **Qwen-Image-2512** | `Qwen/Qwen-Image-2512` | ✅ | -| **Z-Image** | `Tongyi-MAI/Z-Image-Turbo` | ✅ | +| Model | Model Identifier | FP8 | Int8 | +|-------|------------------|:---:|:---:| +| **Qwen-Image** | `Qwen/Qwen-Image` | ✅ | ✅ | +| **Qwen-Image-2512** | `Qwen/Qwen-Image-2512` | ✅ | ✅ | +| **Z-Image** | `Tongyi-MAI/Z-Image-Turbo` | ✅ | ✅ | ## Performance Benchmarks @@ -307,13 +308,30 @@ outputs = omni.generate( ) ``` +### Using Int8 Quantization(only for NPU) + +```python +from vllm_omni import Omni +from vllm_omni.inputs.data import OmniDiffusionSamplingParams + +omni = Omni( + model="", + quantization="int8", +) + +outputs = omni.generate( + "A cat sitting on a windowsill", + OmniDiffusionSamplingParams(num_inference_steps=50), +) +``` + ## Documentation For detailed information on each acceleration method: - **[TeaCache Guide](diffusion/teacache.md)** - Complete TeaCache documentation, configuration options, and best practices - **[Cache-DiT Acceleration Guide](diffusion/cache_dit_acceleration.md)** - Comprehensive Cache-DiT guide covering DBCache, TaylorSeer, SCM, and configuration parameters -- **[FP8 Quantization Guide](diffusion/quantization/overview.md)** - FP8 quantization for DiT models with per-layer control +- **[Quantization Guide](diffusion/quantization/overview.md)** - Quantization for DiT models with per-layer control - **[Tensor Parallelism](diffusion/parallelism_acceleration.md#tensor-parallelism)** - Guidance on how to enable TP for diffusion models. - **[Sequence Parallelism](diffusion/parallelism_acceleration.md#sequence-parallelism)** - Guidance on how to set sequence parallelism with configuration. - **[CFG-Parallel](diffusion/parallelism_acceleration.md#cfg-parallel)** - Guidance on how to set CFG-Parallel to run positive/negative branches across ranks. diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index db87abf008d..0ad5982aee2 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -117,7 +117,7 @@ def parse_args() -> argparse.Namespace: "--quantization", type=str, default=None, - choices=["fp8"], + choices=["fp8", "int8"], help="Quantization method for the transformer. " "Options: 'fp8' (FP8 W8A8 on Ada/Hopper, weight-only on older GPUs). " "Default: None (no quantization, uses BF16).", diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py new file mode 100644 index 00000000000..32e6c6dd94c --- /dev/null +++ b/tests/diffusion/quantization/test_int8_config.py @@ -0,0 +1,98 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for Int8 quantization config.""" + +import pytest + +def test_int8_config_creation(): + """Test that Int8 config can be created.""" + from vllm_omni.diffusion.quantization import get_diffusion_quant_config + + config = get_diffusion_quant_config("int8") + assert config is not None + assert config.get_name() == "int8" + + +def test_vllm_config_extraction(): + """Test that vLLM config can be extracted from diffusion config.""" + from vllm_omni.diffusion.quantization import ( + get_diffusion_quant_config, + get_vllm_quant_config_for_layers, + ) + + diff_config = get_diffusion_quant_config("int8") + vllm_config = get_vllm_quant_config_for_layers(diff_config) + assert vllm_config is not None + assert vllm_config.activation_scheme == "dynamic" + +def test_supported_methods(): + """Test that supported methods list is correct.""" + from vllm_omni.diffusion.quantization import SUPPORTED_QUANTIZATION_METHODS + + assert "int8" in SUPPORTED_QUANTIZATION_METHODS + + +def test_int8_config_with_custom_params(): + """Test Int8 config with custom parameters.""" + from vllm_omni.diffusion.quantization import get_diffusion_quant_config + + config = get_diffusion_quant_config( + "int8", + activation_scheme="dynamic", + ignored_layers=["proj_out"], + ) + assert config is not None + assert config.activation_scheme == "dynamic" + assert "proj_out" in config.ignored_layers + + +def test_quantization_integration(): + """Test end-to-end quantization flow through OmniDiffusionConfig.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + # Test with quantization string only + config = OmniDiffusionConfig(model="test", quantization="int8") + assert config.quantization_config is not None + assert config.quantization_config.get_name() == "int8" + + # Test with quantization_config dict + config2 = OmniDiffusionConfig( + model="test", + quantization_config={"method": "int8", "activation_scheme": "dynamic"}, + ) + assert config2.quantization_config is not None + assert config2.quantization_config.get_name() == "int8" + assert config2.quantization_config.activation_scheme == "dynamic" + + # Test that vLLM config can be extracted + vllm_config = config.quantization_config.get_vllm_quant_config() + assert vllm_config is not None + + +def test_quantization_dict_not_mutated(): + """Test that passing a dict to quantization_config doesn't mutate it.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + original_dict = {"method": "int8", "activation_scheme": "dynamic"} + dict_copy = original_dict.copy() + + OmniDiffusionConfig(model="test", quantization_config=original_dict) + + # Original dict should be unchanged + assert original_dict == dict_copy + + +def test_quantization_conflicting_methods_warning(caplog): + """Test warning when quantization and quantization_config['method'] conflict.""" + import logging + + from vllm_omni.diffusion.data import OmniDiffusionConfig + + with caplog.at_level(logging.WARNING): + config = OmniDiffusionConfig( + model="test", + quantization="int8", # This should be overridden + quantization_config={"method": "int8", "activation_scheme": "dynamic"}, + ) + # No warning when methods match + assert config.quantization_config is not None diff --git a/vllm_omni/diffusion/quantization/__init__.py b/vllm_omni/diffusion/quantization/__init__.py index cc1bb547f77..8476d14e3c8 100644 --- a/vllm_omni/diffusion/quantization/__init__.py +++ b/vllm_omni/diffusion/quantization/__init__.py @@ -28,6 +28,7 @@ from .base import DiffusionQuantizationConfig from .fp8 import DiffusionFp8Config +from .int8 import DiffusionInt8Config if TYPE_CHECKING: from vllm.model_executor.layers.quantization.base_config import ( @@ -40,6 +41,7 @@ # To add a new method, create a new config class and register it here _QUANT_CONFIG_REGISTRY: dict[str, type[DiffusionQuantizationConfig]] = { "fp8": DiffusionFp8Config, + "int8": DiffusionInt8Config, } SUPPORTED_QUANTIZATION_METHODS = list(_QUANT_CONFIG_REGISTRY.keys()) @@ -108,6 +110,7 @@ def get_vllm_quant_config_for_layers( __all__ = [ "DiffusionQuantizationConfig", "DiffusionFp8Config", + "DiffusionInt8Config", "get_diffusion_quant_config", "get_vllm_quant_config_for_layers", "SUPPORTED_QUANTIZATION_METHODS", diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py new file mode 100644 index 00000000000..c1e6d6c8289 --- /dev/null +++ b/vllm_omni/diffusion/quantization/int8.py @@ -0,0 +1,333 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""INT8 quantization config for diffusion transformers.""" + +from collections.abc import Callable, Sequence + +from typing import Any, Dict, List, Mapping, Optional +from typing import TYPE_CHECKING + +import torch +from torch.nn import Module + +import torch_npu + +from vllm.model_executor.parameter import ( + BlockQuantScaleParameter, + ModelWeightParameter, + PerTensorScaleParameter, + ChannelQuantScaleParameter, +) + +from vllm.model_executor.layers.quantization import QuantizationMethods + +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + QuantizeMethodBase, +) + +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + is_layer_skipped, +) + +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, +) + +from vllm.model_executor.utils import replace_parameter, set_weight_attrs + +from vllm.logger import init_logger + +from vllm_omni.platforms import current_omni_platform + +from .base import DiffusionQuantizationConfig + +if TYPE_CHECKING: + from vllm.model_executor.models.utils import WeightsMapper + +# Dynamic quantization is supported first. +ACTIVATION_SCHEMES=["dynamic"] + +logger = init_logger(__name__) + + +def create_int8_weight_parameter( + output_size_per_partition: int, + input_size_per_partition: int, + weight_loader: Callable | None, +) -> torch.nn.Parameter: + """ + Create int8 weight parameter. + """ + from vllm.model_executor.parameter import ModelWeightParameter + + return ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype = torch.int8, + ), + input_dim = 1, + output_dim = 0, + weight_loader = weight_loader, + ) + +def create_int8_scale_parameter( + parameter_type: torch.nn.Parameter, + output_partition_sizes: list[int], + input_size_per_partition: int, + block_size: list[int] | None, + weight_loader: Callable | None, + params_dtype: torch.dtype, +) -> torch.nn.Parameter: + """ + Create scale parameter based on quantization strategy + """ + if parameter_type == ChannelQuantScaleParameter: + scale = parameter_type( + data = torch.empty((sum(output_partition_sizes),1), dtype = torch.float32), + output_dim = 0, + weight_loader=weight_loader, + ) + else: + raise ValueError(f"Unknown parameter type: {parameter_type}") + + return scale + +class Int8Config(QuantizationConfig): + """ + Config class for Int8. + """ + + def __init__( + self, + is_checkpoint_int8_serialized: bool = False, + activation_scheme: str = "dynamic", + ignored_layers: list[str] | None = None, + ) -> None: + super().__init__() + + self.is_checkpoint_int8_serialized = is_checkpoint_int8_serialized + + if activation_scheme not in ACTIVATION_SCHEMES: + raise ValueError(f"Unsupported activation scheme {activation_scheme}") + self.activation_scheme = activation_scheme + self.ignored_layers = ignored_layers or [] + + @classmethod + def get_name(cls) -> QuantizationMethods: + return "int8" + + @classmethod + def get_supported_act_dtypes(cls) -> list[torch.dtype]: + return [torch.bfloat16, torch.float16] + + @classmethod + def get_min_capability(cls) -> int: + return 75 + + @classmethod + def get_config_filenames(cls) -> List[str]: + return [] + + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): + if self.ignored_layers is not None: + self.ignored_layers = hf_to_vllm_mapper.apply_list(self.ignored_layers) + + @classmethod + def from_config(cls, config: dict[str, Any]) -> Int8Config: + quant_method = cls.get_from_keys(config, ["quant_method"]) + is_checkpoint_int8_serialized = "int8" in quant_method + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) + + if not ignored_layers: + ignored_layers = cls.get_frm_keys_or( + config, ["modules_to_not_convert"], None + ) + return cls( + is_checkpoint_int8_serialized = is_checkpoint_int8_serialized, + activation_scheme = activation_scheme, + ignored_layers = ignored_layers, + ) + + def get_quant_method( + self, + layer: torch.nn.Module, + prefix: str, + ) -> Optional["QuantizeMethodBase"]: + if current_omni_platform.is_npu(): + if isinstance(layer, LinearBase): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + if not self.is_checkpoint_int8_serialized: + online_method = Int8OnlineLinearMethod(self) + return online_method + else: + offline_method = Int8LinearMethod(self) + return offline_method + else: + logger.warning("The current platform is not supported.") + return None + +class Int8LinearMethod(LinearMethodBase): + """ + Linear method for Int8 + Supports loading Int8 checkpoints with static weight scale and dynamic activation scale. + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: Int8Config): + self.quant_config = quant_config + self.out_dtype = torch.get_default_dtype() + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + + weight = create_int8_weight_parameter( + output_size_per_partition=output_size_per_partition, + input_size_per_partition=input_size_per_partition, + weight_loader=weight_loader, + ) + layer.register_parameter("weight", weight) + + # WEIGHT OFFSET + offset = create_int8_scale_parameter( + ChannelQuantScaleParameter, + output_partition_sizes, + input_size_per_partition, + None, + weight_loader, + params_dtype + ) + layer.register_parameter("weight_offset", offset) + + # WEIGHT SCALE + scale = create_int8_scale_parameter( + ChannelQuantScaleParameter, + output_partition_sizes, + input_size_per_partition, + None, + weight_loader, + params_dtype, + ) + layer.register_parameter("weight_scale",scale) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + ori_shape = x.shape + ori_dtype = x.dtype + + x = x.reshape(-1, ori_shape[-1]) + quantized_x, pertoken_scale = torch_npu.npu_dynamic_quant(x) + + output = torch_npu.npu_quant_matmul( + quantized_x, + layer.weight, + layer.weight_scale, + bias = bias, + pertoken_scale = pertoken_scale, + output_dtype = ori_dtype, + ) + output = output.reshape(*ori_shape[:-1], -1) + return output + +class Int8OnlineLinearMethod(Int8LinearMethod): + """ + Online version of Int8LinearMethod, loads the fp16/bf16 checkpoint + and quantized the weights during loading. + """ + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype = torch.int8, + ), + input_dim = 1, + output_dim = 0, + weight_loader = weight_loader, + ) + layer.register_parameter("weight", weight) + + def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return + + layer.input-scale = None, + qweight, weight_scale = torch_npu.npu_dynamic_quant(layer.weight) + + weight = qweight.t() + + # Update layer with new values. + replace_parameter(layer, "weight", weight.data) + replace_parameter(layer, "weight_scale", weight_scale.data) + +class DiffusionInt8Config(DiffusionQuantizationConfig): + """ + Int8 quantization config optimized for diggusion transformers. + + Args: + activation_scheme: Activation quantization scheme. + - "dynamic": Per-token dynamic scaling (default, no calibration) + Format: [block_n, block_k]. If None, uses per-tensor scaling. + ignored_layers: List of layer name patterns to skip quantization. + """ + + def __int__( + self, + activation_scheme: str = "dynamic", + ignored_layers: list[str] | None = None, + ): + # Create underlying vLLM Int8 config + self._vllm_config = Int8Config( + is_checkpoint_int8_serialized=False, + activation_scheme=activation_scheme, + ignored_layers=ignored_layers, + ) + + def get_vllm_quant_config(self) -> QuantizationConfig: + return self._vllm_config \ No newline at end of file From aa3598e68736076703056a1da29d68afb5f444db Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 16:25:22 +0800 Subject: [PATCH 02/26] fix process_weights_after_loading Signed-off-by: juboyu <767868009@qq.com> --- .../quantization/test_int8_config.py | 27 ++++++++++-- vllm_omni/diffusion/quantization/int8.py | 42 ++++++++++--------- 2 files changed, 46 insertions(+), 23 deletions(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 32e6c6dd94c..eba981abdd3 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -25,13 +25,26 @@ def test_vllm_config_extraction(): assert vllm_config is not None assert vllm_config.activation_scheme == "dynamic" -def test_supported_methods(): - """Test that supported methods list is correct.""" - from vllm_omni.diffusion.quantization import SUPPORTED_QUANTIZATION_METHODS +def test_none_quantization(): + """Test that None quantization returns None config.""" + from vllm_omni.diffusion.quantization import ( + get_diffusion_quant_config, + get_vllm_quant_config_for_layers, + ) - assert "int8" in SUPPORTED_QUANTIZATION_METHODS + config = get_diffusion_quant_config(None) + assert config is None + vllm_config = get_vllm_quant_config_for_layers(config) + assert vllm_config is None +def test_invalid_quantization(): + """Test that invalid quantization method raises error.""" + from vllm_omni.diffusion.quantization import get_diffusion_quant_config + + with pytest.raises(ValueError, match="Unknown quantization method"): + get_diffusion_quant_config("invalid_method") + def test_int8_config_with_custom_params(): """Test Int8 config with custom parameters.""" from vllm_omni.diffusion.quantization import get_diffusion_quant_config @@ -45,6 +58,12 @@ def test_int8_config_with_custom_params(): assert config.activation_scheme == "dynamic" assert "proj_out" in config.ignored_layers +def test_supported_methods(): + """Test that supported methods list is correct.""" + from vllm_omni.diffusion.quantization import SUPPORTED_QUANTIZATION_METHODS + + assert "int8" in SUPPORTED_QUANTIZATION_METHODS + def test_quantization_integration(): """Test end-to-end quantization flow through OmniDiffusionConfig.""" diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index c1e6d6c8289..c79be301d2e 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -48,7 +48,7 @@ from vllm.model_executor.models.utils import WeightsMapper # Dynamic quantization is supported first. -ACTIVATION_SCHEMES=["dynamic"] +ACTIVATION_SCHEMES = ["dynamic"] logger = init_logger(__name__) @@ -59,7 +59,7 @@ def create_int8_weight_parameter( weight_loader: Callable | None, ) -> torch.nn.Parameter: """ - Create int8 weight parameter. + Create int8 weight parameter. """ from vllm.model_executor.parameter import ModelWeightParameter @@ -83,7 +83,7 @@ def create_int8_scale_parameter( params_dtype: torch.dtype, ) -> torch.nn.Parameter: """ - Create scale parameter based on quantization strategy + Create scale parameter based on quantization strategy """ if parameter_type == ChannelQuantScaleParameter: scale = parameter_type( @@ -98,7 +98,7 @@ def create_int8_scale_parameter( class Int8Config(QuantizationConfig): """ - Config class for Int8. + Config class for Int8. """ def __init__( @@ -144,7 +144,7 @@ def from_config(cls, config: dict[str, Any]) -> Int8Config: ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) if not ignored_layers: - ignored_layers = cls.get_frm_keys_or( + ignored_layers = cls.get_from_keys_or( config, ["modules_to_not_convert"], None ) return cls( @@ -235,6 +235,11 @@ def create_weights( ) layer.register_parameter("weight_scale",scale) + def process_weights_after_loading(self,layer: Module) -> None: + layer.weight.data = layer.weight.data.t().contiguous() + layer.weight_scale.data = layer.weight_scale.data.squeeze() + layer.weight_offset.data = layer.weight_offset.data.squeeze() + def apply( self, layer: torch.nn.Module, @@ -285,7 +290,7 @@ def create_weights( data=torch.empty( output_size_per_partition, input_size_per_partition, - dtype = torch.int8, + dtype = params_dtype, ), input_dim = 1, output_dim = 0, @@ -294,21 +299,20 @@ def create_weights( layer.register_parameter("weight", weight) def process_weights_after_loading(self, layer: Module) -> None: - if getattr(layer, "_already_called_process_weights_after_loading", False): - return - - layer.input-scale = None, qweight, weight_scale = torch_npu.npu_dynamic_quant(layer.weight) - weight = qweight.t() + layer.weight = None + torch.npu.empty_cache() + + weight = qweight.t().contiguous() # Update layer with new values. - replace_parameter(layer, "weight", weight.data) - replace_parameter(layer, "weight_scale", weight_scale.data) + replace_parameter(layer, "weight", weight) + replace_parameter(layer, "weight_scale", weight_scale) class DiffusionInt8Config(DiffusionQuantizationConfig): """ - Int8 quantization config optimized for diggusion transformers. + Int8 quantization config optimized for diffusion transformers. Args: activation_scheme: Activation quantization scheme. @@ -322,12 +326,12 @@ def __int__( activation_scheme: str = "dynamic", ignored_layers: list[str] | None = None, ): + self.activation_scheme=activation_scheme + self.ignored_layers=ignored_layers or [] + # Create underlying vLLM Int8 config self._vllm_config = Int8Config( - is_checkpoint_int8_serialized=False, + is_checkpoint_int8_serialized=False, # Online quantization activation_scheme=activation_scheme, ignored_layers=ignored_layers, - ) - - def get_vllm_quant_config(self) -> QuantizationConfig: - return self._vllm_config \ No newline at end of file + ) \ No newline at end of file From e3948198f6825a1c169cdaa5092f07ff346f1d1e Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 16:33:24 +0800 Subject: [PATCH 03/26] fix DiffusionInt8Config init Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index c79be301d2e..0207e139793 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -321,7 +321,7 @@ class DiffusionInt8Config(DiffusionQuantizationConfig): ignored_layers: List of layer name patterns to skip quantization. """ - def __int__( + def __init__( self, activation_scheme: str = "dynamic", ignored_layers: list[str] | None = None, From f4282df54d0a2eebf5bcc4afffa91bd2ea219835 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 16:38:07 +0800 Subject: [PATCH 04/26] fix Int8Config's function from_config undefine Int8Config Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 0207e139793..e8e79bcbb68 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -137,7 +137,7 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): self.ignored_layers = hf_to_vllm_mapper.apply_list(self.ignored_layers) @classmethod - def from_config(cls, config: dict[str, Any]) -> Int8Config: + def from_config(cls, config: dict[str, Any]) -> "Int8Config": quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_int8_serialized = "int8" in quant_method activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) From b1c29a4118ab1a91fe1fb042c87cfe489e07f522 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 16:52:34 +0800 Subject: [PATCH 05/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 197 +++++++++++------------ 1 file changed, 94 insertions(+), 103 deletions(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index e8e79bcbb68..35d249511a8 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -2,43 +2,33 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """INT8 quantization config for diffusion transformers.""" -from collections.abc import Callable, Sequence +from collections.abc import Callable -from typing import Any, Dict, List, Mapping, Optional +from typing import Any, Optional from typing import TYPE_CHECKING import torch -from torch.nn import Module - import torch_npu - -from vllm.model_executor.parameter import ( - BlockQuantScaleParameter, - ModelWeightParameter, - PerTensorScaleParameter, - ChannelQuantScaleParameter, +from torch.nn import Module +from vllm.logger import init_logger +from vllm.model_executor.layers.linear import ( + LinearBase, + LinearMethodBase, + UnquantizedLinearMethod, ) - from vllm.model_executor.layers.quantization import QuantizationMethods - from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) - from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped, ) - -from vllm.model_executor.layers.linear import ( - LinearBase, - LinearMethodBase, - UnquantizedLinearMethod, +from vllm.model_executor.parameter import ( + ModelWeightParameter, + ChannelQuantScaleParameter, ) - -from vllm.model_executor.utils import replace_parameter, set_weight_attrs - -from vllm.logger import init_logger +from vllm.model_executor.utils import replace_parameter from vllm_omni.platforms import current_omni_platform @@ -48,15 +38,15 @@ from vllm.model_executor.models.utils import WeightsMapper # Dynamic quantization is supported first. -ACTIVATION_SCHEMES = ["dynamic"] +ACTIVATION_SCHEMES = ["dynamic"] logger = init_logger(__name__) def create_int8_weight_parameter( - output_size_per_partition: int, - input_size_per_partition: int, - weight_loader: Callable | None, + output_size_per_partition: int, + input_size_per_partition: int, + weight_loader: Callable | None, ) -> torch.nn.Parameter: """ Create int8 weight parameter. @@ -67,45 +57,46 @@ def create_int8_weight_parameter( data=torch.empty( output_size_per_partition, input_size_per_partition, - dtype = torch.int8, + dtype=torch.int8, ), - input_dim = 1, - output_dim = 0, - weight_loader = weight_loader, + input_dim=1, + output_dim=0, + weight_loader=weight_loader, ) def create_int8_scale_parameter( - parameter_type: torch.nn.Parameter, - output_partition_sizes: list[int], - input_size_per_partition: int, - block_size: list[int] | None, - weight_loader: Callable | None, - params_dtype: torch.dtype, + parameter_type: torch.nn.Parameter, + output_partition_sizes: list[int], + input_size_per_partition: int, + block_size: list[int] | None, + weight_loader: Callable | None, + params_dtype: torch.dtype, ) -> torch.nn.Parameter: """ Create scale parameter based on quantization strategy """ if parameter_type == ChannelQuantScaleParameter: scale = parameter_type( - data = torch.empty((sum(output_partition_sizes),1), dtype = torch.float32), - output_dim = 0, + data=torch.empty((sum(output_partition_sizes),1), dtype = torch.float32), + output_dim=0, weight_loader=weight_loader, ) else: raise ValueError(f"Unknown parameter type: {parameter_type}") - + return scale + class Int8Config(QuantizationConfig): """ Config class for Int8. """ - + def __init__( - self, - is_checkpoint_int8_serialized: bool = False, - activation_scheme: str = "dynamic", - ignored_layers: list[str] | None = None, + self, + is_checkpoint_int8_serialized: bool = False, + activation_scheme: str = "dynamic", + ignored_layers: list[str] | None = None, ) -> None: super().__init__() @@ -115,23 +106,23 @@ def __init__( raise ValueError(f"Unsupported activation scheme {activation_scheme}") self.activation_scheme = activation_scheme self.ignored_layers = ignored_layers or [] - + @classmethod def get_name(cls) -> QuantizationMethods: return "int8" - + @classmethod def get_supported_act_dtypes(cls) -> list[torch.dtype]: return [torch.bfloat16, torch.float16] - + @classmethod def get_min_capability(cls) -> int: return 75 - + @classmethod - def get_config_filenames(cls) -> List[str]: + def get_config_filenames(cls) -> list[str]: return [] - + def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): if self.ignored_layers is not None: self.ignored_layers = hf_to_vllm_mapper.apply_list(self.ignored_layers) @@ -144,19 +135,17 @@ def from_config(cls, config: dict[str, Any]) -> "Int8Config": ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) if not ignored_layers: - ignored_layers = cls.get_from_keys_or( - config, ["modules_to_not_convert"], None - ) + ignored_layers = cls.get_from_keys_or(config, ["modules_to_not_convert"], None) return cls( - is_checkpoint_int8_serialized = is_checkpoint_int8_serialized, - activation_scheme = activation_scheme, - ignored_layers = ignored_layers, + is_checkpoint_int8_serialized=is_checkpoint_int8_serialized, + activation_scheme=activation_scheme, + ignored_layers=ignored_layers, ) def get_quant_method( - self, - layer: torch.nn.Module, - prefix: str, + self, + layer: torch.nn.Module, + prefix: str, ) -> Optional["QuantizeMethodBase"]: if current_omni_platform.is_npu(): if isinstance(layer, LinearBase): @@ -175,7 +164,8 @@ def get_quant_method( else: logger.warning("The current platform is not supported.") return None - + + class Int8LinearMethod(LinearMethodBase): """ Linear method for Int8 @@ -184,25 +174,25 @@ class Int8LinearMethod(LinearMethodBase): Args: quant_config: The quantization config. """ - + def __init__(self, quant_config: Int8Config): self.quant_config = quant_config self.out_dtype = torch.get_default_dtype() def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, ): output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") layer.logical_widths = output_partition_sizes - layer.input_size_per_partition = input_size_per_partition + layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition layer.orig_dtype = params_dtype @@ -220,10 +210,10 @@ def create_weights( input_size_per_partition, None, weight_loader, - params_dtype + params_dtype, ) layer.register_parameter("weight_offset", offset) - + # WEIGHT SCALE scale = create_int8_scale_parameter( ChannelQuantScaleParameter, @@ -233,36 +223,37 @@ def create_weights( weight_loader, params_dtype, ) - layer.register_parameter("weight_scale",scale) + layer.register_parameter("weight_scale", scale) - def process_weights_after_loading(self,layer: Module) -> None: + def process_weights_after_loading(self, layer: Module) -> None: layer.weight.data = layer.weight.data.t().contiguous() layer.weight_scale.data = layer.weight_scale.data.squeeze() layer.weight_offset.data = layer.weight_offset.data.squeeze() def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: torch.Tensor | None = None, + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, ) -> torch.Tensor: ori_shape = x.shape ori_dtype = x.dtype x = x.reshape(-1, ori_shape[-1]) quantized_x, pertoken_scale = torch_npu.npu_dynamic_quant(x) - + output = torch_npu.npu_quant_matmul( quantized_x, layer.weight, layer.weight_scale, - bias = bias, - pertoken_scale = pertoken_scale, - output_dtype = ori_dtype, + bias=bias, + pertoken_scale=pertoken_scale, + output_dtype=ori_dtype, ) output = output.reshape(*ori_shape[:-1], -1) return output - + + class Int8OnlineLinearMethod(Int8LinearMethod): """ Online version of Int8LinearMethod, loads the fp16/bf16 checkpoint @@ -270,19 +261,19 @@ class Int8OnlineLinearMethod(Int8LinearMethod): """ def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, ): output_size_per_partition = sum(output_partition_sizes) weight_loader = extra_weight_attrs.get("weight_loader") layer.logical_widths = output_partition_sizes - layer.input_size_per_partition = input_size_per_partition + layer.input_size_per_partition = input_size_per_partition layer.output_size_per_partition = output_size_per_partition layer.orig_dtype = params_dtype @@ -290,11 +281,11 @@ def create_weights( data=torch.empty( output_size_per_partition, input_size_per_partition, - dtype = params_dtype, + dtype=params_dtype, ), - input_dim = 1, - output_dim = 0, - weight_loader = weight_loader, + input_dim=1, + output_dim=0, + weight_loader=weight_loader, ) layer.register_parameter("weight", weight) @@ -322,16 +313,16 @@ class DiffusionInt8Config(DiffusionQuantizationConfig): """ def __init__( - self, - activation_scheme: str = "dynamic", - ignored_layers: list[str] | None = None, + self, + activation_scheme: str = "dynamic", + ignored_layers: list[str] | None = None, ): - self.activation_scheme=activation_scheme - self.ignored_layers=ignored_layers or [] + self.activation_scheme = activation_scheme + self.ignored_layers = ignored_layers or [] # Create underlying vLLM Int8 config self._vllm_config = Int8Config( - is_checkpoint_int8_serialized=False, # Online quantization + is_checkpoint_int8_serialized=False, # Online quantization activation_scheme=activation_scheme, ignored_layers=ignored_layers, - ) \ No newline at end of file + ) From 2a95e10d1b896ed7b87433416e0607650e74ab7b Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 17:05:09 +0800 Subject: [PATCH 06/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- tests/diffusion/quantization/test_int8_config.py | 4 ++++ vllm_omni/diffusion/quantization/int8.py | 16 ++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index eba981abdd3..bb3acf1650a 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -4,6 +4,7 @@ import pytest + def test_int8_config_creation(): """Test that Int8 config can be created.""" from vllm_omni.diffusion.quantization import get_diffusion_quant_config @@ -25,6 +26,7 @@ def test_vllm_config_extraction(): assert vllm_config is not None assert vllm_config.activation_scheme == "dynamic" + def test_none_quantization(): """Test that None quantization returns None config.""" from vllm_omni.diffusion.quantization import ( @@ -45,6 +47,7 @@ def test_invalid_quantization(): with pytest.raises(ValueError, match="Unknown quantization method"): get_diffusion_quant_config("invalid_method") + def test_int8_config_with_custom_params(): """Test Int8 config with custom parameters.""" from vllm_omni.diffusion.quantization import get_diffusion_quant_config @@ -58,6 +61,7 @@ def test_int8_config_with_custom_params(): assert config.activation_scheme == "dynamic" assert "proj_out" in config.ignored_layers + def test_supported_methods(): """Test that supported methods list is correct.""" from vllm_omni.diffusion.quantization import SUPPORTED_QUANTIZATION_METHODS diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 35d249511a8..1770705601c 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -3,9 +3,7 @@ """INT8 quantization config for diffusion transformers.""" from collections.abc import Callable - -from typing import Any, Optional -from typing import TYPE_CHECKING +from typing import Any, Optional,TYPE_CHECKING import torch import torch_npu @@ -25,8 +23,8 @@ is_layer_skipped, ) from vllm.model_executor.parameter import ( - ModelWeightParameter, ChannelQuantScaleParameter, + ModelWeightParameter, ) from vllm.model_executor.utils import replace_parameter @@ -64,6 +62,7 @@ def create_int8_weight_parameter( weight_loader=weight_loader, ) + def create_int8_scale_parameter( parameter_type: torch.nn.Parameter, output_partition_sizes: list[int], @@ -77,7 +76,7 @@ def create_int8_scale_parameter( """ if parameter_type == ChannelQuantScaleParameter: scale = parameter_type( - data=torch.empty((sum(output_partition_sizes),1), dtype = torch.float32), + data=torch.empty((sum(output_partition_sizes),1), dtype=torch.float32), output_dim=0, weight_loader=weight_loader, ) @@ -141,7 +140,7 @@ def from_config(cls, config: dict[str, Any]) -> "Int8Config": activation_scheme=activation_scheme, ignored_layers=ignored_layers, ) - + def get_quant_method( self, layer: torch.nn.Module, @@ -249,7 +248,7 @@ def apply( bias=bias, pertoken_scale=pertoken_scale, output_dtype=ori_dtype, - ) + ) output = output.reshape(*ori_shape[:-1], -1) return output @@ -261,7 +260,7 @@ class Int8OnlineLinearMethod(Int8LinearMethod): """ def create_weights( - self, + self, layer: torch.nn.Module, input_size_per_partition: int, output_partition_sizes: list[int], @@ -289,6 +288,7 @@ def create_weights( ) layer.register_parameter("weight", weight) + def process_weights_after_loading(self, layer: Module) -> None: qweight, weight_scale = torch_npu.npu_dynamic_quant(layer.weight) From a3fcc33c94309feb422cdc8090fdba0c09c5ec6e Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 25 Feb 2026 17:16:13 +0800 Subject: [PATCH 07/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 1770705601c..44932a54d30 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -3,7 +3,7 @@ """INT8 quantization config for diffusion transformers.""" from collections.abc import Callable -from typing import Any, Optional,TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Optional import torch import torch_npu @@ -76,7 +76,7 @@ def create_int8_scale_parameter( """ if parameter_type == ChannelQuantScaleParameter: scale = parameter_type( - data=torch.empty((sum(output_partition_sizes),1), dtype=torch.float32), + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), output_dim=0, weight_loader=weight_loader, ) @@ -288,7 +288,6 @@ def create_weights( ) layer.register_parameter("weight", weight) - def process_weights_after_loading(self, layer: Module) -> None: qweight, weight_scale = torch_npu.npu_dynamic_quant(layer.weight) @@ -301,6 +300,7 @@ def process_weights_after_loading(self, layer: Module) -> None: replace_parameter(layer, "weight", weight) replace_parameter(layer, "weight_scale", weight_scale) + class DiffusionInt8Config(DiffusionQuantizationConfig): """ Int8 quantization config optimized for diffusion transformers. From 17aab526f568160cd500bc7656b3611b211b0288 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Thu, 26 Feb 2026 11:05:39 +0800 Subject: [PATCH 08/26] add quant_config_cls and fix import torch_npu Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 44932a54d30..43038950193 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Any, Optional import torch -import torch_npu from torch.nn import Module from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( @@ -235,6 +234,8 @@ def apply( x: torch.Tensor, bias: torch.Tensor | None = None, ) -> torch.Tensor: + import torch_npu + ori_shape = x.shape ori_dtype = x.dtype @@ -289,6 +290,8 @@ def create_weights( layer.register_parameter("weight", weight) def process_weights_after_loading(self, layer: Module) -> None: + import torch_npu + qweight, weight_scale = torch_npu.npu_dynamic_quant(layer.weight) layer.weight = None @@ -311,6 +314,7 @@ class DiffusionInt8Config(DiffusionQuantizationConfig): Format: [block_n, block_k]. If None, uses per-tensor scaling. ignored_layers: List of layer name patterns to skip quantization. """ + quant_config_cls = Int8Config def __init__( self, From 9ecc62f8febb3228ccbfbde6f1bffc6a486735b8 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Thu, 26 Feb 2026 11:10:51 +0800 Subject: [PATCH 09/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 43038950193..04f36d579af 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -235,7 +235,7 @@ def apply( bias: torch.Tensor | None = None, ) -> torch.Tensor: import torch_npu - + ori_shape = x.shape ori_dtype = x.dtype @@ -314,6 +314,7 @@ class DiffusionInt8Config(DiffusionQuantizationConfig): Format: [block_n, block_k]. If None, uses per-tensor scaling. ignored_layers: List of layer name patterns to skip quantization. """ + quant_config_cls = Int8Config def __init__( From 6173d12a1b60f19a0f464e73a46d8b018b9b4aeb Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Tue, 3 Mar 2026 14:41:51 +0800 Subject: [PATCH 10/26] fix invalid character Signed-off-by: juboyu <767868009@qq.com> --- examples/offline_inference/text_to_image/text_to_image.py | 2 +- vllm_omni/diffusion/quantization/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 09a8965ec38..1cc195f0b35 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -132,7 +132,7 @@ def parse_args() -> argparse.Namespace: "--quantization", type=str, default=None, - choices=["fp8", "int8","gguf"], + choices=["fp8", "int8", "gguf"], help="Quantization method for the transformer. " "Options: 'fp8' (FP8 W8A8 on Ada/Hopper, weight-only on older GPUs), 'int8' (Int8 W8A8 on NPUs), 'gguf' (GGUF quantized weights)." "Default: None (no quantization, uses BF16).", diff --git a/vllm_omni/diffusion/quantization/__init__.py b/vllm_omni/diffusion/quantization/__init__.py index de7f33994d7..eb4b8ea9f5e 100644 --- a/vllm_omni/diffusion/quantization/__init__.py +++ b/vllm_omni/diffusion/quantization/__init__.py @@ -28,8 +28,8 @@ from .base import DiffusionQuantizationConfig from .fp8 import DiffusionFp8Config -from .int8 import DiffusionInt8Config from .gguf import DiffusionGgufConfig +from .int8 import DiffusionInt8Config if TYPE_CHECKING: from vllm.model_executor.layers.quantization.base_config import ( From 6ce717af77ad7b63d6f01f75f382935c4530f1cf Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 11 Mar 2026 09:36:28 +0800 Subject: [PATCH 11/26] add int8 for GPU Signed-off-by: juboyu <767868009@qq.com> --- .../diffusion/quantization/overview.md | 6 - docs/user_guide/diffusion_acceleration.md | 6 +- .../quantization/test_int8_config.py | 170 ++++++++++++++++-- vllm_omni/diffusion/quantization/int8.py | 145 +++++++++------ 4 files changed, 247 insertions(+), 80 deletions(-) diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md index 8ce982c8f0a..b996c64de2c 100644 --- a/docs/user_guide/diffusion/quantization/overview.md +++ b/docs/user_guide/diffusion/quantization/overview.md @@ -17,9 +17,3 @@ vLLM-Omni supports quantization of DiT linear layers to reduce memory usage and | Ada/Hopper (SM 89+) | RTX 4090, H100, H200 | Full W8A8 with native hardware | Kernel selection is automatic. - -## Device Compatibility for Int8 - -| NPU Generation | Int8 Mode | -|---------------|----------| -| Atlas A2/A3 | W8A8 | diff --git a/docs/user_guide/diffusion_acceleration.md b/docs/user_guide/diffusion_acceleration.md index e61e0b7e337..ee3b28329b7 100644 --- a/docs/user_guide/diffusion_acceleration.md +++ b/docs/user_guide/diffusion_acceleration.md @@ -16,7 +16,7 @@ Both methods can provide significant speedups (typically **1.5x-2.0x**) while ma vLLM-Omni also supports quantization methods: -3. **[Quantization](diffusion/quantization/overview.md)** - Reduces DiT linear layers from BF16 to FP8 or Int8 (for NPU), providing ~1.28x speedup with minimal quality loss. Supports per-layer skip for sensitive layers. +3. **[Quantization](diffusion/quantization/overview.md)** - Reduces DiT linear layers from BF16 to FP8 or Int8, providing ~1.28x speedup with minimal quality loss. Supports per-layer skip for sensitive layers. vLLM-Omni also supports parallelism methods for diffusion models, including: @@ -46,7 +46,7 @@ vLLM-Omni also supports parallelism methods for diffusion models, including: | Method | Configuration | Description | Best For | |--------|--------------|-------------|----------| | **FP8** | `quantization="fp8"` | FP8 W8A8 on Ada/Hopper, weight-only on older GPUs | Memory reduction, inference speedup | -| **Int8** | `quantization="int8"` | Int8 W8A8 on NPUs | Memory reduction, inference speedup | +| **Int8** | `quantization="int8"` | Int8 W8A8 | Memory reduction, inference speedup | ## Supported Models @@ -335,7 +335,7 @@ outputs = omni.generate( ) ``` -### Using Int8 Quantization(only for NPU) +### Using Int8 Quantization ```python from vllm_omni import Omni diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index bb3acf1650a..77df4afba59 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -3,12 +3,26 @@ """Unit tests for Int8 quantization config.""" import pytest +from unittest.mock import Mock, MagicMock, patch +from pytest_mock import MockerFixture +import torch +from torch.nn import Parameter, Module + +from vllm.model_executor.layers.linear import ( + LinearBase, + UnquantizedLinearMethod +) +from vllm.model_executor.parameter import ( + ModelWeightParameter +) +from vllm_omni.diffusion.quantization import ( + get_diffusion_quant_config, + get_vllm_quant_config_for_layers, +) def test_int8_config_creation(): """Test that Int8 config can be created.""" - from vllm_omni.diffusion.quantization import get_diffusion_quant_config - config = get_diffusion_quant_config("int8") assert config is not None assert config.get_name() == "int8" @@ -16,11 +30,6 @@ def test_int8_config_creation(): def test_vllm_config_extraction(): """Test that vLLM config can be extracted from diffusion config.""" - from vllm_omni.diffusion.quantization import ( - get_diffusion_quant_config, - get_vllm_quant_config_for_layers, - ) - diff_config = get_diffusion_quant_config("int8") vllm_config = get_vllm_quant_config_for_layers(diff_config) assert vllm_config is not None @@ -29,11 +38,6 @@ def test_vllm_config_extraction(): def test_none_quantization(): """Test that None quantization returns None config.""" - from vllm_omni.diffusion.quantization import ( - get_diffusion_quant_config, - get_vllm_quant_config_for_layers, - ) - config = get_diffusion_quant_config(None) assert config is None vllm_config = get_vllm_quant_config_for_layers(config) @@ -42,16 +46,12 @@ def test_none_quantization(): def test_invalid_quantization(): """Test that invalid quantization method raises error.""" - from vllm_omni.diffusion.quantization import get_diffusion_quant_config - with pytest.raises(ValueError, match="Unknown quantization method"): get_diffusion_quant_config("invalid_method") def test_int8_config_with_custom_params(): """Test Int8 config with custom parameters.""" - from vllm_omni.diffusion.quantization import get_diffusion_quant_config - config = get_diffusion_quant_config( "int8", activation_scheme="dynamic", @@ -119,3 +119,141 @@ def test_quantization_conflicting_methods_warning(caplog): ) # No warning when methods match assert config.quantization_config is not None + + +def test_get_quant_method(mocker: MockerFixture): + """Test for get_quant_method method for GPU""" + from vllm_omni.diffusion.quantization.int8 import Int8OnlineLinearMethod + + diff_config = get_diffusion_quant_config("int8") + vllm_config = get_vllm_quant_config_for_layers(diff_config) + + def _fake_init(self, quant_config): + pass + + layer = MagicMock(spec=LinearBase) + mocker.patch.object(Int8OnlineLinearMethod, "__init__", _fake_init) + + prefix = "test_layer" + + # Mock the platform to be GPU + with patch('vllm_omni.platforms.current_omni_platform.is_npu', return_value=False): + method = vllm_config.get_quant_method(layer, prefix) + assert isinstance(method, Int8OnlineLinearMethod) + + # Test skipping quantization for a layer + vllm_config.ignored_layers = [prefix] + method = vllm_config.get_quant_method(layer, prefix) + assert isinstance(method, UnquantizedLinearMethod) + + +def test_get_npu_quant_method(): + """Test for get_quant_method method for NPU""" + from vllm_omni.diffusion.quantization.int8 import NPUInt8OnlineLinearMethod + + diff_config = get_diffusion_quant_config("int8") + vllm_config = get_vllm_quant_config_for_layers(diff_config) + + layer = MagicMock(spec=LinearBase) + prefix = "test_layer" + + # Mock the platform to be GPU + with patch('vllm_omni.platforms.current_omni_platform.is_npu', return_value=True): + method = vllm_config.get_quant_method(layer, prefix) + assert isinstance(method, NPUInt8OnlineLinearMethod) + + # Test skipping quantization for a layer + vllm_config.ignored_layers = [prefix] + method = vllm_config.get_quant_method(layer, prefix) + assert isinstance(method, UnquantizedLinearMethod) + + +class TestInt8LinearMethod: + @pytest.fixture + def mock_quant_config(self, mocker): + return mocker.Mock() + + @pytest.fixture + def mock_kernel(self, mocker): + kernel = mocker.Mock() + kernel.process_weights_after_loading = mocker.Mock() + kernel.apply_weights = mocker.Mock(return_value=torch.randn(1, 10)) + return kernel + + @pytest.fixture + def patch_deps(self, mocker, mock_kernel): + # mock init_int8_linear_kernel + mocker.patch("vllm_omni.diffusion.quantization.int8.init_int8_linear_kernel", + return_value=mock_kernel) + return mock_kernel + + def test_init(self, patch_deps, mock_quant_config): + # test for Int8LinearMethod init + from vllm_omni.diffusion.quantization.int8 import Int8LinearMethod + from vllm_omni.diffusion.quantization.int8 import init_int8_linear_kernel + + method = Int8LinearMethod(mock_quant_config) + + assert method.quant_config == mock_quant_config + init_int8_linear_kernel.assert_called_once_with( + is_channelwise=False, + is_static_input_scheme=False, + input_symmetric=True, + module_name="Int8LinearMethod" + ) + assert method.int8_linear == patch_deps + + def test_process_weights_after_loading(self, patch_deps, mock_quant_config): + from vllm_omni.diffusion.quantization.int8 import Int8LinearMethod + method = Int8LinearMethod(mock_quant_config) + layer = Module() + + method.process_weights_after_loading(layer) + patch_deps.process_weights_after_loading.assert_called_once_with(layer) + + def test_apply(self, patch_deps, mock_quant_config): + from vllm_omni.diffusion.quantization.int8 import Int8LinearMethod + method = Int8LinearMethod(mock_quant_config) + layer = Module() + x = torch.randn(1, 128) + bias = torch.randn(128) + + output = method.apply(layer, x, bias) + + patch_deps.apply_weights.assert_called_once_with(layer, x, bias) + assert isinstance(output, torch.Tensor) + + +class TestInt8OnlineLinearMethod: + @pytest.fixture + def mock_quant_config(self, mocker): + return mocker.Mock() + + @pytest.fixture + def mock_deps(self, mocker): + # mock kernel + mock_kernel = mocker.Mock() + mocker.patch("vllm_omni.diffusion.quantization.int8.init_int8_linear_kernel", + return_value=mock_kernel) + mocker.patch("vllm_omni.diffusion.quantization.int8.replace_parameter") + + # mock scaled_int8_quant return value + mock_qweight = torch.ones((128, 64), dtype=torch.int8) + mock_scale = torch.tensor([0.5]) + mock_quant = mocker.patch("vllm_omni.diffusion.quantization.int8.ops.scaled_int8_quant", + return_value=(mock_qweight, mock_scale, None)) + return { + "kernel": mock_kernel, + "quant": mock_quant, + "mock_qweight": mock_qweight, + "mock_scale": mock_scale + } + + def test_process_weights_after_loading(self, mock_deps, mock_quant_config): + from vllm_omni.diffusion.quantization.int8 import Int8OnlineLinearMethod + + method = Int8OnlineLinearMethod(mock_quant_config) + layer = Module() + layer.weight = Parameter(torch.randn(128, 64)) + method.process_weights_after_loading(layer) + mock_deps["quant"].assert_called_once_with(layer.weight, scale=None) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 04f36d579af..4cea27e5964 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -7,6 +7,7 @@ import torch from torch.nn import Module +from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( LinearBase, @@ -18,6 +19,9 @@ QuantizationConfig, QuantizeMethodBase, ) +from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( + init_int8_linear_kernel, +) from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped, ) @@ -48,8 +52,6 @@ def create_int8_weight_parameter( """ Create int8 weight parameter. """ - from vllm.model_executor.parameter import ModelWeightParameter - return ModelWeightParameter( data=torch.empty( output_size_per_partition, @@ -154,7 +156,10 @@ def get_quant_method( ): return UnquantizedLinearMethod() if not self.is_checkpoint_int8_serialized: - online_method = Int8OnlineLinearMethod(self) + if current_omni_platform.is_cuda(): + online_method = Int8OnlineLinearMethod(self) + elif current_omni_platform.is_npu(): + online_method = NPUInt8OnlineLinearMethod(self) return online_method else: offline_method = Int8LinearMethod(self) @@ -164,7 +169,7 @@ def get_quant_method( return None -class Int8LinearMethod(LinearMethodBase): +class BaseInt8LinearMethod(LinearMethodBase): """ Linear method for Int8 Supports loading Int8 checkpoints with static weight scale and dynamic activation scale. @@ -201,32 +206,75 @@ def create_weights( ) layer.register_parameter("weight", weight) - # WEIGHT OFFSET - offset = create_int8_scale_parameter( - ChannelQuantScaleParameter, - output_partition_sizes, - input_size_per_partition, - None, - weight_loader, - params_dtype, - ) - layer.register_parameter("weight_offset", offset) + if self.quant_config.is_checkpoint_int8_serialized: + scale = create_int8_scale_parameter( + ChannelQuantScaleParameter, + output_partition_sizes, + input_size_per_partition, + None, + weight_loader, + params_dtype, + ) + layer.register_parameter("weight_scale", scale) - # WEIGHT SCALE - scale = create_int8_scale_parameter( - ChannelQuantScaleParameter, - output_partition_sizes, - input_size_per_partition, - None, - weight_loader, - params_dtype, + def process_weights_after_loading(self, layer: Module) -> None: + raise NotImplementedError(f"No BaseInt8LinearMethod process_weights_after_loading implementation.") + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + raise NotImplementedError(f"No BaseInt8LinearMethod apply implementation.") + + +class Int8LinearMethod(BaseInt8LinearMethod): + """ + Linear method for Int8 + Supports loading Int8 checkpoints. + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: Int8Config): + super().__init__(quant_config) + + self.int8_linear = init_int8_linear_kernel( + is_channelwise=False, + is_static_input_scheme=False, + input_symmetric=True, + module_name=self.__class__.__name__, ) - layer.register_parameter("weight_scale", scale) + + def process_weights_after_loading(self, layer: Module) -> None: + self.int8_linear.process_weights_after_loading(layer) + + def apply( + self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.int8_linear.apply_weights(layer, x, bias) + + +class NPUInt8LinearMethod(BaseInt8LinearMethod): + """ + NPU Linear method for Int8 + Supports loading Int8 checkpoints. + + Args: + quant_config: The quantization config. + """ + + def __init__(self, quant_config: Int8Config): + super().__init__(quant_config) def process_weights_after_loading(self, layer: Module) -> None: layer.weight.data = layer.weight.data.t().contiguous() layer.weight_scale.data = layer.weight_scale.data.squeeze() - layer.weight_offset.data = layer.weight_offset.data.squeeze() def apply( self, @@ -260,47 +308,34 @@ class Int8OnlineLinearMethod(Int8LinearMethod): and quantized the weights during loading. """ - def create_weights( - self, - layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: list[int], - input_size: int, - output_size: int, - params_dtype: torch.dtype, - **extra_weight_attrs, - ): - output_size_per_partition = sum(output_partition_sizes) - weight_loader = extra_weight_attrs.get("weight_loader") - layer.logical_widths = output_partition_sizes - layer.input_size_per_partition = input_size_per_partition - layer.output_size_per_partition = output_size_per_partition - layer.orig_dtype = params_dtype + def process_weights_after_loading(self, layer: Module) -> None: + qweight, weight_scale = ops.scaled_int8_quant(layer.weight, scale=None) + weight = qweight.t() - weight = ModelWeightParameter( - data=torch.empty( - output_size_per_partition, - input_size_per_partition, - dtype=params_dtype, - ), - input_dim=1, - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight", weight) + # Update layer with new values. + replace_parameter(layer, "weight", weight) + replace_parameter(layer, "weight_scale", weight_scale) + + +class NPUInt8OnlineLinearMethod(NPUInt8LinearMethod): + """ + NPU Online version of Int8LinearMethod, loads the fp16/bf16 checkpoint + and quantized the weights during loading. + """ def process_weights_after_loading(self, layer: Module) -> None: import torch_npu - qweight, weight_scale = torch_npu.npu_dynamic_quant(layer.weight) + weight = layer.weight + qweight, weight_scale = torch_npu.npu_dynamic_quant(weight) - layer.weight = None + del weight torch.npu.empty_cache() - weight = qweight.t().contiguous() + qweight = qweight.t().contiguous() # Update layer with new values. - replace_parameter(layer, "weight", weight) + replace_parameter(layer, "weight", qweight) replace_parameter(layer, "weight_scale", weight_scale) From d5ac438e990b98e332fe6ec7d438c2109606608e Mon Sep 17 00:00:00 2001 From: Alicia <115451386+congw729@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:00:33 +0800 Subject: [PATCH 12/26] [CI] Add scripts for bechmark collection and email distribution. (#1307) Signed-off-by: Alicia <115451386+congw729@users.noreply.github.com> --- .buildkite/test-nightly.yml | 27 +- pyproject.toml | 3 +- tests/perf/scripts/run_benchmark.py | 17 +- tools/nightly/generate_nightly_perf_excel.py | 437 +++++++++++++++++++ tools/nightly/send_nightly_perf_email.py | 233 ++++++++++ 5 files changed, 704 insertions(+), 13 deletions(-) create mode 100644 tools/nightly/generate_nightly_perf_excel.py create mode 100644 tools/nightly/send_nightly_perf_email.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 7b535033e05..8b21acfa7c5 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -1,5 +1,5 @@ steps: - - label: "Omni Model Test with H100" + - label: ":full_moon: Omni Model Test with H100" timeout_in_minutes: 90 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" @@ -41,7 +41,7 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate - - label: "Omni Model Test" + - label: ":full_moon: Omni Model Test" timeout_in_minutes: 60 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" @@ -56,18 +56,22 @@ steps: always-pull: true shm-size: "8gb" propagate-environment: true + shm-size: "8gb" environment: - "HF_HOME=/fsx/hf_cache" volumes: - "/fsx/hf_cache:/fsx/hf_cache" - - label: "Omni Model Perf Test" - timeout_in_minutes: 120 + - label: ":full_moon: Omni Model Perf Test with H100" + key: nightly-performance + timeout_in_minutes: 180 depends_on: upload-nightly-pipeline if: build.env("NIGHTLY") == "1" commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export BENCHMARK_DIR=tests - pytest -s -v tests/perf/scripts/run_benchmark.py + - buildkite-agent artifact upload "tests/*.json" agents: queue: "mithril-h100-pool" plugins: @@ -96,3 +100,18 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + + - label: ":email: Nightly Perf Collection & Email" + key: nightly-perf-distribution + depends_on: nightly-performance + if: build.env("NIGHTLY") == "1" + commands: + - pip install openpyxl + - export DEFAULT_INPUT_DIR=tests + - export DEFAULT_OUTPUT_DIR=tests + - buildkite-agent artifact download "tests/*.json" . --step nightly-performance + - python tools/nightly/generate_nightly_perf_excel.py + - python tools/nightly/send_nightly_perf_email.py + - buildkite-agent artifact upload "tests/*.xlsx" + agents: + queue: "cpu_queue_premerge" diff --git a/pyproject.toml b/pyproject.toml index 73ad69c253e..95e43d87199 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,8 @@ dev = [ "imageio[ffmpeg]>=0.6.0", "opencv-python>=4.12.0.88", "mooncake-transfer-engine==0.3.8.post1", - "av" # for ComfyUI tests + "av", # for ComfyUI tests + "openpyxl>=3.0.0", # for nightly CI ] docs = [ diff --git a/tests/perf/scripts/run_benchmark.py b/tests/perf/scripts/run_benchmark.py index 6886661629e..113c7e8264e 100644 --- a/tests/perf/scripts/run_benchmark.py +++ b/tests/perf/scripts/run_benchmark.py @@ -1,9 +1,5 @@ -import os - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" -os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" - import json +import os import subprocess import threading from datetime import datetime @@ -14,6 +10,9 @@ from tests.conftest import OmniServer, modify_stage_config +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + def load_configs(config_path: str) -> list[dict[str, Any]]: try: @@ -123,6 +122,8 @@ def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_promp "--endpoint", "/v1/chat/completions", "--save-result", + "--result-dir", + os.environ.get("BENCHMARK_DIR", "tests"), "--result-filename", result_filename, ] @@ -137,9 +138,9 @@ def run_benchmark(args: list, test_name: str, flow, dataset_name: str, num_promp for line in iter(process.stderr.readline, ""): print(line, end=" ") - if "--result-dir" in args: - index = args.index("--result-dir") - result_dir = args[index + 1] + if "--result-dir" in command: + index = command.index("--result-dir") + result_dir = command[index + 1] else: result_dir = "./" diff --git a/tools/nightly/generate_nightly_perf_excel.py b/tools/nightly/generate_nightly_perf_excel.py new file mode 100644 index 00000000000..a9376475d8c --- /dev/null +++ b/tools/nightly/generate_nightly_perf_excel.py @@ -0,0 +1,437 @@ +#!/usr/bin/env python3 +""" +Generate a nightly Excel performance report from JSON results. + +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +from collections.abc import Iterable, Sequence +from datetime import datetime, timezone +from typing import Any + +from openpyxl import Workbook +from openpyxl.styles import PatternFill +from openpyxl.utils import get_column_letter + +LOGGER = logging.getLogger(__name__) + +GREY_BLOCK_FILL = PatternFill(start_color="D3D3D3", fill_type="solid") + +# Benchmark metric columns: grey the latest row's cell when value changed vs previous date. +BENCHMARK_COLUMNS: tuple[str, ...] = ( + "num_prompts", + "request_rate", + "burstiness", + "max_concurrency", + "duration", + "completed", + "failed", + "request_throughput", + "output_throughput", + "total_token_throughput", + "mean_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "p99_tpot_ms", + "mean_itl_ms", + "p99_itl_ms", + "mean_e2el_ms", + "p99_e2el_ms", + "mean_audio_rtf", + "p99_audio_rtf", + "mean_audio_duration_s", + "p99_audio_duration_s", +) +# Columns that get float coercion and number format in Excel. Excludes request_rate ("inf" str) +# and max_concurrency (null); leave those as-is. If they become float in the future, they are +# still written correctly from JSON without coercion here. +NUMERIC_FORMAT_COLUMNS: tuple[str, ...] = tuple( + c for c in BENCHMARK_COLUMNS if c not in ("request_rate", "max_concurrency") +) + +_COLUMNS_FILENAME = "nightly_perf_summary_columns.txt" +DEFAULT_INPUT_DIR = os.getenv("DEFAULT_INPUT_DIR") if os.getenv("DEFAULT_INPUT_DIR") else "tests" +DEFAULT_OUTPUT_DIR = os.getenv("DEFAULT_OUTPUT_DIR") if os.getenv("DEFAULT_OUTPUT_DIR") else "tests" + + +def _load_summary_columns(script_dir: str) -> list[str]: + """Load summary column names from a file next to this script; fallback to default if missing.""" + path = os.path.join(script_dir, _COLUMNS_FILENAME) + default = [ + "date", + "endpoint_type", + "backend", + "model_id", + "tokenizer_id", + "num_prompts", + "request_rate", + "burstiness", + "max_concurrency", + "duration", + "completed", + "failed", + "request_throughput", + "output_throughput", + "total_token_throughput", + "mean_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "p99_tpot_ms", + "mean_itl_ms", + "p99_itl_ms", + "mean_e2el_ms", + "p99_e2el_ms", + "mean_audio_rtf", + "p99_audio_rtf", + "mean_audio_duration_s", + "p99_audio_duration_s", + "commit_sha", + "build_id", + "build_url", + "source_file", + ] + if not os.path.isfile(path): + return default + columns: list[str] = [] + with open(path, encoding="utf-8") as f: + for line in f: + s = line.strip() + if s and not s.startswith("#"): + columns.append(s) + return columns if columns else default + + +def _vllm_omni_root() -> str: + """Resolve vllm-omni repo root: directory that contains a 'tests' subdir (and usually 'tools').""" + path = os.path.dirname(os.path.abspath(__file__)) + while path and path != os.path.dirname(path): + if os.path.isdir(os.path.join(path, "tests")): + return path + path = os.path.dirname(path) + return os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")) + + +def _default_input_dir() -> str: + """Default: vllm-omni root / DEFAULT_INPUT_DIR (where performance JSON files live).""" + root = _vllm_omni_root() + return os.path.join(root, DEFAULT_INPUT_DIR) + + +def _default_output_file() -> str: + """Default: vllm-omni root / DEFAULT_OUTPUT_DIR / nightly_perf_.xlsx.""" + ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") + return os.path.join(_vllm_omni_root(), DEFAULT_OUTPUT_DIR, f"nightly_perf_{ts}.xlsx") + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Read performance JSON files from vllm-omni/tests/ and generate an Excel report." + ) + parser.add_argument( + "--input-dir", + type=str, + default=_default_input_dir(), + help="Directory containing performance JSON files; default is /DEFAULT_INPUT_DIR.", + ) + parser.add_argument( + "--output-file", + type=str, + default=_default_output_file(), + help="Output path of the Excel report; \ + default is /DEFAULT_OUTPUT_DIR/nightly_perf_.xlsx.", + ) + parser.add_argument( + "--commit-sha", + type=str, + default=None, + help="Optional commit SHA; defaults to environment variable BUILDKITE_COMMIT if unset.", + ) + parser.add_argument( + "--build-id", + type=str, + default=None, + help="Optional build ID; defaults to environment variable BUILDKITE_BUILD_ID if unset.", + ) + parser.add_argument( + "--build-url", + type=str, + default=None, + help="Optional build URL; defaults to environment variable BUILDKITE_BUILD_URL if unset.", + ) + return parser.parse_args() + + +def _load_json_file(path: str) -> dict[str, Any] | None: + """Safely load a single JSON file; return None and log a warning on failure.""" + try: + with open(path, encoding="utf-8") as f: + data = json.load(f) + except (OSError, json.JSONDecodeError) as exc: + LOGGER.warning("failed to load json '%s': %s", path, exc) + return None + + if not isinstance(data, dict): + LOGGER.warning("json root in '%s' is not an object, skip", path) + return None + + return data + + +def _iter_json_records(input_dir: str) -> Iterable[dict[str, Any]]: + """Iterate over JSON files in the input directory and yield normalized records. + commit_sha/build_id/build_url are not set here; they are applied later only to + rows with the latest date (see _apply_build_metadata_to_latest_only). + """ + if not os.path.isdir(input_dir): + LOGGER.warning("input dir '%s' does not exist or is not a directory", input_dir) + return + + for entry in sorted(os.listdir(input_dir)): + if not entry.endswith(".json"): + continue + full_path = os.path.join(input_dir, entry) + if not os.path.isfile(full_path): + continue + + data = _load_json_file(full_path) + if data is None: + continue + + record: dict[str, Any] = dict(data) + record.setdefault("date", datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")) + record["source_file"] = os.path.basename(full_path) + yield record + + +def _collect_records(input_dir: str) -> list[dict[str, Any]]: + """Collect all JSON records into a list.""" + records: list[dict[str, Any]] = [] + for record in _iter_json_records(input_dir): + records.append(record) + return records + + +def _apply_build_metadata_to_latest_only( + records: Sequence[dict[str, Any]], + commit_sha: str | None, + build_id: str | None, + build_url: str | None, +) -> None: + """Set commit_sha, build_id, build_url only on rows with the latest date. + Other rows get None so that build info is not duplicated for older benchmark data. + """ + if not records: + return + max_date = max((r.get("date") or "") for r in records) + for r in records: + if (r.get("date") or "") == max_date: + r["commit_sha"] = commit_sha + r["build_id"] = build_id + r["build_url"] = build_url + else: + r["commit_sha"] = None + r["build_id"] = None + r["build_url"] = None + + +def _sort_records_for_summary(records: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Sort so that same model_id is grouped, newest date first within each group.""" + by_date_desc = sorted(records, key=lambda r: (r.get("date") or ""), reverse=True) + return sorted(by_date_desc, key=lambda r: (r.get("model_id") or "")) + + +def _values_differ(a: Any, b: Any) -> bool: + """Return True if two values are considered different; avoid direct float equality.""" + if a is None and b is None: + return False + if a is None or b is None: + return True + if isinstance(a, float) and isinstance(b, float): + if a != a and b != b: + return False + if a != a or b != b: + return True + return abs(a - b) > 1e-9 + return a != b + + +def _apply_benchmark_change_highlight( + ws, + summary_columns: Sequence[str], + records: Sequence[dict[str, Any]], +) -> None: + """Grey cells in the latest row of each model when a benchmark metric changed vs previous date.""" + if not records: + return + col_to_index = {c: i + 1 for i, c in enumerate(summary_columns)} + # Walk by model_id blocks (records already sorted by model_id, date desc). + i = 0 + while i < len(records): + model_id = records[i].get("model_id") + block_start = i + while i < len(records) and records[i].get("model_id") == model_id: + i += 1 + block_end = i + newest_idx = block_start + prev_idx = block_start + 1 if block_start + 1 < block_end else None + if prev_idx is None: + continue + excel_row = newest_idx + 2 + for col in BENCHMARK_COLUMNS: + if col not in col_to_index: + continue + cur_val = records[newest_idx].get(col) + prev_val = records[prev_idx].get(col) + if _values_differ(cur_val, prev_val): + ws.cell(row=excel_row, column=col_to_index[col]).fill = GREY_BLOCK_FILL + + +def _build_raw_columns( + records: Sequence[dict[str, Any]], + summary_columns: Sequence[str], +) -> list[str]: + """Infer the column set for the raw sheet based on all records.""" + keys: set[str] = set() + for record in records: + keys.update(record.keys()) + # Ensure summary columns appear first; remaining columns sorted alphabetically. + ordered_keys: list[str] = [] + for key in summary_columns: + if key in keys: + ordered_keys.append(key) + keys.discard(key) + ordered_keys.extend(sorted(keys)) + return ordered_keys + + +def _to_float_if_numeric(value: Any) -> Any: + """Coerce to float when possible so Excel treats as number; avoid #### from narrow columns.""" + if value is None: + return value + if isinstance(value, (int, float)): + return float(value) if isinstance(value, int) else value + if isinstance(value, str): + try: + return float(value) + except (ValueError, TypeError): + return value + return value + + +def _write_sheet( + ws, + columns: Sequence[str], + rows: Iterable[dict[str, Any]], + numeric_columns: Sequence[str] = (), +) -> None: + """Write column names and row data into the given worksheet.""" + numeric_set = set(numeric_columns) + ws.append(list(columns)) + for record in rows: + row_values = [] + for col in columns: + v = record.get(col) + if col in numeric_set: + v = _to_float_if_numeric(v) + row_values.append(v) + ws.append(row_values) + + +def _format_benchmark_columns( + ws, + columns: Sequence[str], + num_data_rows: int, +) -> None: + """Set number format and column width for numeric benchmark columns so values display (no ####).""" + numeric_set = set(NUMERIC_FORMAT_COLUMNS) + for c, col_name in enumerate(columns): + if col_name not in numeric_set: + continue + col_letter = get_column_letter(c + 1) + ws.column_dimensions[col_letter].width = 14 + for r in range(2, 2 + num_data_rows): + cell = ws.cell(row=r, column=c + 1) + if cell.value is not None and isinstance(cell.value, (int, float)): + cell.number_format = "0.0000" + elif isinstance(cell.value, str): + try: + cell.value = float(cell.value) + cell.number_format = "0.0000" + except (ValueError, TypeError): + pass + + +def _ensure_parent_dir(path: str) -> None: + """Ensure that the parent directory of the output file exists.""" + parent = os.path.dirname(os.path.abspath(path)) + if not parent: + return + os.makedirs(parent, exist_ok=True) + + +def generate_excel_report( + input_dir: str, + output_file: str, + commit_sha: str | None, + build_id: str | None, + build_url: str | None, +) -> None: + """Main logic: load JSON records and generate an Excel report.""" + script_dir = os.path.dirname(os.path.abspath(__file__)) + summary_columns = _load_summary_columns(script_dir) + + records = _collect_records(input_dir) + if not records: + LOGGER.warning("no valid json records found under '%s'", input_dir) + + sorted_records = _sort_records_for_summary(records) + _apply_build_metadata_to_latest_only(sorted_records, commit_sha, build_id, build_url) + + wb = Workbook() + ws_summary = wb.active + ws_summary.title = "summary" + + _write_sheet(ws_summary, summary_columns, sorted_records, numeric_columns=NUMERIC_FORMAT_COLUMNS) + _format_benchmark_columns(ws_summary, summary_columns, len(sorted_records)) + _apply_benchmark_change_highlight(ws_summary, summary_columns, sorted_records) + + if sorted_records: + raw_columns = _build_raw_columns(sorted_records, summary_columns) + ws_raw = wb.create_sheet(title="raw") + _write_sheet(ws_raw, raw_columns, sorted_records) + + _ensure_parent_dir(output_file) + wb.save(output_file) + LOGGER.info("excel report saved to '%s'", output_file) + + +def main() -> None: + """Command-line entrypoint.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + ) + + args = parse_args() + + commit_sha = args.commit_sha or os.getenv("BUILDKITE_COMMIT") + build_id = args.build_id or os.getenv("BUILDKITE_BUILD_ID") + build_url = args.build_url or os.getenv("BUILDKITE_BUILD_URL") + + generate_excel_report( + input_dir=args.input_dir, + output_file=args.output_file, + commit_sha=commit_sha, + build_id=build_id, + build_url=build_url, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/nightly/send_nightly_perf_email.py b/tools/nightly/send_nightly_perf_email.py new file mode 100644 index 00000000000..af11cc73a06 --- /dev/null +++ b/tools/nightly/send_nightly_perf_email.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +""" +Send the nightly performance Excel report by email to DAILY_EMAIL_LIST. + +Reads SMTP and recipient config from environment variables. Use --dry-run to +validate config and print subject/body without sending. +""" + +from __future__ import annotations + +import argparse +import logging +import os +import smtplib +import sys +from email.mime.application import MIMEApplication +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + +LOGGER = logging.getLogger(__name__) + +# Env keys for SMTP and recipients. +ENV_SMTP_HOST = "SMTP_HOST" +ENV_SMTP_PORT = "SMTP_PORT" +ENV_SMTP_USERNAME = "SMTP_USERNAME" +ENV_SMTP_PASSWORD = "SMTP_PASSWORD" +ENV_DAILY_EMAIL_LIST = "DAILY_EMAIL_LIST" +ENV_EMAIL_SENDER = "EMAIL_SENDER" +ENV_EMAIL_SUBJECT_PREFIX = "EMAIL_SUBJECT_PREFIX" +ENV_BUILD_URL = "BUILDKITE_BUILD_URL" +ENV_COMMIT = "BUILDKITE_COMMIT" + +DEFAULT_OUTPUT_DIR = os.getenv("DEFAULT_OUTPUT_DIR") + +# Do not attach Excel if size >= this (bytes); body will suggest downloading from build URL. +MAX_ATTACHMENT_BYTES = 20 * 1024 * 1024 + +SMTP_RETRIES = 3 +SMTP_RETRY_DELAY_SEC = 5 + + +def _get_required_env() -> dict[str, str]: + """Read required env vars; raise SystemExit with clear message if any missing.""" + required = { + ENV_SMTP_HOST: os.environ.get(ENV_SMTP_HOST), + ENV_SMTP_PORT: os.environ.get(ENV_SMTP_PORT), + ENV_SMTP_USERNAME: os.environ.get(ENV_SMTP_USERNAME), + ENV_SMTP_PASSWORD: os.environ.get(ENV_SMTP_PASSWORD), + ENV_DAILY_EMAIL_LIST: os.environ.get(ENV_DAILY_EMAIL_LIST), + } + missing = [k for k, v in required.items() if not (v and str(v).strip())] + if missing: + raise SystemExit(f"Missing required env vars: {', '.join(missing)}. Set them (e.g. in Buildkite secrets).") + return {k: str(v).strip() for k, v in required.items()} + + +def _get_latest_file(folder_path: str) -> str: + """Get the latest modified file from the folder path.""" + files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".xlsx")] + if not files: + raise SystemExit(f"No Excel files found in {folder_path}") + return max(files, key=os.path.getmtime) + + +def _recipients_list(comma_separated: str) -> list[str]: + """Parse DAILY_EMAIL_LIST into a list of addresses.""" + return [a.strip() for a in comma_separated.split(",") if a.strip()] + + +def _build_body( + date_str: str, + commit_sha: str | None, + build_url: str | None, + attachment_skipped: bool, +) -> str: + """Plain-text body with build metadata; note when attachment was skipped.""" + lines = [ + f"Nightly performance report date: {date_str}", + "", + f"Commit: {commit_sha or 'N/A'}", + f"Build: {build_url or 'N/A'}", + "", + ] + if attachment_skipped: + lines.append( + "Report file was too large to attach. Please download the Excel from the build artifacts (Build URL above)." + ) + lines.append("") + return "\n".join(lines) + + +def _build_subject(prefix: str | None, date_str: str) -> str: + """Subject line: optional prefix + date.""" + base = f"Nightly Perf {date_str}" + if prefix and prefix.strip(): + return f"{prefix.strip()} {base}" + return base + + +def _send_mail( + report_file: str, + date_str: str, + dry_run: bool, +) -> None: + """Load config, build message, and send (or dry-run).""" + cfg = _get_required_env() + recipients = _recipients_list(cfg[ENV_DAILY_EMAIL_LIST]) + if not recipients: + raise SystemExit("DAILY_EMAIL_LIST is empty after parsing.") + + commit_sha = os.environ.get(ENV_COMMIT) + build_url = os.environ.get(ENV_BUILD_URL) + sender = os.environ.get(ENV_EMAIL_SENDER) or cfg[ENV_SMTP_USERNAME] + prefix = os.environ.get(ENV_EMAIL_SUBJECT_PREFIX) + + size = os.path.getsize(report_file) if os.path.isfile(report_file) else 0 + attach_excel = size < MAX_ATTACHMENT_BYTES and size > 0 + attachment_skipped = os.path.isfile(report_file) and not attach_excel + + body = _build_body( + commit_sha=commit_sha, build_url=build_url, date_str=date_str, attachment_skipped=attachment_skipped + ) + subject = _build_subject(prefix=prefix, date_str=date_str) + + msg = MIMEMultipart() + msg["Subject"] = subject + msg["From"] = sender + msg["To"] = ", ".join(recipients) + msg.attach(MIMEText(body, "plain", "utf-8")) + + if attach_excel: + with open(report_file, "rb") as f: + part = MIMEApplication(f.read(), _subtype="vnd.openxmlformats-officedocument.spreadsheetml.sheet") + part.add_header("Content-Disposition", "attachment", filename=os.path.basename(report_file)) + msg.attach(part) + + if dry_run: + LOGGER.info("dry-run: not sending mail") + print("To:", recipients, file=sys.stderr) + print("Subject:", subject, file=sys.stderr) + print("Attachment:", "yes" if attach_excel else "no (size limit)", file=sys.stderr) + print("Body preview:", body[:300] + ("..." if len(body) > 300 else ""), file=sys.stderr) + return + + port = int(cfg[ENV_SMTP_PORT], 10) + last_err: Exception | None = None + for attempt in range(SMTP_RETRIES): + try: + with smtplib.SMTP(cfg[ENV_SMTP_HOST], port=port, timeout=30) as smtp: + smtp.starttls() + smtp.login(cfg[ENV_SMTP_USERNAME], cfg[ENV_SMTP_PASSWORD]) + smtp.sendmail(sender, recipients, msg.as_string()) + LOGGER.info("sent nightly perf email to %d recipient(s)", len(recipients)) + return + except Exception as e: + last_err = e + LOGGER.warning("SMTP attempt %d/%d failed: %s", attempt + 1, SMTP_RETRIES, e) + if attempt < SMTP_RETRIES - 1: + import time + + time.sleep(SMTP_RETRY_DELAY_SEC) + raise SystemExit(f"Failed to send email after {SMTP_RETRIES} attempts.") from last_err + + +def _date_from_filename(path: str) -> str: + """Try to derive a date string from report filename (e.g. nightly_perf_20260211-020704.xlsx -> 2026-02-11).""" + base = os.path.splitext(os.path.basename(path))[0] + if base.startswith("nightly_perf_") and len(base) > 13: + raw = base.replace("nightly_perf_", "")[:8] + if len(raw) == 8 and raw.isdigit(): + return f"{raw[:4]}-{raw[4:6]}-{raw[6:8]}" + return base or "unknown" + + +def _vllm_omni_root() -> str: + """Resolve vllm-omni repo root: directory that contains a 'tests' subdir (and usually 'tools').""" + path = os.path.dirname(os.path.abspath(__file__)) + while path and path != os.path.dirname(path): + if os.path.isdir(os.path.join(path, "tests")): + return path + path = os.path.dirname(path) + return os.path.normpath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "..")) + + +def _default_output_dir() -> str: + """Default: vllm-omni root / DEFAULT_OUTPUT_DIR (where performance .xlsx files live).""" + root = _vllm_omni_root() + return os.path.join(root, DEFAULT_OUTPUT_DIR) + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Send nightly performance Excel report by email (config from env).", + ) + parser.add_argument( + "--report-file", + type=str, + default=_default_output_dir(), + help="Folder/file path to the nightly_perf_*.xlsx file; default is DEFAULT_OUTPUT_DIR.", + ) + parser.add_argument( + "--date", + type=str, + default=None, + help="Date string for subject/body (default: inferred from report filename).", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Print recipient, subject, and body; do not send.", + ) + return parser.parse_args() + + +def main() -> None: + """Entrypoint.""" + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + ) + args = parse_args() + + report_file = _get_latest_file(args.report_file) if os.path.isdir(args.report_file) else args.report_file + if not os.path.isfile(report_file): + raise SystemExit(f"Report file not found: {report_file}") + + date_str = args.date or _date_from_filename(report_file) + _send_mail(report_file=report_file, date_str=date_str, dry_run=args.dry_run) + + +if __name__ == "__main__": + main() From 806285bd72f94ce7070d7db6f4d61647d3a9f9e5 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Thu, 12 Mar 2026 15:18:45 +0800 Subject: [PATCH 13/26] fix import Signed-off-by: juboyu <767868009@qq.com> --- .../text_to_image/text_to_image.py | 2 +- vllm_omni/diffusion/quantization/int8.py | 59 +++++++++++-------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 1cc195f0b35..0d31e0758b9 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -134,7 +134,7 @@ def parse_args() -> argparse.Namespace: default=None, choices=["fp8", "int8", "gguf"], help="Quantization method for the transformer. " - "Options: 'fp8' (FP8 W8A8 on Ada/Hopper, weight-only on older GPUs), 'int8' (Int8 W8A8 on NPUs), 'gguf' (GGUF quantized weights)." + "Options: 'fp8' (FP8 W8A8 on Ada/Hopper, weight-only on older GPUs), 'int8' (Int8 W8A8), 'gguf' (GGUF quantized weights)." "Default: None (no quantization, uses BF16).", ) parser.add_argument( diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index a23fd5d9d78..224d1492576 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -6,6 +6,12 @@ from typing import TYPE_CHECKING, Any, Optional import torch + +try: + import torch_npu +except (ImportError, ModuleNotFoundError): + torch_npu = None + from torch.nn import Module from vllm import _custom_ops as ops from vllm.logger import init_logger @@ -44,10 +50,11 @@ logger = init_logger(__name__) -def create_int8_weight_parameter( +def create_weight_parameter( output_size_per_partition: int, input_size_per_partition: int, weight_loader: Callable | None, + params_dtype: torch.dtype, ) -> torch.nn.Parameter: """ Create int8 weight parameter. @@ -56,7 +63,7 @@ def create_int8_weight_parameter( data=torch.empty( output_size_per_partition, input_size_per_partition, - dtype=torch.int8, + dtype=params_dtype, ), input_dim=1, output_dim=0, @@ -147,25 +154,29 @@ def get_quant_method( layer: torch.nn.Module, prefix: str, ) -> Optional["QuantizeMethodBase"]: - if current_omni_platform.is_npu(): - if isinstance(layer, LinearBase): - if is_layer_skipped( - prefix=prefix, - ignored_layers=self.ignored_layers, - fused_mapping=self.packed_modules_mapping, - ): - return UnquantizedLinearMethod() - if not self.is_checkpoint_int8_serialized: - if current_omni_platform.is_cuda(): - online_method = Int8OnlineLinearMethod(self) - elif current_omni_platform.is_npu(): - online_method = NPUInt8OnlineLinearMethod(self) - return online_method + if isinstance(layer, LinearBase): + if is_layer_skipped( + prefix=prefix, + ignored_layers=self.ignored_layers, + fused_mapping=self.packed_modules_mapping, + ): + return UnquantizedLinearMethod() + if not self.is_checkpoint_int8_serialized: + if current_omni_platform.is_cuda(): + online_method = Int8OnlineLinearMethod(self) + elif current_omni_platform.is_npu(): + online_method = NPUInt8OnlineLinearMethod(self) else: + logger.warning("The current platform is not supported.") + return online_method + else: + if current_omni_platform.is_cuda(): offline_method = Int8LinearMethod(self) - return offline_method - else: - logger.warning("The current platform is not supported.") + elif current_omni_platform.is_npu(): + online_method = NPUInt8LinearMethod(self) + else: + logger.warning("The current platform is not supported.") + return offline_method return None @@ -199,10 +210,12 @@ def create_weights( layer.output_size_per_partition = output_size_per_partition layer.orig_dtype = params_dtype - weight = create_int8_weight_parameter( + params_dtype = torch.int8 if self.quant_config.is_checkpoint_int8_serialized else params_dtype + weight = create_weight_parameter( output_size_per_partition=output_size_per_partition, input_size_per_partition=input_size_per_partition, weight_loader=weight_loader, + params_dtype=params_dtype, ) layer.register_parameter("weight", weight) @@ -282,8 +295,6 @@ def apply( x: torch.Tensor, bias: torch.Tensor | None = None, ) -> torch.Tensor: - import torch_npu - ori_shape = x.shape ori_dtype = x.dtype @@ -309,7 +320,7 @@ class Int8OnlineLinearMethod(Int8LinearMethod): """ def process_weights_after_loading(self, layer: Module) -> None: - qweight, weight_scale = ops.scaled_int8_quant(layer.weight, scale=None) + qweight, weight_scale, _ = ops.scaled_int8_quant(layer.weight, scale=None) weight = qweight.t() # Update layer with new values. @@ -324,8 +335,6 @@ class NPUInt8OnlineLinearMethod(NPUInt8LinearMethod): """ def process_weights_after_loading(self, layer: Module) -> None: - import torch_npu - weight = layer.weight qweight, weight_scale = torch_npu.npu_dynamic_quant(weight) From 23b42522e9c7da51f1715de7fbc51c08f6ddcfa9 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Thu, 12 Mar 2026 15:23:45 +0800 Subject: [PATCH 14/26] raise error in int8 unsupported platfrom Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 224d1492576..32b09c3703a 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -167,7 +167,7 @@ def get_quant_method( elif current_omni_platform.is_npu(): online_method = NPUInt8OnlineLinearMethod(self) else: - logger.warning("The current platform is not supported.") + raise NotImplementedError("The current platform is not supported int8 online quant.") return online_method else: if current_omni_platform.is_cuda(): @@ -175,7 +175,7 @@ def get_quant_method( elif current_omni_platform.is_npu(): online_method = NPUInt8LinearMethod(self) else: - logger.warning("The current platform is not supported.") + raise NotImplementedError("The current platform is not supported int8 offline quant.") return offline_method return None From 03a7e47a0a2f1270d4415c31444913795bea6e61 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Fri, 13 Mar 2026 19:42:55 +0800 Subject: [PATCH 15/26] fix npu int8 process_weights_after_loading unclear & complete test_int8_config.py Signed-off-by: juboyu <767868009@qq.com> --- .../diffusion/quantization/overview.md | 7 ++ .../text_to_image/text_to_image.py | 2 +- .../quantization/test_int8_config.py | 69 +++++++++++++++++-- vllm_omni/diffusion/quantization/int8.py | 62 ++++++----------- 4 files changed, 95 insertions(+), 45 deletions(-) diff --git a/docs/user_guide/diffusion/quantization/overview.md b/docs/user_guide/diffusion/quantization/overview.md index b996c64de2c..a95afdbf498 100644 --- a/docs/user_guide/diffusion/quantization/overview.md +++ b/docs/user_guide/diffusion/quantization/overview.md @@ -17,3 +17,10 @@ vLLM-Omni supports quantization of DiT linear layers to reduce memory usage and | Ada/Hopper (SM 89+) | RTX 4090, H100, H200 | Full W8A8 with native hardware | Kernel selection is automatic. + +## Device Compatibility for Int8 + +| Device Type | Generation | Example | Int8 Mode | +|-------------|---------------|-------------------|----------| +| NVIDIA GPU | Ada/Hopper (SM 89+) | RTX 4090, H100, H200 | Full W8A8 with native hardware | +| Ascend NPU | Atlas A2/Atlas A3 | Atlas 800T A2/Atlas 900 A3 | Full W8A8 with native hardware | diff --git a/examples/offline_inference/text_to_image/text_to_image.py b/examples/offline_inference/text_to_image/text_to_image.py index 0d31e0758b9..b3918645347 100644 --- a/examples/offline_inference/text_to_image/text_to_image.py +++ b/examples/offline_inference/text_to_image/text_to_image.py @@ -134,7 +134,7 @@ def parse_args() -> argparse.Namespace: default=None, choices=["fp8", "int8", "gguf"], help="Quantization method for the transformer. " - "Options: 'fp8' (FP8 W8A8 on Ada/Hopper, weight-only on older GPUs), 'int8' (Int8 W8A8), 'gguf' (GGUF quantized weights)." + "Options: 'fp8' (FP8 W8A8 on Ada/Hopper, weight-only on older GPUs), 'int8' (Int8 W8A8), 'gguf' (GGUF quantized weights). " "Default: None (no quantization, uses BF16).", ) parser.add_argument( diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 72126cdb333..71c7254937e 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -132,7 +132,8 @@ def _fake_init(self, quant_config): prefix = "test_layer" # Mock the platform to be GPU - with patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False): + with (patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False)): method = vllm_config.get_quant_method(layer, prefix) assert isinstance(method, Int8OnlineLinearMethod) @@ -152,8 +153,9 @@ def test_get_npu_quant_method(): layer = MagicMock(spec=LinearBase) prefix = "test_layer" - # Mock the platform to be GPU - with patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True): + # Mock the platform to be NPU + with (patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True)): method = vllm_config.get_quant_method(layer, prefix) assert isinstance(method, NPUInt8OnlineLinearMethod) @@ -225,12 +227,13 @@ def mock_quant_config(self, mocker): def mock_deps(self, mocker): # mock kernel mock_kernel = mocker.Mock() + mock_kernel.layer_param_names = ("weight", "weight_scale", "input_scale", "input_zero_point", "azp_adj") mocker.patch("vllm_omni.diffusion.quantization.int8.init_int8_linear_kernel", return_value=mock_kernel) mocker.patch("vllm_omni.diffusion.quantization.int8.replace_parameter") # mock scaled_int8_quant return value mock_qweight = torch.ones((128, 64), dtype=torch.int8) - mock_scale = torch.tensor([0.5]) + mock_scale = torch.randn(128) mock_quant = mocker.patch( "vllm_omni.diffusion.quantization.int8.ops.scaled_int8_quant", return_value=(mock_qweight, mock_scale, None) ) @@ -244,3 +247,61 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config): layer.weight = Parameter(torch.randn(128, 64)) method.process_weights_after_loading(layer) mock_deps["quant"].assert_called_once_with(layer.weight, scale=None) + + +class TestNPUInt8LinearMethod: + qweight_mock = torch.randn((128,64)).to(dtype=torch.int8) + scale_mock = torch.randn(128) + out_mock = torch.randn((16,128)) + + @pytest.fixture + def mock_torch_npu(self, mocker): + torch_npu = MagicMock() + + mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu", + return_value=torch_npu) + mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu.npu_dynamic_quant", + return_value=(self.qweight_mock, self.scale_mock)) + mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu.npu_quant_matmul", + return_value=self.out_mock) + return torch_npu + + @pytest.fixture + def mock_quant_config(self, mocker): + return mocker.Mock() + + @pytest.fixture + def mock_layer(self, mocker): + layer = torch.nn.Module() + layer.weight = torch.nn.Parameter(self.qweight_mock, requires_grad=False) + layer.weight_scale = torch.nn.Parameter(self.scale_mock, requires_grad=False) + return layer + + def test_npu_int8_process_weights_after_loading(self, mock_layer, mock_quant_config, mock_torch_npu): + from vllm_omni.diffusion.quantization.int8 import NPUInt8LinearMethod + + method = NPUInt8LinearMethod(mock_quant_config) + ori_weight_shape = mock_layer.weight.shape + + method.process_weights_after_loading(mock_layer) + + assert mock_layer.weight.shape == ori_weight_shape[::-1] + assert mock_layer.weight.is_contiguous() + + def test_npu_int8_apply(self, mock_layer, mock_quant_config, mock_torch_npu): + from vllm_omni.diffusion.quantization.int8 import NPUInt8LinearMethod + + method = NPUInt8LinearMethod(mock_quant_config) + x = torch.randn(1, 16, 64) + + output = method.apply(mock_layer, x) + assert output.shape == (1, 16, 128) + + def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mock_torch_npu): + from vllm_omni.diffusion.quantization.int8 import NPUInt8OnlineLinearMethod + + method = NPUInt8OnlineLinearMethod(mock_quant_config) + method.process_weights_after_loading(mock_layer) + + assert mock_layer.weight.shape == (64, 128) + assert torch.equal(mock_layer.weight_scale, self.scale_mock) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 32b09c3703a..5c6c4bb1880 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -71,29 +71,6 @@ def create_weight_parameter( ) -def create_int8_scale_parameter( - parameter_type: torch.nn.Parameter, - output_partition_sizes: list[int], - input_size_per_partition: int, - block_size: list[int] | None, - weight_loader: Callable | None, - params_dtype: torch.dtype, -) -> torch.nn.Parameter: - """ - Create scale parameter based on quantization strategy - """ - if parameter_type == ChannelQuantScaleParameter: - scale = parameter_type( - data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), - output_dim=0, - weight_loader=weight_loader, - ) - else: - raise ValueError(f"Unknown parameter type: {parameter_type}") - - return scale - - class Int8Config(QuantizationConfig): """ Config class for Int8. @@ -124,7 +101,8 @@ def get_supported_act_dtypes(cls) -> list[torch.dtype]: @classmethod def get_min_capability(cls) -> int: - return 75 + # Have verified on A100 and H20, but not on oldest versions. + return 80 @classmethod def get_config_filenames(cls) -> list[str]: @@ -138,7 +116,7 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): def from_config(cls, config: dict[str, Any]) -> "Int8Config": quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_int8_serialized = "int8" in quant_method - activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + activation_scheme = cls.get_from_keys(config, ["activation_scheme"], "dynamic") ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) if not ignored_layers: @@ -173,7 +151,7 @@ def get_quant_method( if current_omni_platform.is_cuda(): offline_method = Int8LinearMethod(self) elif current_omni_platform.is_npu(): - online_method = NPUInt8LinearMethod(self) + offline_method = NPUInt8LinearMethod(self) else: raise NotImplementedError("The current platform is not supported int8 offline quant.") return offline_method @@ -220,13 +198,10 @@ def create_weights( layer.register_parameter("weight", weight) if self.quant_config.is_checkpoint_int8_serialized: - scale = create_int8_scale_parameter( - ChannelQuantScaleParameter, - output_partition_sizes, - input_size_per_partition, - None, - weight_loader, - params_dtype, + scale = ChannelQuantScaleParameter( + data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader, ) layer.register_parameter("weight_scale", scale) @@ -320,13 +295,24 @@ class Int8OnlineLinearMethod(Int8LinearMethod): """ def process_weights_after_loading(self, layer: Module) -> None: + w_q_name, w_s_name, i_s_name, i_zp_name, azp_adj_name = self.int8_linear.layer_param_names qweight, weight_scale, _ = ops.scaled_int8_quant(layer.weight, scale=None) - weight = qweight.t() # Update layer with new values. - replace_parameter(layer, "weight", weight) - replace_parameter(layer, "weight_scale", weight_scale) + replace_parameter( + layer, + w_q_name, + torch.nn.Parameter(qweight.t().data, requires_grad=False) + ) + replace_parameter( + layer, + w_s_name, + torch.nn.Parameter(weight_scale.data, requires_grad=False) + ) + setattr(layer, i_s_name, None) + setattr(layer, i_zp_name, None) + setattr(layer, azp_adj_name, None) class NPUInt8OnlineLinearMethod(NPUInt8LinearMethod): """ @@ -338,9 +324,6 @@ def process_weights_after_loading(self, layer: Module) -> None: weight = layer.weight qweight, weight_scale = torch_npu.npu_dynamic_quant(weight) - del weight - torch.npu.empty_cache() - qweight = qweight.t().contiguous() # Update layer with new values. @@ -355,7 +338,6 @@ class DiffusionInt8Config(DiffusionQuantizationConfig): Args: activation_scheme: Activation quantization scheme. - "dynamic": Per-token dynamic scaling (default, no calibration) - Format: [block_n, block_k]. If None, uses per-tensor scaling. ignored_layers: List of layer name patterns to skip quantization. """ From d6eec8c3a67e6ee89784175cf30dcd252a1fab49 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Fri, 13 Mar 2026 19:58:45 +0800 Subject: [PATCH 16/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- .../quantization/test_int8_config.py | 28 +++++++++++-------- vllm_omni/diffusion/quantization/int8.py | 13 ++------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 71c7254937e..9ba66de19d1 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -132,8 +132,10 @@ def _fake_init(self, quant_config): prefix = "test_layer" # Mock the platform to be GPU - with (patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False)): + with ( + patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False) + ): method = vllm_config.get_quant_method(layer, prefix) assert isinstance(method, Int8OnlineLinearMethod) @@ -154,8 +156,10 @@ def test_get_npu_quant_method(): prefix = "test_layer" # Mock the platform to be NPU - with (patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True)): + with ( + patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True) + ): method = vllm_config.get_quant_method(layer, prefix) assert isinstance(method, NPUInt8OnlineLinearMethod) @@ -250,20 +254,20 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config): class TestNPUInt8LinearMethod: - qweight_mock = torch.randn((128,64)).to(dtype=torch.int8) + qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8) scale_mock = torch.randn(128) - out_mock = torch.randn((16,128)) + out_mock = torch.randn((16, 128)) @pytest.fixture def mock_torch_npu(self, mocker): torch_npu = MagicMock() - mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu", - return_value=torch_npu) - mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu.npu_dynamic_quant", - return_value=(self.qweight_mock, self.scale_mock)) - mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu.npu_quant_matmul", - return_value=self.out_mock) + mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu", return_value=torch_npu) + mocker.patch( + "vllm_omni.diffusion.quantization.int8.torch_npu.npu_dynamic_quant", + return_value=(self.qweight_mock, self.scale_mock), + ) + mocker.patch("vllm_omni.diffusion.quantization.int8.torch_npu.npu_quant_matmul", return_value=self.out_mock) return torch_npu @pytest.fixture diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 5c6c4bb1880..8436b544fbc 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -299,21 +299,14 @@ def process_weights_after_loading(self, layer: Module) -> None: qweight, weight_scale, _ = ops.scaled_int8_quant(layer.weight, scale=None) # Update layer with new values. - replace_parameter( - layer, - w_q_name, - torch.nn.Parameter(qweight.t().data, requires_grad=False) - ) - replace_parameter( - layer, - w_s_name, - torch.nn.Parameter(weight_scale.data, requires_grad=False) - ) + replace_parameter(layer, w_q_name, torch.nn.Parameter(qweight.t().data, requires_grad=False)) + replace_parameter(layer, w_s_name, torch.nn.Parameter(weight_scale.data, requires_grad=False)) setattr(layer, i_s_name, None) setattr(layer, i_zp_name, None) setattr(layer, azp_adj_name, None) + class NPUInt8OnlineLinearMethod(NPUInt8LinearMethod): """ NPU Online version of Int8LinearMethod, loads the fp16/bf16 checkpoint From a17f9bd9ac40b452e070fd5d8cc4745e47ba3163 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Fri, 13 Mar 2026 20:02:09 +0800 Subject: [PATCH 17/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- tests/diffusion/quantization/test_int8_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 9ba66de19d1..17ea0716d27 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -134,7 +134,7 @@ def _fake_init(self, quant_config): # Mock the platform to be GPU with ( patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=True), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False) + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=False), ): method = vllm_config.get_quant_method(layer, prefix) assert isinstance(method, Int8OnlineLinearMethod) @@ -158,7 +158,7 @@ def test_get_npu_quant_method(): # Mock the platform to be NPU with ( patch("vllm_omni.platforms.current_omni_platform.is_cuda", return_value=False), - patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True) + patch("vllm_omni.platforms.current_omni_platform.is_npu", return_value=True), ): method = vllm_config.get_quant_method(layer, prefix) assert isinstance(method, NPUInt8OnlineLinearMethod) From 302f0b6b225b36ec814294e66409115e26b3fcf0 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Tue, 17 Mar 2026 17:41:46 +0800 Subject: [PATCH 18/26] add smoke test & lazy weight loading Signed-off-by: juboyu <767868009@qq.com> --- .../quantization/test_int8_config.py | 164 +++++++++++++++++- vllm_omni/diffusion/quantization/int8.py | 138 ++++++++++++++- 2 files changed, 295 insertions(+), 7 deletions(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 17ea0716d27..f329c0288df 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -14,6 +14,17 @@ get_diffusion_quant_config, get_vllm_quant_config_for_layers, ) +from vllm_omni.platforms import current_omni_platform + +npu_available = pytest.mark.skipif( + not current_omni_platform.is_npu(), + reason="NPU platform not available." +) + +cuda_available = pytest.mark.skipif( + not current_omni_platform.is_npu(), + reason="GPU platform not available." +) def test_int8_config_creation(): @@ -252,7 +263,7 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config): method.process_weights_after_loading(layer) mock_deps["quant"].assert_called_once_with(layer.weight, scale=None) - +@npu_available class TestNPUInt8LinearMethod: qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8) scale_mock = torch.randn(128) @@ -309,3 +320,154 @@ def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mo assert mock_layer.weight.shape == (64, 128) assert torch.equal(mock_layer.weight_scale, self.scale_mock) + +@pytest.fixture +def quant_config(): + """Shared quant config fixture for smoke tests.""" + from vllm_omni.diffusion.quantization.int8 import Int8Config + + return Int8Config( + is_checkpoint_int8_serialized=False, + activation_scheme="dynamic", + ) + +@npu_available +class TestNPUInt8Smoke: + """Smoke tests using real torch_npu, only run on NPU.""" + + @pytest.fixture + def real_layer(self): + """Create a real linear layer with fp16 weights on NPU""" + layer = torch.nn.Module() + layer.weight = torch.nn.Parameter( + torch.randn(128, 64, dtype=torch.float16, device="npu"), + requires_grad=False, + ) + layer.logical_widths = [128] + layer.input_size_per_partition = 64 + layer.output_size_per_partition = 128 + layer.orig_dtype = torch.float16 + return layer + + def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer): + """Smoke test: verify npu_dynamic_quant returns correct shapes.""" + import torch_npu + + from vllm_omni.diffusion.quantization.int8 import NPUInt8OnlineLinearMethod + + method = NPUInt8OnlineLinearMethod(quant_config) + + # Call real torch_npu.npu_dynamic_quant + weight = real_layer.weight + qweight, scale = torch_npu.npu_dynamic_quant(weight) + + assert qweight.shape == weight.shape + assert qweight.dtype == torch.int8 + assert scale.shape == (weight.shape[0],) + + def test_real_npu_online_process_weights_after_loading( + self, quant_config, real_layer + ): + """Smoke test: full process_weights_after_loading with real torch_npu.""" + from vllm_omni.diffusion.quantization.int8 import NPUInt8OnlineLinearMethod + + method = NPUInt8OnlineLinearMethod(quant_config) + + method.process_weights_after_loading(real_layer) + + assert real_layer.weight.shape == (64, 128) + assert real_layer.weight.dtype == torch.int8 + assert hasattr(real_layer, "weight_scale") + assert real_layer.weight_scale.shape == (128,) + + def test_real_npu_int8_apply_forward(self, quant_config): + """Smoke test: forward pass with real npu_quant_matmul.""" + import torch_npu + + from vllm_omni.diffusion.quantization.int8 import NPUInt8LinearMethod + + method = NPUInt8LinearMethod(quant_config) + + # Create layer with pre-processed weights on NPU + layer = torch.nn.Module() + weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="npu") + qweight, scale = torch_npu.npu_dynamic_quant(weight_fp16) + layer.weight = torch.nn.Parameter(qweight.t().contiguous(), requires_grad=False) + layer.weight_scale = torch.nn.Parameter(scale.squeeze(), requires_grad=False) + + # Forward pass on NPU + x = torch.randn(2, 16, 64, dtype=torch.float16, device="npu") + output = method.apply(layer, x) + + assert output.shape == (2, 16, 128) + assert output.dtype == torch.float16 + + +@cuda_available +class TestCudaInt8Smoke: + """Smoke tests using real CUDA kernels, only on CUDA""" + + @pytest.fixture + def real_layer(self): + """Create a real linear layer with fp16 weights on CUDA""" + layer = torch.nn.Module() + layer.weight = torch.nn.Parameter( + torch.randn(128, 64, dtype=torch.float16, device="cuda"), + requires_grad=False, + ) + layer.logical_widths = [128] + layer.input_size_per_partition = 64 + layer.output_size_per_partition = 128 + layer.orig_dtype = torch.float16 + return layer + + def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config): + """Smoke test: verify scaled_int8_quant returns correct shapes.""" + from vllm import _custom_ops as ops + + weight = torch.randn(128, 64, dtype=torch.float16, device="cuda") + qweight, scale, _ = ops.scaled_int8_quant(weight, scale=None) + + assert qweight.shape == weight.shape + assert qweight.dtype == torch.int8 + assert scale.shape == (weight.shape[0], 1) + + def test_real_cuda_online_process_weights_after_loading( + self, quant_config, real_layer + ): + """Smoke test: full process_weights_after_loading with real CUDA ops.""" + from vllm_omni.diffusion.quantization.int8 import Int8OnlineLinearMethod + + method = Int8OnlineLinearMethod(quant_config) + + method.process_weights_after_loading(real_layer) + + assert real_layer.weight.shape == (64, 128) + assert real_layer.weight.dtype == torch.int8 + assert hasattr(real_layer, "weight_scale") + + def test_real_cuda_int8_apply_forward(self, quant_config): + """Smoke test: forward pass with real CUDA int8 kernel.""" + from vllm import _custom_ops as ops + from vllm_omni.diffusion.quantization.int8 import Int8LinearMethod + + method = Int8LinearMethod(quant_config) + + # Create layer with pre-processed weights + layer = torch.nn.Module() + weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="cuda") + qweight, scale, _ = ops.scaled_int8_quant(weight_fp16, scale=None) + layer.weight = torch.nn.Parameter(qweight.t(), requires_grad=False) + layer.weight_scale = torch.nn.Parameter(scale, requires_grad=False) + + # Set required attributes for kernel + layer.input_scale = None + layer.input_zero_point = None + layer.azp_adj = None + + # Forward pass + x = torch.randn(2, 16, 64, dtype=torch.float16, device="cuda") + output = method.apply(layer, x) + + assert output.shape == (2, 16, 128) + assert output.dtype == torch.float16 diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 8436b544fbc..5960a39a8a5 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -7,12 +7,13 @@ import torch -try: +if current_omni_platform.is_npu(): import torch_npu -except (ImportError, ModuleNotFoundError): +else: torch_npu = None from torch.nn import Module +from torch.utils._python_dispatch import TorchDispatchMode from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( @@ -25,17 +26,22 @@ QuantizationConfig, QuantizeMethodBase, ) +from vllm.model_executor.layers.quantization.fp8 import ( + CopyNumelCounter, + _copy_missing_attrs +) from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( init_int8_linear_kernel, ) from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped, ) +from vllm.model_executor.model_loader.weight_utils import initialize_single_dummy_weight from vllm.model_executor.parameter import ( ChannelQuantScaleParameter, ModelWeightParameter, ) -from vllm.model_executor.utils import replace_parameter +from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm_omni.platforms import current_omni_platform @@ -116,7 +122,7 @@ def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): def from_config(cls, config: dict[str, Any]) -> "Int8Config": quant_method = cls.get_from_keys(config, ["quant_method"]) is_checkpoint_int8_serialized = "int8" in quant_method - activation_scheme = cls.get_from_keys(config, ["activation_scheme"], "dynamic") + activation_scheme = cls.get_from_keys_or(config, ["activation_scheme"], "dynamic") ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) if not ignored_layers: @@ -216,6 +222,92 @@ def apply( ) -> torch.Tensor: raise NotImplementedError("No BaseInt8LinearMethod apply implementation.") +class LazyWeightMixin: + """ + Mixin for lazy weight loading with meta device. + weighs are created on meta device and materialized just-in-time during loadding. + """ + uses_meta_device: bool = True + + def create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: list[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + output_size_per_partition = sum(output_partition_sizes) + weight_loader = extra_weight_attrs.get("weight_loader") + layer.logical_widths = output_partition_sizes + layer.input_size_per_partition = input_size_per_partition + layer.output_size_per_partition = output_size_per_partition + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + # WEIGHT + def patched_weight_loader(param, loaded_weight, *args, **kwargs): + # track how many elements we have updated + if not hasattr(layer, "_loaded_numel"): + layer._loaded_numel = 0 + + # when the first `loaded_weight` is about to be + # loaded to `param`, materialize `param` just-in-time + weight = ModelWeightParameter( + data=torch.empty_like(layer.weight, device=layer._load_device), + input_dim=1, + output_dim=0, + weight_loader=patched_weight_loader, + ) + _copy_missing_attrs(layer.weight, weight) + layer.register_parameter("weight", weight) + del layer._load_device + + # refresh the reference to `param` to reflect just-in-time + # materialization + param = layer.weight + + # load the current weight chunk + copy_numel_counter = CopyNumelCounter() + with copy_numel_counter: + res = weight_loader(param, loaded_weight, *args, **kwargs) # type: ignore[misc] + layer._loaded_numel += copy_numel_counter.copied_numel + + # if we have loaded all of the elements, call + # process_weights_after_loading + target_loaded_numel = layer.weight.numel() + if layer._loaded_numel == target_loaded_numel: + self.process_weights_after_loading(layer) + + # Prevent the usual `process_weights_after_loading` call from doing + # anything + layer._already_called_process_weights_after_loading = True + + # Note that we keep `layer._loaded_numel` around just in case + # there is logic added to vllm in the future which calls a + # weight loader twice - we do not want to re-initialize in + # that case. + + return res + + weight = ModelWeightParameter( + data=torch.empty( + output_size_per_partition, + input_size_per_partition, + # materialized just-in-time in `patched_weight_loader` + device="meta", + dtype=params_dtype, + ), + input_dim=1, + output_dim=0, + weight_loader=patched_weight_loader, + ) + # stash the correct device for `patched_weight_loader` + layer._load_device = torch.get_default_device() + layer.register_parameter("weight", weight) + class Int8LinearMethod(BaseInt8LinearMethod): """ @@ -288,13 +380,27 @@ def apply( return output -class Int8OnlineLinearMethod(Int8LinearMethod): +class Int8OnlineLinearMethod(LazyWeightMixin, Int8LinearMethod): """ Online version of Int8LinearMethod, loads the fp16/bf16 checkpoint and quantized the weights during loading. """ def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return + + if layer.weight.device == torch.device("meta"): + weight = ModelWeightParameter( + data=torch.empty_like(layer.weight, device=layer._load_device), + input_dim=1, + output_dim=0, + weight_loader=layer.weight.weight_loader, + ) + _copy_missing_attrs(layer.weight, weight) + layer.register_parameter("weight", weight) + initialize_single_dummy_weight(layer.weight) + w_q_name, w_s_name, i_s_name, i_zp_name, azp_adj_name = self.int8_linear.layer_param_names qweight, weight_scale, _ = ops.scaled_int8_quant(layer.weight, scale=None) @@ -306,14 +412,31 @@ def process_weights_after_loading(self, layer: Module) -> None: setattr(layer, i_zp_name, None) setattr(layer, azp_adj_name, None) + # Prevent duplicate processing (e.g., during weight reload) + layer._already_called_process_weights_after_loading = True -class NPUInt8OnlineLinearMethod(NPUInt8LinearMethod): + +class NPUInt8OnlineLinearMethod(LazyWeightMixin, NPUInt8LinearMethod): """ NPU Online version of Int8LinearMethod, loads the fp16/bf16 checkpoint and quantized the weights during loading. """ def process_weights_after_loading(self, layer: Module) -> None: + if getattr(layer, "_already_called_process_weights_after_loading", False): + return + + if layer.weight.device == torch.device("meta"): + weight = ModelWeightParameter( + data=torch.empty_like(layer.weight, device=layer._load_device), + input_dim=1, + output_dim=0, + weight_loader=layer.weight.weight_loader, + ) + _copy_missing_attrs(layer.weight, weight) + layer.register_parameter("weight", weight) + initialize_single_dummy_weight(layer.weight) + weight = layer.weight qweight, weight_scale = torch_npu.npu_dynamic_quant(weight) @@ -323,6 +446,9 @@ def process_weights_after_loading(self, layer: Module) -> None: replace_parameter(layer, "weight", qweight) replace_parameter(layer, "weight_scale", weight_scale) + # Prevent duplicate processing (e.g., during weight reload) + layer._already_called_process_weights_after_loading = True + class DiffusionInt8Config(DiffusionQuantizationConfig): """ From a3803986373a26225d85f3bfa58e5d737c19d558 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Tue, 17 Mar 2026 18:16:17 +0800 Subject: [PATCH 19/26] fix import torch_npu Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 5960a39a8a5..ac7570c979c 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -6,12 +6,6 @@ from typing import TYPE_CHECKING, Any, Optional import torch - -if current_omni_platform.is_npu(): - import torch_npu -else: - torch_npu = None - from torch.nn import Module from torch.utils._python_dispatch import TorchDispatchMode from vllm import _custom_ops as ops @@ -45,6 +39,11 @@ from vllm_omni.platforms import current_omni_platform +if current_omni_platform.is_npu(): + import torch_npu +else: + torch_npu = None + from .base import DiffusionQuantizationConfig if TYPE_CHECKING: From aed56475c0924fd84d8081162153c49cb1abd363 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Tue, 17 Mar 2026 18:56:00 +0800 Subject: [PATCH 20/26] fix pytest.mark.skipif Signed-off-by: juboyu <767868009@qq.com> --- tests/diffusion/quantization/test_int8_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index f329c0288df..5b9c82fe0ea 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -22,7 +22,7 @@ ) cuda_available = pytest.mark.skipif( - not current_omni_platform.is_npu(), + not current_omni_platform.is_cuda(), reason="GPU platform not available." ) From 624460ffe45ec9138769f4748d50d9ccaa881e93 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Tue, 17 Mar 2026 22:47:40 +0800 Subject: [PATCH 21/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- .../quantization/test_int8_config.py | 25 ++++++------------- vllm_omni/diffusion/quantization/int8.py | 13 +++++----- 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 5b9c82fe0ea..7210d828771 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -16,15 +16,9 @@ ) from vllm_omni.platforms import current_omni_platform -npu_available = pytest.mark.skipif( - not current_omni_platform.is_npu(), - reason="NPU platform not available." -) +npu_available = pytest.mark.skipif(not current_omni_platform.is_npu(), reason="NPU platform not available.") -cuda_available = pytest.mark.skipif( - not current_omni_platform.is_cuda(), - reason="GPU platform not available." -) +cuda_available = pytest.mark.skipif(not current_omni_platform.is_cuda(), reason="GPU platform not available.") def test_int8_config_creation(): @@ -321,6 +315,7 @@ def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mo assert mock_layer.weight.shape == (64, 128) assert torch.equal(mock_layer.weight_scale, self.scale_mock) + @pytest.fixture def quant_config(): """Shared quant config fixture for smoke tests.""" @@ -331,6 +326,7 @@ def quant_config(): activation_scheme="dynamic", ) + @npu_available class TestNPUInt8Smoke: """Smoke tests using real torch_npu, only run on NPU.""" @@ -353,10 +349,6 @@ def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer): """Smoke test: verify npu_dynamic_quant returns correct shapes.""" import torch_npu - from vllm_omni.diffusion.quantization.int8 import NPUInt8OnlineLinearMethod - - method = NPUInt8OnlineLinearMethod(quant_config) - # Call real torch_npu.npu_dynamic_quant weight = real_layer.weight qweight, scale = torch_npu.npu_dynamic_quant(weight) @@ -365,9 +357,7 @@ def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer): assert qweight.dtype == torch.int8 assert scale.shape == (weight.shape[0],) - def test_real_npu_online_process_weights_after_loading( - self, quant_config, real_layer - ): + def test_real_npu_online_process_weights_after_loading(self, quant_config, real_layer): """Smoke test: full process_weights_after_loading with real torch_npu.""" from vllm_omni.diffusion.quantization.int8 import NPUInt8OnlineLinearMethod @@ -432,9 +422,7 @@ def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config): assert qweight.dtype == torch.int8 assert scale.shape == (weight.shape[0], 1) - def test_real_cuda_online_process_weights_after_loading( - self, quant_config, real_layer - ): + def test_real_cuda_online_process_weights_after_loading(self, quant_config, real_layer): """Smoke test: full process_weights_after_loading with real CUDA ops.""" from vllm_omni.diffusion.quantization.int8 import Int8OnlineLinearMethod @@ -449,6 +437,7 @@ def test_real_cuda_online_process_weights_after_loading( def test_real_cuda_int8_apply_forward(self, quant_config): """Smoke test: forward pass with real CUDA int8 kernel.""" from vllm import _custom_ops as ops + from vllm_omni.diffusion.quantization.int8 import Int8LinearMethod method = Int8LinearMethod(quant_config) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index ac7570c979c..c71cf1ef0d8 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -20,10 +20,7 @@ QuantizationConfig, QuantizeMethodBase, ) -from vllm.model_executor.layers.quantization.fp8 import ( - CopyNumelCounter, - _copy_missing_attrs -) +from vllm.model_executor.layers.quantization.fp8 import CopyNumelCounter, _copy_missing_attrs from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( init_int8_linear_kernel, ) @@ -35,7 +32,7 @@ ChannelQuantScaleParameter, ModelWeightParameter, ) -from vllm.model_executor.utils import replace_parameter, set_weight_attrs +from vllm.model_executor.utils import replace_parameter from vllm_omni.platforms import current_omni_platform @@ -221,11 +218,13 @@ def apply( ) -> torch.Tensor: raise NotImplementedError("No BaseInt8LinearMethod apply implementation.") + class LazyWeightMixin: """ Mixin for lazy weight loading with meta device. weighs are created on meta device and materialized just-in-time during loadding. """ + uses_meta_device: bool = True def create_weights( @@ -388,7 +387,7 @@ class Int8OnlineLinearMethod(LazyWeightMixin, Int8LinearMethod): def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return - + if layer.weight.device == torch.device("meta"): weight = ModelWeightParameter( data=torch.empty_like(layer.weight, device=layer._load_device), @@ -424,7 +423,7 @@ class NPUInt8OnlineLinearMethod(LazyWeightMixin, NPUInt8LinearMethod): def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return - + if layer.weight.device == torch.device("meta"): weight = ModelWeightParameter( data=torch.empty_like(layer.weight, device=layer._load_device), From 19dc8588fb9be54641cc5708f5c00e8499407656 Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Tue, 17 Mar 2026 23:52:04 +0800 Subject: [PATCH 22/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- tests/diffusion/quantization/test_int8_config.py | 1 + vllm_omni/diffusion/quantization/int8.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/diffusion/quantization/test_int8_config.py index 7210d828771..9b5d67fcbaf 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/diffusion/quantization/test_int8_config.py @@ -257,6 +257,7 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config): method.process_weights_after_loading(layer) mock_deps["quant"].assert_called_once_with(layer.weight, scale=None) + @npu_available class TestNPUInt8LinearMethod: qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index c71cf1ef0d8..128e994165f 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -7,7 +7,6 @@ import torch from torch.nn import Module -from torch.utils._python_dispatch import TorchDispatchMode from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import ( From 141f715d0a60652da70b387e66b9dba7eaf1cc5e Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 18 Mar 2026 15:43:40 +0800 Subject: [PATCH 23/26] fix problem from path updates in the vllm operator Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 128e994165f..4198d4ad9b8 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -20,7 +20,7 @@ QuantizeMethodBase, ) from vllm.model_executor.layers.quantization.fp8 import CopyNumelCounter, _copy_missing_attrs -from vllm.model_executor.layers.quantization.kernels.scaled_mm import ( +from vllm.model_executor.kernels.linear import ( init_int8_linear_kernel, ) from vllm.model_executor.layers.quantization.utils.quant_utils import ( From 871952b0dc7d8379389f604b36afb2778f49820c Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Wed, 18 Mar 2026 15:45:52 +0800 Subject: [PATCH 24/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/quantization/int8.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/quantization/int8.py b/vllm_omni/diffusion/quantization/int8.py index 4198d4ad9b8..798038c9f7a 100644 --- a/vllm_omni/diffusion/quantization/int8.py +++ b/vllm_omni/diffusion/quantization/int8.py @@ -9,6 +9,9 @@ from torch.nn import Module from vllm import _custom_ops as ops from vllm.logger import init_logger +from vllm.model_executor.kernels.linear import ( + init_int8_linear_kernel, +) from vllm.model_executor.layers.linear import ( LinearBase, LinearMethodBase, @@ -20,9 +23,6 @@ QuantizeMethodBase, ) from vllm.model_executor.layers.quantization.fp8 import CopyNumelCounter, _copy_missing_attrs -from vllm.model_executor.kernels.linear import ( - init_int8_linear_kernel, -) from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped, ) From 084db20cac7de55a590b4cb26343a846c66e4d8a Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Thu, 19 Mar 2026 19:57:05 +0800 Subject: [PATCH 25/26] Fix the issue of quantization parameter passing, and add z_image as the prefix for quantization ignored_layers Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/models/z_image/z_image_transformer.py | 6 ++++++ vllm_omni/engine/async_omni_engine.py | 1 + 2 files changed, 7 insertions(+) diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index b68b6f70d94..0a90f0f30ef 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -279,6 +279,7 @@ def __init__( total_num_kv_heads=num_kv_heads, bias=False, quant_config=quant_config, + prefix="to_qkv", ) assert qk_norm is True @@ -297,6 +298,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix="to_out", ) ] ) @@ -361,6 +363,7 @@ def __init__( dim: int, hidden_dim: int, quant_config: "QuantizationConfig | None" = None, + prefix: str = "", ): super().__init__() self.w13 = MergedColumnParallelLinear( @@ -369,6 +372,7 @@ def __init__( bias=False, return_bias=False, quant_config=quant_config, + prefix=prefix, ) self.act = SiluAndMul() self.w2 = RowParallelLinear( @@ -378,6 +382,7 @@ def __init__( input_is_parallel=True, return_bias=False, quant_config=quant_config, + prefix=prefix, ) def forward(self, x): @@ -412,6 +417,7 @@ def __init__( dim=dim, hidden_dim=int(dim / 3 * 8), quant_config=quant_config, + prefix="feed_forward" ) self.layer_id = layer_id diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 4a20e4bc48a..36c15b4f7be 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -844,6 +844,7 @@ def _create_default_diffusion_stage_cfg(kwargs: dict[str, Any]) -> list: "enable_sleep_mode": kwargs.get("enable_sleep_mode", False), "enable_multithread_weight_load": kwargs.get("enable_multithread_weight_load", True), "num_weight_load_threads": kwargs.get("num_weight_load_threads", 4), + "quantization": kwargs.get("quantization", None), }, "final_output": True, "final_output_type": "image", From e1cfe8cc4f9b5c6a0c8a636a9613f1145af9030d Mon Sep 17 00:00:00 2001 From: juboyu <767868009@qq.com> Date: Thu, 19 Mar 2026 21:38:19 +0800 Subject: [PATCH 26/26] fix format Signed-off-by: juboyu <767868009@qq.com> --- vllm_omni/diffusion/models/z_image/z_image_transformer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vllm_omni/diffusion/models/z_image/z_image_transformer.py b/vllm_omni/diffusion/models/z_image/z_image_transformer.py index 0a90f0f30ef..b7367e9b324 100644 --- a/vllm_omni/diffusion/models/z_image/z_image_transformer.py +++ b/vllm_omni/diffusion/models/z_image/z_image_transformer.py @@ -414,10 +414,7 @@ def __init__( ) self.feed_forward = FeedForward( - dim=dim, - hidden_dim=int(dim / 3 * 8), - quant_config=quant_config, - prefix="feed_forward" + dim=dim, hidden_dim=int(dim / 3 * 8), quant_config=quant_config, prefix="feed_forward" ) self.layer_id = layer_id