From c2e3ae99ebd8eb0fcf415664eb444f5f9a145552 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 14 Apr 2026 21:59:26 +0800 Subject: [PATCH 01/12] support autoround w4a16 for wan2.2 Signed-off-by: lvliang-intel --- .../models/wan2_2/pipeline_wan2_2.py | 3 ++ .../models/wan2_2/pipeline_wan2_2_i2v.py | 15 ++++++- .../models/wan2_2/pipeline_wan2_2_vace.py | 3 ++ .../models/wan2_2/wan2_2_transformer.py | 7 ++- vllm_omni/diffusion/stage_diffusion_proc.py | 44 +++++++++++++++++++ 5 files changed, 69 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 3ee46ffb003..e93c3691697 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -124,6 +124,7 @@ def load_transformer_config(model_path: str, subfolder: str = "transformer", loc def create_transformer_from_config( config: dict, quant_config: QuantizationConfig | None = None, + prefix: str = "" ) -> WanTransformer3DModel: """Create WanTransformer3DModel from config dict.""" kwargs: dict = {} @@ -166,6 +167,8 @@ def create_transformer_from_config( if quant_config is not None: kwargs["quant_config"] = quant_config + if prefix: + kwargs["prefix"] = prefix return WanTransformer3DModel(**kwargs) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index 42c4eff6add..d18f607bd5b 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -231,10 +231,21 @@ def __init__( # Transformers (weights loaded via load_weights) # Load config from model directory or HF Hub to get correct in_channels for I2V models transformer_config = load_transformer_config(model, "transformer", local_files_only) - self.transformer = self._create_transformer(transformer_config) + self.transformer = create_transformer_from_config( + transformer_config, quant_config=od_config.quantization_config, + ) if self.has_transformer_2: transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only) - self.transformer_2 = self._create_transformer(transformer_2_config) + # transformer_2 may have its own quantization config (or none). + # Detect from its config.json rather than blindly reusing the + # primary transformer's quantization. + t2_quant = transformer_2_config.get("quantization_config") + if t2_quant is not None: + from vllm_omni.quantization.factory import build_quant_config + t2_quant = build_quant_config(t2_quant) + self.transformer_2 = create_transformer_from_config( + transformer_2_config, quant_config=t2_quant, + ) else: self.transformer_2 = None diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py index 75bdac27f2a..5ba2c6c690f 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py @@ -45,6 +45,7 @@ def create_vace_transformer_from_config( config: dict, quant_config: QuantizationConfig | None = None, + prefix: str = "", ) -> WanVACETransformer3DModel: """Create WanVACETransformer3DModel from config dict.""" kwargs = {} @@ -84,6 +85,8 @@ def create_vace_transformer_from_config( kwargs["vace_in_channels"] = config["vace_in_channels"] if quant_config is not None: kwargs["quant_config"] = quant_config + if prefix: + kwargs["prefix"] = prefix return WanVACETransformer3DModel(**kwargs) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 81889607a71..03a4a752c4d 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -3,7 +3,7 @@ import math from collections.abc import Iterable -from typing import Any +from typing import TYPE_CHECKING, Any import torch import torch.nn as nn @@ -46,6 +46,11 @@ from vllm_omni.diffusion.layers.rope import RotaryEmbeddingWan from vllm_omni.platforms import current_omni_platform +if TYPE_CHECKING: + from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig, + ) + logger = init_logger(__name__) diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index 871a29729f2..4ae685f9d6f 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -119,8 +119,52 @@ def initialize(self) -> None: logger.info("StageDiffusionProc initialized with model: %s", self._model) def _enrich_config(self) -> None: +<<<<<<< HEAD """Load model metadata from HuggingFace and populate od_config fields.""" self._od_config.enrich_config() +======= + """Load model metadata from HuggingFace and populate od_config fields. + + Diffusers-style models expose ``model_index.json`` with ``_class_name``. + Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, + so we fall back to reading that and mapping model_type manually. + """ + od_config = self._od_config + + try: + config_dict = get_hf_file_to_dict("model_index.json", od_config.model) + if config_dict is not None: + if od_config.model_class_name is None: + od_config.model_class_name = config_dict.get("_class_name", None) + od_config.update_multimodal_support() + + tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) + od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict)) + else: + raise FileNotFoundError("model_index.json not found") + except (AttributeError, OSError, ValueError, FileNotFoundError): + cfg = get_hf_file_to_dict("config.json", od_config.model) + if cfg is None: + raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") + + od_config.set_tf_model_config(TransformerConfig.from_dict(cfg)) + model_type = cfg.get("model_type") + architectures = cfg.get("architectures") or [] + + if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: + od_config.model_class_name = "BagelPipeline" + od_config.tf_model_config = TransformerConfig() + od_config.update_multimodal_support() + elif model_type == "nextstep": + if od_config.model_class_name is None: + od_config.model_class_name = "NextStep11Pipeline" + od_config.tf_model_config = TransformerConfig() + od_config.update_multimodal_support() + elif architectures and len(architectures) == 1: + od_config.model_class_name = architectures[0] + else: + raise +>>>>>>> e33fa8e7 (support autoround w4a16 for wan2.2) # ------------------------------------------------------------------ # Request processing From de58ea3c6b0bceeb3f7f81b31563500cd3d30ca3 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Mon, 20 Apr 2026 10:30:42 +0800 Subject: [PATCH 02/12] fix stage diffusion proc Signed-off-by: lvliang-intel --- vllm_omni/diffusion/stage_diffusion_proc.py | 44 --------------------- 1 file changed, 44 deletions(-) diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py index 4ae685f9d6f..871a29729f2 100644 --- a/vllm_omni/diffusion/stage_diffusion_proc.py +++ b/vllm_omni/diffusion/stage_diffusion_proc.py @@ -119,52 +119,8 @@ def initialize(self) -> None: logger.info("StageDiffusionProc initialized with model: %s", self._model) def _enrich_config(self) -> None: -<<<<<<< HEAD """Load model metadata from HuggingFace and populate od_config fields.""" self._od_config.enrich_config() -======= - """Load model metadata from HuggingFace and populate od_config fields. - - Diffusers-style models expose ``model_index.json`` with ``_class_name``. - Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``, - so we fall back to reading that and mapping model_type manually. - """ - od_config = self._od_config - - try: - config_dict = get_hf_file_to_dict("model_index.json", od_config.model) - if config_dict is not None: - if od_config.model_class_name is None: - od_config.model_class_name = config_dict.get("_class_name", None) - od_config.update_multimodal_support() - - tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model) - od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict)) - else: - raise FileNotFoundError("model_index.json not found") - except (AttributeError, OSError, ValueError, FileNotFoundError): - cfg = get_hf_file_to_dict("config.json", od_config.model) - if cfg is None: - raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}") - - od_config.set_tf_model_config(TransformerConfig.from_dict(cfg)) - model_type = cfg.get("model_type") - architectures = cfg.get("architectures") or [] - - if model_type == "bagel" or "BagelForConditionalGeneration" in architectures: - od_config.model_class_name = "BagelPipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif model_type == "nextstep": - if od_config.model_class_name is None: - od_config.model_class_name = "NextStep11Pipeline" - od_config.tf_model_config = TransformerConfig() - od_config.update_multimodal_support() - elif architectures and len(architectures) == 1: - od_config.model_class_name = architectures[0] - else: - raise ->>>>>>> e33fa8e7 (support autoround w4a16 for wan2.2) # ------------------------------------------------------------------ # Request processing From d6ef8cc351ab8ac9e2918ac3a8533e616690346e Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Mon, 20 Apr 2026 10:34:00 +0800 Subject: [PATCH 03/12] fix i2v Signed-off-by: lvliang-intel --- .../diffusion/models/wan2_2/pipeline_wan2_2_i2v.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index d18f607bd5b..1c1e24723d9 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -236,13 +236,14 @@ def __init__( ) if self.has_transformer_2: transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only) - # transformer_2 may have its own quantization config (or none). - # Detect from its config.json rather than blindly reusing the - # primary transformer's quantization. t2_quant = transformer_2_config.get("quantization_config") - if t2_quant is not None: + if isinstance(t2_quant, dict) and "quant_method" in t2_quant: from vllm_omni.quantization.factory import build_quant_config - t2_quant = build_quant_config(t2_quant) + method = t2_quant["quant_method"] + kwargs = {k: v for k, v in t2_quant.items() if k != "quant_method"} + t2_quant = build_quant_config(method, **kwargs) + else: + t2_quant = None self.transformer_2 = create_transformer_from_config( transformer_2_config, quant_config=t2_quant, ) From 6be4173a642c9ba092ec382ddcf108fa9c9003a6 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Wed, 29 Apr 2026 15:16:25 +0800 Subject: [PATCH 04/12] snapshot sys.modules before iteration to prevent RuntimeError Signed-off-by: lvliang-intel --- vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py index 1c1e24723d9..5ff3742051f 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py @@ -232,20 +232,23 @@ def __init__( # Load config from model directory or HF Hub to get correct in_channels for I2V models transformer_config = load_transformer_config(model, "transformer", local_files_only) self.transformer = create_transformer_from_config( - transformer_config, quant_config=od_config.quantization_config, + transformer_config, + quant_config=od_config.quantization_config, ) if self.has_transformer_2: transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only) t2_quant = transformer_2_config.get("quantization_config") if isinstance(t2_quant, dict) and "quant_method" in t2_quant: from vllm_omni.quantization.factory import build_quant_config + method = t2_quant["quant_method"] kwargs = {k: v for k, v in t2_quant.items() if k != "quant_method"} t2_quant = build_quant_config(method, **kwargs) else: t2_quant = None self.transformer_2 = create_transformer_from_config( - transformer_2_config, quant_config=t2_quant, + transformer_2_config, + quant_config=t2_quant, ) else: self.transformer_2 = None From 24ecb6dcc8e5039e2c778399e77eab7a85fbb886 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 5 May 2026 15:38:16 +0800 Subject: [PATCH 05/12] add test Signed-off-by: lvliang-intel --- .../test_wan22_quant_config_propagation.py | 301 ++++++++++++++++++ .../test_wan22_i2v_autoround_w4a16.py | 157 +++++++++ .../test_wan22_t2v_autoround_w4a16.py | 142 +++++++++ 3 files changed, 600 insertions(+) create mode 100644 tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py create mode 100644 tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py create mode 100644 tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py diff --git a/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py new file mode 100644 index 00000000000..e5442f912f8 --- /dev/null +++ b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py @@ -0,0 +1,301 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Unit tests for Wan2.2 quant_config propagation through transformer creation. + +Tests cover: +- create_transformer_from_config passes quant_config and prefix +- create_vace_transformer_from_config passes quant_config and prefix +- set_tf_model_config propagates quant_config to OmniDiffusionConfig +- patch_wan_rms_norm safely iterates sys.modules with concurrent modifications +- I2V transformer_2 quant_config is built from config dict +""" + +import sys +from types import SimpleNamespace +from unittest.mock import MagicMock + +import pytest + +import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 as wan22_module +import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace as wan22_vace_module +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import ( + create_transformer_from_config, +) +from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace import ( + create_vace_transformer_from_config, +) + +pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion] + + +# --------------------------------------------------------------------------- +# create_transformer_from_config: quant_config / prefix forwarding +# --------------------------------------------------------------------------- + + +class TestCreateTransformerQuant: + """Verify quant_config and prefix are forwarded to WanTransformer3DModel.""" + + def test_quant_config_passed_through(self, monkeypatch): + captured = {} + + class FakeTransformer: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + + fake_qc = MagicMock() + create_transformer_from_config( + {"patch_size": [1, 2, 2], "num_layers": 2}, + quant_config=fake_qc, + ) + assert captured.get("quant_config") is fake_qc + + def test_prefix_passed_through(self, monkeypatch): + captured = {} + + class FakeTransformer: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + + create_transformer_from_config( + {"patch_size": [1, 2, 2]}, + prefix="model.transformer.", + ) + assert captured.get("prefix") == "model.transformer." + + def test_quant_config_none_by_default(self, monkeypatch): + captured = {} + + class FakeTransformer: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + + create_transformer_from_config({"patch_size": [1, 2, 2]}) + # When quant_config is None and prefix is "", they are not added + assert "quant_config" not in captured or captured["quant_config"] is None + + def test_quant_config_and_prefix_together(self, monkeypatch): + captured = {} + + class FakeTransformer: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + + fake_qc = MagicMock() + create_transformer_from_config( + {"patch_size": [1, 2, 2], "num_attention_heads": 4}, + quant_config=fake_qc, + prefix="blocks.", + ) + assert captured["quant_config"] is fake_qc + assert captured["prefix"] == "blocks." + + +# --------------------------------------------------------------------------- +# create_vace_transformer_from_config: quant_config / prefix forwarding +# --------------------------------------------------------------------------- + + +class TestCreateVaceTransformerQuant: + """Verify quant_config and prefix are forwarded to WanVACETransformer3DModel.""" + + def test_quant_config_passed_through(self, monkeypatch): + captured = {} + + class FakeVACETransformer: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer) + + fake_qc = MagicMock() + create_vace_transformer_from_config( + {"patch_size": [1, 2, 2], "num_layers": 2}, + quant_config=fake_qc, + ) + assert captured.get("quant_config") is fake_qc + + def test_prefix_passed_through(self, monkeypatch): + captured = {} + + class FakeVACETransformer: + def __init__(self, **kwargs): + captured.update(kwargs) + + monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer) + + create_vace_transformer_from_config( + {"patch_size": [1, 2, 2]}, + prefix="vace.", + ) + assert captured.get("prefix") == "vace." + + +# --------------------------------------------------------------------------- +# set_tf_model_config: propagation of quant_config +# --------------------------------------------------------------------------- + + +class TestSetTfModelConfig: + """Test that set_tf_model_config propagates quant_config correctly.""" + + def _make_od_config(self): + """Create a minimal OmniDiffusionConfig-like object for testing.""" + from vllm_omni.diffusion.data import OmniDiffusionConfig + + cfg = object.__new__(OmniDiffusionConfig) + cfg.quantization_config = None + cfg.tf_model_config = None + return cfg + + def test_propagates_quant_config_when_none(self): + cfg = self._make_od_config() + fake_qc = MagicMock() + tf_config = SimpleNamespace(quant_config=fake_qc, quant_method="auto-round") + + cfg.set_tf_model_config(tf_config) + + assert cfg.tf_model_config is tf_config + assert cfg.quantization_config is fake_qc + + def test_does_not_overwrite_existing_quantization_config(self): + cfg = self._make_od_config() + existing_qc = MagicMock() + cfg.quantization_config = existing_qc + tf_config = SimpleNamespace(quant_config=MagicMock()) + + cfg.set_tf_model_config(tf_config) + + assert cfg.tf_model_config is tf_config + assert cfg.quantization_config is existing_qc # not overwritten + + def test_no_propagation_when_tf_quant_config_is_none(self): + cfg = self._make_od_config() + tf_config = SimpleNamespace(quant_config=None) + + cfg.set_tf_model_config(tf_config) + + assert cfg.tf_model_config is tf_config + assert cfg.quantization_config is None + + +# --------------------------------------------------------------------------- +# patch_wan_rms_norm: sys.modules snapshot safety +# --------------------------------------------------------------------------- + + +class TestPatchWanRmsNorm: + """Test that patch_wan_rms_norm doesn't raise on concurrent module registration.""" + + def test_patches_modules_with_wan_rms_norm(self): + from vllm_omni.diffusion.layers.norm import RMSNormVAE + from vllm_omni.diffusion.models.wan2_2.patch_diffusers import patch_wan_rms_norm + + # Create a fake module that has WanRMS_norm + fake_module = SimpleNamespace(WanRMS_norm=lambda x: x) + sys.modules["_test_fake_wan_module"] = fake_module + + try: + patch_wan_rms_norm() + assert fake_module.WanRMS_norm is RMSNormVAE + finally: + del sys.modules["_test_fake_wan_module"] + + def test_no_error_when_modules_change_during_iteration(self): + """Regression test: list() snapshot prevents RuntimeError.""" + from vllm_omni.diffusion.models.wan2_2.patch_diffusers import patch_wan_rms_norm + + # Simulate a module being added during iteration by a side effect + original_items = sys.modules.items + + def items_with_side_effect(): + # This would cause RuntimeError without list() snapshot + result = list(original_items()) + # Add a new module to simulate concurrent modification + sys.modules["_test_dynamic_module"] = SimpleNamespace() + return result + + try: + # The function uses list(sys.modules.items()) so it takes a snapshot + # Just verify it doesn't raise + patch_wan_rms_norm() + finally: + sys.modules.pop("_test_dynamic_module", None) + + +# --------------------------------------------------------------------------- +# I2V transformer_2 quant_config extraction +# --------------------------------------------------------------------------- + + +class TestI2VTransformer2QuantConfig: + """Test the transformer_2 quant_config build logic from pipeline_wan2_2_i2v.""" + + def test_transformer_2_quant_config_built_from_dict(self): + """When transformer_2 config has quantization_config dict, build_quant_config is called.""" + from vllm_omni.quantization.factory import build_quant_config + + t2_config = { + "patch_size": [1, 2, 2], + "num_layers": 2, + "quantization_config": { + "quant_method": "auto-round", + "bits": 4, + "group_size": 128, + "sym": True, + "packing_format": "auto_round:auto_gptq", + }, + } + + # Replicate the logic from pipeline_wan2_2_i2v.py + t2_quant = t2_config.get("quantization_config") + if isinstance(t2_quant, dict) and "quant_method" in t2_quant: + method = t2_quant["quant_method"] + kwargs = {k: v for k, v in t2_quant.items() if k != "quant_method"} + t2_quant = build_quant_config(method, **kwargs) + else: + t2_quant = None + + from vllm.model_executor.layers.quantization.inc import INCConfig + + assert isinstance(t2_quant, INCConfig) + assert t2_quant.weight_bits == 4 + assert t2_quant.group_size == 128 + + def test_transformer_2_quant_config_none_when_missing(self): + """When transformer_2 config has no quantization_config, result is None.""" + t2_config = { + "patch_size": [1, 2, 2], + "num_layers": 2, + } + + t2_quant = t2_config.get("quantization_config") + if isinstance(t2_quant, dict) and "quant_method" in t2_quant: + pass # won't enter + else: + t2_quant = None + + assert t2_quant is None + + def test_transformer_2_quant_config_none_when_dict_lacks_method(self): + """When quantization_config is a dict but missing quant_method, result is None.""" + t2_config = { + "patch_size": [1, 2, 2], + "quantization_config": {"bits": 4}, # no quant_method key + } + + t2_quant = t2_config.get("quantization_config") + if isinstance(t2_quant, dict) and "quant_method" in t2_quant: + pass + else: + t2_quant = None + + assert t2_quant is None diff --git a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py new file mode 100644 index 00000000000..6245995d3c2 --- /dev/null +++ b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py @@ -0,0 +1,157 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""E2E tests for Wan2.2-I2V-A14B AutoRound W4A16 quantized inference. + +These tests require: + - A CUDA GPU with sufficient memory (~36 GiB for quantized model) + - The quantized model checkpoint (Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound) +""" + +import gc +import os +import os as _os + +import numpy as np +import pytest +import torch +from PIL import Image +from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +QUANTIZED_MODEL = "Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound" +BASELINE_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" + +# Allow overriding via environment for local testing +QUANTIZED_MODEL = _os.environ.get("WAN22_I2V_AUTOROUND_MODEL", QUANTIZED_MODEL) +BASELINE_MODEL = _os.environ.get("WAN22_I2V_BASELINE_MODEL", BASELINE_MODEL) + +# Small resolution to keep GPU memory & time manageable +HEIGHT = 480 +WIDTH = 640 +NUM_FRAMES = 5 # must satisfy num_frames % 4 == 1 for Wan2.2 +NUM_STEPS = 2 # minimal for smoke-test + + +def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image: + """Create a deterministic test image for I2V tests.""" + rng = np.random.RandomState(42) + arr = rng.randint(0, 256, (height, width, 3), dtype=np.uint8) + return Image.fromarray(arr) + + +def _generate_video( + model_name: str, **extra_kwargs +) -> tuple[object, float]: + """Load a Wan2.2 I2V model, generate one video, return (frames, peak_memory_mb).""" + gc.collect() + current_omni_platform.empty_cache() + device_index = current_omni_platform.current_device() + current_omni_platform.reset_peak_memory_stats() + monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + + image = _create_test_image() + + with OmniRunner( + model_name, + enforce_eager=True, + boundary_ratio=0.875, + flow_shift=12.0, + **extra_kwargs, + ) as runner: + current_omni_platform.reset_peak_memory_stats() + outputs = runner.omni.generate( + { + "prompt": "A cat sitting on a table, smooth motion", + "multi_modal_data": {"image": image}, + }, + sampling_params_list=OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_frames=NUM_FRAMES, + num_inference_steps=NUM_STEPS, + guidance_scale=5.0, + guidance_scale_2=6.0, + boundary_ratio=0.875, + generator=torch.Generator( + device=current_omni_platform.device_type + ).manual_seed(42), + ), + ) + + peak = monitor.peak_used_mb + monitor.stop() + + first_output = outputs[0] + assert first_output.final_output_type == "image" + + req_out = first_output.request_output + if isinstance(req_out, list): + req_out = req_out[0] + assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") + frames = req_out.images[0] + + gc.collect() + current_omni_platform.empty_cache() + + return frames, peak + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}) +def test_wan22_i2v_autoround_w4a16_generates_video(): + """Load the W4A16 quantized Wan2.2 I2V model and verify it produces a valid video.""" + frames, _ = _generate_video(QUANTIZED_MODEL) + + assert frames is not None, "Expected video frames output" + assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" + + # frames shape: (batch, num_frames, height, width, channels) + assert frames.shape[1] == NUM_FRAMES, ( + f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" + ) + assert frames.shape[2] == HEIGHT, ( + f"Expected height {HEIGHT}, got {frames.shape[2]}" + ) + assert frames.shape[3] == WIDTH, ( + f"Expected width {WIDTH}, got {frames.shape[3]}" + ) + + # Sanity: video should not be blank (frames are [0, 1] floats) + arr = np.asarray(frames) + assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)" + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}) +def test_wan22_i2v_autoround_w4a16_memory_savings(): + """Compare peak GPU memory of quantized vs BF16 baseline. + + The W4A16 model should use meaningfully less memory than the + BF16 baseline since weights are 4-bit instead of 16-bit. + """ + _, quant_peak = _generate_video(QUANTIZED_MODEL) + cleanup_dist_env_and_memory() + _, baseline_peak = _generate_video(BASELINE_MODEL) + + print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB") + print(f"Baseline (BF16) peak memory: {baseline_peak:.0f} MB") + print(f"Savings: {baseline_peak - quant_peak:.0f} MB") + + # Wan2.2 I2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB. + # Use a conservative threshold to account for activations and overhead. + min_savings_mb = 5000 + assert quant_peak + min_savings_mb < baseline_peak, ( + f"Quantized model ({quant_peak:.0f} MB) should use at least " + f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" + ) diff --git a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py new file mode 100644 index 00000000000..7a17f54c408 --- /dev/null +++ b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""E2E tests for Wan2.2-T2V-A14B AutoRound W4A16 quantized inference. + +These tests require: + - A CUDA GPU with sufficient memory (~36 GiB for quantized model) + - The quantized model checkpoint (Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound) +""" + +import gc +import os +import os as _os + +import numpy as np +import pytest +import torch +from vllm.distributed.parallel_state import cleanup_dist_env_and_memory + +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.mark import hardware_test +from tests.helpers.runtime import OmniRunner +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.platforms import current_omni_platform + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +QUANTIZED_MODEL = "Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound" +BASELINE_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers" + +# Allow overriding via environment for local testing +QUANTIZED_MODEL = _os.environ.get("WAN22_T2V_AUTOROUND_MODEL", QUANTIZED_MODEL) +BASELINE_MODEL = _os.environ.get("WAN22_T2V_BASELINE_MODEL", BASELINE_MODEL) + +# Small resolution to keep GPU memory & time manageable +HEIGHT = 480 +WIDTH = 640 +NUM_FRAMES = 5 # must satisfy num_frames % 4 == 1 for Wan2.2 +NUM_STEPS = 2 # minimal for smoke-test + + +def _generate_video( + model_name: str, **extra_kwargs +) -> tuple[object, float]: + """Load a Wan2.2 T2V model, generate one video, return (frames, peak_memory_mb).""" + gc.collect() + current_omni_platform.empty_cache() + device_index = current_omni_platform.current_device() + current_omni_platform.reset_peak_memory_stats() + monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + + with OmniRunner( + model_name, + enforce_eager=True, + boundary_ratio=0.875, + flow_shift=5.0, + **extra_kwargs, + ) as runner: + current_omni_platform.reset_peak_memory_stats() + outputs = runner.omni.generate( + prompts="A cat sitting on a table", + sampling_params_list=OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_frames=NUM_FRAMES, + num_inference_steps=NUM_STEPS, + guidance_scale=1.0, + generator=torch.Generator( + device=current_omni_platform.device_type + ).manual_seed(42), + ), + ) + + peak = monitor.peak_used_mb + monitor.stop() + + first_output = outputs[0] + assert first_output.final_output_type == "image" + + req_out = first_output.request_output + if isinstance(req_out, list): + req_out = req_out[0] + assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") + frames = req_out.images[0] + + gc.collect() + current_omni_platform.empty_cache() + + return frames, peak + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}) +def test_wan22_t2v_autoround_w4a16_generates_video(): + """Load the W4A16 quantized Wan2.2 T2V model and verify it produces a valid video.""" + frames, _ = _generate_video(QUANTIZED_MODEL) + + assert frames is not None, "Expected video frames output" + assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" + + # frames shape: (batch, num_frames, height, width, channels) + assert frames.shape[1] == NUM_FRAMES, ( + f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" + ) + assert frames.shape[2] == HEIGHT, ( + f"Expected height {HEIGHT}, got {frames.shape[2]}" + ) + assert frames.shape[3] == WIDTH, ( + f"Expected width {WIDTH}, got {frames.shape[3]}" + ) + + # Sanity: video should not be blank (frames are [0, 1] floats) + arr = np.asarray(frames) + assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)" + + +@pytest.mark.advanced_model +@pytest.mark.diffusion +@hardware_test(res={"cuda": "H100"}) +def test_wan22_t2v_autoround_w4a16_memory_savings(): + """Compare peak GPU memory of quantized vs BF16 baseline. + + The W4A16 model should use meaningfully less memory than the + BF16 baseline since weights are 4-bit instead of 16-bit. + """ + _, quant_peak = _generate_video(QUANTIZED_MODEL) + cleanup_dist_env_and_memory() + _, baseline_peak = _generate_video(BASELINE_MODEL) + + print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB") + print(f"Baseline (BF16) peak memory: {baseline_peak:.0f} MB") + print(f"Savings: {baseline_peak - quant_peak:.0f} MB") + + # Wan2.2 T2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB. + # Use a conservative threshold to account for activations and overhead. + min_savings_mb = 5000 + assert quant_peak + min_savings_mb < baseline_peak, ( + f"Quantized model ({quant_peak:.0f} MB) should use at least " + f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" + ) From 9b1737d460f9e0273ee805b199e00fa2e65cadf6 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 5 May 2026 21:15:42 +0800 Subject: [PATCH 06/12] fix pre-commit Signed-off-by: lvliang-intel --- .../test_wan22_i2v_autoround_w4a16.py | 20 +++++-------------- .../test_wan22_t2v_autoround_w4a16.py | 20 +++++-------------- 2 files changed, 10 insertions(+), 30 deletions(-) diff --git a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py index 6245995d3c2..163e5d5b030 100644 --- a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py @@ -47,9 +47,7 @@ def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image: return Image.fromarray(arr) -def _generate_video( - model_name: str, **extra_kwargs -) -> tuple[object, float]: +def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]: """Load a Wan2.2 I2V model, generate one video, return (frames, peak_memory_mb).""" gc.collect() current_omni_platform.empty_cache() @@ -81,9 +79,7 @@ def _generate_video( guidance_scale=5.0, guidance_scale_2=6.0, boundary_ratio=0.875, - generator=torch.Generator( - device=current_omni_platform.device_type - ).manual_seed(42), + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), ), ) @@ -116,15 +112,9 @@ def test_wan22_i2v_autoround_w4a16_generates_video(): assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" # frames shape: (batch, num_frames, height, width, channels) - assert frames.shape[1] == NUM_FRAMES, ( - f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" - ) - assert frames.shape[2] == HEIGHT, ( - f"Expected height {HEIGHT}, got {frames.shape[2]}" - ) - assert frames.shape[3] == WIDTH, ( - f"Expected width {WIDTH}, got {frames.shape[3]}" - ) + assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" + assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}" + assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}" # Sanity: video should not be blank (frames are [0, 1] floats) arr = np.asarray(frames) diff --git a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py index 7a17f54c408..8c96103562e 100644 --- a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py +++ b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py @@ -39,9 +39,7 @@ NUM_STEPS = 2 # minimal for smoke-test -def _generate_video( - model_name: str, **extra_kwargs -) -> tuple[object, float]: +def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]: """Load a Wan2.2 T2V model, generate one video, return (frames, peak_memory_mb).""" gc.collect() current_omni_platform.empty_cache() @@ -66,9 +64,7 @@ def _generate_video( num_frames=NUM_FRAMES, num_inference_steps=NUM_STEPS, guidance_scale=1.0, - generator=torch.Generator( - device=current_omni_platform.device_type - ).manual_seed(42), + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), ), ) @@ -101,15 +97,9 @@ def test_wan22_t2v_autoround_w4a16_generates_video(): assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" # frames shape: (batch, num_frames, height, width, channels) - assert frames.shape[1] == NUM_FRAMES, ( - f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" - ) - assert frames.shape[2] == HEIGHT, ( - f"Expected height {HEIGHT}, got {frames.shape[2]}" - ) - assert frames.shape[3] == WIDTH, ( - f"Expected width {WIDTH}, got {frames.shape[3]}" - ) + assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" + assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}" + assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}" # Sanity: video should not be blank (frames are [0, 1] floats) arr = np.asarray(frames) From 0d0719032b43c8210f0196022d19a69952913522 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Wed, 6 May 2026 09:48:45 +0800 Subject: [PATCH 07/12] update doc Signed-off-by: lvliang-intel --- docs/user_guide/quantization/autoround.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/user_guide/quantization/autoround.md b/docs/user_guide/quantization/autoround.md index 2261d79a57c..88fed3b62b3 100644 --- a/docs/user_guide/quantization/autoround.md +++ b/docs/user_guide/quantization/autoround.md @@ -32,7 +32,9 @@ guide. AutoRound is Intel-supported. |-------|------------|-------|--------|---------| | FLUX.1-dev | `vllm-project-org/FLUX.1-dev-AutoRound-w4a16` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend | | Qwen-Image | Not listed | Diffusion transformer | W4A16 | Not validated | -| Wan2.2 | Not listed | Diffusion transformer | W4A16 | Not validated | +| Wan2.2-I2V | `Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend | +| Wan2.2-T2V | `Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend | +| Wan2.2-TI2V | `Intel/Wan2.2-TI2V-5B-Diffusers-int4-AutoRound` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend | ### Multi-Stage Omni/TTS Model (Qwen3-Omni, Qwen3-TTS) From 5bc949a6b78cb480f0aacb20a68f5220d05b62e1 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 19 May 2026 11:30:32 +0800 Subject: [PATCH 08/12] remove unnecessary import Signed-off-by: lvliang-intel --- vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py index 03a4a752c4d..81889607a71 100644 --- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py +++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py @@ -3,7 +3,7 @@ import math from collections.abc import Iterable -from typing import TYPE_CHECKING, Any +from typing import Any import torch import torch.nn as nn @@ -46,11 +46,6 @@ from vllm_omni.diffusion.layers.rope import RotaryEmbeddingWan from vllm_omni.platforms import current_omni_platform -if TYPE_CHECKING: - from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig, - ) - logger = init_logger(__name__) From c3c2d71a2634a4e5ef1653ca23fa63f0172a10f8 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 19 May 2026 11:34:12 +0800 Subject: [PATCH 09/12] fix lint Signed-off-by: lvliang-intel --- vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index e93c3691697..2d8c752a4eb 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -122,9 +122,7 @@ def load_transformer_config(model_path: str, subfolder: str = "transformer", loc def create_transformer_from_config( - config: dict, - quant_config: QuantizationConfig | None = None, - prefix: str = "" + config: dict, quant_config: QuantizationConfig | None = None, prefix: str = "" ) -> WanTransformer3DModel: """Create WanTransformer3DModel from config dict.""" kwargs: dict = {} From 40dd76ac483908b3d726031217429c31d5430742 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Tue, 19 May 2026 22:38:22 +0800 Subject: [PATCH 10/12] adapt test code according to comments Signed-off-by: lvliang-intel --- .buildkite/test-nightly.yml | 2 +- .../test_wan22_quant_config_propagation.py | 44 +-- .../test_wan22_autoround_w4a16_expansion.py | 311 ++++++++++++++++++ .../test_wan22_i2v_autoround_w4a16.py | 147 --------- .../test_wan22_t2v_autoround_w4a16.py | 132 -------- .../models/wan2_2/pipeline_wan2_2.py | 1 + vllm_omni/quantization/factory.py | 37 +++ 7 files changed, 372 insertions(+), 302 deletions(-) create mode 100644 tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py delete mode 100644 tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py delete mode 100644 tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 1b61044affa..ae93d2353a3 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -959,7 +959,7 @@ steps: - label: ":full_moon: Diffusion X2V · Function Test" timeout_in_minutes: 90 commands: - - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "full_model and cuda" --run-level "full_model" + - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py -m "full_model and cuda" --run-level "full_model" agents: queue: "mithril-h100-pool" plugins: diff --git a/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py index e5442f912f8..b1405993312 100644 --- a/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py +++ b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py @@ -12,9 +12,9 @@ import sys from types import SimpleNamespace -from unittest.mock import MagicMock import pytest +from pytest_mock import MockerFixture import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 as wan22_module import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace as wan22_vace_module @@ -36,30 +36,30 @@ class TestCreateTransformerQuant: """Verify quant_config and prefix are forwarded to WanTransformer3DModel.""" - def test_quant_config_passed_through(self, monkeypatch): + def test_quant_config_passed_through(self, mocker: MockerFixture): captured = {} class FakeTransformer: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer) - fake_qc = MagicMock() + fake_qc = mocker.MagicMock() create_transformer_from_config( {"patch_size": [1, 2, 2], "num_layers": 2}, quant_config=fake_qc, ) assert captured.get("quant_config") is fake_qc - def test_prefix_passed_through(self, monkeypatch): + def test_prefix_passed_through(self, mocker: MockerFixture): captured = {} class FakeTransformer: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer) create_transformer_from_config( {"patch_size": [1, 2, 2]}, @@ -67,29 +67,29 @@ def __init__(self, **kwargs): ) assert captured.get("prefix") == "model.transformer." - def test_quant_config_none_by_default(self, monkeypatch): + def test_quant_config_none_by_default(self, mocker: MockerFixture): captured = {} class FakeTransformer: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer) create_transformer_from_config({"patch_size": [1, 2, 2]}) # When quant_config is None and prefix is "", they are not added assert "quant_config" not in captured or captured["quant_config"] is None - def test_quant_config_and_prefix_together(self, monkeypatch): + def test_quant_config_and_prefix_together(self, mocker: MockerFixture): captured = {} class FakeTransformer: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer) + mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer) - fake_qc = MagicMock() + fake_qc = mocker.MagicMock() create_transformer_from_config( {"patch_size": [1, 2, 2], "num_attention_heads": 4}, quant_config=fake_qc, @@ -107,30 +107,30 @@ def __init__(self, **kwargs): class TestCreateVaceTransformerQuant: """Verify quant_config and prefix are forwarded to WanVACETransformer3DModel.""" - def test_quant_config_passed_through(self, monkeypatch): + def test_quant_config_passed_through(self, mocker: MockerFixture): captured = {} class FakeVACETransformer: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer) + mocker.patch.object(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer) - fake_qc = MagicMock() + fake_qc = mocker.MagicMock() create_vace_transformer_from_config( {"patch_size": [1, 2, 2], "num_layers": 2}, quant_config=fake_qc, ) assert captured.get("quant_config") is fake_qc - def test_prefix_passed_through(self, monkeypatch): + def test_prefix_passed_through(self, mocker: MockerFixture): captured = {} class FakeVACETransformer: def __init__(self, **kwargs): captured.update(kwargs) - monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer) + mocker.patch.object(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer) create_vace_transformer_from_config( {"patch_size": [1, 2, 2]}, @@ -156,9 +156,9 @@ def _make_od_config(self): cfg.tf_model_config = None return cfg - def test_propagates_quant_config_when_none(self): + def test_propagates_quant_config_when_none(self, mocker: MockerFixture): cfg = self._make_od_config() - fake_qc = MagicMock() + fake_qc = mocker.MagicMock() tf_config = SimpleNamespace(quant_config=fake_qc, quant_method="auto-round") cfg.set_tf_model_config(tf_config) @@ -166,18 +166,18 @@ def test_propagates_quant_config_when_none(self): assert cfg.tf_model_config is tf_config assert cfg.quantization_config is fake_qc - def test_does_not_overwrite_existing_quantization_config(self): + def test_does_not_overwrite_existing_quantization_config(self, mocker: MockerFixture): cfg = self._make_od_config() - existing_qc = MagicMock() + existing_qc = mocker.MagicMock() cfg.quantization_config = existing_qc - tf_config = SimpleNamespace(quant_config=MagicMock()) + tf_config = SimpleNamespace(quant_config=mocker.MagicMock()) cfg.set_tf_model_config(tf_config) assert cfg.tf_model_config is tf_config assert cfg.quantization_config is existing_qc # not overwritten - def test_no_propagation_when_tf_quant_config_is_none(self): + def test_no_propagation_when_tf_quant_config_is_none(self, mocker: MockerFixture): cfg = self._make_od_config() tf_config = SimpleNamespace(quant_config=None) diff --git a/tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py b/tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py new file mode 100644 index 00000000000..044435cb571 --- /dev/null +++ b/tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py @@ -0,0 +1,311 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""E2E tests for Wan2.2 AutoRound W4A16 quantized inference. + +These tests cover I2V (image-to-video) and T2V (text-to-video) generation +with quantized weights. + +Requirements: + - CUDA GPU (H100 or equivalent, ~36 GiB for quantized model) + - The quantized model checkpoint (Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound, + Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound) +""" + +import gc +import os as _os + +import numpy as np +import pytest +import torch +from PIL import Image + +from tests.helpers.env import DeviceMemoryMonitor +from tests.helpers.mark import hardware_test +from vllm_omni.inputs.data import OmniDiffusionSamplingParams +from vllm_omni.platforms import current_omni_platform + +_os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +QUANTIZED_MODEL_I2V = "Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound" +BASELINE_MODEL_I2V = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" +QUANTIZED_MODEL_T2V = "Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound" +BASELINE_MODEL_T2V = "Wan-AI/Wan2.2-T2V-A14B-Diffusers" + +QUANTIZED_MODEL_I2V = _os.environ.get("WAN22_I2V_AUTOROUND_MODEL", QUANTIZED_MODEL_I2V) +BASELINE_MODEL_I2V = _os.environ.get("WAN22_I2V_BASELINE_MODEL", BASELINE_MODEL_I2V) +QUANTIZED_MODEL_T2V = _os.environ.get("WAN22_T2V_AUTOROUND_MODEL", QUANTIZED_MODEL_T2V) +BASELINE_MODEL_T2V = _os.environ.get("WAN22_T2V_BASELINE_MODEL", BASELINE_MODEL_T2V) + +pytestmark = [ + pytest.mark.full_model, + pytest.mark.diffusion, +] + +# Small resolution to keep GPU memory & time manageable +HEIGHT = 480 +WIDTH = 640 +NUM_FRAMES = 5 # must satisfy num_frames % 4 == 1 for Wan2.2 +NUM_STEPS = 2 # minimal for smoke-test + +# Parametrise: (model, stage_config_path=None, extra_omni_kwargs) +# When stage_config_path is None, the engine auto-resolves from the model's own config. +quant_i2v_params = [(QUANTIZED_MODEL_I2V, None, {"enforce_eager": True})] +baseline_i2v_params = [(BASELINE_MODEL_I2V, None, {"enforce_eager": True})] +quant_t2v_params = [(QUANTIZED_MODEL_T2V, None, {"enforce_eager": True})] +baseline_t2v_params = [(BASELINE_MODEL_T2V, None, {"enforce_eager": True})] + +# Module-level storage for peak memory results across tests +_memory_results: dict[str, float] = {} + + +def _sampling_params_i2v() -> OmniDiffusionSamplingParams: + """Create sampling params for I2V generation.""" + return OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_frames=NUM_FRAMES, + num_inference_steps=NUM_STEPS, + guidance_scale=5.0, + guidance_scale_2=6.0, + boundary_ratio=0.875, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ) + + +def _sampling_params_t2v() -> OmniDiffusionSamplingParams: + """Create sampling params for T2V generation.""" + return OmniDiffusionSamplingParams( + height=HEIGHT, + width=WIDTH, + num_frames=NUM_FRAMES, + num_inference_steps=NUM_STEPS, + guidance_scale=4.0, + generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), + ) + + +def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image: + """Create a deterministic test image for I2V tests.""" + rng = np.random.RandomState(42) + arr = rng.randint(0, 256, (height, width, 3), dtype=np.uint8) + return Image.fromarray(arr) + + +def _generate_i2v_video(omni_runner_handler, prompt: str = "A cat sitting on a table, smooth motion") -> tuple: + """Generate one I2V video, return (frames, peak_memory_mb).""" + gc.collect() + current_omni_platform.empty_cache() + device_index = current_omni_platform.current_device() + current_omni_platform.reset_peak_memory_stats() + monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + + image = _create_test_image() + response = omni_runner_handler.send_diffusion_request( + { + "prompt": prompt, + "images": image, + "sampling_params": _sampling_params_i2v(), + }, + ) + + peak = monitor.peak_used_mb + monitor.stop() + + assert response.success, f"Request failed: {response.error_message}" + assert response.images is not None and len(response.images) > 0, "Expected image output" + frames = response.images[0] + + gc.collect() + current_omni_platform.empty_cache() + + return frames, peak + + +def _generate_t2v_video(omni_runner_handler, prompt: str = "A cat sitting on a table") -> tuple: + """Generate one T2V video, return (frames, peak_memory_mb).""" + gc.collect() + current_omni_platform.empty_cache() + device_index = current_omni_platform.current_device() + current_omni_platform.reset_peak_memory_stats() + monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) + monitor.start() + + response = omni_runner_handler.send_diffusion_request( + { + "prompt": prompt, + "sampling_params": _sampling_params_t2v(), + }, + ) + + peak = monitor.peak_used_mb + monitor.stop() + + assert response.success, f"Request failed: {response.error_message}" + assert response.images is not None and len(response.images) > 0, "Expected image output" + frames = response.images[0] + + gc.collect() + current_omni_platform.empty_cache() + + return frames, peak + + +# ------------------------------------------------------------------ +# Test: I2V quantized model generates valid video +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +@pytest.mark.parametrize("omni_runner", quant_i2v_params, indirect=True) +def test_wan22_i2v_autoround_w4a16_generates_video(omni_runner, omni_runner_handler): + """Load the W4A16 quantized Wan2.2 I2V model and verify it produces a valid video.""" + frames, _ = _generate_i2v_video(omni_runner_handler) + + assert frames is not None, "Expected video frames output" + assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" + + # frames shape: (batch, num_frames, height, width, channels) + assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" + assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}" + assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}" + + # Sanity: video should not be blank (frames are [0, 1] floats) + arr = np.asarray(frames) + assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)" + + +# ------------------------------------------------------------------ +# Test: T2V quantized model generates valid video +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +@pytest.mark.parametrize("omni_runner", quant_t2v_params, indirect=True) +def test_wan22_t2v_autoround_w4a16_generates_video(omni_runner, omni_runner_handler): + """Load the W4A16 quantized Wan2.2 T2V model and verify it produces a valid video.""" + frames, _ = _generate_t2v_video(omni_runner_handler) + + assert frames is not None, "Expected video frames output" + assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" + + assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" + assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}" + assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}" + + arr = np.asarray(frames) + assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)" + + +# ------------------------------------------------------------------ +# Test: I2V quantized peak memory +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +@pytest.mark.parametrize("omni_runner", quant_i2v_params, indirect=True) +def test_wan22_i2v_autoround_w4a16_quant_peak(omni_runner, omni_runner_handler): + """Measure peak GPU memory of W4A16 quantized I2V model.""" + frames, peak = _generate_i2v_video(omni_runner_handler) + + assert frames is not None, "Expected video frames output" + _memory_results["quant_i2v"] = peak + print(f"\nQuantized I2V (W4A16) peak memory: {peak:.0f} MB") + + +# ------------------------------------------------------------------ +# Test: I2V baseline peak memory +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +@pytest.mark.parametrize("omni_runner", baseline_i2v_params, indirect=True) +def test_wan22_i2v_autoround_w4a16_baseline_peak(omni_runner, omni_runner_handler): + """Measure peak GPU memory of BF16 baseline I2V model.""" + frames, peak = _generate_i2v_video(omni_runner_handler) + + assert frames is not None, "Expected video frames output" + _memory_results["baseline_i2v"] = peak + print(f"\nBaseline I2V (BF16) peak memory: {peak:.0f} MB") + + +# ------------------------------------------------------------------ +# Test: I2V memory savings +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +def test_wan22_i2v_autoround_w4a16_memory_savings(): + """Assert quantized I2V model uses meaningfully less memory than BF16 baseline.""" + quant_peak = _memory_results["quant_i2v"] + baseline_peak = _memory_results["baseline_i2v"] + + savings = baseline_peak - quant_peak + print(f"\nQuantized I2V (W4A16) peak memory: {quant_peak:.0f} MB") + print(f"Baseline I2V (BF16) peak memory: {baseline_peak:.0f} MB") + print(f"Savings: {savings:.0f} MB") + + # Wan2.2 I2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB. + # Use a conservative threshold to account for activations and overhead. + min_savings_mb = 5000 + assert quant_peak + min_savings_mb < baseline_peak, ( + f"Quantized model ({quant_peak:.0f} MB) should use at least " + f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" + ) + + +# ------------------------------------------------------------------ +# Test: T2V quantized peak memory +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +@pytest.mark.parametrize("omni_runner", quant_t2v_params, indirect=True) +def test_wan22_t2v_autoround_w4a16_quant_peak(omni_runner, omni_runner_handler): + """Measure peak GPU memory of W4A16 quantized T2V model.""" + frames, peak = _generate_t2v_video(omni_runner_handler) + + assert frames is not None, "Expected video frames output" + _memory_results["quant_t2v"] = peak + print(f"\nQuantized T2V (W4A16) peak memory: {peak:.0f} MB") + + +# ------------------------------------------------------------------ +# Test: T2V baseline peak memory +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +@pytest.mark.parametrize("omni_runner", baseline_t2v_params, indirect=True) +def test_wan22_t2v_autoround_w4a16_baseline_peak(omni_runner, omni_runner_handler): + """Measure peak GPU memory of BF16 baseline T2V model.""" + frames, peak = _generate_t2v_video(omni_runner_handler) + + assert frames is not None, "Expected video frames output" + _memory_results["baseline_t2v"] = peak + print(f"\nBaseline T2V (BF16) peak memory: {peak:.0f} MB") + + +# ------------------------------------------------------------------ +# Test: T2V memory savings +# ------------------------------------------------------------------ + + +@hardware_test(res={"cuda": "H100"}) +def test_wan22_t2v_autoround_w4a16_memory_savings(): + """Assert quantized T2V model uses meaningfully less memory than BF16 baseline.""" + quant_peak = _memory_results["quant_t2v"] + baseline_peak = _memory_results["baseline_t2v"] + + savings = baseline_peak - quant_peak + print(f"\nQuantized T2V (W4A16) peak memory: {quant_peak:.0f} MB") + print(f"Baseline T2V (BF16) peak memory: {baseline_peak:.0f} MB") + print(f"Savings: {savings:.0f} MB") + + # Wan2.2 T2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB. + # Use a conservative threshold to account for activations and overhead. + min_savings_mb = 5000 + assert quant_peak + min_savings_mb < baseline_peak, ( + f"Quantized model ({quant_peak:.0f} MB) should use at least " + f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" + ) diff --git a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py deleted file mode 100644 index 163e5d5b030..00000000000 --- a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py +++ /dev/null @@ -1,147 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""E2E tests for Wan2.2-I2V-A14B AutoRound W4A16 quantized inference. - -These tests require: - - A CUDA GPU with sufficient memory (~36 GiB for quantized model) - - The quantized model checkpoint (Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound) -""" - -import gc -import os -import os as _os - -import numpy as np -import pytest -import torch -from PIL import Image -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory - -from tests.helpers.env import DeviceMemoryMonitor -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -QUANTIZED_MODEL = "Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound" -BASELINE_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers" - -# Allow overriding via environment for local testing -QUANTIZED_MODEL = _os.environ.get("WAN22_I2V_AUTOROUND_MODEL", QUANTIZED_MODEL) -BASELINE_MODEL = _os.environ.get("WAN22_I2V_BASELINE_MODEL", BASELINE_MODEL) - -# Small resolution to keep GPU memory & time manageable -HEIGHT = 480 -WIDTH = 640 -NUM_FRAMES = 5 # must satisfy num_frames % 4 == 1 for Wan2.2 -NUM_STEPS = 2 # minimal for smoke-test - - -def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image: - """Create a deterministic test image for I2V tests.""" - rng = np.random.RandomState(42) - arr = rng.randint(0, 256, (height, width, 3), dtype=np.uint8) - return Image.fromarray(arr) - - -def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]: - """Load a Wan2.2 I2V model, generate one video, return (frames, peak_memory_mb).""" - gc.collect() - current_omni_platform.empty_cache() - device_index = current_omni_platform.current_device() - current_omni_platform.reset_peak_memory_stats() - monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) - monitor.start() - - image = _create_test_image() - - with OmniRunner( - model_name, - enforce_eager=True, - boundary_ratio=0.875, - flow_shift=12.0, - **extra_kwargs, - ) as runner: - current_omni_platform.reset_peak_memory_stats() - outputs = runner.omni.generate( - { - "prompt": "A cat sitting on a table, smooth motion", - "multi_modal_data": {"image": image}, - }, - sampling_params_list=OmniDiffusionSamplingParams( - height=HEIGHT, - width=WIDTH, - num_frames=NUM_FRAMES, - num_inference_steps=NUM_STEPS, - guidance_scale=5.0, - guidance_scale_2=6.0, - boundary_ratio=0.875, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) - - peak = monitor.peak_used_mb - monitor.stop() - - first_output = outputs[0] - assert first_output.final_output_type == "image" - - req_out = first_output.request_output - if isinstance(req_out, list): - req_out = req_out[0] - assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") - frames = req_out.images[0] - - gc.collect() - current_omni_platform.empty_cache() - - return frames, peak - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) -def test_wan22_i2v_autoround_w4a16_generates_video(): - """Load the W4A16 quantized Wan2.2 I2V model and verify it produces a valid video.""" - frames, _ = _generate_video(QUANTIZED_MODEL) - - assert frames is not None, "Expected video frames output" - assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" - - # frames shape: (batch, num_frames, height, width, channels) - assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" - assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}" - assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}" - - # Sanity: video should not be blank (frames are [0, 1] floats) - arr = np.asarray(frames) - assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)" - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) -def test_wan22_i2v_autoround_w4a16_memory_savings(): - """Compare peak GPU memory of quantized vs BF16 baseline. - - The W4A16 model should use meaningfully less memory than the - BF16 baseline since weights are 4-bit instead of 16-bit. - """ - _, quant_peak = _generate_video(QUANTIZED_MODEL) - cleanup_dist_env_and_memory() - _, baseline_peak = _generate_video(BASELINE_MODEL) - - print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB") - print(f"Baseline (BF16) peak memory: {baseline_peak:.0f} MB") - print(f"Savings: {baseline_peak - quant_peak:.0f} MB") - - # Wan2.2 I2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB. - # Use a conservative threshold to account for activations and overhead. - min_savings_mb = 5000 - assert quant_peak + min_savings_mb < baseline_peak, ( - f"Quantized model ({quant_peak:.0f} MB) should use at least " - f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" - ) diff --git a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py deleted file mode 100644 index 8c96103562e..00000000000 --- a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py +++ /dev/null @@ -1,132 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""E2E tests for Wan2.2-T2V-A14B AutoRound W4A16 quantized inference. - -These tests require: - - A CUDA GPU with sufficient memory (~36 GiB for quantized model) - - The quantized model checkpoint (Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound) -""" - -import gc -import os -import os as _os - -import numpy as np -import pytest -import torch -from vllm.distributed.parallel_state import cleanup_dist_env_and_memory - -from tests.helpers.env import DeviceMemoryMonitor -from tests.helpers.mark import hardware_test -from tests.helpers.runtime import OmniRunner -from vllm_omni.inputs.data import OmniDiffusionSamplingParams -from vllm_omni.outputs import OmniRequestOutput -from vllm_omni.platforms import current_omni_platform - -os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - -QUANTIZED_MODEL = "Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound" -BASELINE_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers" - -# Allow overriding via environment for local testing -QUANTIZED_MODEL = _os.environ.get("WAN22_T2V_AUTOROUND_MODEL", QUANTIZED_MODEL) -BASELINE_MODEL = _os.environ.get("WAN22_T2V_BASELINE_MODEL", BASELINE_MODEL) - -# Small resolution to keep GPU memory & time manageable -HEIGHT = 480 -WIDTH = 640 -NUM_FRAMES = 5 # must satisfy num_frames % 4 == 1 for Wan2.2 -NUM_STEPS = 2 # minimal for smoke-test - - -def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]: - """Load a Wan2.2 T2V model, generate one video, return (frames, peak_memory_mb).""" - gc.collect() - current_omni_platform.empty_cache() - device_index = current_omni_platform.current_device() - current_omni_platform.reset_peak_memory_stats() - monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02) - monitor.start() - - with OmniRunner( - model_name, - enforce_eager=True, - boundary_ratio=0.875, - flow_shift=5.0, - **extra_kwargs, - ) as runner: - current_omni_platform.reset_peak_memory_stats() - outputs = runner.omni.generate( - prompts="A cat sitting on a table", - sampling_params_list=OmniDiffusionSamplingParams( - height=HEIGHT, - width=WIDTH, - num_frames=NUM_FRAMES, - num_inference_steps=NUM_STEPS, - guidance_scale=1.0, - generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42), - ), - ) - - peak = monitor.peak_used_mb - monitor.stop() - - first_output = outputs[0] - assert first_output.final_output_type == "image" - - req_out = first_output.request_output - if isinstance(req_out, list): - req_out = req_out[0] - assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images") - frames = req_out.images[0] - - gc.collect() - current_omni_platform.empty_cache() - - return frames, peak - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) -def test_wan22_t2v_autoround_w4a16_generates_video(): - """Load the W4A16 quantized Wan2.2 T2V model and verify it produces a valid video.""" - frames, _ = _generate_video(QUANTIZED_MODEL) - - assert frames is not None, "Expected video frames output" - assert hasattr(frames, "shape"), "Expected frames to have a shape attribute" - - # frames shape: (batch, num_frames, height, width, channels) - assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}" - assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}" - assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}" - - # Sanity: video should not be blank (frames are [0, 1] floats) - arr = np.asarray(frames) - assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)" - - -@pytest.mark.advanced_model -@pytest.mark.diffusion -@hardware_test(res={"cuda": "H100"}) -def test_wan22_t2v_autoround_w4a16_memory_savings(): - """Compare peak GPU memory of quantized vs BF16 baseline. - - The W4A16 model should use meaningfully less memory than the - BF16 baseline since weights are 4-bit instead of 16-bit. - """ - _, quant_peak = _generate_video(QUANTIZED_MODEL) - cleanup_dist_env_and_memory() - _, baseline_peak = _generate_video(BASELINE_MODEL) - - print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB") - print(f"Baseline (BF16) peak memory: {baseline_peak:.0f} MB") - print(f"Savings: {baseline_peak - quant_peak:.0f} MB") - - # Wan2.2 T2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB. - # Use a conservative threshold to account for activations and overhead. - min_savings_mb = 5000 - assert quant_peak + min_savings_mb < baseline_peak, ( - f"Quantized model ({quant_peak:.0f} MB) should use at least " - f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)" - ) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 2d8c752a4eb..3a68a48d72a 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -37,6 +37,7 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.platforms import current_omni_platform +from vllm_omni.quantization.factory import build_quant_config, normalize_quant_method_alias logger = logging.getLogger(__name__) DEBUG_PERF = False diff --git a/vllm_omni/quantization/factory.py b/vllm_omni/quantization/factory.py index 955f97cef85..597980ad952 100644 --- a/vllm_omni/quantization/factory.py +++ b/vllm_omni/quantization/factory.py @@ -99,6 +99,43 @@ def _build_inc(**kw: Any) -> QuantizationConfig: SUPPORTED_QUANTIZATION_METHODS: list[str] = list(dict.fromkeys(QUANTIZATION_METHODS + list(_OVERRIDES.keys()))) +def _build_reverse_alias_map() -> dict[str, str]: + """Build a mapping from normalized method aliases to canonical names. + + All keys in _OVERRIDES that share the same builder function are considered + aliases of each other. The canonical name is the first key (in definition + order) that maps to a given builder — i.e. the one returned by + builder().get_name(). + """ + builder_to_first_key: dict[Callable[..., QuantizationConfig], str] = {} + for key in _OVERRIDES: + builder = _OVERRIDES[key] + if builder not in builder_to_first_key: + builder_to_first_key[builder] = key + + result: dict[str, str] = {} + for key, builder in _OVERRIDES.items(): + canonical = builder_to_first_key[builder] + result[key.lower().replace("-", "_")] = canonical + return result + + +_CACHED_ALIAS_MAP: dict[str, str] | None = None + + +def normalize_quant_method_alias(method: str | None) -> str | None: + """Map a method name (or any of its aliases) to its canonical internal name. + Returns the input unchanged if it is not a known alias. + """ + if method is None: + return None + global _CACHED_ALIAS_MAP + if _CACHED_ALIAS_MAP is None: + _CACHED_ALIAS_MAP = _build_reverse_alias_map() + normalized = method.lower().replace("-", "_") + return _CACHED_ALIAS_MAP.get(normalized, normalized) + + _MODEL_OPT_METHODS = { "modelopt", } From e261e3a9ac8edf4973e851cfd93b59e648eeb862 Mon Sep 17 00:00:00 2001 From: lvliang-intel Date: Wed, 20 May 2026 16:24:50 +0800 Subject: [PATCH 11/12] fix pre-commit Signed-off-by: lvliang-intel --- vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py | 1 - vllm_omni/quantization/factory.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py index 3a68a48d72a..2d8c752a4eb 100644 --- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py +++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py @@ -37,7 +37,6 @@ from vllm_omni.diffusion.request import OmniDiffusionRequest from vllm_omni.inputs.data import OmniTextPrompt from vllm_omni.platforms import current_omni_platform -from vllm_omni.quantization.factory import build_quant_config, normalize_quant_method_alias logger = logging.getLogger(__name__) DEBUG_PERF = False diff --git a/vllm_omni/quantization/factory.py b/vllm_omni/quantization/factory.py index 597980ad952..3766e4596cd 100644 --- a/vllm_omni/quantization/factory.py +++ b/vllm_omni/quantization/factory.py @@ -123,7 +123,7 @@ def _build_reverse_alias_map() -> dict[str, str]: _CACHED_ALIAS_MAP: dict[str, str] | None = None -def normalize_quant_method_alias(method: str | None) -> str | None: +def _normalize_quant_method_alias(method: str | None) -> str | None: """Map a method name (or any of its aliases) to its canonical internal name. Returns the input unchanged if it is not a known alias. """ @@ -371,7 +371,9 @@ def resolve_quant_config_from_disk( ) return build_quant_config(qc_method, **qc_kwargs) - if quant_config.get_name() != qc_method: + active_method = _normalize_quant_method_alias(quant_config.get_name()) + disk_method = _normalize_quant_method_alias(qc_method) + if active_method != disk_method: raise ValueError( f"Checkpoint config.json declares quant_method={qc_method!r} but the " f"active quantization config is {quant_config.get_name()!r}. " From f75eaf3238a2b321ee8575f533a9f141c0536a9e Mon Sep 17 00:00:00 2001 From: hxhhhlalala Date: Wed, 20 May 2026 14:21:29 +0800 Subject: [PATCH 12/12] [NPU][Quant] Add W4A4 MXFP4 online & MXFP4 dual-scale online/offline quantization support for Wan2.2 T2V / I2V inference on Ascend NPU (#3578) Signed-off-by: hyh_hh Co-authored-by: hyh_hh Signed-off-by: lvliang-intel --- docs/user_guide/quantization/mxfp4.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/quantization/mxfp4.md b/docs/user_guide/quantization/mxfp4.md index 7463ada23ee..401a55ad4d8 100644 --- a/docs/user_guide/quantization/mxfp4.md +++ b/docs/user_guide/quantization/mxfp4.md @@ -397,7 +397,7 @@ names** discovered in Step 1. No code changes to the model are required. ```python omni = Omni( model="/path/to/your-model", - quantization_config={ + quantization={ "method": "mxfp4_dualscale", "ignored_layers": [ "blocks.0.attn1.to_qkv", # runtime name, not diffusers name