From c2e3ae99ebd8eb0fcf415664eb444f5f9a145552 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 14 Apr 2026 21:59:26 +0800
Subject: [PATCH 01/12] support autoround w4a16 for wan2.2

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 .../models/wan2_2/pipeline_wan2_2.py          |  3 ++
 .../models/wan2_2/pipeline_wan2_2_i2v.py      | 15 ++++++-
 .../models/wan2_2/pipeline_wan2_2_vace.py     |  3 ++
 .../models/wan2_2/wan2_2_transformer.py       |  7 ++-
 vllm_omni/diffusion/stage_diffusion_proc.py   | 44 +++++++++++++++++++
 5 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
index 3ee46ffb003..e93c3691697 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
@@ -124,6 +124,7 @@ def load_transformer_config(model_path: str, subfolder: str = "transformer", loc
 def create_transformer_from_config(
     config: dict,
     quant_config: QuantizationConfig | None = None,
+    prefix: str = ""
 ) -> WanTransformer3DModel:
     """Create WanTransformer3DModel from config dict."""
     kwargs: dict = {}
@@ -166,6 +167,8 @@ def create_transformer_from_config(
 
     if quant_config is not None:
         kwargs["quant_config"] = quant_config
+    if prefix:
+        kwargs["prefix"] = prefix
 
     return WanTransformer3DModel(**kwargs)
 
diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
index 42c4eff6add..d18f607bd5b 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
@@ -231,10 +231,21 @@ def __init__(
         # Transformers (weights loaded via load_weights)
         # Load config from model directory or HF Hub to get correct in_channels for I2V models
         transformer_config = load_transformer_config(model, "transformer", local_files_only)
-        self.transformer = self._create_transformer(transformer_config)
+        self.transformer = create_transformer_from_config(
+            transformer_config, quant_config=od_config.quantization_config,
+        )
         if self.has_transformer_2:
             transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only)
-            self.transformer_2 = self._create_transformer(transformer_2_config)
+            # transformer_2 may have its own quantization config (or none).
+            # Detect from its config.json rather than blindly reusing the
+            # primary transformer's quantization.
+            t2_quant = transformer_2_config.get("quantization_config")
+            if t2_quant is not None:
+                from vllm_omni.quantization.factory import build_quant_config
+                t2_quant = build_quant_config(t2_quant)
+            self.transformer_2 = create_transformer_from_config(
+                transformer_2_config, quant_config=t2_quant,
+            )
         else:
             self.transformer_2 = None
 
diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py
index 75bdac27f2a..5ba2c6c690f 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_vace.py
@@ -45,6 +45,7 @@
 def create_vace_transformer_from_config(
     config: dict,
     quant_config: QuantizationConfig | None = None,
+    prefix: str = "",
 ) -> WanVACETransformer3DModel:
     """Create WanVACETransformer3DModel from config dict."""
     kwargs = {}
@@ -84,6 +85,8 @@ def create_vace_transformer_from_config(
         kwargs["vace_in_channels"] = config["vace_in_channels"]
     if quant_config is not None:
         kwargs["quant_config"] = quant_config
+    if prefix:
+        kwargs["prefix"] = prefix
 
     return WanVACETransformer3DModel(**kwargs)
 
diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
index 81889607a71..03a4a752c4d 100644
--- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
+++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 import torch
 import torch.nn as nn
@@ -46,6 +46,11 @@
 from vllm_omni.diffusion.layers.rope import RotaryEmbeddingWan
 from vllm_omni.platforms import current_omni_platform
 
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.quantization.base_config import (
+        QuantizationConfig,
+    )
+
 logger = init_logger(__name__)
 
 
diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py
index 871a29729f2..4ae685f9d6f 100644
--- a/vllm_omni/diffusion/stage_diffusion_proc.py
+++ b/vllm_omni/diffusion/stage_diffusion_proc.py
@@ -119,8 +119,52 @@ def initialize(self) -> None:
         logger.info("StageDiffusionProc initialized with model: %s", self._model)
 
     def _enrich_config(self) -> None:
+<<<<<<< HEAD
         """Load model metadata from HuggingFace and populate od_config fields."""
         self._od_config.enrich_config()
+=======
+        """Load model metadata from HuggingFace and populate od_config fields.
+
+        Diffusers-style models expose ``model_index.json`` with ``_class_name``.
+        Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``,
+        so we fall back to reading that and mapping model_type manually.
+        """
+        od_config = self._od_config
+
+        try:
+            config_dict = get_hf_file_to_dict("model_index.json", od_config.model)
+            if config_dict is not None:
+                if od_config.model_class_name is None:
+                    od_config.model_class_name = config_dict.get("_class_name", None)
+                od_config.update_multimodal_support()
+
+                tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model)
+                od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict))
+            else:
+                raise FileNotFoundError("model_index.json not found")
+        except (AttributeError, OSError, ValueError, FileNotFoundError):
+            cfg = get_hf_file_to_dict("config.json", od_config.model)
+            if cfg is None:
+                raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
+
+            od_config.set_tf_model_config(TransformerConfig.from_dict(cfg))
+            model_type = cfg.get("model_type")
+            architectures = cfg.get("architectures") or []
+
+            if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
+                od_config.model_class_name = "BagelPipeline"
+                od_config.tf_model_config = TransformerConfig()
+                od_config.update_multimodal_support()
+            elif model_type == "nextstep":
+                if od_config.model_class_name is None:
+                    od_config.model_class_name = "NextStep11Pipeline"
+                od_config.tf_model_config = TransformerConfig()
+                od_config.update_multimodal_support()
+            elif architectures and len(architectures) == 1:
+                od_config.model_class_name = architectures[0]
+            else:
+                raise
+>>>>>>> e33fa8e7 (support autoround w4a16 for wan2.2)
 
     # ------------------------------------------------------------------
     # Request processing

From de58ea3c6b0bceeb3f7f81b31563500cd3d30ca3 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Mon, 20 Apr 2026 10:30:42 +0800
Subject: [PATCH 02/12] fix stage diffusion proc

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 vllm_omni/diffusion/stage_diffusion_proc.py | 44 ---------------------
 1 file changed, 44 deletions(-)

diff --git a/vllm_omni/diffusion/stage_diffusion_proc.py b/vllm_omni/diffusion/stage_diffusion_proc.py
index 4ae685f9d6f..871a29729f2 100644
--- a/vllm_omni/diffusion/stage_diffusion_proc.py
+++ b/vllm_omni/diffusion/stage_diffusion_proc.py
@@ -119,52 +119,8 @@ def initialize(self) -> None:
         logger.info("StageDiffusionProc initialized with model: %s", self._model)
 
     def _enrich_config(self) -> None:
-<<<<<<< HEAD
         """Load model metadata from HuggingFace and populate od_config fields."""
         self._od_config.enrich_config()
-=======
-        """Load model metadata from HuggingFace and populate od_config fields.
-
-        Diffusers-style models expose ``model_index.json`` with ``_class_name``.
-        Non-diffusers models (e.g. Bagel, NextStep) only have ``config.json``,
-        so we fall back to reading that and mapping model_type manually.
-        """
-        od_config = self._od_config
-
-        try:
-            config_dict = get_hf_file_to_dict("model_index.json", od_config.model)
-            if config_dict is not None:
-                if od_config.model_class_name is None:
-                    od_config.model_class_name = config_dict.get("_class_name", None)
-                od_config.update_multimodal_support()
-
-                tf_config_dict = get_hf_file_to_dict("transformer/config.json", od_config.model)
-                od_config.set_tf_model_config(TransformerConfig.from_dict(tf_config_dict))
-            else:
-                raise FileNotFoundError("model_index.json not found")
-        except (AttributeError, OSError, ValueError, FileNotFoundError):
-            cfg = get_hf_file_to_dict("config.json", od_config.model)
-            if cfg is None:
-                raise ValueError(f"Could not find config.json or model_index.json for model {od_config.model}")
-
-            od_config.set_tf_model_config(TransformerConfig.from_dict(cfg))
-            model_type = cfg.get("model_type")
-            architectures = cfg.get("architectures") or []
-
-            if model_type == "bagel" or "BagelForConditionalGeneration" in architectures:
-                od_config.model_class_name = "BagelPipeline"
-                od_config.tf_model_config = TransformerConfig()
-                od_config.update_multimodal_support()
-            elif model_type == "nextstep":
-                if od_config.model_class_name is None:
-                    od_config.model_class_name = "NextStep11Pipeline"
-                od_config.tf_model_config = TransformerConfig()
-                od_config.update_multimodal_support()
-            elif architectures and len(architectures) == 1:
-                od_config.model_class_name = architectures[0]
-            else:
-                raise
->>>>>>> e33fa8e7 (support autoround w4a16 for wan2.2)
 
     # ------------------------------------------------------------------
     # Request processing

From d6ef8cc351ab8ac9e2918ac3a8533e616690346e Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Mon, 20 Apr 2026 10:34:00 +0800
Subject: [PATCH 03/12] fix i2v

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 .../diffusion/models/wan2_2/pipeline_wan2_2_i2v.py    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
index d18f607bd5b..1c1e24723d9 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
@@ -236,13 +236,14 @@ def __init__(
         )
         if self.has_transformer_2:
             transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only)
-            # transformer_2 may have its own quantization config (or none).
-            # Detect from its config.json rather than blindly reusing the
-            # primary transformer's quantization.
             t2_quant = transformer_2_config.get("quantization_config")
-            if t2_quant is not None:
+            if isinstance(t2_quant, dict) and "quant_method" in t2_quant:
                 from vllm_omni.quantization.factory import build_quant_config
-                t2_quant = build_quant_config(t2_quant)
+                method = t2_quant["quant_method"]
+                kwargs = {k: v for k, v in t2_quant.items() if k != "quant_method"}
+                t2_quant = build_quant_config(method, **kwargs)
+            else:
+                t2_quant = None
             self.transformer_2 = create_transformer_from_config(
                 transformer_2_config, quant_config=t2_quant,
             )

From 6be4173a642c9ba092ec382ddcf108fa9c9003a6 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Wed, 29 Apr 2026 15:16:25 +0800
Subject: [PATCH 04/12] snapshot sys.modules before iteration to prevent
 RuntimeError

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
index 1c1e24723d9..5ff3742051f 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2_i2v.py
@@ -232,20 +232,23 @@ def __init__(
         # Load config from model directory or HF Hub to get correct in_channels for I2V models
         transformer_config = load_transformer_config(model, "transformer", local_files_only)
         self.transformer = create_transformer_from_config(
-            transformer_config, quant_config=od_config.quantization_config,
+            transformer_config,
+            quant_config=od_config.quantization_config,
         )
         if self.has_transformer_2:
             transformer_2_config = load_transformer_config(model, "transformer_2", local_files_only)
             t2_quant = transformer_2_config.get("quantization_config")
             if isinstance(t2_quant, dict) and "quant_method" in t2_quant:
                 from vllm_omni.quantization.factory import build_quant_config
+
                 method = t2_quant["quant_method"]
                 kwargs = {k: v for k, v in t2_quant.items() if k != "quant_method"}
                 t2_quant = build_quant_config(method, **kwargs)
             else:
                 t2_quant = None
             self.transformer_2 = create_transformer_from_config(
-                transformer_2_config, quant_config=t2_quant,
+                transformer_2_config,
+                quant_config=t2_quant,
             )
         else:
             self.transformer_2 = None

From 24ecb6dcc8e5039e2c778399e77eab7a85fbb886 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 5 May 2026 15:38:16 +0800
Subject: [PATCH 05/12] add test

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 .../test_wan22_quant_config_propagation.py    | 301 ++++++++++++++++++
 .../test_wan22_i2v_autoround_w4a16.py         | 157 +++++++++
 .../test_wan22_t2v_autoround_w4a16.py         | 142 +++++++++
 3 files changed, 600 insertions(+)
 create mode 100644 tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py
 create mode 100644 tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
 create mode 100644 tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py

diff --git a/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py
new file mode 100644
index 00000000000..e5442f912f8
--- /dev/null
+++ b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py
@@ -0,0 +1,301 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Unit tests for Wan2.2 quant_config propagation through transformer creation.
+
+Tests cover:
+- create_transformer_from_config passes quant_config and prefix
+- create_vace_transformer_from_config passes quant_config and prefix
+- set_tf_model_config propagates quant_config to OmniDiffusionConfig
+- patch_wan_rms_norm safely iterates sys.modules with concurrent modifications
+- I2V transformer_2 quant_config is built from config dict
+"""
+
+import sys
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+
+import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 as wan22_module
+import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace as wan22_vace_module
+from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 import (
+    create_transformer_from_config,
+)
+from vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace import (
+    create_vace_transformer_from_config,
+)
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu, pytest.mark.diffusion]
+
+
+# ---------------------------------------------------------------------------
+# create_transformer_from_config: quant_config / prefix forwarding
+# ---------------------------------------------------------------------------
+
+
+class TestCreateTransformerQuant:
+    """Verify quant_config and prefix are forwarded to WanTransformer3DModel."""
+
+    def test_quant_config_passed_through(self, monkeypatch):
+        captured = {}
+
+        class FakeTransformer:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+
+        fake_qc = MagicMock()
+        create_transformer_from_config(
+            {"patch_size": [1, 2, 2], "num_layers": 2},
+            quant_config=fake_qc,
+        )
+        assert captured.get("quant_config") is fake_qc
+
+    def test_prefix_passed_through(self, monkeypatch):
+        captured = {}
+
+        class FakeTransformer:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+
+        create_transformer_from_config(
+            {"patch_size": [1, 2, 2]},
+            prefix="model.transformer.",
+        )
+        assert captured.get("prefix") == "model.transformer."
+
+    def test_quant_config_none_by_default(self, monkeypatch):
+        captured = {}
+
+        class FakeTransformer:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+
+        create_transformer_from_config({"patch_size": [1, 2, 2]})
+        # When quant_config is None and prefix is "", they are not added
+        assert "quant_config" not in captured or captured["quant_config"] is None
+
+    def test_quant_config_and_prefix_together(self, monkeypatch):
+        captured = {}
+
+        class FakeTransformer:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+
+        fake_qc = MagicMock()
+        create_transformer_from_config(
+            {"patch_size": [1, 2, 2], "num_attention_heads": 4},
+            quant_config=fake_qc,
+            prefix="blocks.",
+        )
+        assert captured["quant_config"] is fake_qc
+        assert captured["prefix"] == "blocks."
+
+
+# ---------------------------------------------------------------------------
+# create_vace_transformer_from_config: quant_config / prefix forwarding
+# ---------------------------------------------------------------------------
+
+
+class TestCreateVaceTransformerQuant:
+    """Verify quant_config and prefix are forwarded to WanVACETransformer3DModel."""
+
+    def test_quant_config_passed_through(self, monkeypatch):
+        captured = {}
+
+        class FakeVACETransformer:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer)
+
+        fake_qc = MagicMock()
+        create_vace_transformer_from_config(
+            {"patch_size": [1, 2, 2], "num_layers": 2},
+            quant_config=fake_qc,
+        )
+        assert captured.get("quant_config") is fake_qc
+
+    def test_prefix_passed_through(self, monkeypatch):
+        captured = {}
+
+        class FakeVACETransformer:
+            def __init__(self, **kwargs):
+                captured.update(kwargs)
+
+        monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer)
+
+        create_vace_transformer_from_config(
+            {"patch_size": [1, 2, 2]},
+            prefix="vace.",
+        )
+        assert captured.get("prefix") == "vace."
+
+
+# ---------------------------------------------------------------------------
+# set_tf_model_config: propagation of quant_config
+# ---------------------------------------------------------------------------
+
+
+class TestSetTfModelConfig:
+    """Test that set_tf_model_config propagates quant_config correctly."""
+
+    def _make_od_config(self):
+        """Create a minimal OmniDiffusionConfig-like object for testing."""
+        from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+        cfg = object.__new__(OmniDiffusionConfig)
+        cfg.quantization_config = None
+        cfg.tf_model_config = None
+        return cfg
+
+    def test_propagates_quant_config_when_none(self):
+        cfg = self._make_od_config()
+        fake_qc = MagicMock()
+        tf_config = SimpleNamespace(quant_config=fake_qc, quant_method="auto-round")
+
+        cfg.set_tf_model_config(tf_config)
+
+        assert cfg.tf_model_config is tf_config
+        assert cfg.quantization_config is fake_qc
+
+    def test_does_not_overwrite_existing_quantization_config(self):
+        cfg = self._make_od_config()
+        existing_qc = MagicMock()
+        cfg.quantization_config = existing_qc
+        tf_config = SimpleNamespace(quant_config=MagicMock())
+
+        cfg.set_tf_model_config(tf_config)
+
+        assert cfg.tf_model_config is tf_config
+        assert cfg.quantization_config is existing_qc  # not overwritten
+
+    def test_no_propagation_when_tf_quant_config_is_none(self):
+        cfg = self._make_od_config()
+        tf_config = SimpleNamespace(quant_config=None)
+
+        cfg.set_tf_model_config(tf_config)
+
+        assert cfg.tf_model_config is tf_config
+        assert cfg.quantization_config is None
+
+
+# ---------------------------------------------------------------------------
+# patch_wan_rms_norm: sys.modules snapshot safety
+# ---------------------------------------------------------------------------
+
+
+class TestPatchWanRmsNorm:
+    """Test that patch_wan_rms_norm doesn't raise on concurrent module registration."""
+
+    def test_patches_modules_with_wan_rms_norm(self):
+        from vllm_omni.diffusion.layers.norm import RMSNormVAE
+        from vllm_omni.diffusion.models.wan2_2.patch_diffusers import patch_wan_rms_norm
+
+        # Create a fake module that has WanRMS_norm
+        fake_module = SimpleNamespace(WanRMS_norm=lambda x: x)
+        sys.modules["_test_fake_wan_module"] = fake_module
+
+        try:
+            patch_wan_rms_norm()
+            assert fake_module.WanRMS_norm is RMSNormVAE
+        finally:
+            del sys.modules["_test_fake_wan_module"]
+
+    def test_no_error_when_modules_change_during_iteration(self):
+        """Regression test: list() snapshot prevents RuntimeError."""
+        from vllm_omni.diffusion.models.wan2_2.patch_diffusers import patch_wan_rms_norm
+
+        # Simulate a module being added during iteration by a side effect
+        original_items = sys.modules.items
+
+        def items_with_side_effect():
+            # This would cause RuntimeError without list() snapshot
+            result = list(original_items())
+            # Add a new module to simulate concurrent modification
+            sys.modules["_test_dynamic_module"] = SimpleNamespace()
+            return result
+
+        try:
+            # The function uses list(sys.modules.items()) so it takes a snapshot
+            # Just verify it doesn't raise
+            patch_wan_rms_norm()
+        finally:
+            sys.modules.pop("_test_dynamic_module", None)
+
+
+# ---------------------------------------------------------------------------
+# I2V transformer_2 quant_config extraction
+# ---------------------------------------------------------------------------
+
+
+class TestI2VTransformer2QuantConfig:
+    """Test the transformer_2 quant_config build logic from pipeline_wan2_2_i2v."""
+
+    def test_transformer_2_quant_config_built_from_dict(self):
+        """When transformer_2 config has quantization_config dict, build_quant_config is called."""
+        from vllm_omni.quantization.factory import build_quant_config
+
+        t2_config = {
+            "patch_size": [1, 2, 2],
+            "num_layers": 2,
+            "quantization_config": {
+                "quant_method": "auto-round",
+                "bits": 4,
+                "group_size": 128,
+                "sym": True,
+                "packing_format": "auto_round:auto_gptq",
+            },
+        }
+
+        # Replicate the logic from pipeline_wan2_2_i2v.py
+        t2_quant = t2_config.get("quantization_config")
+        if isinstance(t2_quant, dict) and "quant_method" in t2_quant:
+            method = t2_quant["quant_method"]
+            kwargs = {k: v for k, v in t2_quant.items() if k != "quant_method"}
+            t2_quant = build_quant_config(method, **kwargs)
+        else:
+            t2_quant = None
+
+        from vllm.model_executor.layers.quantization.inc import INCConfig
+
+        assert isinstance(t2_quant, INCConfig)
+        assert t2_quant.weight_bits == 4
+        assert t2_quant.group_size == 128
+
+    def test_transformer_2_quant_config_none_when_missing(self):
+        """When transformer_2 config has no quantization_config, result is None."""
+        t2_config = {
+            "patch_size": [1, 2, 2],
+            "num_layers": 2,
+        }
+
+        t2_quant = t2_config.get("quantization_config")
+        if isinstance(t2_quant, dict) and "quant_method" in t2_quant:
+            pass  # won't enter
+        else:
+            t2_quant = None
+
+        assert t2_quant is None
+
+    def test_transformer_2_quant_config_none_when_dict_lacks_method(self):
+        """When quantization_config is a dict but missing quant_method, result is None."""
+        t2_config = {
+            "patch_size": [1, 2, 2],
+            "quantization_config": {"bits": 4},  # no quant_method key
+        }
+
+        t2_quant = t2_config.get("quantization_config")
+        if isinstance(t2_quant, dict) and "quant_method" in t2_quant:
+            pass
+        else:
+            t2_quant = None
+
+        assert t2_quant is None
diff --git a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
new file mode 100644
index 00000000000..6245995d3c2
--- /dev/null
+++ b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
@@ -0,0 +1,157 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E tests for Wan2.2-I2V-A14B AutoRound W4A16 quantized inference.
+
+These tests require:
+  - A CUDA GPU with sufficient memory (~36 GiB for quantized model)
+  - The quantized model checkpoint (Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound)
+"""
+
+import gc
+import os
+import os as _os
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+
+from tests.helpers.env import DeviceMemoryMonitor
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniRunner
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+QUANTIZED_MODEL = "Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound"
+BASELINE_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
+
+# Allow overriding via environment for local testing
+QUANTIZED_MODEL = _os.environ.get("WAN22_I2V_AUTOROUND_MODEL", QUANTIZED_MODEL)
+BASELINE_MODEL = _os.environ.get("WAN22_I2V_BASELINE_MODEL", BASELINE_MODEL)
+
+# Small resolution to keep GPU memory & time manageable
+HEIGHT = 480
+WIDTH = 640
+NUM_FRAMES = 5  # must satisfy num_frames % 4 == 1 for Wan2.2
+NUM_STEPS = 2  # minimal for smoke-test
+
+
+def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image:
+    """Create a deterministic test image for I2V tests."""
+    rng = np.random.RandomState(42)
+    arr = rng.randint(0, 256, (height, width, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def _generate_video(
+    model_name: str, **extra_kwargs
+) -> tuple[object, float]:
+    """Load a Wan2.2 I2V model, generate one video, return (frames, peak_memory_mb)."""
+    gc.collect()
+    current_omni_platform.empty_cache()
+    device_index = current_omni_platform.current_device()
+    current_omni_platform.reset_peak_memory_stats()
+    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+
+    image = _create_test_image()
+
+    with OmniRunner(
+        model_name,
+        enforce_eager=True,
+        boundary_ratio=0.875,
+        flow_shift=12.0,
+        **extra_kwargs,
+    ) as runner:
+        current_omni_platform.reset_peak_memory_stats()
+        outputs = runner.omni.generate(
+            {
+                "prompt": "A cat sitting on a table, smooth motion",
+                "multi_modal_data": {"image": image},
+            },
+            sampling_params_list=OmniDiffusionSamplingParams(
+                height=HEIGHT,
+                width=WIDTH,
+                num_frames=NUM_FRAMES,
+                num_inference_steps=NUM_STEPS,
+                guidance_scale=5.0,
+                guidance_scale_2=6.0,
+                boundary_ratio=0.875,
+                generator=torch.Generator(
+                    device=current_omni_platform.device_type
+                ).manual_seed(42),
+            ),
+        )
+
+    peak = monitor.peak_used_mb
+    monitor.stop()
+
+    first_output = outputs[0]
+    assert first_output.final_output_type == "image"
+
+    req_out = first_output.request_output
+    if isinstance(req_out, list):
+        req_out = req_out[0]
+    assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images")
+    frames = req_out.images[0]
+
+    gc.collect()
+    current_omni_platform.empty_cache()
+
+    return frames, peak
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_wan22_i2v_autoround_w4a16_generates_video():
+    """Load the W4A16 quantized Wan2.2 I2V model and verify it produces a valid video."""
+    frames, _ = _generate_video(QUANTIZED_MODEL)
+
+    assert frames is not None, "Expected video frames output"
+    assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
+
+    # frames shape: (batch, num_frames, height, width, channels)
+    assert frames.shape[1] == NUM_FRAMES, (
+        f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
+    )
+    assert frames.shape[2] == HEIGHT, (
+        f"Expected height {HEIGHT}, got {frames.shape[2]}"
+    )
+    assert frames.shape[3] == WIDTH, (
+        f"Expected width {WIDTH}, got {frames.shape[3]}"
+    )
+
+    # Sanity: video should not be blank (frames are [0, 1] floats)
+    arr = np.asarray(frames)
+    assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)"
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_wan22_i2v_autoround_w4a16_memory_savings():
+    """Compare peak GPU memory of quantized vs BF16 baseline.
+
+    The W4A16 model should use meaningfully less memory than the
+    BF16 baseline since weights are 4-bit instead of 16-bit.
+    """
+    _, quant_peak = _generate_video(QUANTIZED_MODEL)
+    cleanup_dist_env_and_memory()
+    _, baseline_peak = _generate_video(BASELINE_MODEL)
+
+    print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB")
+    print(f"Baseline (BF16) peak memory:   {baseline_peak:.0f} MB")
+    print(f"Savings:                        {baseline_peak - quant_peak:.0f} MB")
+
+    # Wan2.2 I2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB.
+    # Use a conservative threshold to account for activations and overhead.
+    min_savings_mb = 5000
+    assert quant_peak + min_savings_mb < baseline_peak, (
+        f"Quantized model ({quant_peak:.0f} MB) should use at least "
+        f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)"
+    )
diff --git a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
new file mode 100644
index 00000000000..7a17f54c408
--- /dev/null
+++ b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
@@ -0,0 +1,142 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E tests for Wan2.2-T2V-A14B AutoRound W4A16 quantized inference.
+
+These tests require:
+  - A CUDA GPU with sufficient memory (~36 GiB for quantized model)
+  - The quantized model checkpoint (Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound)
+"""
+
+import gc
+import os
+import os as _os
+
+import numpy as np
+import pytest
+import torch
+from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
+
+from tests.helpers.env import DeviceMemoryMonitor
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniRunner
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.platforms import current_omni_platform
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+QUANTIZED_MODEL = "Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound"
+BASELINE_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
+
+# Allow overriding via environment for local testing
+QUANTIZED_MODEL = _os.environ.get("WAN22_T2V_AUTOROUND_MODEL", QUANTIZED_MODEL)
+BASELINE_MODEL = _os.environ.get("WAN22_T2V_BASELINE_MODEL", BASELINE_MODEL)
+
+# Small resolution to keep GPU memory & time manageable
+HEIGHT = 480
+WIDTH = 640
+NUM_FRAMES = 5  # must satisfy num_frames % 4 == 1 for Wan2.2
+NUM_STEPS = 2  # minimal for smoke-test
+
+
+def _generate_video(
+    model_name: str, **extra_kwargs
+) -> tuple[object, float]:
+    """Load a Wan2.2 T2V model, generate one video, return (frames, peak_memory_mb)."""
+    gc.collect()
+    current_omni_platform.empty_cache()
+    device_index = current_omni_platform.current_device()
+    current_omni_platform.reset_peak_memory_stats()
+    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+
+    with OmniRunner(
+        model_name,
+        enforce_eager=True,
+        boundary_ratio=0.875,
+        flow_shift=5.0,
+        **extra_kwargs,
+    ) as runner:
+        current_omni_platform.reset_peak_memory_stats()
+        outputs = runner.omni.generate(
+            prompts="A cat sitting on a table",
+            sampling_params_list=OmniDiffusionSamplingParams(
+                height=HEIGHT,
+                width=WIDTH,
+                num_frames=NUM_FRAMES,
+                num_inference_steps=NUM_STEPS,
+                guidance_scale=1.0,
+                generator=torch.Generator(
+                    device=current_omni_platform.device_type
+                ).manual_seed(42),
+            ),
+        )
+
+    peak = monitor.peak_used_mb
+    monitor.stop()
+
+    first_output = outputs[0]
+    assert first_output.final_output_type == "image"
+
+    req_out = first_output.request_output
+    if isinstance(req_out, list):
+        req_out = req_out[0]
+    assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images")
+    frames = req_out.images[0]
+
+    gc.collect()
+    current_omni_platform.empty_cache()
+
+    return frames, peak
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_wan22_t2v_autoround_w4a16_generates_video():
+    """Load the W4A16 quantized Wan2.2 T2V model and verify it produces a valid video."""
+    frames, _ = _generate_video(QUANTIZED_MODEL)
+
+    assert frames is not None, "Expected video frames output"
+    assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
+
+    # frames shape: (batch, num_frames, height, width, channels)
+    assert frames.shape[1] == NUM_FRAMES, (
+        f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
+    )
+    assert frames.shape[2] == HEIGHT, (
+        f"Expected height {HEIGHT}, got {frames.shape[2]}"
+    )
+    assert frames.shape[3] == WIDTH, (
+        f"Expected width {WIDTH}, got {frames.shape[3]}"
+    )
+
+    # Sanity: video should not be blank (frames are [0, 1] floats)
+    arr = np.asarray(frames)
+    assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)"
+
+
+@pytest.mark.advanced_model
+@pytest.mark.diffusion
+@hardware_test(res={"cuda": "H100"})
+def test_wan22_t2v_autoround_w4a16_memory_savings():
+    """Compare peak GPU memory of quantized vs BF16 baseline.
+
+    The W4A16 model should use meaningfully less memory than the
+    BF16 baseline since weights are 4-bit instead of 16-bit.
+    """
+    _, quant_peak = _generate_video(QUANTIZED_MODEL)
+    cleanup_dist_env_and_memory()
+    _, baseline_peak = _generate_video(BASELINE_MODEL)
+
+    print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB")
+    print(f"Baseline (BF16) peak memory:   {baseline_peak:.0f} MB")
+    print(f"Savings:                        {baseline_peak - quant_peak:.0f} MB")
+
+    # Wan2.2 T2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB.
+    # Use a conservative threshold to account for activations and overhead.
+    min_savings_mb = 5000
+    assert quant_peak + min_savings_mb < baseline_peak, (
+        f"Quantized model ({quant_peak:.0f} MB) should use at least "
+        f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)"
+    )

From 9b1737d460f9e0273ee805b199e00fa2e65cadf6 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 5 May 2026 21:15:42 +0800
Subject: [PATCH 06/12] fix pre-commit

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 .../test_wan22_i2v_autoround_w4a16.py         | 20 +++++--------------
 .../test_wan22_t2v_autoround_w4a16.py         | 20 +++++--------------
 2 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
index 6245995d3c2..163e5d5b030 100644
--- a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
+++ b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
@@ -47,9 +47,7 @@ def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image:
     return Image.fromarray(arr)
 
 
-def _generate_video(
-    model_name: str, **extra_kwargs
-) -> tuple[object, float]:
+def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]:
     """Load a Wan2.2 I2V model, generate one video, return (frames, peak_memory_mb)."""
     gc.collect()
     current_omni_platform.empty_cache()
@@ -81,9 +79,7 @@ def _generate_video(
                 guidance_scale=5.0,
                 guidance_scale_2=6.0,
                 boundary_ratio=0.875,
-                generator=torch.Generator(
-                    device=current_omni_platform.device_type
-                ).manual_seed(42),
+                generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
             ),
         )
 
@@ -116,15 +112,9 @@ def test_wan22_i2v_autoround_w4a16_generates_video():
     assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
 
     # frames shape: (batch, num_frames, height, width, channels)
-    assert frames.shape[1] == NUM_FRAMES, (
-        f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
-    )
-    assert frames.shape[2] == HEIGHT, (
-        f"Expected height {HEIGHT}, got {frames.shape[2]}"
-    )
-    assert frames.shape[3] == WIDTH, (
-        f"Expected width {WIDTH}, got {frames.shape[3]}"
-    )
+    assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
+    assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}"
+    assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}"
 
     # Sanity: video should not be blank (frames are [0, 1] floats)
     arr = np.asarray(frames)
diff --git a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
index 7a17f54c408..8c96103562e 100644
--- a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
+++ b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
@@ -39,9 +39,7 @@
 NUM_STEPS = 2  # minimal for smoke-test
 
 
-def _generate_video(
-    model_name: str, **extra_kwargs
-) -> tuple[object, float]:
+def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]:
     """Load a Wan2.2 T2V model, generate one video, return (frames, peak_memory_mb)."""
     gc.collect()
     current_omni_platform.empty_cache()
@@ -66,9 +64,7 @@ def _generate_video(
                 num_frames=NUM_FRAMES,
                 num_inference_steps=NUM_STEPS,
                 guidance_scale=1.0,
-                generator=torch.Generator(
-                    device=current_omni_platform.device_type
-                ).manual_seed(42),
+                generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
             ),
         )
 
@@ -101,15 +97,9 @@ def test_wan22_t2v_autoround_w4a16_generates_video():
     assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
 
     # frames shape: (batch, num_frames, height, width, channels)
-    assert frames.shape[1] == NUM_FRAMES, (
-        f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
-    )
-    assert frames.shape[2] == HEIGHT, (
-        f"Expected height {HEIGHT}, got {frames.shape[2]}"
-    )
-    assert frames.shape[3] == WIDTH, (
-        f"Expected width {WIDTH}, got {frames.shape[3]}"
-    )
+    assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
+    assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}"
+    assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}"
 
     # Sanity: video should not be blank (frames are [0, 1] floats)
     arr = np.asarray(frames)

From 0d0719032b43c8210f0196022d19a69952913522 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Wed, 6 May 2026 09:48:45 +0800
Subject: [PATCH 07/12] update doc

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 docs/user_guide/quantization/autoround.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/user_guide/quantization/autoround.md b/docs/user_guide/quantization/autoround.md
index 2261d79a57c..88fed3b62b3 100644
--- a/docs/user_guide/quantization/autoround.md
+++ b/docs/user_guide/quantization/autoround.md
@@ -32,7 +32,9 @@ guide. AutoRound is Intel-supported.
 |-------|------------|-------|--------|---------|
 | FLUX.1-dev | `vllm-project-org/FLUX.1-dev-AutoRound-w4a16` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend |
 | Qwen-Image | Not listed | Diffusion transformer | W4A16 | Not validated |
-| Wan2.2 | Not listed | Diffusion transformer | W4A16 | Not validated |
+| Wan2.2-I2V | `Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend |
+| Wan2.2-T2V | `Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend |
+| Wan2.2-TI2V | `Intel/Wan2.2-TI2V-5B-Diffusers-int4-AutoRound` | Diffusion transformer | W4A16 | GPTQ-Marlin or Intel-supported AutoRound backend |
 
 ### Multi-Stage Omni/TTS Model (Qwen3-Omni, Qwen3-TTS)
 

From 5bc949a6b78cb480f0aacb20a68f5220d05b62e1 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 19 May 2026 11:30:32 +0800
Subject: [PATCH 08/12] remove unnecessary import

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
index 03a4a752c4d..81889607a71 100644
--- a/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
+++ b/vllm_omni/diffusion/models/wan2_2/wan2_2_transformer.py
@@ -3,7 +3,7 @@
 
 import math
 from collections.abc import Iterable
-from typing import TYPE_CHECKING, Any
+from typing import Any
 
 import torch
 import torch.nn as nn
@@ -46,11 +46,6 @@
 from vllm_omni.diffusion.layers.rope import RotaryEmbeddingWan
 from vllm_omni.platforms import current_omni_platform
 
-if TYPE_CHECKING:
-    from vllm.model_executor.layers.quantization.base_config import (
-        QuantizationConfig,
-    )
-
 logger = init_logger(__name__)
 
 

From c3c2d71a2634a4e5ef1653ca23fa63f0172a10f8 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 19 May 2026 11:34:12 +0800
Subject: [PATCH 09/12] fix lint

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
index e93c3691697..2d8c752a4eb 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
@@ -122,9 +122,7 @@ def load_transformer_config(model_path: str, subfolder: str = "transformer", loc
 
 
 def create_transformer_from_config(
-    config: dict,
-    quant_config: QuantizationConfig | None = None,
-    prefix: str = ""
+    config: dict, quant_config: QuantizationConfig | None = None, prefix: str = ""
 ) -> WanTransformer3DModel:
     """Create WanTransformer3DModel from config dict."""
     kwargs: dict = {}

From 40dd76ac483908b3d726031217429c31d5430742 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Tue, 19 May 2026 22:38:22 +0800
Subject: [PATCH 10/12] adapt test code according to comments

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 .buildkite/test-nightly.yml                   |   2 +-
 .../test_wan22_quant_config_propagation.py    |  44 +--
 .../test_wan22_autoround_w4a16_expansion.py   | 311 ++++++++++++++++++
 .../test_wan22_i2v_autoround_w4a16.py         | 147 ---------
 .../test_wan22_t2v_autoround_w4a16.py         | 132 --------
 .../models/wan2_2/pipeline_wan2_2.py          |   1 +
 vllm_omni/quantization/factory.py             |  37 +++
 7 files changed, 372 insertions(+), 302 deletions(-)
 create mode 100644 tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py
 delete mode 100644 tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
 delete mode 100644 tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 1b61044affa..ae93d2353a3 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -959,7 +959,7 @@ steps:
       - label: ":full_moon: Diffusion X2V · Function Test"
         timeout_in_minutes: 90
         commands:
-          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py  -m "full_model and cuda" --run-level "full_model"
+          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py -m "full_model and cuda" --run-level "full_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
diff --git a/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py
index e5442f912f8..b1405993312 100644
--- a/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py
+++ b/tests/diffusion/models/wan2_2/test_wan22_quant_config_propagation.py
@@ -12,9 +12,9 @@
 
 import sys
 from types import SimpleNamespace
-from unittest.mock import MagicMock
 
 import pytest
+from pytest_mock import MockerFixture
 
 import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2 as wan22_module
 import vllm_omni.diffusion.models.wan2_2.pipeline_wan2_2_vace as wan22_vace_module
@@ -36,30 +36,30 @@
 class TestCreateTransformerQuant:
     """Verify quant_config and prefix are forwarded to WanTransformer3DModel."""
 
-    def test_quant_config_passed_through(self, monkeypatch):
+    def test_quant_config_passed_through(self, mocker: MockerFixture):
         captured = {}
 
         class FakeTransformer:
             def __init__(self, **kwargs):
                 captured.update(kwargs)
 
-        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+        mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer)
 
-        fake_qc = MagicMock()
+        fake_qc = mocker.MagicMock()
         create_transformer_from_config(
             {"patch_size": [1, 2, 2], "num_layers": 2},
             quant_config=fake_qc,
         )
         assert captured.get("quant_config") is fake_qc
 
-    def test_prefix_passed_through(self, monkeypatch):
+    def test_prefix_passed_through(self, mocker: MockerFixture):
         captured = {}
 
         class FakeTransformer:
             def __init__(self, **kwargs):
                 captured.update(kwargs)
 
-        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+        mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer)
 
         create_transformer_from_config(
             {"patch_size": [1, 2, 2]},
@@ -67,29 +67,29 @@ def __init__(self, **kwargs):
         )
         assert captured.get("prefix") == "model.transformer."
 
-    def test_quant_config_none_by_default(self, monkeypatch):
+    def test_quant_config_none_by_default(self, mocker: MockerFixture):
         captured = {}
 
         class FakeTransformer:
             def __init__(self, **kwargs):
                 captured.update(kwargs)
 
-        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+        mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer)
 
         create_transformer_from_config({"patch_size": [1, 2, 2]})
         # When quant_config is None and prefix is "", they are not added
         assert "quant_config" not in captured or captured["quant_config"] is None
 
-    def test_quant_config_and_prefix_together(self, monkeypatch):
+    def test_quant_config_and_prefix_together(self, mocker: MockerFixture):
         captured = {}
 
         class FakeTransformer:
             def __init__(self, **kwargs):
                 captured.update(kwargs)
 
-        monkeypatch.setattr(wan22_module, "WanTransformer3DModel", FakeTransformer)
+        mocker.patch.object(wan22_module, "WanTransformer3DModel", FakeTransformer)
 
-        fake_qc = MagicMock()
+        fake_qc = mocker.MagicMock()
         create_transformer_from_config(
             {"patch_size": [1, 2, 2], "num_attention_heads": 4},
             quant_config=fake_qc,
@@ -107,30 +107,30 @@ def __init__(self, **kwargs):
 class TestCreateVaceTransformerQuant:
     """Verify quant_config and prefix are forwarded to WanVACETransformer3DModel."""
 
-    def test_quant_config_passed_through(self, monkeypatch):
+    def test_quant_config_passed_through(self, mocker: MockerFixture):
         captured = {}
 
         class FakeVACETransformer:
             def __init__(self, **kwargs):
                 captured.update(kwargs)
 
-        monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer)
+        mocker.patch.object(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer)
 
-        fake_qc = MagicMock()
+        fake_qc = mocker.MagicMock()
         create_vace_transformer_from_config(
             {"patch_size": [1, 2, 2], "num_layers": 2},
             quant_config=fake_qc,
         )
         assert captured.get("quant_config") is fake_qc
 
-    def test_prefix_passed_through(self, monkeypatch):
+    def test_prefix_passed_through(self, mocker: MockerFixture):
         captured = {}
 
         class FakeVACETransformer:
             def __init__(self, **kwargs):
                 captured.update(kwargs)
 
-        monkeypatch.setattr(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer)
+        mocker.patch.object(wan22_vace_module, "WanVACETransformer3DModel", FakeVACETransformer)
 
         create_vace_transformer_from_config(
             {"patch_size": [1, 2, 2]},
@@ -156,9 +156,9 @@ def _make_od_config(self):
         cfg.tf_model_config = None
         return cfg
 
-    def test_propagates_quant_config_when_none(self):
+    def test_propagates_quant_config_when_none(self, mocker: MockerFixture):
         cfg = self._make_od_config()
-        fake_qc = MagicMock()
+        fake_qc = mocker.MagicMock()
         tf_config = SimpleNamespace(quant_config=fake_qc, quant_method="auto-round")
 
         cfg.set_tf_model_config(tf_config)
@@ -166,18 +166,18 @@ def test_propagates_quant_config_when_none(self):
         assert cfg.tf_model_config is tf_config
         assert cfg.quantization_config is fake_qc
 
-    def test_does_not_overwrite_existing_quantization_config(self):
+    def test_does_not_overwrite_existing_quantization_config(self, mocker: MockerFixture):
         cfg = self._make_od_config()
-        existing_qc = MagicMock()
+        existing_qc = mocker.MagicMock()
         cfg.quantization_config = existing_qc
-        tf_config = SimpleNamespace(quant_config=MagicMock())
+        tf_config = SimpleNamespace(quant_config=mocker.MagicMock())
 
         cfg.set_tf_model_config(tf_config)
 
         assert cfg.tf_model_config is tf_config
         assert cfg.quantization_config is existing_qc  # not overwritten
 
-    def test_no_propagation_when_tf_quant_config_is_none(self):
+    def test_no_propagation_when_tf_quant_config_is_none(self, mocker: MockerFixture):
         cfg = self._make_od_config()
         tf_config = SimpleNamespace(quant_config=None)
 
diff --git a/tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py b/tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py
new file mode 100644
index 00000000000..044435cb571
--- /dev/null
+++ b/tests/e2e/offline_inference/test_wan22_autoround_w4a16_expansion.py
@@ -0,0 +1,311 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E tests for Wan2.2 AutoRound W4A16 quantized inference.
+
+These tests cover I2V (image-to-video) and T2V (text-to-video) generation
+with quantized weights.
+
+Requirements:
+  - CUDA GPU (H100 or equivalent, ~36 GiB for quantized model)
+  - The quantized model checkpoint (Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound,
+    Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound)
+"""
+
+import gc
+import os as _os
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+
+from tests.helpers.env import DeviceMemoryMonitor
+from tests.helpers.mark import hardware_test
+from vllm_omni.inputs.data import OmniDiffusionSamplingParams
+from vllm_omni.platforms import current_omni_platform
+
+_os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+QUANTIZED_MODEL_I2V = "Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound"
+BASELINE_MODEL_I2V = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
+QUANTIZED_MODEL_T2V = "Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound"
+BASELINE_MODEL_T2V = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
+
+QUANTIZED_MODEL_I2V = _os.environ.get("WAN22_I2V_AUTOROUND_MODEL", QUANTIZED_MODEL_I2V)
+BASELINE_MODEL_I2V = _os.environ.get("WAN22_I2V_BASELINE_MODEL", BASELINE_MODEL_I2V)
+QUANTIZED_MODEL_T2V = _os.environ.get("WAN22_T2V_AUTOROUND_MODEL", QUANTIZED_MODEL_T2V)
+BASELINE_MODEL_T2V = _os.environ.get("WAN22_T2V_BASELINE_MODEL", BASELINE_MODEL_T2V)
+
+pytestmark = [
+    pytest.mark.full_model,
+    pytest.mark.diffusion,
+]
+
+# Small resolution to keep GPU memory & time manageable
+HEIGHT = 480
+WIDTH = 640
+NUM_FRAMES = 5  # must satisfy num_frames % 4 == 1 for Wan2.2
+NUM_STEPS = 2  # minimal for smoke-test
+
+# Parametrise: (model, stage_config_path=None, extra_omni_kwargs)
+# When stage_config_path is None, the engine auto-resolves from the model's own config.
+quant_i2v_params = [(QUANTIZED_MODEL_I2V, None, {"enforce_eager": True})]
+baseline_i2v_params = [(BASELINE_MODEL_I2V, None, {"enforce_eager": True})]
+quant_t2v_params = [(QUANTIZED_MODEL_T2V, None, {"enforce_eager": True})]
+baseline_t2v_params = [(BASELINE_MODEL_T2V, None, {"enforce_eager": True})]
+
+# Module-level storage for peak memory results across tests
+_memory_results: dict[str, float] = {}
+
+
+def _sampling_params_i2v() -> OmniDiffusionSamplingParams:
+    """Create sampling params for I2V generation."""
+    return OmniDiffusionSamplingParams(
+        height=HEIGHT,
+        width=WIDTH,
+        num_frames=NUM_FRAMES,
+        num_inference_steps=NUM_STEPS,
+        guidance_scale=5.0,
+        guidance_scale_2=6.0,
+        boundary_ratio=0.875,
+        generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
+    )
+
+
+def _sampling_params_t2v() -> OmniDiffusionSamplingParams:
+    """Create sampling params for T2V generation."""
+    return OmniDiffusionSamplingParams(
+        height=HEIGHT,
+        width=WIDTH,
+        num_frames=NUM_FRAMES,
+        num_inference_steps=NUM_STEPS,
+        guidance_scale=4.0,
+        generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
+    )
+
+
+def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image:
+    """Create a deterministic test image for I2V tests."""
+    rng = np.random.RandomState(42)
+    arr = rng.randint(0, 256, (height, width, 3), dtype=np.uint8)
+    return Image.fromarray(arr)
+
+
+def _generate_i2v_video(omni_runner_handler, prompt: str = "A cat sitting on a table, smooth motion") -> tuple:
+    """Generate one I2V video, return (frames, peak_memory_mb)."""
+    gc.collect()
+    current_omni_platform.empty_cache()
+    device_index = current_omni_platform.current_device()
+    current_omni_platform.reset_peak_memory_stats()
+    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+
+    image = _create_test_image()
+    response = omni_runner_handler.send_diffusion_request(
+        {
+            "prompt": prompt,
+            "images": image,
+            "sampling_params": _sampling_params_i2v(),
+        },
+    )
+
+    peak = monitor.peak_used_mb
+    monitor.stop()
+
+    assert response.success, f"Request failed: {response.error_message}"
+    assert response.images is not None and len(response.images) > 0, "Expected image output"
+    frames = response.images[0]
+
+    gc.collect()
+    current_omni_platform.empty_cache()
+
+    return frames, peak
+
+
+def _generate_t2v_video(omni_runner_handler, prompt: str = "A cat sitting on a table") -> tuple:
+    """Generate one T2V video, return (frames, peak_memory_mb)."""
+    gc.collect()
+    current_omni_platform.empty_cache()
+    device_index = current_omni_platform.current_device()
+    current_omni_platform.reset_peak_memory_stats()
+    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
+    monitor.start()
+
+    response = omni_runner_handler.send_diffusion_request(
+        {
+            "prompt": prompt,
+            "sampling_params": _sampling_params_t2v(),
+        },
+    )
+
+    peak = monitor.peak_used_mb
+    monitor.stop()
+
+    assert response.success, f"Request failed: {response.error_message}"
+    assert response.images is not None and len(response.images) > 0, "Expected image output"
+    frames = response.images[0]
+
+    gc.collect()
+    current_omni_platform.empty_cache()
+
+    return frames, peak
+
+
+# ------------------------------------------------------------------
+# Test: I2V quantized model generates valid video
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+@pytest.mark.parametrize("omni_runner", quant_i2v_params, indirect=True)
+def test_wan22_i2v_autoround_w4a16_generates_video(omni_runner, omni_runner_handler):
+    """Load the W4A16 quantized Wan2.2 I2V model and verify it produces a valid video."""
+    frames, _ = _generate_i2v_video(omni_runner_handler)
+
+    assert frames is not None, "Expected video frames output"
+    assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
+
+    # frames shape: (batch, num_frames, height, width, channels)
+    assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
+    assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}"
+    assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}"
+
+    # Sanity: video should not be blank (frames are [0, 1] floats)
+    arr = np.asarray(frames)
+    assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)"
+
+
+# ------------------------------------------------------------------
+# Test: T2V quantized model generates valid video
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+@pytest.mark.parametrize("omni_runner", quant_t2v_params, indirect=True)
+def test_wan22_t2v_autoround_w4a16_generates_video(omni_runner, omni_runner_handler):
+    """Load the W4A16 quantized Wan2.2 T2V model and verify it produces a valid video."""
+    frames, _ = _generate_t2v_video(omni_runner_handler)
+
+    assert frames is not None, "Expected video frames output"
+    assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
+
+    assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
+    assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}"
+    assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}"
+
+    arr = np.asarray(frames)
+    assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)"
+
+
+# ------------------------------------------------------------------
+# Test: I2V quantized peak memory
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+@pytest.mark.parametrize("omni_runner", quant_i2v_params, indirect=True)
+def test_wan22_i2v_autoround_w4a16_quant_peak(omni_runner, omni_runner_handler):
+    """Measure peak GPU memory of W4A16 quantized I2V model."""
+    frames, peak = _generate_i2v_video(omni_runner_handler)
+
+    assert frames is not None, "Expected video frames output"
+    _memory_results["quant_i2v"] = peak
+    print(f"\nQuantized I2V (W4A16) peak memory: {peak:.0f} MB")
+
+
+# ------------------------------------------------------------------
+# Test: I2V baseline peak memory
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+@pytest.mark.parametrize("omni_runner", baseline_i2v_params, indirect=True)
+def test_wan22_i2v_autoround_w4a16_baseline_peak(omni_runner, omni_runner_handler):
+    """Measure peak GPU memory of BF16 baseline I2V model."""
+    frames, peak = _generate_i2v_video(omni_runner_handler)
+
+    assert frames is not None, "Expected video frames output"
+    _memory_results["baseline_i2v"] = peak
+    print(f"\nBaseline I2V (BF16) peak memory: {peak:.0f} MB")
+
+
+# ------------------------------------------------------------------
+# Test: I2V memory savings
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+def test_wan22_i2v_autoround_w4a16_memory_savings():
+    """Assert quantized I2V model uses meaningfully less memory than BF16 baseline."""
+    quant_peak = _memory_results["quant_i2v"]
+    baseline_peak = _memory_results["baseline_i2v"]
+
+    savings = baseline_peak - quant_peak
+    print(f"\nQuantized I2V (W4A16) peak memory: {quant_peak:.0f} MB")
+    print(f"Baseline I2V (BF16) peak memory:   {baseline_peak:.0f} MB")
+    print(f"Savings:                            {savings:.0f} MB")
+
+    # Wan2.2 I2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB.
+    # Use a conservative threshold to account for activations and overhead.
+    min_savings_mb = 5000
+    assert quant_peak + min_savings_mb < baseline_peak, (
+        f"Quantized model ({quant_peak:.0f} MB) should use at least "
+        f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)"
+    )
+
+
+# ------------------------------------------------------------------
+# Test: T2V quantized peak memory
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+@pytest.mark.parametrize("omni_runner", quant_t2v_params, indirect=True)
+def test_wan22_t2v_autoround_w4a16_quant_peak(omni_runner, omni_runner_handler):
+    """Measure peak GPU memory of W4A16 quantized T2V model."""
+    frames, peak = _generate_t2v_video(omni_runner_handler)
+
+    assert frames is not None, "Expected video frames output"
+    _memory_results["quant_t2v"] = peak
+    print(f"\nQuantized T2V (W4A16) peak memory: {peak:.0f} MB")
+
+
+# ------------------------------------------------------------------
+# Test: T2V baseline peak memory
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+@pytest.mark.parametrize("omni_runner", baseline_t2v_params, indirect=True)
+def test_wan22_t2v_autoround_w4a16_baseline_peak(omni_runner, omni_runner_handler):
+    """Measure peak GPU memory of BF16 baseline T2V model."""
+    frames, peak = _generate_t2v_video(omni_runner_handler)
+
+    assert frames is not None, "Expected video frames output"
+    _memory_results["baseline_t2v"] = peak
+    print(f"\nBaseline T2V (BF16) peak memory: {peak:.0f} MB")
+
+
+# ------------------------------------------------------------------
+# Test: T2V memory savings
+# ------------------------------------------------------------------
+
+
+@hardware_test(res={"cuda": "H100"})
+def test_wan22_t2v_autoround_w4a16_memory_savings():
+    """Assert quantized T2V model uses meaningfully less memory than BF16 baseline."""
+    quant_peak = _memory_results["quant_t2v"]
+    baseline_peak = _memory_results["baseline_t2v"]
+
+    savings = baseline_peak - quant_peak
+    print(f"\nQuantized T2V (W4A16) peak memory: {quant_peak:.0f} MB")
+    print(f"Baseline T2V (BF16) peak memory:   {baseline_peak:.0f} MB")
+    print(f"Savings:                            {savings:.0f} MB")
+
+    # Wan2.2 T2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB.
+    # Use a conservative threshold to account for activations and overhead.
+    min_savings_mb = 5000
+    assert quant_peak + min_savings_mb < baseline_peak, (
+        f"Quantized model ({quant_peak:.0f} MB) should use at least "
+        f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)"
+    )
diff --git a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
deleted file mode 100644
index 163e5d5b030..00000000000
--- a/tests/e2e/offline_inference/test_wan22_i2v_autoround_w4a16.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""E2E tests for Wan2.2-I2V-A14B AutoRound W4A16 quantized inference.
-
-These tests require:
-  - A CUDA GPU with sufficient memory (~36 GiB for quantized model)
-  - The quantized model checkpoint (Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound)
-"""
-
-import gc
-import os
-import os as _os
-
-import numpy as np
-import pytest
-import torch
-from PIL import Image
-from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-from tests.helpers.env import DeviceMemoryMonitor
-from tests.helpers.mark import hardware_test
-from tests.helpers.runtime import OmniRunner
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-from vllm_omni.outputs import OmniRequestOutput
-from vllm_omni.platforms import current_omni_platform
-
-os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-QUANTIZED_MODEL = "Intel/Wan2.2-I2V-A14B-Diffusers-int4-AutoRound"
-BASELINE_MODEL = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
-
-# Allow overriding via environment for local testing
-QUANTIZED_MODEL = _os.environ.get("WAN22_I2V_AUTOROUND_MODEL", QUANTIZED_MODEL)
-BASELINE_MODEL = _os.environ.get("WAN22_I2V_BASELINE_MODEL", BASELINE_MODEL)
-
-# Small resolution to keep GPU memory & time manageable
-HEIGHT = 480
-WIDTH = 640
-NUM_FRAMES = 5  # must satisfy num_frames % 4 == 1 for Wan2.2
-NUM_STEPS = 2  # minimal for smoke-test
-
-
-def _create_test_image(width: int = WIDTH, height: int = HEIGHT) -> Image.Image:
-    """Create a deterministic test image for I2V tests."""
-    rng = np.random.RandomState(42)
-    arr = rng.randint(0, 256, (height, width, 3), dtype=np.uint8)
-    return Image.fromarray(arr)
-
-
-def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]:
-    """Load a Wan2.2 I2V model, generate one video, return (frames, peak_memory_mb)."""
-    gc.collect()
-    current_omni_platform.empty_cache()
-    device_index = current_omni_platform.current_device()
-    current_omni_platform.reset_peak_memory_stats()
-    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
-    monitor.start()
-
-    image = _create_test_image()
-
-    with OmniRunner(
-        model_name,
-        enforce_eager=True,
-        boundary_ratio=0.875,
-        flow_shift=12.0,
-        **extra_kwargs,
-    ) as runner:
-        current_omni_platform.reset_peak_memory_stats()
-        outputs = runner.omni.generate(
-            {
-                "prompt": "A cat sitting on a table, smooth motion",
-                "multi_modal_data": {"image": image},
-            },
-            sampling_params_list=OmniDiffusionSamplingParams(
-                height=HEIGHT,
-                width=WIDTH,
-                num_frames=NUM_FRAMES,
-                num_inference_steps=NUM_STEPS,
-                guidance_scale=5.0,
-                guidance_scale_2=6.0,
-                boundary_ratio=0.875,
-                generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
-            ),
-        )
-
-    peak = monitor.peak_used_mb
-    monitor.stop()
-
-    first_output = outputs[0]
-    assert first_output.final_output_type == "image"
-
-    req_out = first_output.request_output
-    if isinstance(req_out, list):
-        req_out = req_out[0]
-    assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images")
-    frames = req_out.images[0]
-
-    gc.collect()
-    current_omni_platform.empty_cache()
-
-    return frames, peak
-
-
-@pytest.mark.advanced_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
-def test_wan22_i2v_autoround_w4a16_generates_video():
-    """Load the W4A16 quantized Wan2.2 I2V model and verify it produces a valid video."""
-    frames, _ = _generate_video(QUANTIZED_MODEL)
-
-    assert frames is not None, "Expected video frames output"
-    assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
-
-    # frames shape: (batch, num_frames, height, width, channels)
-    assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
-    assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}"
-    assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}"
-
-    # Sanity: video should not be blank (frames are [0, 1] floats)
-    arr = np.asarray(frames)
-    assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)"
-
-
-@pytest.mark.advanced_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
-def test_wan22_i2v_autoround_w4a16_memory_savings():
-    """Compare peak GPU memory of quantized vs BF16 baseline.
-
-    The W4A16 model should use meaningfully less memory than the
-    BF16 baseline since weights are 4-bit instead of 16-bit.
-    """
-    _, quant_peak = _generate_video(QUANTIZED_MODEL)
-    cleanup_dist_env_and_memory()
-    _, baseline_peak = _generate_video(BASELINE_MODEL)
-
-    print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB")
-    print(f"Baseline (BF16) peak memory:   {baseline_peak:.0f} MB")
-    print(f"Savings:                        {baseline_peak - quant_peak:.0f} MB")
-
-    # Wan2.2 I2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB.
-    # Use a conservative threshold to account for activations and overhead.
-    min_savings_mb = 5000
-    assert quant_peak + min_savings_mb < baseline_peak, (
-        f"Quantized model ({quant_peak:.0f} MB) should use at least "
-        f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)"
-    )
diff --git a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py b/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
deleted file mode 100644
index 8c96103562e..00000000000
--- a/tests/e2e/offline_inference/test_wan22_t2v_autoround_w4a16.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""E2E tests for Wan2.2-T2V-A14B AutoRound W4A16 quantized inference.
-
-These tests require:
-  - A CUDA GPU with sufficient memory (~36 GiB for quantized model)
-  - The quantized model checkpoint (Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound)
-"""
-
-import gc
-import os
-import os as _os
-
-import numpy as np
-import pytest
-import torch
-from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
-
-from tests.helpers.env import DeviceMemoryMonitor
-from tests.helpers.mark import hardware_test
-from tests.helpers.runtime import OmniRunner
-from vllm_omni.inputs.data import OmniDiffusionSamplingParams
-from vllm_omni.outputs import OmniRequestOutput
-from vllm_omni.platforms import current_omni_platform
-
-os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-
-QUANTIZED_MODEL = "Intel/Wan2.2-T2V-A14B-Diffusers-int4-AutoRound"
-BASELINE_MODEL = "Wan-AI/Wan2.2-T2V-A14B-Diffusers"
-
-# Allow overriding via environment for local testing
-QUANTIZED_MODEL = _os.environ.get("WAN22_T2V_AUTOROUND_MODEL", QUANTIZED_MODEL)
-BASELINE_MODEL = _os.environ.get("WAN22_T2V_BASELINE_MODEL", BASELINE_MODEL)
-
-# Small resolution to keep GPU memory & time manageable
-HEIGHT = 480
-WIDTH = 640
-NUM_FRAMES = 5  # must satisfy num_frames % 4 == 1 for Wan2.2
-NUM_STEPS = 2  # minimal for smoke-test
-
-
-def _generate_video(model_name: str, **extra_kwargs) -> tuple[object, float]:
-    """Load a Wan2.2 T2V model, generate one video, return (frames, peak_memory_mb)."""
-    gc.collect()
-    current_omni_platform.empty_cache()
-    device_index = current_omni_platform.current_device()
-    current_omni_platform.reset_peak_memory_stats()
-    monitor = DeviceMemoryMonitor(device_index=device_index, interval=0.02)
-    monitor.start()
-
-    with OmniRunner(
-        model_name,
-        enforce_eager=True,
-        boundary_ratio=0.875,
-        flow_shift=5.0,
-        **extra_kwargs,
-    ) as runner:
-        current_omni_platform.reset_peak_memory_stats()
-        outputs = runner.omni.generate(
-            prompts="A cat sitting on a table",
-            sampling_params_list=OmniDiffusionSamplingParams(
-                height=HEIGHT,
-                width=WIDTH,
-                num_frames=NUM_FRAMES,
-                num_inference_steps=NUM_STEPS,
-                guidance_scale=1.0,
-                generator=torch.Generator(device=current_omni_platform.device_type).manual_seed(42),
-            ),
-        )
-
-    peak = monitor.peak_used_mb
-    monitor.stop()
-
-    first_output = outputs[0]
-    assert first_output.final_output_type == "image"
-
-    req_out = first_output.request_output
-    if isinstance(req_out, list):
-        req_out = req_out[0]
-    assert isinstance(req_out, OmniRequestOutput) and hasattr(req_out, "images")
-    frames = req_out.images[0]
-
-    gc.collect()
-    current_omni_platform.empty_cache()
-
-    return frames, peak
-
-
-@pytest.mark.advanced_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
-def test_wan22_t2v_autoround_w4a16_generates_video():
-    """Load the W4A16 quantized Wan2.2 T2V model and verify it produces a valid video."""
-    frames, _ = _generate_video(QUANTIZED_MODEL)
-
-    assert frames is not None, "Expected video frames output"
-    assert hasattr(frames, "shape"), "Expected frames to have a shape attribute"
-
-    # frames shape: (batch, num_frames, height, width, channels)
-    assert frames.shape[1] == NUM_FRAMES, f"Expected {NUM_FRAMES} frames, got {frames.shape[1]}"
-    assert frames.shape[2] == HEIGHT, f"Expected height {HEIGHT}, got {frames.shape[2]}"
-    assert frames.shape[3] == WIDTH, f"Expected width {WIDTH}, got {frames.shape[3]}"
-
-    # Sanity: video should not be blank (frames are [0, 1] floats)
-    arr = np.asarray(frames)
-    assert arr.std() > 0.01, "Generated video appears blank (std ≈ 0)"
-
-
-@pytest.mark.advanced_model
-@pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
-def test_wan22_t2v_autoround_w4a16_memory_savings():
-    """Compare peak GPU memory of quantized vs BF16 baseline.
-
-    The W4A16 model should use meaningfully less memory than the
-    BF16 baseline since weights are 4-bit instead of 16-bit.
-    """
-    _, quant_peak = _generate_video(QUANTIZED_MODEL)
-    cleanup_dist_env_and_memory()
-    _, baseline_peak = _generate_video(BASELINE_MODEL)
-
-    print(f"Quantized (W4A16) peak memory: {quant_peak:.0f} MB")
-    print(f"Baseline (BF16) peak memory:   {baseline_peak:.0f} MB")
-    print(f"Savings:                        {baseline_peak - quant_peak:.0f} MB")
-
-    # Wan2.2 T2V A14B transformer is ~28 GB in BF16; W4A16 should save ~20 GB.
-    # Use a conservative threshold to account for activations and overhead.
-    min_savings_mb = 5000
-    assert quant_peak + min_savings_mb < baseline_peak, (
-        f"Quantized model ({quant_peak:.0f} MB) should use at least "
-        f"{min_savings_mb} MB less than baseline ({baseline_peak:.0f} MB)"
-    )
diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
index 2d8c752a4eb..3a68a48d72a 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
@@ -37,6 +37,7 @@
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.inputs.data import OmniTextPrompt
 from vllm_omni.platforms import current_omni_platform
+from vllm_omni.quantization.factory import build_quant_config, normalize_quant_method_alias
 
 logger = logging.getLogger(__name__)
 DEBUG_PERF = False
diff --git a/vllm_omni/quantization/factory.py b/vllm_omni/quantization/factory.py
index 955f97cef85..597980ad952 100644
--- a/vllm_omni/quantization/factory.py
+++ b/vllm_omni/quantization/factory.py
@@ -99,6 +99,43 @@ def _build_inc(**kw: Any) -> QuantizationConfig:
 SUPPORTED_QUANTIZATION_METHODS: list[str] = list(dict.fromkeys(QUANTIZATION_METHODS + list(_OVERRIDES.keys())))
 
 
+def _build_reverse_alias_map() -> dict[str, str]:
+    """Build a mapping from normalized method aliases to canonical names.
+
+    All keys in _OVERRIDES that share the same builder function are considered
+    aliases of each other. The canonical name is the first key (in definition
+    order) that maps to a given builder — i.e. the one returned by
+    builder().get_name().
+    """
+    builder_to_first_key: dict[Callable[..., QuantizationConfig], str] = {}
+    for key in _OVERRIDES:
+        builder = _OVERRIDES[key]
+        if builder not in builder_to_first_key:
+            builder_to_first_key[builder] = key
+
+    result: dict[str, str] = {}
+    for key, builder in _OVERRIDES.items():
+        canonical = builder_to_first_key[builder]
+        result[key.lower().replace("-", "_")] = canonical
+    return result
+
+
+_CACHED_ALIAS_MAP: dict[str, str] | None = None
+
+
+def normalize_quant_method_alias(method: str | None) -> str | None:
+    """Map a method name (or any of its aliases) to its canonical internal name.
+    Returns the input unchanged if it is not a known alias.
+    """
+    if method is None:
+        return None
+    global _CACHED_ALIAS_MAP
+    if _CACHED_ALIAS_MAP is None:
+        _CACHED_ALIAS_MAP = _build_reverse_alias_map()
+    normalized = method.lower().replace("-", "_")
+    return _CACHED_ALIAS_MAP.get(normalized, normalized)
+
+
 _MODEL_OPT_METHODS = {
     "modelopt",
 }

From e261e3a9ac8edf4973e851cfd93b59e648eeb862 Mon Sep 17 00:00:00 2001
From: lvliang-intel <liang1.lv@intel.com>
Date: Wed, 20 May 2026 16:24:50 +0800
Subject: [PATCH 11/12] fix pre-commit

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py | 1 -
 vllm_omni/quantization/factory.py                    | 6 ++++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
index 3a68a48d72a..2d8c752a4eb 100644
--- a/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
+++ b/vllm_omni/diffusion/models/wan2_2/pipeline_wan2_2.py
@@ -37,7 +37,6 @@
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.inputs.data import OmniTextPrompt
 from vllm_omni.platforms import current_omni_platform
-from vllm_omni.quantization.factory import build_quant_config, normalize_quant_method_alias
 
 logger = logging.getLogger(__name__)
 DEBUG_PERF = False
diff --git a/vllm_omni/quantization/factory.py b/vllm_omni/quantization/factory.py
index 597980ad952..3766e4596cd 100644
--- a/vllm_omni/quantization/factory.py
+++ b/vllm_omni/quantization/factory.py
@@ -123,7 +123,7 @@ def _build_reverse_alias_map() -> dict[str, str]:
 _CACHED_ALIAS_MAP: dict[str, str] | None = None
 
 
-def normalize_quant_method_alias(method: str | None) -> str | None:
+def _normalize_quant_method_alias(method: str | None) -> str | None:
     """Map a method name (or any of its aliases) to its canonical internal name.
     Returns the input unchanged if it is not a known alias.
     """
@@ -371,7 +371,9 @@ def resolve_quant_config_from_disk(
         )
         return build_quant_config(qc_method, **qc_kwargs)
 
-    if quant_config.get_name() != qc_method:
+    active_method = _normalize_quant_method_alias(quant_config.get_name())
+    disk_method = _normalize_quant_method_alias(qc_method)
+    if active_method != disk_method:
         raise ValueError(
             f"Checkpoint config.json declares quant_method={qc_method!r} but the "
             f"active quantization config is {quant_config.get_name()!r}. "

From f75eaf3238a2b321ee8575f533a9f141c0536a9e Mon Sep 17 00:00:00 2001
From: hxhhhlalala <hyh_hh@163.com>
Date: Wed, 20 May 2026 14:21:29 +0800
Subject: [PATCH 12/12] [NPU][Quant] Add W4A4 MXFP4 online & MXFP4  dual-scale
 online/offline quantization support for Wan2.2 T2V / I2V inference on Ascend
 NPU (#3578)

Signed-off-by: hyh_hh <huyinghong1@huawei.com>
Co-authored-by: hyh_hh <huyinghong1@huawei.com>
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
---
 docs/user_guide/quantization/mxfp4.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/user_guide/quantization/mxfp4.md b/docs/user_guide/quantization/mxfp4.md
index 7463ada23ee..401a55ad4d8 100644
--- a/docs/user_guide/quantization/mxfp4.md
+++ b/docs/user_guide/quantization/mxfp4.md
@@ -397,7 +397,7 @@ names** discovered in Step 1. No code changes to the model are required.
 ```python
 omni = Omni(
     model="/path/to/your-model",
-    quantization_config={
+    quantization={
         "method": "mxfp4_dualscale",
         "ignored_layers": [
             "blocks.0.attn1.to_qkv",   # runtime name, not diffusers name