vllm-project · pjh4993 · Apr 9, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 13, 2026
@@ -110,6 +110,46 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
+      - label: ":full_moon: Diffusion · Other · Quantization Quality Test"
+        timeout_in_minutes: 60
+        if: *nightly_or_pr_label
+        commands:
+          - pip install lpips
+          - pytest -s -v tests/quantization/test_quantization_quality.py -k "z_image or flux" -m "advanced_model" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
   - group: ":card_index_dividers: Wan Series Model Test"
     key: nightly-wan-model-test-group
     steps:
@@ -362,3 +402,44 @@ steps:
                     hostPath:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
+
+      - label: ":full_moon: Diffusion · Qwen-Image · Quantization Quality Test"
+        timeout_in_minutes: 60
+        if: *nightly_or_pr_label
+        commands:
+          - pip install lpips
+          - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          - pytest -s -v tests/quantization/test_quantization_quality.py -k "qwen_image" -m "advanced_model" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
@@ -263,7 +263,7 @@ outputs = omni.generate(
 
 ### Quality Gate Test (LPIPS)
 
-We provide a pytest-based quality gate at `tests/diffusion/quantization/test_quantization_quality.py`.
+We provide a pytest-based quality gate at `tests/quantization/test_quantization_quality.py`.
 It generates outputs with both BF16 and your quantized method using the same seed, computes
 [LPIPS](https://github.com/richzhang/PerceptualSimilarity) perceptual distance, and **fails if it
 exceeds a threshold**. This is the recommended way to validate that a quantization method does not
@@ -290,13 +290,13 @@ QualityTestConfig(
 pip install lpips
 
 # Run all quality tests
-pytest tests/diffusion/quantization/test_quantization_quality.py -v -m ""
+pytest tests/quantization/test_quantization_quality.py -v -m ""
 
 # Run only your method
-pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "int8"
+pytest tests/quantization/test_quantization_quality.py -v -m "" -k "int8"
 
 # Run a specific model
-pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "z_image"
+pytest tests/quantization/test_quantization_quality.py -v -m "" -k "z_image"
 ```
 
 **Step 3: Read the output:**
@@ -364,7 +364,7 @@ aggressive quantization methods (Int8, NVFP4).
 
 1. Run the quality gate test with all layers quantized:
     ```bash
-    pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "your_model"
+    pytest tests/quantization/test_quantization_quality.py -v -m "" -k "your_model"
     ```
 
 2. If LPIPS exceeds your threshold, try skipping common sensitive layers one at a time:

@@ -5,7 +5,7 @@
 import pytest
 from torch import nn
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_build_quant_config_fp8():

@@ -18,7 +18,7 @@
     dequant_gemm_gguf,
 )
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_gguf_config_creation_and_delegation():

@@ -4,7 +4,7 @@
 
 import pytest
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_build_quant_config_autoround():

@@ -12,11 +12,7 @@
 from vllm_omni.quantization import build_quant_config
 from vllm_omni.quantization.factory import SUPPORTED_QUANTIZATION_METHODS
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
-
-npu_available = pytest.mark.skipif(not current_omni_platform.is_npu(), reason="NPU platform not available.")
-
-cuda_available = pytest.mark.skipif(not current_omni_platform.is_cuda(), reason="GPU platform not available.")
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_int8_config_creation():
@@ -229,208 +225,3 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config):
         layer.weight = Parameter(torch.randn(128, 64))
         method.process_weights_after_loading(layer)
         mock_deps["quant"].assert_called_once_with(layer.weight, scale=None)
-
-
-@npu_available
-class TestNPUInt8LinearMethod:
-    qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8)
-    scale_mock = torch.randn(128)
-    out_mock = torch.randn((16, 128))
-
-    @pytest.fixture
-    def mock_torch_npu(self, mocker):
-        torch_npu = mocker.MagicMock()
-
-        mocker.patch("vllm_omni.quantization.int8_config.torch_npu", return_value=torch_npu)
-        mocker.patch(
-            "vllm_omni.quantization.int8_config.torch_npu.npu_dynamic_quant",
-            return_value=(self.qweight_mock, self.scale_mock),
-        )
-        mocker.patch("vllm_omni.quantization.int8_config.torch_npu.npu_quant_matmul", return_value=self.out_mock)
-        return torch_npu
-
-    @pytest.fixture
-    def mock_quant_config(self, mocker):
-        return mocker.Mock()
-
-    @pytest.fixture
-    def mock_layer(self, mocker):
-        layer = torch.nn.Module()
-        layer.weight = torch.nn.Parameter(self.qweight_mock, requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(self.scale_mock, requires_grad=False)
-        return layer
-
-    def test_npu_int8_process_weights_after_loading(self, mock_layer, mock_quant_config, mock_torch_npu):
-        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
-
-        method = NPUInt8LinearMethod(mock_quant_config)
-        ori_weight_shape = mock_layer.weight.shape
-
-        method.process_weights_after_loading(mock_layer)
-
-        assert mock_layer.weight.shape == ori_weight_shape[::-1]
-        assert mock_layer.weight.is_contiguous()
-
-    def test_npu_int8_apply(self, mock_layer, mock_quant_config, mock_torch_npu):
-        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
-
-        method = NPUInt8LinearMethod(mock_quant_config)
-        x = torch.randn(1, 16, 64)
-
-        output = method.apply(mock_layer, x)
-        assert output.shape == (1, 16, 128)
-
-    def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mock_torch_npu):
-        from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod
-
-        method = NPUInt8OnlineLinearMethod(mock_quant_config)
-        method.process_weights_after_loading(mock_layer)
-
-        assert mock_layer.weight.shape == (64, 128)
-        assert torch.equal(mock_layer.weight_scale, self.scale_mock)
-
-
-@pytest.fixture
-def quant_config():
-    """Shared quant config fixture for smoke tests."""
-    from vllm_omni.quantization.int8_config import DiffusionInt8Config
-
-    return DiffusionInt8Config(
-        is_checkpoint_int8_serialized=False,
-        activation_scheme="dynamic",
-    )
-
-
-@npu_available
-class TestNPUInt8Smoke:
-    """Smoke tests using real torch_npu, only run on NPU."""
-
-    @pytest.fixture
-    def real_layer(self):
-        """Create a real linear layer with fp16 weights on NPU"""
-        layer = torch.nn.Module()
-        layer.weight = torch.nn.Parameter(
-            torch.randn(128, 64, dtype=torch.float16, device="npu"),
-            requires_grad=False,
-        )
-        layer.logical_widths = [128]
-        layer.input_size_per_partition = 64
-        layer.output_size_per_partition = 128
-        layer.orig_dtype = torch.float16
-        return layer
-
-    def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer):
-        """Smoke test: verify npu_dynamic_quant returns correct shapes."""
-        import torch_npu
-
-        # Call real torch_npu.npu_dynamic_quant
-        weight = real_layer.weight
-        qweight, scale = torch_npu.npu_dynamic_quant(weight)
-
-        assert qweight.shape == weight.shape
-        assert qweight.dtype == torch.int8
-        assert scale.shape == (weight.shape[0],)
-
-    def test_real_npu_online_process_weights_after_loading(self, quant_config, real_layer):
-        """Smoke test: full process_weights_after_loading with real torch_npu."""
-        from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod
-
-        method = NPUInt8OnlineLinearMethod(quant_config)
-
-        method.process_weights_after_loading(real_layer)
-
-        assert real_layer.weight.shape == (64, 128)
-        assert real_layer.weight.dtype == torch.int8
-        assert hasattr(real_layer, "weight_scale")
-        assert real_layer.weight_scale.shape == (128,)
-
-    def test_real_npu_int8_apply_forward(self, quant_config):
-        """Smoke test: forward pass with real npu_quant_matmul."""
-        import torch_npu
-
-        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
-
-        method = NPUInt8LinearMethod(quant_config)
-
-        # Create layer with pre-processed weights on NPU
-        layer = torch.nn.Module()
-        weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="npu")
-        qweight, scale = torch_npu.npu_dynamic_quant(weight_fp16)
-        layer.weight = torch.nn.Parameter(qweight.t().contiguous(), requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(scale.squeeze(), requires_grad=False)
-
-        # Forward pass on NPU
-        x = torch.randn(2, 16, 64, dtype=torch.float16, device="npu")
-        output = method.apply(layer, x)
-
-        assert output.shape == (2, 16, 128)
-        assert output.dtype == torch.float16
-
-
-@cuda_available
-class TestCudaInt8Smoke:
-    """Smoke tests using real CUDA kernels, only on CUDA"""
-
-    @pytest.fixture
-    def real_layer(self):
-        """Create a real linear layer with fp16 weights on CUDA"""
-        layer = torch.nn.Module()
-        layer.weight = torch.nn.Parameter(
-            torch.randn(128, 64, dtype=torch.float16, device="cuda"),
-            requires_grad=False,
-        )
-        layer.logical_widths = [128]
-        layer.input_size_per_partition = 64
-        layer.output_size_per_partition = 128
-        layer.orig_dtype = torch.float16
-        return layer
-
-    def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config):
-        """Smoke test: verify scaled_int8_quant returns correct shapes."""
-        from vllm import _custom_ops as ops
-
-        weight = torch.randn(128, 64, dtype=torch.float16, device="cuda")
-        qweight, scale, _ = ops.scaled_int8_quant(weight, scale=None)
-
-        assert qweight.shape == weight.shape
-        assert qweight.dtype == torch.int8
-        assert scale.shape == (weight.shape[0], 1)
-
-    def test_real_cuda_online_process_weights_after_loading(self, quant_config, real_layer):
-        """Smoke test: full process_weights_after_loading with real CUDA ops."""
-        from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod
-
-        method = Int8OnlineLinearMethod(quant_config)
-
-        method.process_weights_after_loading(real_layer)
-
-        assert real_layer.weight.shape == (64, 128)
-        assert real_layer.weight.dtype == torch.int8
-        assert hasattr(real_layer, "weight_scale")
-
-    def test_real_cuda_int8_apply_forward(self, quant_config):
-        """Smoke test: forward pass with real CUDA int8 kernel."""
-        from vllm import _custom_ops as ops
-
-        from vllm_omni.quantization.int8_config import Int8LinearMethod
-
-        method = Int8LinearMethod(quant_config)
-
-        # Create layer with pre-processed weights
-        layer = torch.nn.Module()
-        weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="cuda")
-        qweight, scale, _ = ops.scaled_int8_quant(weight_fp16, scale=None)
-        layer.weight = torch.nn.Parameter(qweight.t(), requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(scale, requires_grad=False)
-
-        # Set required attributes for kernel
-        layer.input_scale = None
-        layer.input_zero_point = None
-        layer.azp_adj = None
-
-        # Forward pass
-        x = torch.randn(2, 16, 64, dtype=torch.float16, device="cuda")
-        output = method.apply(layer, x)
-
-        assert output.shape == (2, 16, 128)
-        assert output.dtype == torch.float16