diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml
index 04b99c0a837..468747f5f5e 100644
--- a/.buildkite/test-nightly-diffusion.yml
+++ b/.buildkite/test-nightly-diffusion.yml
@@ -110,6 +110,46 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
+      - label: ":full_moon: Diffusion · Other · Quantization Quality Test"
+        timeout_in_minutes: 60
+        if: *nightly_or_pr_label
+        commands:
+          - pip install lpips
+          - pytest -s -v tests/quantization/test_quantization_quality.py -k "z_image or flux" -m "advanced_model" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
   - group: ":card_index_dividers: Wan Series Model Test"
     key: nightly-wan-model-test-group
     steps:
@@ -362,3 +402,44 @@ steps:
                     hostPath:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
+
+      - label: ":full_moon: Diffusion · Qwen-Image · Quantization Quality Test"
+        timeout_in_minutes: 60
+        if: *nightly_or_pr_label
+        commands:
+          - pip install lpips
+          - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+          - pytest -s -v tests/quantization/test_quantization_quality.py -k "qwen_image" -m "advanced_model" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
diff --git a/docs/contributing/model/adding_quantization_model.md b/docs/contributing/model/adding_quantization_model.md
index f2731888846..1db91314e94 100644
--- a/docs/contributing/model/adding_quantization_model.md
+++ b/docs/contributing/model/adding_quantization_model.md
@@ -263,7 +263,7 @@ outputs = omni.generate(
 
 ### Quality Gate Test (LPIPS)
 
-We provide a pytest-based quality gate at `tests/diffusion/quantization/test_quantization_quality.py`.
+We provide a pytest-based quality gate at `tests/quantization/test_quantization_quality.py`.
 It generates outputs with both BF16 and your quantized method using the same seed, computes
 [LPIPS](https://github.com/richzhang/PerceptualSimilarity) perceptual distance, and **fails if it
 exceeds a threshold**. This is the recommended way to validate that a quantization method does not
@@ -290,13 +290,13 @@ QualityTestConfig(
 pip install lpips
 
 # Run all quality tests
-pytest tests/diffusion/quantization/test_quantization_quality.py -v -m ""
+pytest tests/quantization/test_quantization_quality.py -v -m ""
 
 # Run only your method
-pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "int8"
+pytest tests/quantization/test_quantization_quality.py -v -m "" -k "int8"
 
 # Run a specific model
-pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "z_image"
+pytest tests/quantization/test_quantization_quality.py -v -m "" -k "z_image"
 ```
 
 **Step 3: Read the output:**
@@ -364,7 +364,7 @@ aggressive quantization methods (Int8, NVFP4).
 
 1. Run the quality gate test with all layers quantized:
     ```bash
-    pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "your_model"
+    pytest tests/quantization/test_quantization_quality.py -v -m "" -k "your_model"
     ```
 
 2. If LPIPS exceeds your threshold, try skipping common sensitive layers one at a time:
diff --git a/tests/diffusion/quantization/__init__.py b/tests/quantization/__init__.py
similarity index 100%
rename from tests/diffusion/quantization/__init__.py
rename to tests/quantization/__init__.py
diff --git a/tests/diffusion/quantization/test_fp8_config.py b/tests/quantization/test_fp8_config.py
similarity index 99%
rename from tests/diffusion/quantization/test_fp8_config.py
rename to tests/quantization/test_fp8_config.py
index 574af7a6699..36bfef4696b 100644
--- a/tests/diffusion/quantization/test_fp8_config.py
+++ b/tests/quantization/test_fp8_config.py
@@ -5,7 +5,7 @@
 import pytest
 from torch import nn
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_build_quant_config_fp8():
diff --git a/tests/diffusion/quantization/test_gguf_config.py b/tests/quantization/test_gguf_config.py
similarity index 98%
rename from tests/diffusion/quantization/test_gguf_config.py
rename to tests/quantization/test_gguf_config.py
index 1dd401d59c5..7a1f9a13099 100644
--- a/tests/diffusion/quantization/test_gguf_config.py
+++ b/tests/quantization/test_gguf_config.py
@@ -18,7 +18,7 @@
     dequant_gemm_gguf,
 )
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu]
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_gguf_config_creation_and_delegation():
diff --git a/tests/diffusion/quantization/test_inc_config.py b/tests/quantization/test_inc_config.py
similarity index 98%
rename from tests/diffusion/quantization/test_inc_config.py
rename to tests/quantization/test_inc_config.py
index a7aabf7f620..8d03d51757d 100644
--- a/tests/diffusion/quantization/test_inc_config.py
+++ b/tests/quantization/test_inc_config.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_build_quant_config_autoround():
diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/quantization/test_int8_config.py
similarity index 51%
rename from tests/diffusion/quantization/test_int8_config.py
rename to tests/quantization/test_int8_config.py
index 875277ece42..182a68956d1 100644
--- a/tests/diffusion/quantization/test_int8_config.py
+++ b/tests/quantization/test_int8_config.py
@@ -12,11 +12,7 @@
 from vllm_omni.quantization import build_quant_config
 from vllm_omni.quantization.factory import SUPPORTED_QUANTIZATION_METHODS
 
-pytestmark = [pytest.mark.core_model, pytest.mark.diffusion]
-
-npu_available = pytest.mark.skipif(not current_omni_platform.is_npu(), reason="NPU platform not available.")
-
-cuda_available = pytest.mark.skipif(not current_omni_platform.is_cuda(), reason="GPU platform not available.")
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
 
 
 def test_int8_config_creation():
@@ -229,208 +225,3 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config):
         layer.weight = Parameter(torch.randn(128, 64))
         method.process_weights_after_loading(layer)
         mock_deps["quant"].assert_called_once_with(layer.weight, scale=None)
-
-
-@npu_available
-class TestNPUInt8LinearMethod:
-    qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8)
-    scale_mock = torch.randn(128)
-    out_mock = torch.randn((16, 128))
-
-    @pytest.fixture
-    def mock_torch_npu(self, mocker):
-        torch_npu = mocker.MagicMock()
-
-        mocker.patch("vllm_omni.quantization.int8_config.torch_npu", return_value=torch_npu)
-        mocker.patch(
-            "vllm_omni.quantization.int8_config.torch_npu.npu_dynamic_quant",
-            return_value=(self.qweight_mock, self.scale_mock),
-        )
-        mocker.patch("vllm_omni.quantization.int8_config.torch_npu.npu_quant_matmul", return_value=self.out_mock)
-        return torch_npu
-
-    @pytest.fixture
-    def mock_quant_config(self, mocker):
-        return mocker.Mock()
-
-    @pytest.fixture
-    def mock_layer(self, mocker):
-        layer = torch.nn.Module()
-        layer.weight = torch.nn.Parameter(self.qweight_mock, requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(self.scale_mock, requires_grad=False)
-        return layer
-
-    def test_npu_int8_process_weights_after_loading(self, mock_layer, mock_quant_config, mock_torch_npu):
-        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
-
-        method = NPUInt8LinearMethod(mock_quant_config)
-        ori_weight_shape = mock_layer.weight.shape
-
-        method.process_weights_after_loading(mock_layer)
-
-        assert mock_layer.weight.shape == ori_weight_shape[::-1]
-        assert mock_layer.weight.is_contiguous()
-
-    def test_npu_int8_apply(self, mock_layer, mock_quant_config, mock_torch_npu):
-        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
-
-        method = NPUInt8LinearMethod(mock_quant_config)
-        x = torch.randn(1, 16, 64)
-
-        output = method.apply(mock_layer, x)
-        assert output.shape == (1, 16, 128)
-
-    def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mock_torch_npu):
-        from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod
-
-        method = NPUInt8OnlineLinearMethod(mock_quant_config)
-        method.process_weights_after_loading(mock_layer)
-
-        assert mock_layer.weight.shape == (64, 128)
-        assert torch.equal(mock_layer.weight_scale, self.scale_mock)
-
-
-@pytest.fixture
-def quant_config():
-    """Shared quant config fixture for smoke tests."""
-    from vllm_omni.quantization.int8_config import DiffusionInt8Config
-
-    return DiffusionInt8Config(
-        is_checkpoint_int8_serialized=False,
-        activation_scheme="dynamic",
-    )
-
-
-@npu_available
-class TestNPUInt8Smoke:
-    """Smoke tests using real torch_npu, only run on NPU."""
-
-    @pytest.fixture
-    def real_layer(self):
-        """Create a real linear layer with fp16 weights on NPU"""
-        layer = torch.nn.Module()
-        layer.weight = torch.nn.Parameter(
-            torch.randn(128, 64, dtype=torch.float16, device="npu"),
-            requires_grad=False,
-        )
-        layer.logical_widths = [128]
-        layer.input_size_per_partition = 64
-        layer.output_size_per_partition = 128
-        layer.orig_dtype = torch.float16
-        return layer
-
-    def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer):
-        """Smoke test: verify npu_dynamic_quant returns correct shapes."""
-        import torch_npu
-
-        # Call real torch_npu.npu_dynamic_quant
-        weight = real_layer.weight
-        qweight, scale = torch_npu.npu_dynamic_quant(weight)
-
-        assert qweight.shape == weight.shape
-        assert qweight.dtype == torch.int8
-        assert scale.shape == (weight.shape[0],)
-
-    def test_real_npu_online_process_weights_after_loading(self, quant_config, real_layer):
-        """Smoke test: full process_weights_after_loading with real torch_npu."""
-        from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod
-
-        method = NPUInt8OnlineLinearMethod(quant_config)
-
-        method.process_weights_after_loading(real_layer)
-
-        assert real_layer.weight.shape == (64, 128)
-        assert real_layer.weight.dtype == torch.int8
-        assert hasattr(real_layer, "weight_scale")
-        assert real_layer.weight_scale.shape == (128,)
-
-    def test_real_npu_int8_apply_forward(self, quant_config):
-        """Smoke test: forward pass with real npu_quant_matmul."""
-        import torch_npu
-
-        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
-
-        method = NPUInt8LinearMethod(quant_config)
-
-        # Create layer with pre-processed weights on NPU
-        layer = torch.nn.Module()
-        weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="npu")
-        qweight, scale = torch_npu.npu_dynamic_quant(weight_fp16)
-        layer.weight = torch.nn.Parameter(qweight.t().contiguous(), requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(scale.squeeze(), requires_grad=False)
-
-        # Forward pass on NPU
-        x = torch.randn(2, 16, 64, dtype=torch.float16, device="npu")
-        output = method.apply(layer, x)
-
-        assert output.shape == (2, 16, 128)
-        assert output.dtype == torch.float16
-
-
-@cuda_available
-class TestCudaInt8Smoke:
-    """Smoke tests using real CUDA kernels, only on CUDA"""
-
-    @pytest.fixture
-    def real_layer(self):
-        """Create a real linear layer with fp16 weights on CUDA"""
-        layer = torch.nn.Module()
-        layer.weight = torch.nn.Parameter(
-            torch.randn(128, 64, dtype=torch.float16, device="cuda"),
-            requires_grad=False,
-        )
-        layer.logical_widths = [128]
-        layer.input_size_per_partition = 64
-        layer.output_size_per_partition = 128
-        layer.orig_dtype = torch.float16
-        return layer
-
-    def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config):
-        """Smoke test: verify scaled_int8_quant returns correct shapes."""
-        from vllm import _custom_ops as ops
-
-        weight = torch.randn(128, 64, dtype=torch.float16, device="cuda")
-        qweight, scale, _ = ops.scaled_int8_quant(weight, scale=None)
-
-        assert qweight.shape == weight.shape
-        assert qweight.dtype == torch.int8
-        assert scale.shape == (weight.shape[0], 1)
-
-    def test_real_cuda_online_process_weights_after_loading(self, quant_config, real_layer):
-        """Smoke test: full process_weights_after_loading with real CUDA ops."""
-        from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod
-
-        method = Int8OnlineLinearMethod(quant_config)
-
-        method.process_weights_after_loading(real_layer)
-
-        assert real_layer.weight.shape == (64, 128)
-        assert real_layer.weight.dtype == torch.int8
-        assert hasattr(real_layer, "weight_scale")
-
-    def test_real_cuda_int8_apply_forward(self, quant_config):
-        """Smoke test: forward pass with real CUDA int8 kernel."""
-        from vllm import _custom_ops as ops
-
-        from vllm_omni.quantization.int8_config import Int8LinearMethod
-
-        method = Int8LinearMethod(quant_config)
-
-        # Create layer with pre-processed weights
-        layer = torch.nn.Module()
-        weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="cuda")
-        qweight, scale, _ = ops.scaled_int8_quant(weight_fp16, scale=None)
-        layer.weight = torch.nn.Parameter(qweight.t(), requires_grad=False)
-        layer.weight_scale = torch.nn.Parameter(scale, requires_grad=False)
-
-        # Set required attributes for kernel
-        layer.input_scale = None
-        layer.input_zero_point = None
-        layer.azp_adj = None
-
-        # Forward pass
-        x = torch.randn(2, 16, 64, dtype=torch.float16, device="cuda")
-        output = method.apply(layer, x)
-
-        assert output.shape == (2, 16, 128)
-        assert output.dtype == torch.float16
diff --git a/tests/quantization/test_int8_smoke.py b/tests/quantization/test_int8_smoke.py
new file mode 100644
index 00000000000..a0038ab7324
--- /dev/null
+++ b/tests/quantization/test_int8_smoke.py
@@ -0,0 +1,230 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Smoke tests for Int8 quantization on real hardware (CUDA / NPU).
+
+These tests exercise the actual quantization kernels and require a GPU.
+For pure config/factory unit tests, see test_int8_config.py.
+"""
+
+import pytest
+import torch
+
+from vllm_omni.platforms import current_omni_platform
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cuda, pytest.mark.L4]
+
+npu_available = pytest.mark.skipif(
+    not current_omni_platform.is_npu(),
+    reason="NPU platform not available.",
+)
+
+cuda_available = pytest.mark.skipif(
+    not current_omni_platform.is_cuda(),
+    reason="GPU platform not available.",
+)
+
+
+@pytest.fixture
+def quant_config():
+    """Shared quant config fixture for smoke tests."""
+    from vllm_omni.quantization.int8_config import DiffusionInt8Config
+
+    return DiffusionInt8Config(
+        is_checkpoint_int8_serialized=False,
+        activation_scheme="dynamic",
+    )
+
+
+@npu_available
+class TestNPUInt8LinearMethod:
+    qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8)
+    scale_mock = torch.randn(128)
+    out_mock = torch.randn((16, 128))
+
+    @pytest.fixture
+    def mock_torch_npu(self, mocker):
+        torch_npu = mocker.MagicMock()
+
+        mocker.patch(
+            "vllm_omni.quantization.int8_config.torch_npu",
+            return_value=torch_npu,
+        )
+        mocker.patch(
+            "vllm_omni.quantization.int8_config.torch_npu.npu_dynamic_quant",
+            return_value=(self.qweight_mock, self.scale_mock),
+        )
+        mocker.patch(
+            "vllm_omni.quantization.int8_config.torch_npu.npu_quant_matmul",
+            return_value=self.out_mock,
+        )
+        return torch_npu
+
+    @pytest.fixture
+    def mock_quant_config(self, mocker):
+        return mocker.Mock()
+
+    @pytest.fixture
+    def mock_layer(self, mocker):
+        layer = torch.nn.Module()
+        layer.weight = torch.nn.Parameter(self.qweight_mock, requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(self.scale_mock, requires_grad=False)
+        return layer
+
+    def test_npu_int8_process_weights_after_loading(self, mock_layer, mock_quant_config, mock_torch_npu):
+        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
+
+        method = NPUInt8LinearMethod(mock_quant_config)
+        ori_weight_shape = mock_layer.weight.shape
+
+        method.process_weights_after_loading(mock_layer)
+
+        assert mock_layer.weight.shape == ori_weight_shape[::-1]
+        assert mock_layer.weight.is_contiguous()
+
+    def test_npu_int8_apply(self, mock_layer, mock_quant_config, mock_torch_npu):
+        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
+
+        method = NPUInt8LinearMethod(mock_quant_config)
+        x = torch.randn(1, 16, 64)
+
+        output = method.apply(mock_layer, x)
+        assert output.shape == (1, 16, 128)
+
+    def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mock_torch_npu):
+        from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod
+
+        method = NPUInt8OnlineLinearMethod(mock_quant_config)
+        method.process_weights_after_loading(mock_layer)
+
+        assert mock_layer.weight.shape == (64, 128)
+        assert torch.equal(mock_layer.weight_scale, self.scale_mock)
+
+
+@npu_available
+class TestNPUInt8Smoke:
+    """Smoke tests using real torch_npu, only run on NPU."""
+
+    @pytest.fixture
+    def real_layer(self):
+        """Create a real linear layer with fp16 weights on NPU"""
+        layer = torch.nn.Module()
+        layer.weight = torch.nn.Parameter(
+            torch.randn(128, 64, dtype=torch.float16, device="npu"),
+            requires_grad=False,
+        )
+        layer.logical_widths = [128]
+        layer.input_size_per_partition = 64
+        layer.output_size_per_partition = 128
+        layer.orig_dtype = torch.float16
+        return layer
+
+    def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer):
+        """Smoke test: verify npu_dynamic_quant returns correct shapes."""
+        import torch_npu
+
+        weight = real_layer.weight
+        qweight, scale = torch_npu.npu_dynamic_quant(weight)
+
+        assert qweight.shape == weight.shape
+        assert qweight.dtype == torch.int8
+        assert scale.shape == (weight.shape[0],)
+
+    def test_real_npu_online_process_weights_after_loading(self, quant_config, real_layer):
+        """Smoke test: full process_weights_after_loading with real torch_npu."""
+        from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod
+
+        method = NPUInt8OnlineLinearMethod(quant_config)
+
+        method.process_weights_after_loading(real_layer)
+
+        assert real_layer.weight.shape == (64, 128)
+        assert real_layer.weight.dtype == torch.int8
+        assert hasattr(real_layer, "weight_scale")
+        assert real_layer.weight_scale.shape == (128,)
+
+    def test_real_npu_int8_apply_forward(self, quant_config):
+        """Smoke test: forward pass with real npu_quant_matmul."""
+        import torch_npu
+
+        from vllm_omni.quantization.int8_config import NPUInt8LinearMethod
+
+        method = NPUInt8LinearMethod(quant_config)
+
+        layer = torch.nn.Module()
+        weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="npu")
+        qweight, scale = torch_npu.npu_dynamic_quant(weight_fp16)
+        layer.weight = torch.nn.Parameter(qweight.t().contiguous(), requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(scale.squeeze(), requires_grad=False)
+
+        x = torch.randn(2, 16, 64, dtype=torch.float16, device="npu")
+        output = method.apply(layer, x)
+
+        assert output.shape == (2, 16, 128)
+        assert output.dtype == torch.float16
+
+
+@cuda_available
+class TestCudaInt8Smoke:
+    """Smoke tests using real CUDA kernels, only on CUDA"""
+
+    @pytest.fixture
+    def real_layer(self):
+        """Create a real linear layer with fp16 weights on CUDA"""
+        layer = torch.nn.Module()
+        layer.weight = torch.nn.Parameter(
+            torch.randn(128, 64, dtype=torch.float16, device="cuda"),
+            requires_grad=False,
+        )
+        layer.logical_widths = [128]
+        layer.input_size_per_partition = 64
+        layer.output_size_per_partition = 128
+        layer.orig_dtype = torch.float16
+        return layer
+
+    def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config):
+        """Smoke test: verify scaled_int8_quant returns correct shapes."""
+        from vllm import _custom_ops as ops
+
+        weight = torch.randn(128, 64, dtype=torch.float16, device="cuda")
+        qweight, scale, _ = ops.scaled_int8_quant(weight, scale=None)
+
+        assert qweight.shape == weight.shape
+        assert qweight.dtype == torch.int8
+        assert scale.shape == (weight.shape[0], 1)
+
+    def test_real_cuda_online_process_weights_after_loading(self, quant_config, real_layer):
+        """Smoke test: full process_weights_after_loading with real CUDA ops."""
+        from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod
+
+        method = Int8OnlineLinearMethod(quant_config)
+
+        method.process_weights_after_loading(real_layer)
+
+        assert real_layer.weight.shape == (64, 128)
+        assert real_layer.weight.dtype == torch.int8
+        assert hasattr(real_layer, "weight_scale")
+
+    def test_real_cuda_int8_apply_forward(self, quant_config):
+        """Smoke test: forward pass with real CUDA int8 kernel."""
+        from vllm import _custom_ops as ops
+
+        from vllm_omni.quantization.int8_config import Int8LinearMethod
+
+        method = Int8LinearMethod(quant_config)
+
+        layer = torch.nn.Module()
+        weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="cuda")
+        qweight, scale, _ = ops.scaled_int8_quant(weight_fp16, scale=None)
+        layer.weight = torch.nn.Parameter(qweight.t(), requires_grad=False)
+        layer.weight_scale = torch.nn.Parameter(scale, requires_grad=False)
+
+        layer.input_scale = None
+        layer.input_zero_point = None
+        layer.azp_adj = None
+
+        x = torch.randn(2, 16, 64, dtype=torch.float16, device="cuda")
+        output = method.apply(layer, x)
+
+        assert output.shape == (2, 16, 128)
+        assert output.dtype == torch.float16
diff --git a/tests/diffusion/quantization/test_quantization_quality.py b/tests/quantization/test_quantization_quality.py
similarity index 96%
rename from tests/diffusion/quantization/test_quantization_quality.py
rename to tests/quantization/test_quantization_quality.py
index 3d8f1873698..612e93cdeab 100644
--- a/tests/diffusion/quantization/test_quantization_quality.py
+++ b/tests/quantization/test_quantization_quality.py
@@ -7,7 +7,7 @@
 Developers adding a new quantization method should:
 1. Add their method + model to QUALITY_CONFIGS below
 2. Set a max_lpips threshold (use 0.15 for image, 0.20 for video as defaults)
-3. Run: pytest tests/diffusion/quantization/test_quantization_quality.py -v -m ""
+3. Run: pytest tests/quantization/test_quantization_quality.py -v -m ""
 4. Paste the output table into their PR description
 
 The test generates outputs with both BF16 and the quantized method using the
@@ -17,10 +17,10 @@
     pip install lpips
 
 Example — run only FP8 tests:
-    pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "fp8"
+    pytest tests/quantization/test_quantization_quality.py -v -m "" -k "fp8"
 
 Example — run a specific model:
-    pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "z_image"
+    pytest tests/quantization/test_quantization_quality.py -v -m "" -k "z_image"
 """
 
 from __future__ import annotations