diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml index 04b99c0a837..468747f5f5e 100644 --- a/.buildkite/test-nightly-diffusion.yml +++ b/.buildkite/test-nightly-diffusion.yml @@ -110,6 +110,46 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: ":full_moon: Diffusion · Other · Quantization Quality Test" + timeout_in_minutes: 60 + if: *nightly_or_pr_label + commands: + - pip install lpips + - pytest -s -v tests/quantization/test_quantization_quality.py -k "z_image or flux" -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - group: ":card_index_dividers: Wan Series Model Test" key: nightly-wan-model-test-group steps: @@ -362,3 +402,44 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + + - label: ":full_moon: Diffusion · Qwen-Image · Quantization Quality Test" + timeout_in_minutes: 60 + if: *nightly_or_pr_label + commands: + - pip install lpips + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -s -v tests/quantization/test_quantization_quality.py -k "qwen_image" -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate diff --git a/docs/contributing/model/adding_quantization_model.md b/docs/contributing/model/adding_quantization_model.md index f2731888846..1db91314e94 100644 --- a/docs/contributing/model/adding_quantization_model.md +++ b/docs/contributing/model/adding_quantization_model.md @@ -263,7 +263,7 @@ outputs = omni.generate( ### Quality Gate Test (LPIPS) -We provide a pytest-based quality gate at `tests/diffusion/quantization/test_quantization_quality.py`. +We provide a pytest-based quality gate at `tests/quantization/test_quantization_quality.py`. It generates outputs with both BF16 and your quantized method using the same seed, computes [LPIPS](https://github.com/richzhang/PerceptualSimilarity) perceptual distance, and **fails if it exceeds a threshold**. This is the recommended way to validate that a quantization method does not @@ -290,13 +290,13 @@ QualityTestConfig( pip install lpips # Run all quality tests -pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" +pytest tests/quantization/test_quantization_quality.py -v -m "" # Run only your method -pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "int8" +pytest tests/quantization/test_quantization_quality.py -v -m "" -k "int8" # Run a specific model -pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "z_image" +pytest tests/quantization/test_quantization_quality.py -v -m "" -k "z_image" ``` **Step 3: Read the output:** @@ -364,7 +364,7 @@ aggressive quantization methods (Int8, NVFP4). 1. Run the quality gate test with all layers quantized: ```bash - pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "your_model" + pytest tests/quantization/test_quantization_quality.py -v -m "" -k "your_model" ``` 2. If LPIPS exceeds your threshold, try skipping common sensitive layers one at a time: diff --git a/tests/diffusion/quantization/__init__.py b/tests/quantization/__init__.py similarity index 100% rename from tests/diffusion/quantization/__init__.py rename to tests/quantization/__init__.py diff --git a/tests/diffusion/quantization/test_fp8_config.py b/tests/quantization/test_fp8_config.py similarity index 99% rename from tests/diffusion/quantization/test_fp8_config.py rename to tests/quantization/test_fp8_config.py index 574af7a6699..36bfef4696b 100644 --- a/tests/diffusion/quantization/test_fp8_config.py +++ b/tests/quantization/test_fp8_config.py @@ -5,7 +5,7 @@ import pytest from torch import nn -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def test_build_quant_config_fp8(): diff --git a/tests/diffusion/quantization/test_gguf_config.py b/tests/quantization/test_gguf_config.py similarity index 98% rename from tests/diffusion/quantization/test_gguf_config.py rename to tests/quantization/test_gguf_config.py index 1dd401d59c5..7a1f9a13099 100644 --- a/tests/diffusion/quantization/test_gguf_config.py +++ b/tests/quantization/test_gguf_config.py @@ -18,7 +18,7 @@ dequant_gemm_gguf, ) -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion, pytest.mark.cpu] +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def test_gguf_config_creation_and_delegation(): diff --git a/tests/diffusion/quantization/test_inc_config.py b/tests/quantization/test_inc_config.py similarity index 98% rename from tests/diffusion/quantization/test_inc_config.py rename to tests/quantization/test_inc_config.py index a7aabf7f620..8d03d51757d 100644 --- a/tests/diffusion/quantization/test_inc_config.py +++ b/tests/quantization/test_inc_config.py @@ -4,7 +4,7 @@ import pytest -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion] +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def test_build_quant_config_autoround(): diff --git a/tests/diffusion/quantization/test_int8_config.py b/tests/quantization/test_int8_config.py similarity index 51% rename from tests/diffusion/quantization/test_int8_config.py rename to tests/quantization/test_int8_config.py index 875277ece42..182a68956d1 100644 --- a/tests/diffusion/quantization/test_int8_config.py +++ b/tests/quantization/test_int8_config.py @@ -12,11 +12,7 @@ from vllm_omni.quantization import build_quant_config from vllm_omni.quantization.factory import SUPPORTED_QUANTIZATION_METHODS -pytestmark = [pytest.mark.core_model, pytest.mark.diffusion] - -npu_available = pytest.mark.skipif(not current_omni_platform.is_npu(), reason="NPU platform not available.") - -cuda_available = pytest.mark.skipif(not current_omni_platform.is_cuda(), reason="GPU platform not available.") +pytestmark = [pytest.mark.core_model, pytest.mark.cpu] def test_int8_config_creation(): @@ -229,208 +225,3 @@ def test_process_weights_after_loading(self, mock_deps, mock_quant_config): layer.weight = Parameter(torch.randn(128, 64)) method.process_weights_after_loading(layer) mock_deps["quant"].assert_called_once_with(layer.weight, scale=None) - - -@npu_available -class TestNPUInt8LinearMethod: - qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8) - scale_mock = torch.randn(128) - out_mock = torch.randn((16, 128)) - - @pytest.fixture - def mock_torch_npu(self, mocker): - torch_npu = mocker.MagicMock() - - mocker.patch("vllm_omni.quantization.int8_config.torch_npu", return_value=torch_npu) - mocker.patch( - "vllm_omni.quantization.int8_config.torch_npu.npu_dynamic_quant", - return_value=(self.qweight_mock, self.scale_mock), - ) - mocker.patch("vllm_omni.quantization.int8_config.torch_npu.npu_quant_matmul", return_value=self.out_mock) - return torch_npu - - @pytest.fixture - def mock_quant_config(self, mocker): - return mocker.Mock() - - @pytest.fixture - def mock_layer(self, mocker): - layer = torch.nn.Module() - layer.weight = torch.nn.Parameter(self.qweight_mock, requires_grad=False) - layer.weight_scale = torch.nn.Parameter(self.scale_mock, requires_grad=False) - return layer - - def test_npu_int8_process_weights_after_loading(self, mock_layer, mock_quant_config, mock_torch_npu): - from vllm_omni.quantization.int8_config import NPUInt8LinearMethod - - method = NPUInt8LinearMethod(mock_quant_config) - ori_weight_shape = mock_layer.weight.shape - - method.process_weights_after_loading(mock_layer) - - assert mock_layer.weight.shape == ori_weight_shape[::-1] - assert mock_layer.weight.is_contiguous() - - def test_npu_int8_apply(self, mock_layer, mock_quant_config, mock_torch_npu): - from vllm_omni.quantization.int8_config import NPUInt8LinearMethod - - method = NPUInt8LinearMethod(mock_quant_config) - x = torch.randn(1, 16, 64) - - output = method.apply(mock_layer, x) - assert output.shape == (1, 16, 128) - - def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mock_torch_npu): - from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod - - method = NPUInt8OnlineLinearMethod(mock_quant_config) - method.process_weights_after_loading(mock_layer) - - assert mock_layer.weight.shape == (64, 128) - assert torch.equal(mock_layer.weight_scale, self.scale_mock) - - -@pytest.fixture -def quant_config(): - """Shared quant config fixture for smoke tests.""" - from vllm_omni.quantization.int8_config import DiffusionInt8Config - - return DiffusionInt8Config( - is_checkpoint_int8_serialized=False, - activation_scheme="dynamic", - ) - - -@npu_available -class TestNPUInt8Smoke: - """Smoke tests using real torch_npu, only run on NPU.""" - - @pytest.fixture - def real_layer(self): - """Create a real linear layer with fp16 weights on NPU""" - layer = torch.nn.Module() - layer.weight = torch.nn.Parameter( - torch.randn(128, 64, dtype=torch.float16, device="npu"), - requires_grad=False, - ) - layer.logical_widths = [128] - layer.input_size_per_partition = 64 - layer.output_size_per_partition = 128 - layer.orig_dtype = torch.float16 - return layer - - def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer): - """Smoke test: verify npu_dynamic_quant returns correct shapes.""" - import torch_npu - - # Call real torch_npu.npu_dynamic_quant - weight = real_layer.weight - qweight, scale = torch_npu.npu_dynamic_quant(weight) - - assert qweight.shape == weight.shape - assert qweight.dtype == torch.int8 - assert scale.shape == (weight.shape[0],) - - def test_real_npu_online_process_weights_after_loading(self, quant_config, real_layer): - """Smoke test: full process_weights_after_loading with real torch_npu.""" - from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod - - method = NPUInt8OnlineLinearMethod(quant_config) - - method.process_weights_after_loading(real_layer) - - assert real_layer.weight.shape == (64, 128) - assert real_layer.weight.dtype == torch.int8 - assert hasattr(real_layer, "weight_scale") - assert real_layer.weight_scale.shape == (128,) - - def test_real_npu_int8_apply_forward(self, quant_config): - """Smoke test: forward pass with real npu_quant_matmul.""" - import torch_npu - - from vllm_omni.quantization.int8_config import NPUInt8LinearMethod - - method = NPUInt8LinearMethod(quant_config) - - # Create layer with pre-processed weights on NPU - layer = torch.nn.Module() - weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="npu") - qweight, scale = torch_npu.npu_dynamic_quant(weight_fp16) - layer.weight = torch.nn.Parameter(qweight.t().contiguous(), requires_grad=False) - layer.weight_scale = torch.nn.Parameter(scale.squeeze(), requires_grad=False) - - # Forward pass on NPU - x = torch.randn(2, 16, 64, dtype=torch.float16, device="npu") - output = method.apply(layer, x) - - assert output.shape == (2, 16, 128) - assert output.dtype == torch.float16 - - -@cuda_available -class TestCudaInt8Smoke: - """Smoke tests using real CUDA kernels, only on CUDA""" - - @pytest.fixture - def real_layer(self): - """Create a real linear layer with fp16 weights on CUDA""" - layer = torch.nn.Module() - layer.weight = torch.nn.Parameter( - torch.randn(128, 64, dtype=torch.float16, device="cuda"), - requires_grad=False, - ) - layer.logical_widths = [128] - layer.input_size_per_partition = 64 - layer.output_size_per_partition = 128 - layer.orig_dtype = torch.float16 - return layer - - def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config): - """Smoke test: verify scaled_int8_quant returns correct shapes.""" - from vllm import _custom_ops as ops - - weight = torch.randn(128, 64, dtype=torch.float16, device="cuda") - qweight, scale, _ = ops.scaled_int8_quant(weight, scale=None) - - assert qweight.shape == weight.shape - assert qweight.dtype == torch.int8 - assert scale.shape == (weight.shape[0], 1) - - def test_real_cuda_online_process_weights_after_loading(self, quant_config, real_layer): - """Smoke test: full process_weights_after_loading with real CUDA ops.""" - from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod - - method = Int8OnlineLinearMethod(quant_config) - - method.process_weights_after_loading(real_layer) - - assert real_layer.weight.shape == (64, 128) - assert real_layer.weight.dtype == torch.int8 - assert hasattr(real_layer, "weight_scale") - - def test_real_cuda_int8_apply_forward(self, quant_config): - """Smoke test: forward pass with real CUDA int8 kernel.""" - from vllm import _custom_ops as ops - - from vllm_omni.quantization.int8_config import Int8LinearMethod - - method = Int8LinearMethod(quant_config) - - # Create layer with pre-processed weights - layer = torch.nn.Module() - weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="cuda") - qweight, scale, _ = ops.scaled_int8_quant(weight_fp16, scale=None) - layer.weight = torch.nn.Parameter(qweight.t(), requires_grad=False) - layer.weight_scale = torch.nn.Parameter(scale, requires_grad=False) - - # Set required attributes for kernel - layer.input_scale = None - layer.input_zero_point = None - layer.azp_adj = None - - # Forward pass - x = torch.randn(2, 16, 64, dtype=torch.float16, device="cuda") - output = method.apply(layer, x) - - assert output.shape == (2, 16, 128) - assert output.dtype == torch.float16 diff --git a/tests/quantization/test_int8_smoke.py b/tests/quantization/test_int8_smoke.py new file mode 100644 index 00000000000..a0038ab7324 --- /dev/null +++ b/tests/quantization/test_int8_smoke.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Smoke tests for Int8 quantization on real hardware (CUDA / NPU). + +These tests exercise the actual quantization kernels and require a GPU. +For pure config/factory unit tests, see test_int8_config.py. +""" + +import pytest +import torch + +from vllm_omni.platforms import current_omni_platform + +pytestmark = [pytest.mark.core_model, pytest.mark.cuda, pytest.mark.L4] + +npu_available = pytest.mark.skipif( + not current_omni_platform.is_npu(), + reason="NPU platform not available.", +) + +cuda_available = pytest.mark.skipif( + not current_omni_platform.is_cuda(), + reason="GPU platform not available.", +) + + +@pytest.fixture +def quant_config(): + """Shared quant config fixture for smoke tests.""" + from vllm_omni.quantization.int8_config import DiffusionInt8Config + + return DiffusionInt8Config( + is_checkpoint_int8_serialized=False, + activation_scheme="dynamic", + ) + + +@npu_available +class TestNPUInt8LinearMethod: + qweight_mock = torch.randn((128, 64)).to(dtype=torch.int8) + scale_mock = torch.randn(128) + out_mock = torch.randn((16, 128)) + + @pytest.fixture + def mock_torch_npu(self, mocker): + torch_npu = mocker.MagicMock() + + mocker.patch( + "vllm_omni.quantization.int8_config.torch_npu", + return_value=torch_npu, + ) + mocker.patch( + "vllm_omni.quantization.int8_config.torch_npu.npu_dynamic_quant", + return_value=(self.qweight_mock, self.scale_mock), + ) + mocker.patch( + "vllm_omni.quantization.int8_config.torch_npu.npu_quant_matmul", + return_value=self.out_mock, + ) + return torch_npu + + @pytest.fixture + def mock_quant_config(self, mocker): + return mocker.Mock() + + @pytest.fixture + def mock_layer(self, mocker): + layer = torch.nn.Module() + layer.weight = torch.nn.Parameter(self.qweight_mock, requires_grad=False) + layer.weight_scale = torch.nn.Parameter(self.scale_mock, requires_grad=False) + return layer + + def test_npu_int8_process_weights_after_loading(self, mock_layer, mock_quant_config, mock_torch_npu): + from vllm_omni.quantization.int8_config import NPUInt8LinearMethod + + method = NPUInt8LinearMethod(mock_quant_config) + ori_weight_shape = mock_layer.weight.shape + + method.process_weights_after_loading(mock_layer) + + assert mock_layer.weight.shape == ori_weight_shape[::-1] + assert mock_layer.weight.is_contiguous() + + def test_npu_int8_apply(self, mock_layer, mock_quant_config, mock_torch_npu): + from vllm_omni.quantization.int8_config import NPUInt8LinearMethod + + method = NPUInt8LinearMethod(mock_quant_config) + x = torch.randn(1, 16, 64) + + output = method.apply(mock_layer, x) + assert output.shape == (1, 16, 128) + + def test_npu_int8_online_process_weights(self, mock_layer, mock_quant_config, mock_torch_npu): + from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod + + method = NPUInt8OnlineLinearMethod(mock_quant_config) + method.process_weights_after_loading(mock_layer) + + assert mock_layer.weight.shape == (64, 128) + assert torch.equal(mock_layer.weight_scale, self.scale_mock) + + +@npu_available +class TestNPUInt8Smoke: + """Smoke tests using real torch_npu, only run on NPU.""" + + @pytest.fixture + def real_layer(self): + """Create a real linear layer with fp16 weights on NPU""" + layer = torch.nn.Module() + layer.weight = torch.nn.Parameter( + torch.randn(128, 64, dtype=torch.float16, device="npu"), + requires_grad=False, + ) + layer.logical_widths = [128] + layer.input_size_per_partition = 64 + layer.output_size_per_partition = 128 + layer.orig_dtype = torch.float16 + return layer + + def test_real_npu_dynamic_quant_shape_contract(self, quant_config, real_layer): + """Smoke test: verify npu_dynamic_quant returns correct shapes.""" + import torch_npu + + weight = real_layer.weight + qweight, scale = torch_npu.npu_dynamic_quant(weight) + + assert qweight.shape == weight.shape + assert qweight.dtype == torch.int8 + assert scale.shape == (weight.shape[0],) + + def test_real_npu_online_process_weights_after_loading(self, quant_config, real_layer): + """Smoke test: full process_weights_after_loading with real torch_npu.""" + from vllm_omni.quantization.int8_config import NPUInt8OnlineLinearMethod + + method = NPUInt8OnlineLinearMethod(quant_config) + + method.process_weights_after_loading(real_layer) + + assert real_layer.weight.shape == (64, 128) + assert real_layer.weight.dtype == torch.int8 + assert hasattr(real_layer, "weight_scale") + assert real_layer.weight_scale.shape == (128,) + + def test_real_npu_int8_apply_forward(self, quant_config): + """Smoke test: forward pass with real npu_quant_matmul.""" + import torch_npu + + from vllm_omni.quantization.int8_config import NPUInt8LinearMethod + + method = NPUInt8LinearMethod(quant_config) + + layer = torch.nn.Module() + weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="npu") + qweight, scale = torch_npu.npu_dynamic_quant(weight_fp16) + layer.weight = torch.nn.Parameter(qweight.t().contiguous(), requires_grad=False) + layer.weight_scale = torch.nn.Parameter(scale.squeeze(), requires_grad=False) + + x = torch.randn(2, 16, 64, dtype=torch.float16, device="npu") + output = method.apply(layer, x) + + assert output.shape == (2, 16, 128) + assert output.dtype == torch.float16 + + +@cuda_available +class TestCudaInt8Smoke: + """Smoke tests using real CUDA kernels, only on CUDA""" + + @pytest.fixture + def real_layer(self): + """Create a real linear layer with fp16 weights on CUDA""" + layer = torch.nn.Module() + layer.weight = torch.nn.Parameter( + torch.randn(128, 64, dtype=torch.float16, device="cuda"), + requires_grad=False, + ) + layer.logical_widths = [128] + layer.input_size_per_partition = 64 + layer.output_size_per_partition = 128 + layer.orig_dtype = torch.float16 + return layer + + def test_real_cuda_scaled_int8_quant_shape_contract(self, quant_config): + """Smoke test: verify scaled_int8_quant returns correct shapes.""" + from vllm import _custom_ops as ops + + weight = torch.randn(128, 64, dtype=torch.float16, device="cuda") + qweight, scale, _ = ops.scaled_int8_quant(weight, scale=None) + + assert qweight.shape == weight.shape + assert qweight.dtype == torch.int8 + assert scale.shape == (weight.shape[0], 1) + + def test_real_cuda_online_process_weights_after_loading(self, quant_config, real_layer): + """Smoke test: full process_weights_after_loading with real CUDA ops.""" + from vllm_omni.quantization.int8_config import Int8OnlineLinearMethod + + method = Int8OnlineLinearMethod(quant_config) + + method.process_weights_after_loading(real_layer) + + assert real_layer.weight.shape == (64, 128) + assert real_layer.weight.dtype == torch.int8 + assert hasattr(real_layer, "weight_scale") + + def test_real_cuda_int8_apply_forward(self, quant_config): + """Smoke test: forward pass with real CUDA int8 kernel.""" + from vllm import _custom_ops as ops + + from vllm_omni.quantization.int8_config import Int8LinearMethod + + method = Int8LinearMethod(quant_config) + + layer = torch.nn.Module() + weight_fp16 = torch.randn(128, 64, dtype=torch.float16, device="cuda") + qweight, scale, _ = ops.scaled_int8_quant(weight_fp16, scale=None) + layer.weight = torch.nn.Parameter(qweight.t(), requires_grad=False) + layer.weight_scale = torch.nn.Parameter(scale, requires_grad=False) + + layer.input_scale = None + layer.input_zero_point = None + layer.azp_adj = None + + x = torch.randn(2, 16, 64, dtype=torch.float16, device="cuda") + output = method.apply(layer, x) + + assert output.shape == (2, 16, 128) + assert output.dtype == torch.float16 diff --git a/tests/diffusion/quantization/test_quantization_quality.py b/tests/quantization/test_quantization_quality.py similarity index 96% rename from tests/diffusion/quantization/test_quantization_quality.py rename to tests/quantization/test_quantization_quality.py index 3d8f1873698..612e93cdeab 100644 --- a/tests/diffusion/quantization/test_quantization_quality.py +++ b/tests/quantization/test_quantization_quality.py @@ -7,7 +7,7 @@ Developers adding a new quantization method should: 1. Add their method + model to QUALITY_CONFIGS below 2. Set a max_lpips threshold (use 0.15 for image, 0.20 for video as defaults) -3. Run: pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" +3. Run: pytest tests/quantization/test_quantization_quality.py -v -m "" 4. Paste the output table into their PR description The test generates outputs with both BF16 and the quantized method using the @@ -17,10 +17,10 @@ pip install lpips Example — run only FP8 tests: - pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "fp8" + pytest tests/quantization/test_quantization_quality.py -v -m "" -k "fp8" Example — run a specific model: - pytest tests/diffusion/quantization/test_quantization_quality.py -v -m "" -k "z_image" + pytest tests/quantization/test_quantization_quality.py -v -m "" -k "z_image" """ from __future__ import annotations