diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 4cacc2710f10..1e84c8f01277 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -327,8 +327,7 @@ apply_rocm_test_overrides() { cmds="${cmds} \ --ignore=kernels/moe/test_moe.py \ --ignore=kernels/moe/test_cutlass_moe.py \ - --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" - fi + fi # --- Entrypoint ignores --- if [[ $cmds == *" entrypoints/openai "* ]]; then diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py deleted file mode 100644 index 6858062b9183..000000000000 --- a/tests/quantization/test_ptpc_fp8.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests whether PTPC w8a8 FP8 computation is enabled correctly. - -Run `pytest tests/quantization/test_ptpc_fp8.py --forked`. -""" - -import pytest - -from tests.quantization.utils import is_quant_method_supported -from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod -from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod -from vllm.platforms import current_platform - - -@pytest.fixture(scope="function", autouse=True) -def enable_pickle(monkeypatch): - """`LLM.apply_model` requires pickling a function.""" - monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - - -@pytest.mark.skipif( - not is_quant_method_supported("ptpc_fp8"), - reason="PTPC FP8 is not supported on this GPU type.", -) -@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.") -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: - llm = vllm_runner( - "facebook/opt-125m", - dtype=dtype, - quantization="ptpc_fp8", - enforce_eager=True, - kv_cache_dtype=kv_cache_dtype, - allow_deprecated_quantization=True, - ) - - with llm: - - def check_model(model): - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, PTPCFp8LinearMethod) - if kv_cache_dtype == "ptpc_fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 - - # For GPUs with hardware support, we keep weights in fp8 - if current_platform.has_device_capability(94): - assert fc1.weight.dtype == current_platform.fp8_dtype() - - llm.apply_model(check_model) - - output = llm.generate_greedy("Hello my name is", max_tokens=4) - assert output diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index e08a6456aba7..9aceb3be054d 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -12,7 +12,6 @@ QuantizationMethods = Literal[ "awq", "fp8", - "ptpc_fp8", "fbgemm_fp8", "fp_quant", "modelopt", @@ -39,7 +38,6 @@ DEPRECATED_QUANTIZATION_METHODS = [ "tpu_int8", - "ptpc_fp8", "fbgemm_fp8", "fp_quant", "experts_int8", @@ -132,7 +130,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .mxfp4 import Mxfp4Config from .mxfp8 import Mxfp8Config from .petit import PetitNvFp4Config - from .ptpc_fp8 import PTPCFp8Config from .torchao import TorchAOConfig method_to_config: dict[str, type[QuantizationConfig]] = { @@ -150,7 +147,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "gptq": GPTQConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, - "ptpc_fp8": PTPCFp8Config, "experts_int8": ExpertsInt8Config, "quark": QuarkConfig, "moe_wna16": MoeWNA16Config, diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py deleted file mode 100644 index 5d7b7b54adc8..000000000000 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ /dev/null @@ -1,132 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.model_executor.kernels.linear import ( - init_fp8_linear_kernel, -) -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase -from vllm.model_executor.layers.quantization.fp8 import ( - Fp8Config, - Fp8KVCacheMethod, - Fp8LinearMethod, -) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - is_layer_skipped, - kFp8DynamicTokenSym, -) -from vllm.platforms import current_platform - - -class PTPCFp8Config(Fp8Config): - """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8.""" - - def __init__( - self, - activation_scheme: str = "dynamic", - ignored_layers: list[str] | None = None, - ) -> None: - if not current_platform.is_rocm(): - raise ValueError("ptpc_fp8 quantization is supported only on ROCm.") - - if not current_platform.has_device_capability(94): - raise ValueError( - "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer." # noqa: E501 - ) - if activation_scheme == "static": - raise ValueError("ptpc_fp8 as of now only support dynamic quantization.") - - super().__init__( - is_checkpoint_fp8_serialized=False, - activation_scheme=activation_scheme, - ignored_layers=ignored_layers, - ) - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "ptpc_fp8" - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config": - activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) - ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) - return cls(activation_scheme=activation_scheme, ignored_layers=ignored_layers) - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> "QuantizeMethodBase | None": - if isinstance(layer, LinearBase): - if is_layer_skipped(prefix, self.ignored_layers): - return UnquantizedLinearMethod() - return PTPCFp8LinearMethod(self) - elif isinstance(layer, Attention): - return Fp8KVCacheMethod(self) - return None - - -class PTPCFp8LinearMethod(Fp8LinearMethod): - """Linear method for Per-Token and Per-Channel FP8 Quantization. - Only supports loading quantized BF16 model checkpoints with dynamic - activation scaling. To load FP16 model checkpoints, user must specify - to convert the FP16 model weight loading into BF16. - The weight scaling factor will be initialized after - the model weights are loaded. - - Limitations: - 1. Only support float8_e4m3fnuz data type due to the limitation of - torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041) - - Args: - quant_config: The quantization config. - """ - - def __init__(self, quant_config: PTPCFp8Config): - assert current_platform.is_rocm(), ( - "PTPCFp8LinearMethod is only supported on ROCm." - ) - super().__init__(quant_config=quant_config) - # Force weight quantization - self.fp8_linear = init_fp8_linear_kernel( - activation_quant_key=kFp8DynamicTokenSym, - weight_quant_key=kFp8DynamicTokenSym, - out_dtype=torch.get_default_dtype(), - module_name=self.__class__.__name__, - ) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - assert layer.weight.data.dtype not in (torch.float16, torch.float32), ( - "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support " - f"output dtype of bfloat16. {layer.weight.data.dtype} is specified." - ) - - if layer.weight.data.dtype == torch.bfloat16: - # Quantize the weights. - qweight, weight_scale = ops.scaled_fp8_quant( - layer.weight, scale=None, use_per_token_if_dynamic=True - ) - - # Update the layer with the new values. - layer.weight = Parameter( - qweight.t(), requires_grad=False - ) # Pretranspose the weight - layer.weight_scale = Parameter(weight_scale, requires_grad=False) - else: - assert layer.weight.data.dtype == current_platform.fp8_dtype() - assert getattr(layer, "weight_scale", None) is not None - layer.input_scale = None - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: torch.Tensor | None = None, - ) -> torch.Tensor: - return self.fp8_linear.apply_weights(layer, x, bias) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 46d83564d476..29d7d5ce8592 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -378,7 +378,6 @@ class RocmPlatform(Platform): "fbgemm_fp8", "gguf", "quark", - "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao",