diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index c5baa66cbeb0..7fe90c881177 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None): TEST_MODELS.append( ( "alexm-nm/tinyllama-24-marlin24-4bit-g128", - {"quantization": "gptq_marlin_24"}, + { + "quantization": "gptq_marlin_24", + "allow_deprecated_quantization": True, + }, ) ) diff --git a/tests/models/quantization/test_gptq_marlin_24.py b/tests/models/quantization/test_gptq_marlin_24.py index 85426ee5b089..43d1d35fa7e3 100644 --- a/tests/models/quantization/test_gptq_marlin_24.py +++ b/tests/models/quantization/test_gptq_marlin_24.py @@ -63,7 +63,10 @@ def test_models( num_logprobs: int, ) -> None: with vllm_runner( - model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24" + model_pair.model_marlin, + dtype=dtype, + quantization="gptq_marlin_24", + allow_deprecated_quantization=True, ) as marlin_24_model: marlin_24_outputs = marlin_24_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs diff --git a/tests/quantization/test_auto_round.py b/tests/quantization/test_auto_round.py index 9f5db8219501..a2a1ebc014cb 100644 --- a/tests/quantization/test_auto_round.py +++ b/tests/quantization/test_auto_round.py @@ -26,7 +26,9 @@ ) @pytest.mark.parametrize("model", MODELS) def test_auto_round(vllm_runner, model): - with vllm_runner(model, enforce_eager=True) as llm: + with vllm_runner( + model, enforce_eager=True, allow_deprecated_quantization=True + ) as llm: output = llm.generate_greedy(["The capital of France is"], max_tokens=8) assert output print(f"{output[0][1]}") diff --git a/tests/quantization/test_experts_int8.py b/tests/quantization/test_experts_int8.py index b992e976ac30..22edb9c58daf 100644 --- a/tests/quantization/test_experts_int8.py +++ b/tests/quantization/test_experts_int8.py @@ -34,6 +34,10 @@ def test_model_experts_int8_startup( model_info.check_transformers_version(on_fail="skip") with vllm_runner( - model, dtype=dtype, enforce_eager=True, quantization="experts_int8" + model, + dtype=dtype, + enforce_eager=True, + quantization="experts_int8", + allow_deprecated_quantization=True, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tests/quantization/test_rtn.py b/tests/quantization/test_rtn.py index 195f1fbbdfc0..8468eec6f2e3 100644 --- a/tests/quantization/test_rtn.py +++ b/tests/quantization/test_rtn.py @@ -30,6 +30,10 @@ def test_model_rtn_startup( max_tokens: int, ) -> None: with vllm_runner( - model, enforce_eager=True, dtype=dtype, quantization="rtn" + model, + enforce_eager=True, + dtype=dtype, + quantization="rtn", + allow_deprecated_quantization=True, ) as vllm_model: vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/vllm/config/model.py b/vllm/config/model.py index 3c89658f0723..c8b677695473 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -191,6 +191,8 @@ class ModelConfig: `quantization_config` attribute in the model config file. If that is `None`, we assume the model weights are not quantized and use `dtype` to determine the data type of the weights.""" + allow_deprecated_quantization: bool = False + """Whether to allow deprecated quantization methods.""" enforce_eager: bool = False """Whether to always use eager-mode PyTorch. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use @@ -940,6 +942,21 @@ def _verify_quantization(self) -> None: current_platform.verify_quantization(self.quantization) + if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS: + if self.allow_deprecated_quantization: + logger.warning( + "The quantization method %s is deprecated " + "and will be removed in future versions of vLLM.", + self.quantization, + ) + else: + raise ValueError( + "The quantization method %s is deprecated " + "and will be removed in future versions of vLLM. To bypass, " + "set `--allow-deprecated-quantization`.", + self.quantization, + ) + def _verify_cuda_graph(self) -> None: # CUDAGraph capture not supported for encoder-decoder models on ROCm unsupported_rocm = self.is_encoder_decoder diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a6a4f780a5a3..2fd3073cdd55 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -451,6 +451,7 @@ class EngineArgs: hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") tokenizer_revision: str | None = ModelConfig.tokenizer_revision quantization: QuantizationMethods | None = ModelConfig.quantization + allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field( @@ -648,6 +649,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: ) model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"]) model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"]) + model_group.add_argument( + "--allow-deprecated-quantization", + **model_kwargs["allow_deprecated_quantization"], + ) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"]) model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"]) @@ -1225,6 +1230,7 @@ def create_model_config(self) -> ModelConfig: tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, + allow_deprecated_quantization=self.allow_deprecated_quantization, enforce_eager=self.enforce_eager, max_logprobs=self.max_logprobs, logprobs_mode=self.logprobs_mode, diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index c240a2d7a33c..1c1587ebee71 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -41,6 +41,23 @@ ] QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods)) +DEPRECATED_QUANTIZATION_METHODS = [ + "deepspeedfp", + "tpu_int8", + "ptpc_fp8", + "fbgemm_fp8", + "fp_quant", + "bitblas", + "gptq_marlin_24", + "gptq_bitblas", + "hqq", + "experts_int8", + "ipex", + "auto-round", + "rtn", + "petit_nvfp4", +] + # The customized quantization methods which will be added to this dict. _CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}