vllm-project · robertgshaw2-redhat · Jan 9, 2026 · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
@@ -62,7 +62,10 @@ def models_list(*, all: bool = True, keywords: list[str] | None = None):
             TEST_MODELS.append(
                 (
                     "alexm-nm/tinyllama-24-marlin24-4bit-g128",
-                    {"quantization": "gptq_marlin_24"},
+                    {
+                        "quantization": "gptq_marlin_24",
+                        "allow_deprecated_quantization": True,
+                    },
                 )
             )
 

@@ -63,7 +63,10 @@ def test_models(
     num_logprobs: int,
 ) -> None:
     with vllm_runner(
-        model_pair.model_marlin, dtype=dtype, quantization="gptq_marlin_24"
+        model_pair.model_marlin,
+        dtype=dtype,
+        quantization="gptq_marlin_24",
+        allow_deprecated_quantization=True,
     ) as marlin_24_model:
         marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
             example_prompts, max_tokens, num_logprobs

@@ -26,7 +26,9 @@
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(model, enforce_eager=True) as llm:
+    with vllm_runner(
+        model, enforce_eager=True, allow_deprecated_quantization=True
+    ) as llm:
         output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
     assert output
     print(f"{output[0][1]}")
@@ -34,6 +34,10 @@ def test_model_experts_int8_startup(
     model_info.check_transformers_version(on_fail="skip")
 
     with vllm_runner(
-        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        quantization="experts_int8",
+        allow_deprecated_quantization=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -30,6 +30,10 @@ def test_model_rtn_startup(
     max_tokens: int,
 ) -> None:
     with vllm_runner(
-        model, enforce_eager=True, dtype=dtype, quantization="rtn"
+        model,
+        enforce_eager=True,
+        dtype=dtype,
+        quantization="rtn",
+        allow_deprecated_quantization=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
@@ -191,6 +191,8 @@ class ModelConfig:
     `quantization_config` attribute in the model config file. If that is
     `None`, we assume the model weights are not quantized and use `dtype` to
     determine the data type of the weights."""
+    allow_deprecated_quantization: bool = False
+    """Whether to allow deprecated quantization methods."""
     enforce_eager: bool = False
     """Whether to always use eager-mode PyTorch. If True, we will disable CUDA
     graph and always execute the model in eager mode. If False, we will use
@@ -940,6 +942,21 @@ def _verify_quantization(self) -> None:
 
             current_platform.verify_quantization(self.quantization)
 
+        if self.quantization in me_quant.DEPRECATED_QUANTIZATION_METHODS:
+            if self.allow_deprecated_quantization:
+                logger.warning(
+                    "The quantization method %s is deprecated "
+                    "and will be removed in future versions of vLLM.",
+                    self.quantization,
+                )
+            else:
+                raise ValueError(
+                    "The quantization method %s is deprecated "
+                    "and will be removed in future versions of vLLM. To bypass, "
+                    "set `--allow-deprecated-quantization`.",
+                    self.quantization,
+                )
+
     def _verify_cuda_graph(self) -> None:
         # CUDAGraph capture not supported for encoder-decoder models on ROCm
         unsupported_rocm = self.is_encoder_decoder

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -451,6 +451,7 @@ class EngineArgs:
     hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
     tokenizer_revision: str | None = ModelConfig.tokenizer_revision
     quantization: QuantizationMethods | None = ModelConfig.quantization
+    allow_deprecated_quantization: bool = ModelConfig.allow_deprecated_quantization
     enforce_eager: bool = ModelConfig.enforce_eager
     disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
     limit_mm_per_prompt: dict[str, int | dict[str, int]] = get_field(
@@ -648,6 +649,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         )
         model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
         model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
+        model_group.add_argument(
+            "--allow-deprecated-quantization",
+            **model_kwargs["allow_deprecated_quantization"],
+        )
         model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
         model_group.add_argument("--max-logprobs", **model_kwargs["max_logprobs"])
         model_group.add_argument("--logprobs-mode", **model_kwargs["logprobs_mode"])
@@ -1225,6 +1230,7 @@ def create_model_config(self) -> ModelConfig:
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
+            allow_deprecated_quantization=self.allow_deprecated_quantization,
             enforce_eager=self.enforce_eager,
             max_logprobs=self.max_logprobs,
             logprobs_mode=self.logprobs_mode,

@@ -41,6 +41,23 @@
 ]
 QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
 
+DEPRECATED_QUANTIZATION_METHODS = [
+    "deepspeedfp",
+    "tpu_int8",
+    "ptpc_fp8",
+    "fbgemm_fp8",
+    "fp_quant",
+    "bitblas",
+    "gptq_marlin_24",
+    "gptq_bitblas",
+    "hqq",
+    "experts_int8",
+    "ipex",
+    "auto-round",
+    "rtn",
+    "petit_nvfp4",
+]
+
 # The customized quantization methods which will be added to this dict.
 _CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}