vllm-project · kitaekatt · Dec 9, 2025
@@ -389,11 +389,41 @@ def _get_quantization_config(
                     )
             supported_dtypes = quant_config.get_supported_act_dtypes()
             if model_config.dtype not in supported_dtypes:
-                raise ValueError(
-                    f"{model_config.dtype} is not supported for quantization "
-                    f"method {model_config.quantization}. Supported dtypes: "
-                    f"{supported_dtypes}"
-                )
+                # Handle dtype conflict between model restrictions and
+                # quantization restrictions (e.g., Gemma3 GGUF on Blackwell
+                # where Gemma3 blocks float16 and GGUF blocks bfloat16)
+                from vllm.config.model import _is_valid_dtype
+
+                model_type = getattr(model_config.hf_config, "model_type", None)
+                compatible_dtypes = [
+                    d
+                    for d in supported_dtypes
+                    if model_type is None or _is_valid_dtype(model_type, d)
+                ]
+                if compatible_dtypes:
+                    # Prefer float16 > bfloat16 > float32 for performance
+                    import torch
+
+                    dtype_preference = [torch.float16, torch.bfloat16, torch.float32]
+                    for preferred in dtype_preference:
+                        if preferred in compatible_dtypes:
+                            logger.warning(
+                                "dtype=%s is not supported for quantization "
+                                "method %s with model type %s. "
+                                "Automatically selecting %s as compatible dtype.",
+                                model_config.dtype,
+                                model_config.quantization,
+                                model_type,
+                                preferred,
+                            )
+                            model_config.dtype = preferred
+                            break
+                else:
+                    raise ValueError(
+                        f"{model_config.dtype} is not supported for quantization "
+                        f"method {model_config.quantization}. Supported dtypes: "
+                        f"{supported_dtypes}"
+                    )
             quant_config.maybe_update_config(model_config.model)
             return quant_config
         return None

@@ -33,6 +33,7 @@
 )
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import direct_register_custom_op
 
 logger = init_logger(__name__)
@@ -52,6 +53,14 @@ def get_name(self) -> QuantizationMethods:
         return "gguf"
 
     def get_supported_act_dtypes(self) -> list[torch.dtype]:
+        # GGUF dequantization kernels use half precision (fp16) internally.
+        # bfloat16 has precision issues on SM 120+ devices (Blackwell).
+        if current_platform.has_device_capability(120):
+            logger.warning_once(
+                "GGUF has precision issues with bfloat16 on SM 120+ devices. "
+                "bfloat16 is unavailable for Blackwell devices."
+            )
+            return [torch.half, torch.float32]
         return [torch.half, torch.bfloat16, torch.float32]
 
     @classmethod