diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index bcdfafb50fc5..6e3cf6dcb672 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -33,6 +33,7 @@ ) from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op logger = init_logger(__name__) @@ -52,6 +53,14 @@ def get_name(self) -> QuantizationMethods: return "gguf" def get_supported_act_dtypes(self) -> list[torch.dtype]: + # GGUF dequantization kernels use half precision (fp16) internally. + # bfloat16 has precision issues on SM 120+ devices (Blackwell). + if current_platform.has_device_capability(120): + logger.warning_once( + "GGUF has precision issues with bfloat16 on SM 120+ devices. " + "bfloat16 is unavailable for Blackwell devices." + ) + return [torch.half, torch.float32] return [torch.half, torch.bfloat16, torch.float32] @classmethod