vllm-project · jmkuebler · Oct 8, 2025 · Oct 8, 2025
@@ -892,9 +892,20 @@ def load_model(self, target_model: nn.Module) -> None:
         from vllm.compilation.backends import set_model_tag
 
         with set_model_tag("eagle_head"):
+            if self.vllm_config.quant_config is not None:
+                target_model_quant_config = self.vllm_config.quant_config
+                self.vllm_config.quant_config = None
+                logger.warning(
+                    "Quantization is not supported for draft model, "
+                    "disabling quantization for draft model"
+                )
+            else:
+                target_model_quant_config = None
             self.model = get_model(
                 vllm_config=self.vllm_config, model_config=draft_model_config
             )
+            # restore the quant config
+            self.vllm_config.quant_config = target_model_quant_config
 
         draft_attn_layer_names = (
             get_layers_from_vllm_config(self.vllm_config, Attention).keys()