diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index f62e4468ef17..2b6899418376 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -3522,6 +3522,10 @@ def __post_init__(self): disable_chunked_prefill_reasons.append( "Only \"last\" pooling supports chunked " "prefill and prefix caching; disabling both.") + if not getattr(self.model_config.hf_config, "is_causal", True): + disable_chunked_prefill_reasons.append( + "Only models using causal attention supports chunked " + "prefill and prefix caching; disabling both.") elif self.model_config.is_encoder_decoder: self.scheduler_config.max_num_encoder_input_tokens = \ MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)