We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent a1213fa commit 25bb9e8Copy full SHA for 25bb9e8
vllm/config/__init__.py
@@ -3558,6 +3558,10 @@ def __post_init__(self):
3558
disable_chunked_prefill_reasons.append(
3559
"Only \"last\" pooling supports chunked "
3560
"prefill and prefix caching; disabling both.")
3561
+ if not getattr(self.model_config.hf_config, "is_causal", True):
3562
+ disable_chunked_prefill_reasons.append(
3563
+ "Only models using causal attention supports chunked "
3564
+ "prefill and prefix caching; disabling both.")
3565
elif self.model_config.is_encoder_decoder:
3566
self.scheduler_config.max_num_encoder_input_tokens = \
3567
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
0 commit comments