diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 0622a54db1bc..db994fa51146 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -206,6 +206,8 @@ def forward( output = out.view_as(query) else: # prefix-enabled attention + assert self.num_kv_heads == self.num_heads, ( + "Prefix caching is currently not supported with MQA/GQA") output = torch.empty_like(query) context_attention_fwd( query,