|
47 | 47 | is_pin_memory_available, round_up, supports_dynamo) |
48 | 48 | from vllm.v1.attention.backends.mamba_selectors import get_mamba_attn_backend |
49 | 49 | from vllm.v1.attention.backends.utils import ( |
50 | | - AttentionMetadataBuilder, CommonAttentionMetadata, |
| 50 | + AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, |
51 | 51 | make_kv_sharing_fast_prefill_attention_metadata, |
52 | 52 | make_local_attention_virtual_batches) |
53 | 53 | from vllm.v1.core.encoder_cache_manager import compute_encoder_budget |
@@ -2619,12 +2619,22 @@ def _initialize_single_attn_backend( |
2619 | 2619 | self.device, |
2620 | 2620 | ) |
2621 | 2621 |
|
2622 | | - if (self.full_cuda_graph |
2623 | | - and not attn_metadata_builder_i.full_cudagraph_supported): |
2624 | | - raise ValueError( |
2625 | | - f"Full CUDAGraph not supported for " |
2626 | | - f"{attn_backend_i.__name__}. Turn off CompilationConfig." |
2627 | | - f"full_cuda_graph or use a different attention backend.") |
| 2622 | + if self.full_cuda_graph: |
| 2623 | + if attn_metadata_builder_i.attn_cudagraph_support == \ |
| 2624 | + AttentionCGSupport.NEVER: |
| 2625 | + raise ValueError(f"Full CUDAGraph not supported for " |
| 2626 | + f"{attn_backend_i.__name__}. Turn off " |
| 2627 | + f"CompilationConfig.full_cuda_graph or use a " |
| 2628 | + f" different attention backend.") |
| 2629 | + if attn_metadata_builder_i.attn_cudagraph_support == \ |
| 2630 | + AttentionCGSupport.PURE_DECODE_ONLY: |
| 2631 | + # Limit the max cudagraph size to the max number of |
| 2632 | + # sequences for pure decode only cudagraph backend, |
| 2633 | + # whose max_query_len is 1. |
| 2634 | + self.cudagraph_batch_sizes = [ |
| 2635 | + size for size in self.cudagraph_batch_sizes |
| 2636 | + if size <= self.scheduler_config.max_num_seqs |
| 2637 | + ] |
2628 | 2638 | return attn_backend_i, attn_metadata_builder_i |
2629 | 2639 |
|
2630 | 2640 | def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: |
|
0 commit comments