From d1c01197dc0ae3e5bb632d3031b4415fd02c0b54 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 29 Jan 2026 17:49:06 +0000 Subject: [PATCH 1/3] fix whisper fa2 Signed-off-by: Lucas Wilkinson --- vllm/v1/attention/backends/flash_attn.py | 12 ------------ vllm/v1/worker/gpu_model_runner.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index ef9c2676d755..9275725314e4 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -263,18 +263,6 @@ def get_cudagraph_support( vllm_config: "VllmConfig", kv_cache_spec: "AttentionSpec", ) -> AttentionCGSupport: - # FA2 does not support CUDA graphs with encoder-decoder models due to - # accuracy issues reported in https://github.com/vllm-project/vllm/issues/33091 - if ( - vllm_config.model_config.is_encoder_decoder - and get_flash_attn_version() == 2 - ): - logger.warning_once( - "FlashAttention2 does not support CUDA graphs with " - "encoder-decoder models due to accuracy issues reported in #33091. " - "Disabling CUDA graph." - ) - return AttentionCGSupport.NEVER return cls._cudagraph_support def __init__( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8e21dea6900a..1a71098a5a18 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1390,12 +1390,14 @@ def _get_encoder_seq_lens( num_scheduled_tokens: dict[str, int], kv_cache_spec: KVCacheSpec, num_reqs: int, + for_cudagraph_capture: bool = False, ) -> tuple[torch.Tensor | None, np.ndarray | None]: if not isinstance(kv_cache_spec, CrossAttentionSpec): return None, None # Zero out buffer for padding requests that are not actually scheduled (CGs) self.encoder_seq_lens.np[:num_reqs] = 0 + # Build encoder_seq_lens array mapping request indices to # encoder lengths for inputs scheduled in this batch for req_id in num_scheduled_tokens: @@ -1412,6 +1414,13 @@ def _get_encoder_seq_lens( feature.mm_position.length for feature in req_state.mm_features ) self.encoder_seq_lens.np[req_index] = encoder_input_tokens + if for_cudagraph_capture: + # During CUDA graph capture, we need to use realistic encoder lengths + # so that max_seqlen_k is captured with the correct value. + max_encoder_len = getattr( + self.model_config.hf_config, "max_source_positions", self.max_model_len + ) + self.encoder_seq_lens.np[:num_reqs] = max_encoder_len self.encoder_seq_lens.copy_to_gpu(num_reqs) encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs] @@ -1829,6 +1838,7 @@ def _build_attn_group_metadata( num_scheduled_tokens or {}, kv_cache_group.kv_cache_spec, num_reqs_padded, + for_cudagraph_capture=for_cudagraph_capture, ) if kv_cache_gid > 0: cm.block_table_tensor = _get_block_table(kv_cache_gid) From 4f9796a65e09120fffe2d4fbd2aacbe19e731030 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 29 Jan 2026 10:52:50 -0700 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1a71098a5a18..cbe6046ff55d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1418,7 +1418,7 @@ def _get_encoder_seq_lens( # During CUDA graph capture, we need to use realistic encoder lengths # so that max_seqlen_k is captured with the correct value. max_encoder_len = getattr( - self.model_config.hf_config, "max_source_positions", self.max_model_len + self.model_config.hf_config, "max_source_positions", self.max_encoder_len ) self.encoder_seq_lens.np[:num_reqs] = max_encoder_len From 5f9e1bdcb8bb745db1616a9998abc86148cdbcc9 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 29 Jan 2026 18:33:56 +0000 Subject: [PATCH 3/3] fix Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cbe6046ff55d..b6138dd76901 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1418,7 +1418,9 @@ def _get_encoder_seq_lens( # During CUDA graph capture, we need to use realistic encoder lengths # so that max_seqlen_k is captured with the correct value. max_encoder_len = getattr( - self.model_config.hf_config, "max_source_positions", self.max_encoder_len + self.model_config.hf_config, + "max_source_positions", + self.max_encoder_len, ) self.encoder_seq_lens.np[:num_reqs] = max_encoder_len