From d1c01197dc0ae3e5bb632d3031b4415fd02c0b54 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Thu, 29 Jan 2026 17:49:06 +0000
Subject: [PATCH 1/3] fix whisper fa2

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/attention/backends/flash_attn.py | 12 ------------
 vllm/v1/worker/gpu_model_runner.py       | 10 ++++++++++
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index ef9c2676d755..9275725314e4 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -263,18 +263,6 @@ def get_cudagraph_support(
         vllm_config: "VllmConfig",
         kv_cache_spec: "AttentionSpec",
     ) -> AttentionCGSupport:
-        # FA2 does not support CUDA graphs with encoder-decoder models due to
-        # accuracy issues reported in https://github.com/vllm-project/vllm/issues/33091
-        if (
-            vllm_config.model_config.is_encoder_decoder
-            and get_flash_attn_version() == 2
-        ):
-            logger.warning_once(
-                "FlashAttention2 does not support CUDA graphs with "
-                "encoder-decoder models due to accuracy issues reported in #33091. "
-                "Disabling CUDA graph."
-            )
-            return AttentionCGSupport.NEVER
         return cls._cudagraph_support
 
     def __init__(
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 8e21dea6900a..1a71098a5a18 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1390,12 +1390,14 @@ def _get_encoder_seq_lens(
         num_scheduled_tokens: dict[str, int],
         kv_cache_spec: KVCacheSpec,
         num_reqs: int,
+        for_cudagraph_capture: bool = False,
     ) -> tuple[torch.Tensor | None, np.ndarray | None]:
         if not isinstance(kv_cache_spec, CrossAttentionSpec):
             return None, None
 
         # Zero out buffer for padding requests that are not actually scheduled (CGs)
         self.encoder_seq_lens.np[:num_reqs] = 0
+
         # Build encoder_seq_lens array mapping request indices to
         # encoder lengths for inputs scheduled in this batch
         for req_id in num_scheduled_tokens:
@@ -1412,6 +1414,13 @@ def _get_encoder_seq_lens(
                 feature.mm_position.length for feature in req_state.mm_features
             )
             self.encoder_seq_lens.np[req_index] = encoder_input_tokens
+        if for_cudagraph_capture:
+            # During CUDA graph capture, we need to use realistic encoder lengths
+            # so that max_seqlen_k is captured with the correct value.
+            max_encoder_len = getattr(
+                self.model_config.hf_config, "max_source_positions", self.max_model_len
+            )
+            self.encoder_seq_lens.np[:num_reqs] = max_encoder_len
 
         self.encoder_seq_lens.copy_to_gpu(num_reqs)
         encoder_seq_lens = self.encoder_seq_lens.gpu[:num_reqs]
@@ -1829,6 +1838,7 @@ def _build_attn_group_metadata(
                 num_scheduled_tokens or {},
                 kv_cache_group.kv_cache_spec,
                 num_reqs_padded,
+                for_cudagraph_capture=for_cudagraph_capture,
             )
             if kv_cache_gid > 0:
                 cm.block_table_tensor = _get_block_table(kv_cache_gid)

From 4f9796a65e09120fffe2d4fbd2aacbe19e731030 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Date: Thu, 29 Jan 2026 10:52:50 -0700
Subject: [PATCH 2/3] Apply suggestions from code review

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1a71098a5a18..cbe6046ff55d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1418,7 +1418,7 @@ def _get_encoder_seq_lens(
             # During CUDA graph capture, we need to use realistic encoder lengths
             # so that max_seqlen_k is captured with the correct value.
             max_encoder_len = getattr(
-                self.model_config.hf_config, "max_source_positions", self.max_model_len
+                self.model_config.hf_config, "max_source_positions", self.max_encoder_len
             )
             self.encoder_seq_lens.np[:num_reqs] = max_encoder_len
 

From 5f9e1bdcb8bb745db1616a9998abc86148cdbcc9 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Thu, 29 Jan 2026 18:33:56 +0000
Subject: [PATCH 3/3] fix

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index cbe6046ff55d..b6138dd76901 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1418,7 +1418,9 @@ def _get_encoder_seq_lens(
             # During CUDA graph capture, we need to use realistic encoder lengths
             # so that max_seqlen_k is captured with the correct value.
             max_encoder_len = getattr(
-                self.model_config.hf_config, "max_source_positions", self.max_encoder_len
+                self.model_config.hf_config,
+                "max_source_positions",
+                self.max_encoder_len,
             )
             self.encoder_seq_lens.np[:num_reqs] = max_encoder_len