fix IMA

fhl2000 · fhl2000 · commit 1928556077df · 2025-07-22T17:44:53.000Z
Signed-off-by: fhl2000 &lt;63384265+fhl2000@users.noreply.github.com&gt;
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -63,6 +63,7 @@ class CommonAttentionMetadata:
 
 M = TypeVar("M")
 
+
 class AttentionCGSupport(enum.Enum):
     """ Constants for the cudagraph support of the attention backend
     Here we do not consider the cascade attention, as currently
@@ -76,6 +77,7 @@ class AttentionCGSupport(enum.Enum):
     ALWAYS = 2
     """Cudagraph always supported"""
 
+
 class AttentionMetadataBuilder(abc.ABC, Generic[M]):
     # Does this backend/builder support CUDA Graphs for attention.
     attn_cudagraph_support: ClassVar[AttentionCGSupport] = \
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -2369,14 +2369,6 @@ def capture_model(self) -> None:
         # can reuse the memory pool allocated for the large shapes.
         with graph_capture(device=self.device):
             full_cg = self.full_cuda_graph
-            # for full cg on pure decode only, do not capture size lager than
-            # max_num_seqs
-            if full_cg and self.attn_metadata_builders[0].attn_cudagraph_support\
-                == AttentionCGSupport.PURE_DECODE_ONLY:
-                max_num_seqs = self.scheduler_config.max_num_seqs
-                self.cudagraph_batch_sizes = [
-                    size for size in self.cudagraph_batch_sizes
-                    if size <= max_num_seqs]
 
             # Only rank 0 should print progress bar during capture
             compilation_cases = reversed(self.cudagraph_batch_sizes)
@@ -2446,13 +2438,20 @@ def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None:
                 self.device,
             )
 
-            if (self.full_cuda_graph
-                    and attn_metadata_builder_i.attn_cudagraph_support == \
-                    AttentionCGSupport.NEVER):
-                raise ValueError(
-                    f"Full CUDAGraph not supported for "
-                    f"{attn_backend_i.__name__}. Turn off CompilationConfig."
-                    f"full_cuda_graph or use a different attention backend.")
+            if self.full_cuda_graph:
+                if attn_metadata_builder_i.attn_cudagraph_support == \
+                    AttentionCGSupport.NEVER:
+                    raise ValueError(
+                        f"Full CUDAGraph not supported for "
+                        f"{attn_backend_i.__name__}. Turn off "
+                        f"CompilationConfig.full_cuda_graph or use a "
+                        f" different attention backend.")
+                if attn_metadata_builder_i.attn_cudagraph_support == \
+                    AttentionCGSupport.PURE_DECODE_ONLY:
+                    self.cudagraph_batch_sizes = [
+                        size for size in self.cudagraph_batch_sizes
+                        if size <= self.scheduler_config.max_num_seqs
+                    ]
 
             self.attn_backends.append(attn_backend_i)
             self.attn_metadata_builders.append(attn_metadata_builder_i)
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -292,9 +292,12 @@ def compile_or_warm_up_model(self) -> None:
                                self.scheduler_config.max_num_batched_tokens)
 
             # We skip EPLB here since we don't want to record dummy metrics
+            # Always activate creating attn_cudagraphs for dummy run to avoid
+            # illegal memory access for full cudagraph.
             hidden_states, last_hidden_states = \
                 self.model_runner._dummy_run(
                     num_tokens=max_num_reqs,
+                    capture_attn_cudagraph=True,
                     skip_eplb=True,
                 )
             if self.model_runner.is_pooling_model: