vllm-project · simon-mo · Mar 7, 2026 · Dec 11, 2025 · Dec 11, 2025 · Dec 11, 2025
@@ -2,10 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import dataclasses
+import weakref
 from collections import Counter
 from collections.abc import Callable
 from contextlib import ExitStack
-from typing import Any
+from typing import Any, ClassVar
 from unittest.mock import patch
 
 import torch
@@ -162,6 +163,14 @@ class CUDAGraphWrapper:
     guaranteed when VLLM_LOGGING_LEVEL == "DEBUG".
     """
 
+    _all_instances: ClassVar[weakref.WeakSet["CUDAGraphWrapper"]] = weakref.WeakSet()
+
+    @classmethod
+    def clear_all_graphs(cls) -> None:
+        """Clear captured graphs from all CUDAGraphWrapper instances."""
+        for instance in list(cls._all_instances):
+            instance.clear_graphs()
+
     def __init__(
         self,
         runnable: Callable[..., Any],
@@ -192,6 +201,8 @@ def __init__(
         # cudagraphs for.
         self.concrete_cudagraph_entries: dict[BatchDescriptor, CUDAGraphEntry] = {}
 
+        CUDAGraphWrapper._all_instances.add(self)
+
     def __getattr__(self, key: str) -> Any:
         # allow accessing the attributes of the runnable.
         if hasattr(self.runnable, key):
@@ -205,6 +216,13 @@ def unwrap(self) -> Callable[..., Any]:
         # in case we need to access the original runnable.
         return self.runnable
 
+    @property
+    def cudagraph_wrapper(self) -> "CUDAGraphWrapper":
+        return self
+
+    def clear_graphs(self) -> None:
+        self.concrete_cudagraph_entries.clear()
+
     def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
         forward_context = get_forward_context()
         batch_descriptor = forward_context.batch_descriptor

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -244,6 +244,7 @@
     VLLM_CUDA_COMPATIBILITY_PATH: str | None = None
     VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: bool = False
     VLLM_ELASTIC_EP_DRAIN_REQUESTS: bool = False
+    VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS: bool = False
 
 
 def get_default_cache_root():
@@ -1628,6 +1629,12 @@ def _get_or_set_default() -> str:
     "VLLM_ELASTIC_EP_DRAIN_REQUESTS": lambda: bool(
         int(os.getenv("VLLM_ELASTIC_EP_DRAIN_REQUESTS", "0"))
     ),
+    # If set to 1, enable CUDA graph memory estimation during memory profiling.
+    # This profiles CUDA graph memory usage to provide more accurate KV cache
+    # memory allocation. Disabled by default to preserve existing behavior.
+    "VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS": lambda: bool(
+        int(os.getenv("VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS", "0"))
+    ),
 }
 
 

diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py
@@ -334,8 +334,11 @@ def get_capture_descs(self) -> list[tuple[CUDAGraphMode, list[BatchDescriptor]]]
         for mode in [CUDAGraphMode.PIECEWISE, CUDAGraphMode.FULL]:
             descs = list(self.cudagraph_keys[mode])
             if descs:
-                # Sort by num_tokens descending (largest first)
-                descs.sort(key=lambda d: d.num_tokens, reverse=True)
+                # Sort by (num_tokens, num_active_loras) descending
+                descs.sort(
+                    key=lambda d: (d.num_tokens, d.num_active_loras),
+                    reverse=True,
+                )
                 result.append((mode, descs))
 
         return result