vllm-project · alexm-redhat · Apr 15, 2026 · LopezCastroRoberto · Apr 17, 2026 · alexm-redhat
@@ -290,9 +290,14 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
                     # across layers will make the cudagraph capture very slow.
                     # therefore, we only run gc for the first graph,
                     # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
                     stack.enter_context(
-                        patch("torch.accelerator.empty_cache", lambda: None)
+                        patch("gc.collect", lambda *args, **kwargs: None)
+                    )
+                    stack.enter_context(
+                        patch(
+                            "torch.accelerator.empty_cache",
+                            lambda *args, **kwargs: None,
+                        )
                     )
 
                 if self.graph_pool is not None:

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -241,6 +241,7 @@
     VLLM_DEBUG_WORKSPACE: bool = False
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
+    VLLM_DISABLE_INDEXER_STREAM: bool = False
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
     VLLM_USE_V2_MODEL_RUNNER: bool = False
     VLLM_LOG_MODEL_INSPECTION: bool = False
@@ -1629,6 +1630,10 @@ def _get_or_set_default() -> str:
     "VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
         int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
     ),
+    # Disables parallel execution of indexer q_b_proj via separate cuda stream
+    "VLLM_DISABLE_INDEXER_STREAM": lambda: bool(
+        int(os.getenv("VLLM_DISABLE_INDEXER_STREAM", "0"))
+    ),
     # Format for saving torch.compile cache artifacts
     # - "binary": saves as binary file
     #     Safe for multiple vllm serve processes accessing the same torch compile cache.