Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions vllm/compilation/cuda_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,14 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any | None:
# across layers will make the cudagraph capture very slow.
# therefore, we only run gc for the first graph,
# and disable gc for the rest of the graphs.
stack.enter_context(patch("gc.collect", lambda: None))
stack.enter_context(
patch("torch.accelerator.empty_cache", lambda: None)
patch("gc.collect", lambda *args, **kwargs: None)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems somehow unrelated to this PR?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, it is a leftover from previous bugs. Will remove it.

)
stack.enter_context(
patch(
"torch.accelerator.empty_cache",
lambda *args, **kwargs: None,
)
)

if self.graph_pool is not None:
Expand Down
5 changes: 5 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@
VLLM_DEBUG_WORKSPACE: bool = False
VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD: int = 256
VLLM_DISABLE_INDEXER_STREAM: bool = False
VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
VLLM_USE_V2_MODEL_RUNNER: bool = False
VLLM_LOG_MODEL_INSPECTION: bool = False
Expand Down Expand Up @@ -1629,6 +1630,10 @@ def _get_or_set_default() -> str:
"VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD": lambda: int(
int(os.getenv("VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD", 256))
),
# Disables parallel execution of indexer q_b_proj via separate cuda stream
"VLLM_DISABLE_INDEXER_STREAM": lambda: bool(
int(os.getenv("VLLM_DISABLE_INDEXER_STREAM", "0"))
),
# Format for saving torch.compile cache artifacts
# - "binary": saves as binary file
# Safe for multiple vllm serve processes accessing the same torch compile cache.
Expand Down
Loading
Loading