Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,10 @@ def determine_available_memory(self) -> int:
)

# Profile CUDA graph memory if graphs will be captured.
# Skip on ROCm/HIP as graph pool handles and mem_get_info behave
# differently and can produce incorrect/negative estimates.
cudagraph_memory_estimate = 0
if not self.model_config.enforce_eager:
if not self.model_config.enforce_eager and not current_platform.is_rocm():
cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory()

# Use the pre-cudagraph torch peak to avoid double-counting.
Expand All @@ -406,6 +408,8 @@ def determine_available_memory(self) -> int:
+ profile_result.weights_memory
)

# On ROCm, cudagraph_memory_estimate is always 0 so this is a no-op.
# On CUDA, respect the opt-in flag as originally designed.
cudagraph_memory_estimate_applied = (
cudagraph_memory_estimate
if envs.VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS
Expand Down Expand Up @@ -517,7 +521,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:

def update_max_model_len(self, max_model_len: int) -> None:
"""Update max_model_len after auto-fit to GPU memory.

This is called when max_model_len=-1 is used and the engine
automatically determines the maximum context length that fits
in GPU memory. Workers need to update their cached max_model_len
Expand Down
Loading