From d389ca52861c1713dd60c752a97cb7d98395ee4e Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Wed, 18 Mar 2026 20:25:52 +0800 Subject: [PATCH 1/3] fix CUDAGraph memory being counted twice Signed-off-by: Peter Pan --- vllm/v1/worker/gpu_worker.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 58e28e694055..926e4c2db348 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -635,9 +635,16 @@ def compile_or_warm_up_model(self) -> float: # slightly underestimate the memory consumption. # So leave a small buffer (=150MiB) to avoid OOM. redundancy_buffer_memory = 150 * (1 << 20) + + cudagraph_memory_estimate_applied = ( + self.cudagraph_memory_estimate + if envs.VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS + else 0 + ) + non_kv_cache_memory = ( self.model_runner.model_memory_usage - + self.peak_activation_memory + + (self.peak_activation_memory - cudagraph_memory_estimate_applied) + self.non_torch_memory + cuda_graph_memory_bytes ) From b26bc5608db56efeaaa1412092f3162ca745c6a9 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Fri, 20 Mar 2026 10:38:02 +0800 Subject: [PATCH 2/3] Update vllm/v1/worker/gpu_worker.py Co-authored-by: Matthew Bonanni Signed-off-by: Peter Pan --- vllm/v1/worker/gpu_worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0704f3ff5710..463680078955 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -419,7 +419,7 @@ def determine_available_memory(self) -> int: self.non_torch_memory = profile_result.non_torch_increase self.peak_activation_memory = ( profile_result.torch_peak_increase + cudagraph_memory_estimate_applied - ) +self.peak_activation_memory = profile_result.torch_peak_increase self.cudagraph_memory_estimate = cudagraph_memory_estimate free_gpu_memory = profile_result.after_profile.free_memory From 5554db41d71fc437425967b7304d472a19ad175c Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Fri, 20 Mar 2026 10:45:04 +0800 Subject: [PATCH 3/3] Fix accroding to Mat's idea Signed-off-by: Peter Pan --- vllm/v1/worker/gpu_worker.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 463680078955..39374db5b640 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -417,9 +417,7 @@ def determine_available_memory(self) -> int: ) self.non_torch_memory = profile_result.non_torch_increase - self.peak_activation_memory = ( - profile_result.torch_peak_increase + cudagraph_memory_estimate_applied -self.peak_activation_memory = profile_result.torch_peak_increase + self.peak_activation_memory = profile_result.torch_peak_increase self.cudagraph_memory_estimate = cudagraph_memory_estimate free_gpu_memory = profile_result.after_profile.free_memory @@ -639,15 +637,9 @@ def compile_or_warm_up_model(self) -> float: # So leave a small buffer (=150MiB) to avoid OOM. redundancy_buffer_memory = 150 * (1 << 20) - cudagraph_memory_estimate_applied = ( - self.cudagraph_memory_estimate - if envs.VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS - else 0 - ) - non_kv_cache_memory = ( self.model_runner.model_memory_usage - + (self.peak_activation_memory - cudagraph_memory_estimate_applied) + + self.peak_activation_memory + self.non_torch_memory + cuda_graph_memory_bytes )