diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 58e2d658c42b..6d117175b1ad 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -392,8 +392,10 @@ def determine_available_memory(self) -> int: ) # Profile CUDA graph memory if graphs will be captured. + # Skip on ROCm/HIP as graph pool handles and mem_get_info behave + # differently and can produce incorrect/negative estimates. cudagraph_memory_estimate = 0 - if not self.model_config.enforce_eager: + if not self.model_config.enforce_eager and not current_platform.is_rocm(): cudagraph_memory_estimate = self.model_runner.profile_cudagraph_memory() # Use the pre-cudagraph torch peak to avoid double-counting. @@ -406,6 +408,8 @@ def determine_available_memory(self) -> int: + profile_result.weights_memory ) + # On ROCm, cudagraph_memory_estimate is always 0 so this is a no-op. + # On CUDA, respect the opt-in flag as originally designed. cudagraph_memory_estimate_applied = ( cudagraph_memory_estimate if envs.VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS @@ -517,7 +521,6 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: def update_max_model_len(self, max_model_len: int) -> None: """Update max_model_len after auto-fit to GPU memory. - This is called when max_model_len=-1 is used and the engine automatically determines the maximum context length that fits in GPU memory. Workers need to update their cached max_model_len