diff --git a/verl/utils/debug/performance.py b/verl/utils/debug/performance.py index 615475a66a5..869fdb498e7 100644 --- a/verl/utils/debug/performance.py +++ b/verl/utils/debug/performance.py @@ -19,8 +19,11 @@ def log_gpu_memory_usage(head: str, logger: logging.Logger = None, level=logging.DEBUG, rank: int = 0): if (not dist.is_initialized()) or (rank is None) or (dist.get_rank() == rank): - memory_allocated = torch.cuda.memory_allocated() / 1024**3 - memory_reserved = torch.cuda.memory_reserved() / 1024**3 + # take care of vllm sleep mem, see https://github.com/volcengine/verl/pull/1101 + free_mem, total_mem = torch.cuda.mem_get_info() + vllm_sleep_mem = torch.cuda.memory_reserved() - (total_mem - free_mem) + memory_allocated = (torch.cuda.memory_allocated() - vllm_sleep_mem) / 1024**3 + memory_reserved = (torch.cuda.memory_reserved() - vllm_sleep_mem) / 1024**3 message = f'{head}, memory allocated (GB): {memory_allocated}, memory reserved (GB): {memory_reserved}'