diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index dd91400f2b8a..2cda557964d5 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -14,6 +14,10 @@ from .mem_constants import GiB_bytes +def format_gib(b: int) -> float: + return round(b / GiB_bytes, 2) + + @cache def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -146,6 +150,18 @@ def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot": auto_measure=False, ) + def __repr__(self) -> str: + return ( + f"torch_peak={format_gib(self.torch_peak)}GiB, " + f"free_memory={format_gib(self.free_memory)}GiB, " + f"total_memory={format_gib(self.total_memory)}GiB, " + f"cuda_memory={format_gib(self.cuda_memory)}GiB, " + f"torch_memory={format_gib(self.torch_memory)}GiB, " + f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, " + f"timestamp={self.timestamp}, " + f"auto_measure={self.auto_measure}" + ) + @dataclass class MemoryProfilingResult: @@ -168,12 +184,12 @@ def __repr__(self) -> str: return ( f"Memory profiling takes {self.profile_time:.2f} seconds. " f"Total non KV cache memory: " - f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; " + f"{format_gib(self.non_kv_cache_memory)}GiB; " f"torch peak memory increase: " - f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; " + f"{format_gib(self.torch_peak_increase)}GiB; " f"non-torch forward increase memory: " - f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; " - f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB." + f"{format_gib(self.non_torch_increase)}GiB; " + f"weights memory: {format_gib(self.weights_memory)}GiB." ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index fd4ee596c30e..01fc00f3d11e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -40,8 +40,7 @@ from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import MemorySnapshot, memory_profiling +from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling from vllm.utils.torch_utils import set_random_seed from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType @@ -132,9 +131,9 @@ def sleep(self, level: int = 1) -> None: used_bytes = total - free_bytes_after_sleep assert freed_bytes >= 0, "Memory usage increased after sleeping." logger.info( - "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.", - freed_bytes / GiB_bytes, - used_bytes / GiB_bytes, + "Sleep mode freed %f GiB memory, %f GiB memory is still in use.", + format_gib(freed_bytes), + format_gib(used_bytes), ) def wake_up(self, tags: list[str] | None = None) -> None: @@ -239,6 +238,10 @@ def init_device(self): # take current memory snapshot self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device) self.requested_memory = request_memory(init_snapshot, self.cache_config) + logger.debug("worker init memory snapshot: %r", self.init_snapshot) + logger.debug( + "worker requested memory: %sGiB", format_gib(self.requested_memory) + ) else: raise RuntimeError(f"Not support device type: {self.device_config.device}") @@ -293,15 +296,14 @@ def determine_available_memory(self) -> int: You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - GiB = lambda b: b / GiB_bytes if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes: # still need a profile run which compiles the model for # max_num_batched_tokens self.model_runner.profile_run() msg = ( - f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} " - f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for " + f"Initial free memory {format_gib(self.init_snapshot.free_memory)} " + f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for " "KV Cache as specified by kv_cache_memory_bytes config and " "skipped memory profiling. This does not respect the " "gpu_memory_utilization config. Only use kv_cache_memory_bytes " @@ -333,8 +335,8 @@ def determine_available_memory(self) -> int: # GPU did not change their memory usage during the profiling. assert self.init_snapshot.free_memory > free_gpu_memory, ( "Error in memory profiling. " - f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, " - f"current free memory {GiB(free_gpu_memory)} GiB. " + f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, " + f"current free memory {format_gib(free_gpu_memory)} GiB. " "This happens when other processes sharing the same container " "release GPU memory while vLLM is profiling during initialization. " "To fix this, ensure consistent GPU memory allocation or " @@ -346,21 +348,20 @@ def determine_available_memory(self) -> int: unrequested_memory = self.init_snapshot.free_memory - self.requested_memory logger.debug( - "Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB", - GiB(self.init_snapshot.free_memory), + "Initial free memory: %f GiB; Requested memory: %f (util), %f GiB", + format_gib(self.init_snapshot.free_memory), self.cache_config.gpu_memory_utilization, - GiB(self.requested_memory), + format_gib(self.requested_memory), ) logger.debug( - "Free memory after profiling: %.2f GiB (total), " - "%.2f GiB (within requested)", - GiB(free_gpu_memory), - GiB(free_gpu_memory - unrequested_memory), + "Free memory after profiling: %f GiB (total), %f GiB (within requested)", + format_gib(free_gpu_memory), + format_gib(free_gpu_memory - unrequested_memory), ) logger.debug(profile_result) logger.info_once( - "Available KV cache memory: %.2f GiB", - GiB(self.available_kv_cache_memory_bytes), + "Available KV cache memory: %f GiB", + format_gib(self.available_kv_cache_memory_bytes), scope="local", ) gc.collect() @@ -467,7 +468,6 @@ def compile_or_warm_up_model(self) -> None: # CUDAGraph memory size and may not utilize all gpu memory. # Users may want fine-grained control to specify kv cache # memory size. - GiB = lambda b: round(b / GiB_bytes, 2) # empirically observed that the memory profiling may # slightly underestimate the memory consumption. @@ -492,24 +492,24 @@ def compile_or_warm_up_model(self) -> None: msg = ( f"Free memory on device " - f"({GiB(self.init_snapshot.free_memory)}/" - f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. " + f"({format_gib(self.init_snapshot.free_memory)}/" + f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. " f"Desired GPU memory utilization is " f"({self.cache_config.gpu_memory_utilization}, " - f"{GiB(self.requested_memory)} GiB). " - f"Actual usage is {GiB(self.model_runner.model_memory_usage)} " - f"GiB for weight, {GiB(self.peak_activation_memory)} GiB " - f"for peak activation, {GiB(self.non_torch_memory)} GiB " - f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} " + f"{format_gib(self.requested_memory)} GiB). " + f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} " + f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB " + f"for peak activation, {format_gib(self.non_torch_memory)} GiB " + f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} " f"GiB for CUDAGraph memory. Replace gpu_memory_utilization " f"config with `--kv-cache-memory=" f"{kv_cache_memory_bytes_to_requested_limit}` " - f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit " + f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit " f"into requested memory, or `--kv-cache-memory=" f"{kv_cache_memory_bytes_to_gpu_limit}` " - f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully " + f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully " f"utilize gpu memory. Current kv cache memory in use is " - f"{GiB(self.available_kv_cache_memory_bytes)} GiB." + f"{format_gib(self.available_kv_cache_memory_bytes)} GiB." ) logger.debug(msg) diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 31ccf7f15746..bfe90572e232 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math from collections import defaultdict from dataclasses import dataclass, field @@ -15,8 +16,7 @@ from vllm.multimodal.cache import processor_only_cache_from_config from vllm.multimodal.registry import MultiModalRegistry from vllm.platforms import current_platform -from vllm.utils.mem_constants import GiB_bytes -from vllm.utils.mem_utils import MemorySnapshot +from vllm.utils.mem_utils import MemorySnapshot, format_gib from vllm.v1.attention.backends.utils import AttentionMetadataBuilder from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec @@ -250,22 +250,23 @@ def gather_mm_placeholders( return placeholders[is_embed] -def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float: +def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int: """ Calculate the amount of memory required by vLLM, then validate that the current amount of free memory is sufficient for that. """ - requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization + requested_memory = math.ceil( + init_snapshot.total_memory * cache_config.gpu_memory_utilization + ) if init_snapshot.free_memory < requested_memory: - GiB = lambda b: round(b / GiB_bytes, 2) raise ValueError( f"Free memory on device {init_snapshot.device_} " - f"({GiB(init_snapshot.free_memory)}/" - f"{GiB(init_snapshot.total_memory)} GiB) on startup " + f"({format_gib(init_snapshot.free_memory)}/" + f"{format_gib(init_snapshot.total_memory)} GiB) on startup " f"is less than desired GPU memory utilization " f"({cache_config.gpu_memory_utilization}, " - f"{GiB(requested_memory)} GiB). Decrease GPU memory " + f"{format_gib(requested_memory)} GiB). Decrease GPU memory " f"utilization or reduce GPU memory used by other processes." )