vllm-project · DarkLight1337 · Jan 6, 2026 · Jan 3, 2026
diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
@@ -14,6 +14,10 @@
 from .mem_constants import GiB_bytes
 
 
+def format_gib(b: int) -> float:
+    return round(b / GiB_bytes, 2)
+
+
 @cache
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
@@ -146,6 +150,18 @@ def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
             auto_measure=False,
         )
 
+    def __repr__(self) -> str:
+        return (
+            f"torch_peak={format_gib(self.torch_peak)}GiB, "
+            f"free_memory={format_gib(self.free_memory)}GiB, "
+            f"total_memory={format_gib(self.total_memory)}GiB, "
+            f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
+            f"torch_memory={format_gib(self.torch_memory)}GiB, "
+            f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
+            f"timestamp={self.timestamp}, "
+            f"auto_measure={self.auto_measure}"
+        )
+
 
 @dataclass
 class MemoryProfilingResult:
@@ -168,12 +184,12 @@ def __repr__(self) -> str:
         return (
             f"Memory profiling takes {self.profile_time:.2f} seconds. "
             f"Total non KV cache memory: "
-            f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.non_kv_cache_memory)}GiB; "
             f"torch peak memory increase: "
-            f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
+            f"{format_gib(self.torch_peak_increase)}GiB; "
             f"non-torch forward increase memory: "
-            f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
-            f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
+            f"{format_gib(self.non_torch_increase)}GiB; "
+            f"weights memory: {format_gib(self.weights_memory)}GiB."
         )
 
 

@@ -40,8 +40,7 @@
 from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
 from vllm.sequence import IntermediateTensors
 from vllm.tasks import SupportedTask
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
 from vllm.utils.torch_utils import set_random_seed
 from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
@@ -132,9 +131,9 @@ def sleep(self, level: int = 1) -> None:
         used_bytes = total - free_bytes_after_sleep
         assert freed_bytes >= 0, "Memory usage increased after sleeping."
         logger.info(
-            "Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
-            freed_bytes / GiB_bytes,
-            used_bytes / GiB_bytes,
+            "Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
+            format_gib(freed_bytes),
+            format_gib(used_bytes),
         )
 
     def wake_up(self, tags: list[str] | None = None) -> None:
@@ -239,6 +238,10 @@ def init_device(self):
             # take current memory snapshot
             self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
             self.requested_memory = request_memory(init_snapshot, self.cache_config)
+            logger.debug("worker init memory snapshot: %r", self.init_snapshot)
+            logger.debug(
+                "worker requested memory: %sGiB", format_gib(self.requested_memory)
+            )
         else:
             raise RuntimeError(f"Not support device type: {self.device_config.device}")
 
@@ -293,15 +296,14 @@ def determine_available_memory(self) -> int:
             You may limit the usage of GPU memory
             by adjusting the `gpu_memory_utilization` parameter.
         """
-        GiB = lambda b: b / GiB_bytes
         if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
             # still need a profile run which compiles the model for
             # max_num_batched_tokens
             self.model_runner.profile_run()
 
             msg = (
-                f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
-                f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
+                f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
+                f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
                 "KV Cache as specified by kv_cache_memory_bytes config and "
                 "skipped memory profiling. This does not respect the "
                 "gpu_memory_utilization config. Only use kv_cache_memory_bytes "
@@ -333,8 +335,8 @@ def determine_available_memory(self) -> int:
         # GPU did not change their memory usage during the profiling.
         assert self.init_snapshot.free_memory > free_gpu_memory, (
             "Error in memory profiling. "
-            f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
-            f"current free memory {GiB(free_gpu_memory)} GiB. "
+            f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
+            f"current free memory {format_gib(free_gpu_memory)} GiB. "
             "This happens when other processes sharing the same container "
             "release GPU memory while vLLM is profiling during initialization. "
             "To fix this, ensure consistent GPU memory allocation or "
@@ -346,21 +348,20 @@ def determine_available_memory(self) -> int:
 
         unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
         logger.debug(
-            "Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
-            GiB(self.init_snapshot.free_memory),
+            "Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
+            format_gib(self.init_snapshot.free_memory),
             self.cache_config.gpu_memory_utilization,
-            GiB(self.requested_memory),
+            format_gib(self.requested_memory),
         )
         logger.debug(
-            "Free memory after profiling: %.2f GiB (total), "
-            "%.2f GiB (within requested)",
-            GiB(free_gpu_memory),
-            GiB(free_gpu_memory - unrequested_memory),
+            "Free memory after profiling: %f GiB (total), %f GiB (within requested)",
+            format_gib(free_gpu_memory),
+            format_gib(free_gpu_memory - unrequested_memory),
         )
         logger.debug(profile_result)
         logger.info_once(
-            "Available KV cache memory: %.2f GiB",
-            GiB(self.available_kv_cache_memory_bytes),
+            "Available KV cache memory: %f GiB",
+            format_gib(self.available_kv_cache_memory_bytes),
             scope="local",
         )
         gc.collect()
@@ -467,7 +468,6 @@ def compile_or_warm_up_model(self) -> None:
             # CUDAGraph memory size and may not utilize all gpu memory.
             # Users may want fine-grained control to specify kv cache
             # memory size.
-            GiB = lambda b: round(b / GiB_bytes, 2)
 
             # empirically observed that the memory profiling may
             # slightly underestimate the memory consumption.
@@ -492,24 +492,24 @@ def compile_or_warm_up_model(self) -> None:
 
             msg = (
                 f"Free memory on device "
-                f"({GiB(self.init_snapshot.free_memory)}/"
-                f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
+                f"({format_gib(self.init_snapshot.free_memory)}/"
+                f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
                 f"Desired GPU memory utilization is "
                 f"({self.cache_config.gpu_memory_utilization}, "
-                f"{GiB(self.requested_memory)} GiB). "
-                f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
-                f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
-                f"for peak activation, {GiB(self.non_torch_memory)} GiB "
-                f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
+                f"{format_gib(self.requested_memory)} GiB). "
+                f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
+                f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
+                f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
+                f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
                 f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
                 f"config with `--kv-cache-memory="
                 f"{kv_cache_memory_bytes_to_requested_limit}` "
-                f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
+                f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
                 f"into requested memory, or `--kv-cache-memory="
                 f"{kv_cache_memory_bytes_to_gpu_limit}` "
-                f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
+                f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
                 f"utilize gpu memory. Current kv cache memory in use is "
-                f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
+                f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
             )
 
             logger.debug(msg)

@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
 from collections import defaultdict
 from dataclasses import dataclass, field
 
@@ -15,8 +16,7 @@
 from vllm.multimodal.cache import processor_only_cache_from_config
 from vllm.multimodal.registry import MultiModalRegistry
 from vllm.platforms import current_platform
-from vllm.utils.mem_constants import GiB_bytes
-from vllm.utils.mem_utils import MemorySnapshot
+from vllm.utils.mem_utils import MemorySnapshot, format_gib
 from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
 from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
 from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
@@ -250,22 +250,23 @@ def gather_mm_placeholders(
     return placeholders[is_embed]
 
 
-def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
+def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
     """
     Calculate the amount of memory required by vLLM, then validate
     that the current amount of free memory is sufficient for that.
     """
-    requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
+    requested_memory = math.ceil(
+        init_snapshot.total_memory * cache_config.gpu_memory_utilization
+    )
 
     if init_snapshot.free_memory < requested_memory:
-        GiB = lambda b: round(b / GiB_bytes, 2)
         raise ValueError(
             f"Free memory on device {init_snapshot.device_} "
-            f"({GiB(init_snapshot.free_memory)}/"
-            f"{GiB(init_snapshot.total_memory)} GiB) on startup "
+            f"({format_gib(init_snapshot.free_memory)}/"
+            f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
             f"is less than desired GPU memory utilization "
             f"({cache_config.gpu_memory_utilization}, "
-            f"{GiB(requested_memory)} GiB). Decrease GPU memory "
+            f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
             f"utilization or reduce GPU memory used by other processes."
         )