Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions vllm/utils/mem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
from .mem_constants import GiB_bytes


def format_gib(b: int) -> float:
return round(b / GiB_bytes, 2)


@cache
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
"""Returns the maximum shared memory per thread block in bytes."""
Expand Down Expand Up @@ -146,6 +150,18 @@ def __sub__(self, other: "MemorySnapshot") -> "MemorySnapshot":
auto_measure=False,
)

def __repr__(self) -> str:
return (
f"torch_peak={format_gib(self.torch_peak)}GiB, "
f"free_memory={format_gib(self.free_memory)}GiB, "
f"total_memory={format_gib(self.total_memory)}GiB, "
f"cuda_memory={format_gib(self.cuda_memory)}GiB, "
f"torch_memory={format_gib(self.torch_memory)}GiB, "
f"non_torch_memory={format_gib(self.non_torch_memory)}GiB, "
f"timestamp={self.timestamp}, "
f"auto_measure={self.auto_measure}"
)


@dataclass
class MemoryProfilingResult:
Expand All @@ -168,12 +184,12 @@ def __repr__(self) -> str:
return (
f"Memory profiling takes {self.profile_time:.2f} seconds. "
f"Total non KV cache memory: "
f"{(self.non_kv_cache_memory / GiB_bytes):.2f}GiB; "
f"{format_gib(self.non_kv_cache_memory)}GiB; "
f"torch peak memory increase: "
f"{(self.torch_peak_increase / GiB_bytes):.2f}GiB; "
f"{format_gib(self.torch_peak_increase)}GiB; "
f"non-torch forward increase memory: "
f"{(self.non_torch_increase / GiB_bytes):.2f}GiB; "
f"weights memory: {(self.weights_memory / GiB_bytes):.2f}GiB."
f"{format_gib(self.non_torch_increase)}GiB; "
f"weights memory: {format_gib(self.weights_memory)}GiB."
)


Expand Down
60 changes: 30 additions & 30 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@
from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper
from vllm.sequence import IntermediateTensors
from vllm.tasks import SupportedTask
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
from vllm.utils.mem_utils import MemorySnapshot, format_gib, memory_profiling
from vllm.utils.torch_utils import set_random_seed
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType
Expand Down Expand Up @@ -132,9 +131,9 @@ def sleep(self, level: int = 1) -> None:
used_bytes = total - free_bytes_after_sleep
assert freed_bytes >= 0, "Memory usage increased after sleeping."
logger.info(
"Sleep mode freed %.2f GiB memory, %.2f GiB memory is still in use.",
freed_bytes / GiB_bytes,
used_bytes / GiB_bytes,
"Sleep mode freed %f GiB memory, %f GiB memory is still in use.",
format_gib(freed_bytes),
format_gib(used_bytes),
)

def wake_up(self, tags: list[str] | None = None) -> None:
Expand Down Expand Up @@ -239,6 +238,10 @@ def init_device(self):
# take current memory snapshot
self.init_snapshot = init_snapshot = MemorySnapshot(device=self.device)
self.requested_memory = request_memory(init_snapshot, self.cache_config)
logger.debug("worker init memory snapshot: %r", self.init_snapshot)
logger.debug(
"worker requested memory: %sGiB", format_gib(self.requested_memory)
)
else:
raise RuntimeError(f"Not support device type: {self.device_config.device}")

Expand Down Expand Up @@ -293,15 +296,14 @@ def determine_available_memory(self) -> int:
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
GiB = lambda b: b / GiB_bytes
if kv_cache_memory_bytes := self.cache_config.kv_cache_memory_bytes:
# still need a profile run which compiles the model for
# max_num_batched_tokens
self.model_runner.profile_run()

msg = (
f"Initial free memory {GiB(self.init_snapshot.free_memory):.2f} "
f"GiB, reserved {GiB(kv_cache_memory_bytes):.2f} GiB memory for "
f"Initial free memory {format_gib(self.init_snapshot.free_memory)} "
f"GiB, reserved {format_gib(kv_cache_memory_bytes)} GiB memory for "
"KV Cache as specified by kv_cache_memory_bytes config and "
"skipped memory profiling. This does not respect the "
"gpu_memory_utilization config. Only use kv_cache_memory_bytes "
Expand Down Expand Up @@ -333,8 +335,8 @@ def determine_available_memory(self) -> int:
# GPU did not change their memory usage during the profiling.
assert self.init_snapshot.free_memory > free_gpu_memory, (
"Error in memory profiling. "
f"Initial free memory {GiB(self.init_snapshot.free_memory)} GiB, "
f"current free memory {GiB(free_gpu_memory)} GiB. "
f"Initial free memory {format_gib(self.init_snapshot.free_memory)} GiB, "
f"current free memory {format_gib(free_gpu_memory)} GiB. "
"This happens when other processes sharing the same container "
"release GPU memory while vLLM is profiling during initialization. "
"To fix this, ensure consistent GPU memory allocation or "
Expand All @@ -346,21 +348,20 @@ def determine_available_memory(self) -> int:

unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
logger.debug(
"Initial free memory: %.2f GiB; Requested memory: %.2f (util), %.2f GiB",
GiB(self.init_snapshot.free_memory),
"Initial free memory: %f GiB; Requested memory: %f (util), %f GiB",
format_gib(self.init_snapshot.free_memory),
self.cache_config.gpu_memory_utilization,
GiB(self.requested_memory),
format_gib(self.requested_memory),
)
logger.debug(
"Free memory after profiling: %.2f GiB (total), "
"%.2f GiB (within requested)",
GiB(free_gpu_memory),
GiB(free_gpu_memory - unrequested_memory),
"Free memory after profiling: %f GiB (total), %f GiB (within requested)",
format_gib(free_gpu_memory),
format_gib(free_gpu_memory - unrequested_memory),
)
logger.debug(profile_result)
logger.info_once(
"Available KV cache memory: %.2f GiB",
GiB(self.available_kv_cache_memory_bytes),
"Available KV cache memory: %f GiB",
format_gib(self.available_kv_cache_memory_bytes),
scope="local",
)
gc.collect()
Expand Down Expand Up @@ -467,7 +468,6 @@ def compile_or_warm_up_model(self) -> None:
# CUDAGraph memory size and may not utilize all gpu memory.
# Users may want fine-grained control to specify kv cache
# memory size.
GiB = lambda b: round(b / GiB_bytes, 2)

# empirically observed that the memory profiling may
# slightly underestimate the memory consumption.
Expand All @@ -492,24 +492,24 @@ def compile_or_warm_up_model(self) -> None:

msg = (
f"Free memory on device "
f"({GiB(self.init_snapshot.free_memory)}/"
f"{GiB(self.init_snapshot.total_memory)} GiB) on startup. "
f"({format_gib(self.init_snapshot.free_memory)}/"
f"{format_gib(self.init_snapshot.total_memory)} GiB) on startup. "
f"Desired GPU memory utilization is "
f"({self.cache_config.gpu_memory_utilization}, "
f"{GiB(self.requested_memory)} GiB). "
f"Actual usage is {GiB(self.model_runner.model_memory_usage)} "
f"GiB for weight, {GiB(self.peak_activation_memory)} GiB "
f"for peak activation, {GiB(self.non_torch_memory)} GiB "
f"for non-torch memory, and {GiB(cuda_graph_memory_bytes)} "
f"{format_gib(self.requested_memory)} GiB). "
f"Actual usage is {format_gib(self.model_runner.model_memory_usage)} "
f"GiB for weight, {format_gib(self.peak_activation_memory)} GiB "
f"for peak activation, {format_gib(self.non_torch_memory)} GiB "
f"for non-torch memory, and {format_gib(cuda_graph_memory_bytes)} "
f"GiB for CUDAGraph memory. Replace gpu_memory_utilization "
f"config with `--kv-cache-memory="
f"{kv_cache_memory_bytes_to_requested_limit}` "
f"({GiB(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
f"({format_gib(kv_cache_memory_bytes_to_requested_limit)} GiB) to fit "
f"into requested memory, or `--kv-cache-memory="
f"{kv_cache_memory_bytes_to_gpu_limit}` "
f"({GiB(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
f"({format_gib(kv_cache_memory_bytes_to_gpu_limit)} GiB) to fully "
f"utilize gpu memory. Current kv cache memory in use is "
f"{GiB(self.available_kv_cache_memory_bytes)} GiB."
f"{format_gib(self.available_kv_cache_memory_bytes)} GiB."
)

logger.debug(msg)
Expand Down
17 changes: 9 additions & 8 deletions vllm/v1/worker/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import math
from collections import defaultdict
from dataclasses import dataclass, field

Expand All @@ -15,8 +16,7 @@
from vllm.multimodal.cache import processor_only_cache_from_config
from vllm.multimodal.registry import MultiModalRegistry
from vllm.platforms import current_platform
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import MemorySnapshot
from vllm.utils.mem_utils import MemorySnapshot, format_gib
from vllm.v1.attention.backends.utils import AttentionMetadataBuilder
from vllm.v1.core.encoder_cache_manager import compute_mm_encoder_budget
from vllm.v1.kv_cache_interface import KVCacheGroupSpec, KVCacheSpec
Expand Down Expand Up @@ -250,22 +250,23 @@ def gather_mm_placeholders(
return placeholders[is_embed]


def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> float:
def request_memory(init_snapshot: MemorySnapshot, cache_config: CacheConfig) -> int:
"""
Calculate the amount of memory required by vLLM, then validate
that the current amount of free memory is sufficient for that.
"""
requested_memory = init_snapshot.total_memory * cache_config.gpu_memory_utilization
requested_memory = math.ceil(
init_snapshot.total_memory * cache_config.gpu_memory_utilization
)

if init_snapshot.free_memory < requested_memory:
GiB = lambda b: round(b / GiB_bytes, 2)
raise ValueError(
f"Free memory on device {init_snapshot.device_} "
f"({GiB(init_snapshot.free_memory)}/"
f"{GiB(init_snapshot.total_memory)} GiB) on startup "
f"({format_gib(init_snapshot.free_memory)}/"
f"{format_gib(init_snapshot.total_memory)} GiB) on startup "
f"is less than desired GPU memory utilization "
f"({cache_config.gpu_memory_utilization}, "
f"{GiB(requested_memory)} GiB). Decrease GPU memory "
f"{format_gib(requested_memory)} GiB). Decrease GPU memory "
f"utilization or reduce GPU memory used by other processes."
)

Expand Down