Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/attention_benchmarks/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,8 +418,8 @@ def _run_single_benchmark(
mem_stats = {}
if config.profile_memory:
mem_stats = {
"allocated_mb": torch.cuda.memory_allocated(device) / 1024**2,
"reserved_mb": torch.cuda.memory_reserved(device) / 1024**2,
"allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2,
"reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2,
}

return times, mem_stats
Expand Down
7 changes: 5 additions & 2 deletions benchmarks/benchmark_topk_topp.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,13 +95,16 @@ def create_logits(
def measure_memory() -> tuple[int, int]:
"""Return (allocated, reserved) memory in bytes."""
torch.accelerator.synchronize()
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
return (
torch.accelerator.memory_allocated(),
torch.accelerator.max_memory_allocated(),
)


def reset_memory_stats():
"""Reset peak memory statistics."""
reset_buffer_cache()
torch.cuda.reset_peak_memory_stats()
torch.accelerator.reset_peak_memory_stats()
torch.accelerator.empty_cache()
gc.collect()

Expand Down
2 changes: 1 addition & 1 deletion tests/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_gc():
# The memory allocated for model and KV cache should be released.
# The memory allocated for PyTorch and others should be less than 50MB.
# Usually, it's around 10MB.
allocated = torch.cuda.memory_allocated()
allocated = torch.accelerator.memory_allocated()
assert allocated < 50 * 1024 * 1024


Expand Down
2 changes: 1 addition & 1 deletion tests/utils_/test_mem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_memory_profiling():
def measure_current_non_torch():
free, total = torch.cuda.mem_get_info()
current_used = total - free
current_torch = torch.cuda.memory_reserved()
current_torch = torch.accelerator.memory_reserved()
current_non_torch = current_used - current_torch
return current_non_torch

Expand Down
2 changes: 1 addition & 1 deletion tools/pre_commit/check_torch_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
# Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx`
# --------------------------------------------------------------------------- #
_TORCH_CUDA_PATTERNS = [
r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|set_device|device\()\b",
r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|reset_peak_memory_stats|memory_stats|set_device|device\()\b",
r"\bwith\storch\.cuda\.device\b",
]

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/model_loader/base_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def load_model(
# Log peak GPU memory after loading weights. This is needed
# to have test coverage on peak memory for online quantization.
if current_platform.is_cuda():
peak_memory = torch.cuda.max_memory_allocated()
peak_memory = torch.accelerator.max_memory_allocated()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The change to torch.accelerator.max_memory_allocated() is correct, but it's inside a if current_platform.is_cuda(): block on line 66. Since torch.accelerator is designed to be device-agnostic (working on CUDA, ROCm, etc.), this condition is now too restrictive and will prevent peak memory logging on other GPU platforms like ROCm.

To ensure this logging works on all supported GPU-like devices, consider broadening the condition. For example:

if current_platform.is_cuda_alike():

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we shuold use if not current_platform.is_cpu() here.

logger.debug_once(
"Peak GPU memory after loading weights: %s GiB",
format_gib(peak_memory),
Expand Down
16 changes: 8 additions & 8 deletions vllm/utils/mem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ def measure(self) -> None:
device = self.device_

# we measure the torch peak memory usage via allocated_bytes,
# rather than `torch.cuda.memory_reserved()` .
# After `torch.cuda.reset_peak_memory_stats()`,
# `torch.cuda.memory_reserved()` will keep growing, and only shrink
# rather than `torch.accelerator.memory_reserved()` .
# After `torch.accelerator.reset_peak_memory_stats()`,
# `torch.accelerator.memory_reserved()` will keep growing, and only shrink
# when we call `torch.accelerator.empty_cache()` or OOM happens.
self.torch_peak = current_platform.memory_stats(device).get(
self.torch_peak = torch.accelerator.memory_stats(device).get(
"allocated_bytes.all.peak", 0
)

Expand All @@ -123,10 +123,10 @@ def measure(self) -> None:

self.cuda_memory = self.total_memory - self.free_memory

# torch.cuda.memory_reserved() is how many bytes
# torch.accelerator.memory_reserved() is how many bytes
# PyTorch gets from cuda (by calling cudaMalloc, etc.)
# this is used to measure the non-torch memory usage
self.torch_memory = current_platform.memory_reserved(device)
self.torch_memory = torch.accelerator.memory_reserved(device)

self.non_torch_memory = self.cuda_memory - self.torch_memory
self.timestamp = time.time()
Expand Down Expand Up @@ -243,15 +243,15 @@ def memory_profiling(
The memory used for loading weights (a.) is directly given from the
argument `weights_memory`.

The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]`
The increase of `torch.accelerator.memory_stats()["allocated_bytes.all.peak"]`
during profiling gives (b.).

The increase of `non_torch_memory` from creating the current vLLM instance
until after profiling to get (c.).
"""
gc.collect()
torch.accelerator.empty_cache()
current_platform.reset_peak_memory_stats(baseline_snapshot.device_)
torch.accelerator.reset_peak_memory_stats(baseline_snapshot.device_)

result = MemoryProfilingResult(
before_create=baseline_snapshot,
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ def determine_available_memory(self) -> int:
) as profile_result:
self.model_runner.profile_run()

profile_torch_peak = current_platform.memory_stats(self.device).get(
profile_torch_peak = torch.accelerator.memory_stats(self.device).get(
Comment thread
hmellor marked this conversation as resolved.
"allocated_bytes.all.peak", 0
)

Expand Down
Loading