From a601b479032947c700635793a47fa42329969a54 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 14 Mar 2026 10:23:17 +0800 Subject: [PATCH 1/6] replace torch.cuda.memory_reserved Signed-off-by: Kunshang Ji --- benchmarks/attention_benchmarks/runner.py | 2 +- tests/utils_/test_mem_utils.py | 2 +- tools/pre_commit/check_torch_cuda.py | 2 +- vllm/utils/mem_utils.py | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index 52286186d61d..d906545762a5 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -419,7 +419,7 @@ def _run_single_benchmark( if config.profile_memory: mem_stats = { "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2, - "reserved_mb": torch.cuda.memory_reserved(device) / 1024**2, + "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2, } return times, mem_stats diff --git a/tests/utils_/test_mem_utils.py b/tests/utils_/test_mem_utils.py index 4b1058be412d..4067b0257811 100644 --- a/tests/utils_/test_mem_utils.py +++ b/tests/utils_/test_mem_utils.py @@ -29,7 +29,7 @@ def test_memory_profiling(): def measure_current_non_torch(): free, total = torch.cuda.mem_get_info() current_used = total - free - current_torch = torch.cuda.memory_reserved() + current_torch = torch.accelerator.memory_reserved() current_non_torch = current_used - current_torch return current_non_torch diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index 4099c315e6eb..756b8a5f41c9 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,7 +8,7 @@ # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|set_device|device\()\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", ] diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index 30e38b0bf4e3..50838f334bdf 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -93,9 +93,9 @@ def measure(self) -> None: device = self.device_ # we measure the torch peak memory usage via allocated_bytes, - # rather than `torch.cuda.memory_reserved()` . + # rather than `torch.accelerator.memory_reserved()` . # After `torch.cuda.reset_peak_memory_stats()`, - # `torch.cuda.memory_reserved()` will keep growing, and only shrink + # `torch.accelerator.memory_reserved()` will keep growing, and only shrink # when we call `torch.accelerator.empty_cache()` or OOM happens. self.torch_peak = current_platform.memory_stats(device).get( "allocated_bytes.all.peak", 0 @@ -123,10 +123,10 @@ def measure(self) -> None: self.cuda_memory = self.total_memory - self.free_memory - # torch.cuda.memory_reserved() is how many bytes + # torch.accelerator.memory_reserved() is how many bytes # PyTorch gets from cuda (by calling cudaMalloc, etc.) # this is used to measure the non-torch memory usage - self.torch_memory = current_platform.memory_reserved(device) + self.torch_memory = torch.accelerator.memory_reserved(device) self.non_torch_memory = self.cuda_memory - self.torch_memory self.timestamp = time.time() From eb45d487a08b089892b9d83249f26884f923fff4 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 14 Mar 2026 10:34:19 +0800 Subject: [PATCH 2/6] replace torch.cuda.memory_allocated Signed-off-by: Kunshang Ji --- benchmarks/attention_benchmarks/runner.py | 2 +- benchmarks/benchmark_topk_topp.py | 2 +- tests/test_regression.py | 2 +- tools/pre_commit/check_torch_cuda.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/attention_benchmarks/runner.py b/benchmarks/attention_benchmarks/runner.py index d906545762a5..6af56e0e94f5 100644 --- a/benchmarks/attention_benchmarks/runner.py +++ b/benchmarks/attention_benchmarks/runner.py @@ -418,7 +418,7 @@ def _run_single_benchmark( mem_stats = {} if config.profile_memory: mem_stats = { - "allocated_mb": torch.cuda.memory_allocated(device) / 1024**2, + "allocated_mb": torch.accelerator.memory_allocated(device) / 1024**2, "reserved_mb": torch.accelerator.memory_reserved(device) / 1024**2, } diff --git a/benchmarks/benchmark_topk_topp.py b/benchmarks/benchmark_topk_topp.py index f1d59cbde834..213901cb6860 100644 --- a/benchmarks/benchmark_topk_topp.py +++ b/benchmarks/benchmark_topk_topp.py @@ -95,7 +95,7 @@ def create_logits( def measure_memory() -> tuple[int, int]: """Return (allocated, reserved) memory in bytes.""" torch.accelerator.synchronize() - return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated() + return torch.accelerator.memory_allocated(), torch.cuda.max_memory_allocated() def reset_memory_stats(): diff --git a/tests/test_regression.py b/tests/test_regression.py index ac82206f7160..978e0783919d 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -55,7 +55,7 @@ def test_gc(): # The memory allocated for model and KV cache should be released. # The memory allocated for PyTorch and others should be less than 50MB. # Usually, it's around 10MB. - allocated = torch.cuda.memory_allocated() + allocated = torch.accelerator.memory_allocated() assert allocated < 50 * 1024 * 1024 diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index 756b8a5f41c9..4deef85c89b8 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,7 +8,7 @@ # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|set_device|device\()\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", ] From 4f4e9488f26335e56c7c88ef5fed807c3455a044 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 14 Mar 2026 10:36:17 +0800 Subject: [PATCH 3/6] replace torch.cuda.max_memory_allocated Signed-off-by: Kunshang Ji --- benchmarks/benchmark_topk_topp.py | 5 ++++- tools/pre_commit/check_torch_cuda.py | 2 +- vllm/model_executor/model_loader/base_loader.py | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/benchmarks/benchmark_topk_topp.py b/benchmarks/benchmark_topk_topp.py index 213901cb6860..0c5ef4c956ec 100644 --- a/benchmarks/benchmark_topk_topp.py +++ b/benchmarks/benchmark_topk_topp.py @@ -95,7 +95,10 @@ def create_logits( def measure_memory() -> tuple[int, int]: """Return (allocated, reserved) memory in bytes.""" torch.accelerator.synchronize() - return torch.accelerator.memory_allocated(), torch.cuda.max_memory_allocated() + return ( + torch.accelerator.memory_allocated(), + torch.accelerator.max_memory_allocated(), + ) def reset_memory_stats(): diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index 4deef85c89b8..8dca542a09fa 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,7 +8,7 @@ # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|set_device|device\()\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", ] diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py index 77fbb41f0371..e3b965db8aaf 100644 --- a/vllm/model_executor/model_loader/base_loader.py +++ b/vllm/model_executor/model_loader/base_loader.py @@ -64,7 +64,7 @@ def load_model( # Log peak GPU memory after loading weights. This is needed # to have test coverage on peak memory for online quantization. if current_platform.is_cuda(): - peak_memory = torch.cuda.max_memory_allocated() + peak_memory = torch.accelerator.max_memory_allocated() logger.debug_once( "Peak GPU memory after loading weights: %s GiB", format_gib(peak_memory), From a5823446e1452886d1fbdf03dab9278d4f2a4b36 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 14 Mar 2026 10:36:52 +0800 Subject: [PATCH 4/6] replace torch.cuda.max_memory_allocated Signed-off-by: Kunshang Ji --- tools/pre_commit/check_torch_cuda.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index 8dca542a09fa..af140fbc61d1 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,7 +8,7 @@ # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|set_device|device\()\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", ] From 949618f955bef22808d03afce7d51697ba006377 Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 14 Mar 2026 10:37:54 +0800 Subject: [PATCH 5/6] replace torch.cuda.reset_peak_memory_stats Signed-off-by: Kunshang Ji --- benchmarks/benchmark_topk_topp.py | 2 +- tools/pre_commit/check_torch_cuda.py | 2 +- vllm/utils/mem_utils.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmark_topk_topp.py b/benchmarks/benchmark_topk_topp.py index 0c5ef4c956ec..f727f16ea29c 100644 --- a/benchmarks/benchmark_topk_topp.py +++ b/benchmarks/benchmark_topk_topp.py @@ -104,7 +104,7 @@ def measure_memory() -> tuple[int, int]: def reset_memory_stats(): """Reset peak memory statistics.""" reset_buffer_cache() - torch.cuda.reset_peak_memory_stats() + torch.accelerator.reset_peak_memory_stats() torch.accelerator.empty_cache() gc.collect() diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index af140fbc61d1..045c87456652 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,7 +8,7 @@ # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|set_device|device\()\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|reset_peak_memory_stats|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", ] diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index 50838f334bdf..27312ce40bd1 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -94,7 +94,7 @@ def measure(self) -> None: # we measure the torch peak memory usage via allocated_bytes, # rather than `torch.accelerator.memory_reserved()` . - # After `torch.cuda.reset_peak_memory_stats()`, + # After `torch.accelerator.reset_peak_memory_stats()`, # `torch.accelerator.memory_reserved()` will keep growing, and only shrink # when we call `torch.accelerator.empty_cache()` or OOM happens. self.torch_peak = current_platform.memory_stats(device).get( @@ -251,7 +251,7 @@ def memory_profiling( """ gc.collect() torch.accelerator.empty_cache() - current_platform.reset_peak_memory_stats(baseline_snapshot.device_) + torch.accelerator.reset_peak_memory_stats(baseline_snapshot.device_) result = MemoryProfilingResult( before_create=baseline_snapshot, From eea94a35fa88410b1528b4b2347050054c03f0ad Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 14 Mar 2026 10:40:26 +0800 Subject: [PATCH 6/6] replace torch.cuda.memory_stats Signed-off-by: Kunshang Ji --- tools/pre_commit/check_torch_cuda.py | 2 +- vllm/utils/mem_utils.py | 4 ++-- vllm/v1/worker/gpu_worker.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/pre_commit/check_torch_cuda.py b/tools/pre_commit/check_torch_cuda.py index 045c87456652..ea84618a0882 100644 --- a/tools/pre_commit/check_torch_cuda.py +++ b/tools/pre_commit/check_torch_cuda.py @@ -8,7 +8,7 @@ # Regex: match `torch.cuda.xxx` but allow `torch.accelerator.xxx` # --------------------------------------------------------------------------- # _TORCH_CUDA_PATTERNS = [ - r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|reset_peak_memory_stats|set_device|device\()\b", + r"\btorch\.cuda\.(empty_cache|synchronize|device_count|current_device|memory_reserved|memory_allocated|max_memory_allocated|max_memory_reserved|reset_peak_memory_stats|memory_stats|set_device|device\()\b", r"\bwith\storch\.cuda\.device\b", ] diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index 27312ce40bd1..e6a60a0c1377 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -97,7 +97,7 @@ def measure(self) -> None: # After `torch.accelerator.reset_peak_memory_stats()`, # `torch.accelerator.memory_reserved()` will keep growing, and only shrink # when we call `torch.accelerator.empty_cache()` or OOM happens. - self.torch_peak = current_platform.memory_stats(device).get( + self.torch_peak = torch.accelerator.memory_stats(device).get( "allocated_bytes.all.peak", 0 ) @@ -243,7 +243,7 @@ def memory_profiling( The memory used for loading weights (a.) is directly given from the argument `weights_memory`. - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` + The increase of `torch.accelerator.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). The increase of `non_torch_memory` from creating the current vLLM instance diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 58e28e694055..58e2d658c42b 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -387,7 +387,7 @@ def determine_available_memory(self) -> int: ) as profile_result: self.model_runner.profile_run() - profile_torch_peak = current_platform.memory_stats(self.device).get( + profile_torch_peak = torch.accelerator.memory_stats(self.device).get( "allocated_bytes.all.peak", 0 )