Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions vllm/model_executor/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -760,6 +760,19 @@ def safetensors_weights_iterator(
param = f.get_tensor(name)
yield name, param

# Free the Linux page cache for the safetensors file just loaded.
# This prevents the OS from keeping the file in system RAM, which
# artificially deflates the UMA free memory metric and KV budget.
if hasattr(os, "posix_fadvise"):
try:
fd = os.open(st_file, os.O_RDONLY)
try:
os.posix_fadvise(fd, 0, 0, os.POSIX_FADV_DONTNEED)
finally:
os.close(fd)
except OSError:
pass
Comment thread
EmilHaase marked this conversation as resolved.


def multi_thread_safetensors_weights_iterator(
hf_weights_files: list[str],
Expand Down
12 changes: 12 additions & 0 deletions vllm/utils/mem_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,18 @@ def memory_profiling(
diff_from_create = result.after_profile - result.before_create
result.torch_peak_increase = diff_profile.torch_peak
result.non_torch_increase = diff_from_create.non_torch_memory

device = baseline_snapshot.device_
shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark
if (
current_platform.is_cuda()
and current_platform.get_device_capability(device.index)
in shared_sysmem_device_mem_sms
):
# Force non-torch increase to 0 on UMA systems so concurrent engine
# memory allocations do not register as local overhead.
result.non_torch_increase = 0

result.profile_time = diff_profile.timestamp

non_torch_memory = result.non_torch_increase
Expand Down