diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 24b2f61b8671..a33fbce9d1ce 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -760,6 +760,19 @@ def safetensors_weights_iterator( param = f.get_tensor(name) yield name, param + # Free the Linux page cache for the safetensors file just loaded. + # This prevents the OS from keeping the file in system RAM, which + # artificially deflates the UMA free memory metric and KV budget. + if hasattr(os, "posix_fadvise"): + try: + fd = os.open(st_file, os.O_RDONLY) + try: + os.posix_fadvise(fd, 0, 0, os.POSIX_FADV_DONTNEED) + finally: + os.close(fd) + except OSError: + pass + def multi_thread_safetensors_weights_iterator( hf_weights_files: list[str], diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index 0b3971126fad..97b092716f37 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -272,6 +272,18 @@ def memory_profiling( diff_from_create = result.after_profile - result.before_create result.torch_peak_increase = diff_profile.torch_peak result.non_torch_increase = diff_from_create.non_torch_memory + + device = baseline_snapshot.device_ + shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1)) # Orin, Thor, Spark + if ( + current_platform.is_cuda() + and current_platform.get_device_capability(device.index) + in shared_sysmem_device_mem_sms + ): + # Force non-torch increase to 0 on UMA systems so concurrent engine + # memory allocations do not register as local overhead. + result.non_torch_increase = 0 + result.profile_time = diff_profile.timestamp non_torch_memory = result.non_torch_increase