vllm-project · EmilHaase · Mar 3, 2026
@@ -760,6 +760,19 @@ def safetensors_weights_iterator(
                     param = f.get_tensor(name)
                     yield name, param
 
+        # Free the Linux page cache for the safetensors file just loaded.
+        # This prevents the OS from keeping the file in system RAM, which
+        # artificially deflates the UMA free memory metric and KV budget.
+        if hasattr(os, "posix_fadvise"):
+            try:
+                fd = os.open(st_file, os.O_RDONLY)
+                try:
+                    os.posix_fadvise(fd, 0, 0, os.POSIX_FADV_DONTNEED)
+                finally:
+                    os.close(fd)
+            except OSError:
+                pass
+
 
 def multi_thread_safetensors_weights_iterator(
     hf_weights_files: list[str],

diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py
@@ -272,6 +272,18 @@ def memory_profiling(
     diff_from_create = result.after_profile - result.before_create
     result.torch_peak_increase = diff_profile.torch_peak
     result.non_torch_increase = diff_from_create.non_torch_memory
+
+    device = baseline_snapshot.device_
+    shared_sysmem_device_mem_sms = ((8, 7), (11, 0), (12, 1))  # Orin, Thor, Spark
+    if (
+        current_platform.is_cuda()
+        and current_platform.get_device_capability(device.index)
+        in shared_sysmem_device_mem_sms
+    ):
+        # Force non-torch increase to 0 on UMA systems so concurrent engine
+        # memory allocations do not register as local overhead.
+        result.non_torch_increase = 0
+
     result.profile_time = diff_profile.timestamp
 
     non_torch_memory = result.non_torch_increase