vllm-project · panpan0000 · Mar 18, 2026 · Mar 18, 2026 · Mar 19, 2026 · Apr 9, 2026
@@ -65,6 +65,11 @@
 
 logger = init_logger(__name__)
 
+# empirically observed that the memory profiling may
+# slightly underestimate the memory consumption.
+# So leave a small buffer (=150MiB) to avoid OOM.
+REDUNDANCY_BUFFER_MEMORY_BYTES = 150 * (1 << 20)
+
 if TYPE_CHECKING:
     from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.worker.gpu_model_runner import GPUModelRunner
@@ -418,6 +423,7 @@ def determine_available_memory(self) -> int:
             self.requested_memory
             - profile_result.non_kv_cache_memory
             - cudagraph_memory_estimate_applied
+            - REDUNDANCY_BUFFER_MEMORY_BYTES
         )
 
         unrequested_memory = self.init_snapshot.free_memory - self.requested_memory
@@ -614,11 +620,8 @@ def compile_or_warm_up_model(self) -> float:
             # Users may want fine-grained control to specify kv cache
             # memory size.
 
-            # empirically observed that the memory profiling may
-            # slightly underestimate the memory consumption.
-            # So leave a small buffer (=150MiB) to avoid OOM.
-            redundancy_buffer_memory = 150 * (1 << 20)
-
+            redundancy_buffer_memory = REDUNDANCY_BUFFER_MEMORY_BYTES
+
             non_kv_cache_memory = (
                 self.model_runner.model_memory_usage
                 + self.peak_activation_memory