vllm-project · zhuohan123 · Jun 26, 2023 · Jun 26, 2023
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -128,7 +128,7 @@ def _init_cache(self) -> None:
         logger.info(f'# GPU blocks: {num_gpu_blocks}, '
                     f'# CPU blocks: {num_cpu_blocks}')
 
-        if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
+        if num_gpu_blocks <= 0:
             raise ValueError("No available memory for the cache blocks. "
                              "Try increasing `gpu_memory_utilization` when "
                              "initializing the engine.")

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -113,6 +113,8 @@ def profile_num_available_blocks(
         num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization
                               - peak_memory) // cache_block_size)
         num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
         torch.cuda.empty_cache()
 
         # Reset the seed to ensure that the random state is not affected by