diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index 61fe44d251b3..91ccdb578359 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -138,6 +138,9 @@ def _init_device_properties(self) -> None: def _sync_device(self) -> None: pass + def _empty_cache(self) -> None: + pass + def _zero_block_ids(self, block_ids: list[int]) -> None: # CPU attention assigns -INF to logits at invalid positions, # so stale KV cache data never affects computation. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a0ba47f945a7..ed8c26414c29 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1051,6 +1051,9 @@ def _init_device_properties(self) -> None: def _sync_device(self) -> None: torch.accelerator.synchronize() + def _empty_cache(self) -> None: + torch.accelerator.empty_cache() + def _get_or_create_async_output_copy_stream(self) -> torch.cuda.Stream: stream = self.async_output_copy_stream if stream is None: @@ -5903,7 +5906,6 @@ def shutdown(self) -> None: from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from vllm.v1.worker.workspace import reset_workspace_manager - # Calls torch.accelerator.synchronize() self._cleanup_profiling_kv_cache() self.compilation_config.static_forward_context.clear() self.model = None # type: ignore[assignment] @@ -5912,7 +5914,7 @@ def shutdown(self) -> None: reset_workspace_manager() def _cleanup_profiling_kv_cache(self) -> None: - torch.accelerator.synchronize() + self._sync_device() if hasattr(self, "kv_caches") and self.kv_caches: for i in range(len(self.kv_caches)): self.kv_caches[i] = None # type: ignore @@ -5941,7 +5943,7 @@ def _cleanup_profiling_kv_cache(self) -> None: layer.impl._v_scale_cache = None gc.collect() - torch.accelerator.empty_cache() + self._empty_cache() logger.debug("Cleaned up profiling KV cache and CUDA graphs")