diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 28b3d364a7e2..df6889a25af2 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2696,6 +2696,10 @@ def _patch_prev_output(self): delayed_output = self.cached_step_outputs.pop(0).cpu().squeeze( -1).tolist() ctx = model_input.async_callback.keywords["ctx"] # type: ignore + # If there's no output to patch with, which is usually the case when + # we're starting a new request after all requests are completed. + if len(ctx.output_queue) == 0: + return assert len( ctx.output_queue) == 1, 'There should be exactly 1 output waiting!' output_data = ctx.output_queue[0]