From 07183a4c6938f7b9f6865b9f0dba53a1163d3ca9 Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Tue, 8 Apr 2025 10:59:11 +0300 Subject: [PATCH 1/2] Fix async callback ordering --- vllm/worker/hpu_model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 7f4b3c25b75d..77344249468c 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2704,6 +2704,8 @@ def try_revert_dummy_output_tokens(): if use_delayed_sampling: fake_output = self._delayed_sampler_outputs(model_input) + elif model_input.async_callback is not None: + model_input.async_callback() with self.profiler.record_event( 'internal', ('sample_' @@ -2725,7 +2727,7 @@ def try_revert_dummy_output_tokens(): self.cached_step_outputs.append(output) self.cached_step_inputs.append(model_input) htorch.core.mark_step() - if model_input.async_callback is not None: + if use_delayed_sampling and model_input.async_callback is not None: model_input.async_callback() if i < num_steps - 1: if i == 0: From f248cd87e706d41729d101bea9ea9e2f3b84cb8b Mon Sep 17 00:00:00 2001 From: Michal Adamczyk Date: Tue, 8 Apr 2025 12:36:28 +0300 Subject: [PATCH 2/2] Ruff --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 77344249468c..bd67dedf2b2d 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2727,7 +2727,8 @@ def try_revert_dummy_output_tokens(): self.cached_step_outputs.append(output) self.cached_step_inputs.append(model_input) htorch.core.mark_step() - if use_delayed_sampling and model_input.async_callback is not None: + if use_delayed_sampling \ + and model_input.async_callback is not None: model_input.async_callback() if i < num_steps - 1: if i == 0: