From 96c6dba10de97998d4560c2b6f9c4a8dd38433b1 Mon Sep 17 00:00:00 2001 From: Yeonsil Yoon Date: Tue, 19 May 2026 13:42:57 -0700 Subject: [PATCH] Fix HPU prompt_token_ids device placement for penalty sampling Move prompt_token_ids to self.device in selective sampling metadata creation for both skip_copy paths. This keeps prompt and output penalty masks on the same device and prevents runtime device mismatch errors during repetition/presence/frequency penalty application. Signed-off-by: Yeonsil Yoon --- vllm_gaudi/v1/worker/hpu_input_batch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_input_batch.py b/vllm_gaudi/v1/worker/hpu_input_batch.py index bf3207cf4e..c2ac2f8f68 100644 --- a/vllm_gaudi/v1/worker/hpu_input_batch.py +++ b/vllm_gaudi/v1/worker/hpu_input_batch.py @@ -640,12 +640,14 @@ def make_selective_sampling_metadata( # The prompt tokens are used only for applying penalties during # the sampling process. Hence copy these tensors only when # there are requests which need penalties to be applied. - prompt_token_ids = self._make_prompt_token_ids_cpu_tensor()[req_indices] + prompt_token_ids = self._make_prompt_token_ids_cpu_tensor()[req_indices].to(device=self.device, + non_blocking=True) else: # Even with skip_copy=True, we need prompt_token_ids for penalties if not self.no_penalties: cached_tensor = self._get_cached_prompt_token_ids() - prompt_token_ids = cached_tensor[req_indices] if cached_tensor is not None else None + prompt_token_ids = cached_tensor[req_indices].to( + device=self.device, non_blocking=True) if cached_tensor is not None else None else: prompt_token_ids = None