From 96c6dba10de97998d4560c2b6f9c4a8dd38433b1 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yeon.sil.yoon@intel.com>
Date: Tue, 19 May 2026 13:42:57 -0700
Subject: [PATCH] Fix HPU prompt_token_ids device placement for penalty
 sampling

Move prompt_token_ids to self.device in selective sampling metadata creation for both skip_copy paths.
This keeps prompt and output penalty masks on the same device and prevents runtime device mismatch errors during repetition/presence/frequency penalty application.

Signed-off-by: Yeonsil Yoon <yeon.sil.yoon@intel.com>
---
 vllm_gaudi/v1/worker/hpu_input_batch.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_input_batch.py b/vllm_gaudi/v1/worker/hpu_input_batch.py
index bf3207cf4e..c2ac2f8f68 100644
--- a/vllm_gaudi/v1/worker/hpu_input_batch.py
+++ b/vllm_gaudi/v1/worker/hpu_input_batch.py
@@ -640,12 +640,14 @@ def make_selective_sampling_metadata(
                 # The prompt tokens are used only for applying penalties during
                 # the sampling process. Hence copy these tensors only when
                 # there are requests which need penalties to be applied.
-                prompt_token_ids = self._make_prompt_token_ids_cpu_tensor()[req_indices]
+                prompt_token_ids = self._make_prompt_token_ids_cpu_tensor()[req_indices].to(device=self.device,
+                                                                                            non_blocking=True)
         else:
             # Even with skip_copy=True, we need prompt_token_ids for penalties
             if not self.no_penalties:
                 cached_tensor = self._get_cached_prompt_token_ids()
-                prompt_token_ids = cached_tensor[req_indices] if cached_tensor is not None else None
+                prompt_token_ids = cached_tensor[req_indices].to(
+                    device=self.device, non_blocking=True) if cached_tensor is not None else None
             else:
                 prompt_token_ids = None