vllm-project · Isotr0py · Jun 23, 2025 · Jun 22, 2025 · Jun 22, 2025 · Jun 22, 2025
@@ -253,7 +253,6 @@ def extract_states(
                 offset += prompt_len
                 pooled_data.append(pooled_data_i)
 
-        pooled_data = []
         returned_token_ids = self.returned_token_ids
         step_tag_id = self.step_tag_id
 

diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
@@ -59,21 +59,23 @@ def get_token_id(self, idx: int) -> int:
 class InputBatch:
 
     def __init__(
-            self,
-            max_num_reqs: int,
-            max_model_len: int,
-            max_num_batched_tokens: int,
-            device: torch.device,
-            pin_memory: bool,
-            vocab_size: int,
-            block_sizes: list[int],  # The block_size of each kv cache group
+        self,
+        max_num_reqs: int,
+        max_model_len: int,
+        max_num_batched_tokens: int,
+        device: torch.device,
+        pin_memory: bool,
+        vocab_size: int,
+        block_sizes: list[int],  # The block_size of each kv cache group
+        sampling_needs_token_ids: bool = False,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
         self.max_num_batched_tokens = max_num_batched_tokens
         self.device = device
         self.pin_memory = pin_memory
         self.vocab_size = vocab_size
+        self.sampling_needs_token_ids = sampling_needs_token_ids
 
         self._req_ids: list[Optional[str]] = []
         self.req_id_to_index: dict[str, int] = {}
@@ -579,9 +581,14 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
             copy_slice(self.repetition_penalties_cpu_tensor,
                        self.repetition_penalties, num_reqs)
 
-            # The prompt tokens are used only for applying penalties during
-            # the sampling process. Hence copy these tensors only when
-            # there are requests which need penalties to be applied.
+        needs_prompt_token_ids = (not self.no_penalties
+                                  or (self.num_reqs > 0
+                                      and self.sampling_needs_token_ids))
+        if needs_prompt_token_ids:
+            # The prompt tokens are used only for applying penalties or
+            # step pooling during the sampling/pooling process.
+            # Hence copy these tensors only when there are requests which
+            # need penalties/step_pooler to be applied.
             prompt_token_ids = self._make_prompt_token_ids_tensor()
         else:
             prompt_token_ids = None

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -122,6 +122,9 @@ def __init__(
 
         self.is_multimodal_model = model_config.is_multimodal_model
         self.is_pooling_model = model_config.pooler_config is not None
+        self.is_step_pooler = (self.is_pooling_model
+                               and model_config.pooler_config.pooling_type
+                               == "STEP")
         self.max_model_len = model_config.max_model_len
         self.max_num_tokens = scheduler_config.max_num_batched_tokens
         self.max_num_reqs = scheduler_config.max_num_seqs
@@ -202,6 +205,7 @@ def __init__(
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.cache_config.block_size],
+            sampling_needs_token_ids=self.is_step_pooler,
         )
 
         self.use_cuda_graph = (
@@ -2301,6 +2305,7 @@ def may_reinitialize_input_batch(self,
                 pin_memory=self.pin_memory,
                 vocab_size=self.model_config.get_vocab_size(),
                 block_sizes=block_sizes,
+                sampling_needs_token_ids=self.is_step_pooler,
             )
 
     def _allocate_kv_cache_tensors(