Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion vllm/model_executor/layers/pooler.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,6 @@ def extract_states(
offset += prompt_len
pooled_data.append(pooled_data_i)

pooled_data = []
returned_token_ids = self.returned_token_ids
step_tag_id = self.step_tag_id

Expand Down
29 changes: 18 additions & 11 deletions vllm/v1/worker/gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,23 @@ def get_token_id(self, idx: int) -> int:
class InputBatch:

def __init__(
self,
max_num_reqs: int,
max_model_len: int,
max_num_batched_tokens: int,
device: torch.device,
pin_memory: bool,
vocab_size: int,
block_sizes: list[int], # The block_size of each kv cache group
self,
max_num_reqs: int,
max_model_len: int,
max_num_batched_tokens: int,
device: torch.device,
pin_memory: bool,
vocab_size: int,
block_sizes: list[int], # The block_size of each kv cache group
sampling_needs_token_ids: bool = False,
):
self.max_num_reqs = max_num_reqs
self.max_model_len = max_model_len
self.max_num_batched_tokens = max_num_batched_tokens
self.device = device
self.pin_memory = pin_memory
self.vocab_size = vocab_size
self.sampling_needs_token_ids = sampling_needs_token_ids

self._req_ids: list[Optional[str]] = []
self.req_id_to_index: dict[str, int] = {}
Expand Down Expand Up @@ -579,9 +581,14 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
copy_slice(self.repetition_penalties_cpu_tensor,
self.repetition_penalties, num_reqs)

# The prompt tokens are used only for applying penalties during
# the sampling process. Hence copy these tensors only when
# there are requests which need penalties to be applied.
needs_prompt_token_ids = (not self.no_penalties
or (self.num_reqs > 0
and self.sampling_needs_token_ids))
if needs_prompt_token_ids:
# The prompt tokens are used only for applying penalties or
# step pooling during the sampling/pooling process.
# Hence copy these tensors only when there are requests which
# need penalties/step_pooler to be applied.
prompt_token_ids = self._make_prompt_token_ids_tensor()
else:
prompt_token_ids = None
Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ def __init__(

self.is_multimodal_model = model_config.is_multimodal_model
self.is_pooling_model = model_config.pooler_config is not None
self.is_step_pooler = (self.is_pooling_model
and model_config.pooler_config.pooling_type
== "STEP")
self.max_model_len = model_config.max_model_len
self.max_num_tokens = scheduler_config.max_num_batched_tokens
self.max_num_reqs = scheduler_config.max_num_seqs
Expand Down Expand Up @@ -202,6 +205,7 @@ def __init__(
pin_memory=self.pin_memory,
vocab_size=self.model_config.get_vocab_size(),
block_sizes=[self.cache_config.block_size],
sampling_needs_token_ids=self.is_step_pooler,
)

self.use_cuda_graph = (
Expand Down Expand Up @@ -2301,6 +2305,7 @@ def may_reinitialize_input_batch(self,
pin_memory=self.pin_memory,
vocab_size=self.model_config.get_vocab_size(),
block_sizes=block_sizes,
sampling_needs_token_ids=self.is_step_pooler,
)

def _allocate_kv_cache_tensors(
Expand Down