diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 17dfcae59ed1..744334aff894 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -684,7 +684,7 @@ def _make_buffer( with_numpy=numpy, ) - def _init_model_kwargs(self, num_tokens: int): + def _init_model_kwargs(self): model_kwargs = dict[str, Any]() if not self.is_pooling_model: @@ -2579,7 +2579,7 @@ def _preprocess( input_ids, inputs_embeds = self._prepare_mm_inputs(num_input_tokens) model_kwargs = { - **self._init_model_kwargs(num_scheduled_tokens), + **self._init_model_kwargs(), **self._extract_mm_kwargs(scheduler_output), } elif self.enable_prompt_embeds and is_first_rank: @@ -2607,7 +2607,7 @@ def _preprocess( self.inputs_embeds.gpu[token_ids_idx] = tokens_to_embeds inputs_embeds = self.inputs_embeds.gpu[:num_input_tokens] - model_kwargs = self._init_model_kwargs(num_input_tokens) + model_kwargs = self._init_model_kwargs() input_ids = None else: # For text-only models, we use token ids as input. @@ -2616,7 +2616,7 @@ def _preprocess( # then the embedding layer is not included in the CUDA graph. input_ids = self.input_ids.gpu[:num_input_tokens] inputs_embeds = None - model_kwargs = self._init_model_kwargs(num_input_tokens) + model_kwargs = self._init_model_kwargs() if self.uses_mrope: positions = self.mrope_positions.gpu[:, :num_input_tokens] @@ -4293,7 +4293,7 @@ def _dummy_run( ): # Make sure padding doesn't exceed max_num_tokens assert num_tokens_padded <= self.max_num_tokens - model_kwargs = self._init_model_kwargs(num_tokens_padded) + model_kwargs = self._init_model_kwargs() if self.supports_mm_inputs and not self.model_config.is_encoder_decoder: input_ids, inputs_embeds = self._prepare_mm_inputs(num_tokens_padded) @@ -4304,7 +4304,7 @@ def _dummy_run( elif self.enable_prompt_embeds: input_ids = None inputs_embeds = self.inputs_embeds.gpu[:num_tokens_padded] - model_kwargs = self._init_model_kwargs(num_tokens_padded) + model_kwargs = self._init_model_kwargs() else: input_ids = self.input_ids.gpu[:num_tokens_padded] inputs_embeds = None