assert num_tokens_after_padding bounds

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit 5fe0c52f3631 · 2025-10-04T15:49:17.000Z
Signed-off-by: Varun Sundar Rabindranath &lt;vsundarr@redhat.com&gt;
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -3200,6 +3200,9 @@ def _dummy_run(
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens, remove_lora):
+
+            # Make sure padding doesn't exceed max_num_tokens
+            assert num_tokens_after_padding <= self.max_num_tokens
             model_kwargs = self._init_model_kwargs(num_tokens_after_padding)
             if (self.supports_mm_inputs
                     and not self.model_config.is_encoder_decoder):