diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index 9bab138abe14..1db6149b055c 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -179,17 +179,17 @@ def warmup(self, chat_params: ChatParams) -> None: from vllm.entrypoints.chat_utils import ChatTemplateResolutionError try: - logger.info("Warming up chat template processing...") + logger.debug("Warming up chat template processing...") start_time = time.perf_counter() self.render_chat([[{"role": "user", "content": "warmup"}]], chat_params) elapsed = time.perf_counter() - start_time - logger.info("Chat template warmup completed in %.3fs", elapsed) + logger.debug("Chat template warmup completed in %.3fs", elapsed) except ChatTemplateResolutionError: - logger.info("This model does not support chat template.") + logger.debug("This model does not support chat template.") except Exception: - logger.exception("Chat template warmup failed") + logger.warning("Chat template warmup failed", exc_info=True) if self.mm_processor: from vllm.multimodal.processing import TimingContext @@ -200,7 +200,7 @@ def warmup(self, chat_params: ChatParams) -> None: mm_limits = processor.info.allowed_mm_limits try: - logger.info("Warming up multi-modal processing...") + logger.debug("Warming up multi-modal processing...") start_time = time.perf_counter() processor_inputs = processor.dummy_inputs.get_dummy_processor_inputs( @@ -209,14 +209,13 @@ def warmup(self, chat_params: ChatParams) -> None: mm_options=mm_config.limit_per_prompt, ) _ = processor.apply( - processor_inputs, - timing_ctx=TimingContext(enabled=False), + processor_inputs, timing_ctx=TimingContext(enabled=False) ) elapsed = time.perf_counter() - start_time logger.info("Multi-modal warmup completed in %.3fs", elapsed) except Exception: - logger.exception("Multi-modal warmup failed") + logger.warning("Multi-modal warmup failed") finally: self.clear_mm_cache()