[serve.llm][Fix] retry with model_config arg (#52991)

lk-chen · web-flow · commit dff4182a4c7d · 2025-05-15T23:49:30.000Z
Signed-off-by: Linkun &lt;github@lkchen.net&gt;
diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py
@@ -234,7 +234,9 @@ async def start(self):
 
         If the engine is already running, do nothing.
         """
-        from vllm.entrypoints.chat_utils import resolve_chat_template_content_format
+        from vllm.entrypoints.chat_utils import (
+            resolve_chat_template_content_format as _resolve_chat_template_content_format,
+        )
 
         if self.running:
             # The engine is already running!
@@ -246,7 +248,21 @@ async def start(self):
         self.model_config = await self.engine.get_model_config()
 
         self._tokenizer = await self.engine.get_tokenizer()
+
+        def resolve_chat_template_content_format(model_config, **kwargs):
+            try:
+                return _resolve_chat_template_content_format(
+                    model_config=model_config, **kwargs
+                )
+            except TypeError:
+                # Legacy API before vLLM 0.9.0.
+                # TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported.
+                return _resolve_chat_template_content_format(
+                    trust_remote_code=model_config.trust_remote_code, **kwargs
+                )
+
         self._resolved_content_format = resolve_chat_template_content_format(
+            model_config=self.model_config,
             # Use HF to get the chat template so set it to None here.
             chat_template=None,
             # Default to None, change when it's needed.
@@ -255,7 +271,6 @@ async def start(self):
             # Let vLLM decide the content format.
             given_format="auto",
             tokenizer=self._tokenizer,
-            trust_remote_code=self.model_config.trust_remote_code,
         )
 
         logger.info("Started vLLM engine.")
@@ -505,7 +520,7 @@ async def prepare_request(
     ) -> GenerationRequest:
         from vllm.entrypoints.chat_utils import (
             parse_chat_messages_futures,
-            apply_hf_chat_template,
+            apply_hf_chat_template as _apply_hf_chat_template,
         )
 
         model_config = self.model_config
@@ -521,14 +536,25 @@ async def prepare_request(
             )
             mm_data = await mm_futures
 
+            def apply_hf_chat_template(model_config, **kwargs):
+                try:
+                    return _apply_hf_chat_template(model_config=model_config, **kwargs)
+                except TypeError:
+                    # Legacy API before vLLM 0.9.0.
+                    # TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported.
+                    return _apply_hf_chat_template(
+                        trust_remote_code=model_config.trust_remote_code, **kwargs
+                    )
+
             prompt_text = apply_hf_chat_template(
+                model_config=model_config,
                 tokenizer=self._tokenizer,
                 conversation=conversation,
                 chat_template=None,
                 tools=None,
-                trust_remote_code=model_config.trust_remote_code,
                 tokenize=False,
                 # **kwargs for tokenizer.apply_chat_template
+                trust_remote_code=model_config.trust_remote_code,
                 add_generation_prompt=True,
                 continue_final_message=False,
             )