From 9fd1cb033db5578179d86b5bc9d1585503eea91d Mon Sep 17 00:00:00 2001 From: Linkun Date: Wed, 14 May 2025 00:29:05 -0700 Subject: [PATCH 1/2] retry with model_config arg Signed-off-by: Linkun --- .../serve/deployments/llm/vllm/vllm_engine.py | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 5ef0fd2fd008..589c18f0781d 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -246,7 +246,7 @@ async def start(self): self.model_config = await self.engine.get_model_config() self._tokenizer = await self.engine.get_tokenizer() - self._resolved_content_format = resolve_chat_template_content_format( + _resolve_chat_template_content_kwargs = dict( # Use HF to get the chat template so set it to None here. chat_template=None, # Default to None, change when it's needed. @@ -257,6 +257,17 @@ async def start(self): tokenizer=self._tokenizer, trust_remote_code=self.model_config.trust_remote_code, ) + try: + self._resolved_content_format = resolve_chat_template_content_format( + **_resolve_chat_template_content_kwargs + ) + except TypeError: + # vLLM 0.9.0 changes API (#52975) + _resolve_chat_template_content_kwargs.pop("trust_remote_code") + _resolve_chat_template_content_kwargs["model_config"] = self.model_config + self._resolved_content_format = resolve_chat_template_content_format( + **_resolve_chat_template_content_kwargs + ) logger.info("Started vLLM engine.") @@ -504,7 +515,7 @@ async def prepare_request( ) mm_data = await mm_futures - prompt_text = apply_hf_chat_template( + _apply_hf_chat_template_kwargs = dict( tokenizer=self._tokenizer, conversation=conversation, chat_template=None, @@ -515,6 +526,13 @@ async def prepare_request( add_generation_prompt=True, continue_final_message=False, ) + try: + prompt_text = apply_hf_chat_template(**_apply_hf_chat_template_kwargs) + except TypeError: + # vLLM 0.9.0 changes API (#52975) + _apply_hf_chat_template_kwargs.pop("trust_remote_code") + _apply_hf_chat_template_kwargs["model_config"] = model_config + prompt_text = apply_hf_chat_template(**_apply_hf_chat_template_kwargs) else: prompt_text = prompt.prompt From b68a36bfc06adc33c04da1dca1124916663517a7 Mon Sep 17 00:00:00 2001 From: Linkun Date: Thu, 15 May 2025 15:59:30 -0700 Subject: [PATCH 2/2] use helper func Signed-off-by: Linkun --- .../serve/deployments/llm/vllm/vllm_engine.py | 56 +++++++++++-------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py index 589c18f0781d..2f85d98e090d 100644 --- a/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py +++ b/python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py @@ -234,7 +234,9 @@ async def start(self): If the engine is already running, do nothing. """ - from vllm.entrypoints.chat_utils import resolve_chat_template_content_format + from vllm.entrypoints.chat_utils import ( + resolve_chat_template_content_format as _resolve_chat_template_content_format, + ) if self.running: # The engine is already running! @@ -246,7 +248,21 @@ async def start(self): self.model_config = await self.engine.get_model_config() self._tokenizer = await self.engine.get_tokenizer() - _resolve_chat_template_content_kwargs = dict( + + def resolve_chat_template_content_format(model_config, **kwargs): + try: + return _resolve_chat_template_content_format( + model_config=model_config, **kwargs + ) + except TypeError: + # Legacy API before vLLM 0.9.0. + # TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported. + return _resolve_chat_template_content_format( + trust_remote_code=model_config.trust_remote_code, **kwargs + ) + + self._resolved_content_format = resolve_chat_template_content_format( + model_config=self.model_config, # Use HF to get the chat template so set it to None here. chat_template=None, # Default to None, change when it's needed. @@ -255,19 +271,7 @@ async def start(self): # Let vLLM decide the content format. given_format="auto", tokenizer=self._tokenizer, - trust_remote_code=self.model_config.trust_remote_code, ) - try: - self._resolved_content_format = resolve_chat_template_content_format( - **_resolve_chat_template_content_kwargs - ) - except TypeError: - # vLLM 0.9.0 changes API (#52975) - _resolve_chat_template_content_kwargs.pop("trust_remote_code") - _resolve_chat_template_content_kwargs["model_config"] = self.model_config - self._resolved_content_format = resolve_chat_template_content_format( - **_resolve_chat_template_content_kwargs - ) logger.info("Started vLLM engine.") @@ -499,7 +503,7 @@ async def prepare_request( ) -> GenerationRequest: from vllm.entrypoints.chat_utils import ( parse_chat_messages_futures, - apply_hf_chat_template, + apply_hf_chat_template as _apply_hf_chat_template, ) model_config = self.model_config @@ -515,24 +519,28 @@ async def prepare_request( ) mm_data = await mm_futures - _apply_hf_chat_template_kwargs = dict( + def apply_hf_chat_template(model_config, **kwargs): + try: + return _apply_hf_chat_template(model_config=model_config, **kwargs) + except TypeError: + # Legacy API before vLLM 0.9.0. + # TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported. + return _apply_hf_chat_template( + trust_remote_code=model_config.trust_remote_code, **kwargs + ) + + prompt_text = apply_hf_chat_template( + model_config=model_config, tokenizer=self._tokenizer, conversation=conversation, chat_template=None, tools=None, - trust_remote_code=model_config.trust_remote_code, tokenize=False, # **kwargs for tokenizer.apply_chat_template + trust_remote_code=model_config.trust_remote_code, add_generation_prompt=True, continue_final_message=False, ) - try: - prompt_text = apply_hf_chat_template(**_apply_hf_chat_template_kwargs) - except TypeError: - # vLLM 0.9.0 changes API (#52975) - _apply_hf_chat_template_kwargs.pop("trust_remote_code") - _apply_hf_chat_template_kwargs["model_config"] = model_config - prompt_text = apply_hf_chat_template(**_apply_hf_chat_template_kwargs) else: prompt_text = prompt.prompt