Skip to content

Commit dff4182

Browse files
authored
[serve.llm][Fix] retry with model_config arg (#52991)
Signed-off-by: Linkun <[email protected]>
1 parent a5000cc commit dff4182

File tree

1 file changed

+30
-4
lines changed

1 file changed

+30
-4
lines changed

python/ray/llm/_internal/serve/deployments/llm/vllm/vllm_engine.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,9 @@ async def start(self):
234234
235235
If the engine is already running, do nothing.
236236
"""
237-
from vllm.entrypoints.chat_utils import resolve_chat_template_content_format
237+
from vllm.entrypoints.chat_utils import (
238+
resolve_chat_template_content_format as _resolve_chat_template_content_format,
239+
)
238240

239241
if self.running:
240242
# The engine is already running!
@@ -246,7 +248,21 @@ async def start(self):
246248
self.model_config = await self.engine.get_model_config()
247249

248250
self._tokenizer = await self.engine.get_tokenizer()
251+
252+
def resolve_chat_template_content_format(model_config, **kwargs):
253+
try:
254+
return _resolve_chat_template_content_format(
255+
model_config=model_config, **kwargs
256+
)
257+
except TypeError:
258+
# Legacy API before vLLM 0.9.0.
259+
# TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported.
260+
return _resolve_chat_template_content_format(
261+
trust_remote_code=model_config.trust_remote_code, **kwargs
262+
)
263+
249264
self._resolved_content_format = resolve_chat_template_content_format(
265+
model_config=self.model_config,
250266
# Use HF to get the chat template so set it to None here.
251267
chat_template=None,
252268
# Default to None, change when it's needed.
@@ -255,7 +271,6 @@ async def start(self):
255271
# Let vLLM decide the content format.
256272
given_format="auto",
257273
tokenizer=self._tokenizer,
258-
trust_remote_code=self.model_config.trust_remote_code,
259274
)
260275

261276
logger.info("Started vLLM engine.")
@@ -505,7 +520,7 @@ async def prepare_request(
505520
) -> GenerationRequest:
506521
from vllm.entrypoints.chat_utils import (
507522
parse_chat_messages_futures,
508-
apply_hf_chat_template,
523+
apply_hf_chat_template as _apply_hf_chat_template,
509524
)
510525

511526
model_config = self.model_config
@@ -521,14 +536,25 @@ async def prepare_request(
521536
)
522537
mm_data = await mm_futures
523538

539+
def apply_hf_chat_template(model_config, **kwargs):
540+
try:
541+
return _apply_hf_chat_template(model_config=model_config, **kwargs)
542+
except TypeError:
543+
# Legacy API before vLLM 0.9.0.
544+
# TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported.
545+
return _apply_hf_chat_template(
546+
trust_remote_code=model_config.trust_remote_code, **kwargs
547+
)
548+
524549
prompt_text = apply_hf_chat_template(
550+
model_config=model_config,
525551
tokenizer=self._tokenizer,
526552
conversation=conversation,
527553
chat_template=None,
528554
tools=None,
529-
trust_remote_code=model_config.trust_remote_code,
530555
tokenize=False,
531556
# **kwargs for tokenizer.apply_chat_template
557+
trust_remote_code=model_config.trust_remote_code,
532558
add_generation_prompt=True,
533559
continue_final_message=False,
534560
)

0 commit comments

Comments
 (0)