Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,9 @@ async def start(self):

If the engine is already running, do nothing.
"""
from vllm.entrypoints.chat_utils import resolve_chat_template_content_format
from vllm.entrypoints.chat_utils import (
resolve_chat_template_content_format as _resolve_chat_template_content_format,
)

if self.running:
# The engine is already running!
Expand All @@ -246,7 +248,21 @@ async def start(self):
self.model_config = await self.engine.get_model_config()

self._tokenizer = await self.engine.get_tokenizer()

def resolve_chat_template_content_format(model_config, **kwargs):
try:
return _resolve_chat_template_content_format(
model_config=model_config, **kwargs
)
except TypeError:
# Legacy API before vLLM 0.9.0.
# TODO(#52975): Remove this try-except once vLLM <0.9.0 is no longer supported.
return _resolve_chat_template_content_format(
trust_remote_code=model_config.trust_remote_code, **kwargs
)

self._resolved_content_format = resolve_chat_template_content_format(
model_config=self.model_config,
# Use HF to get the chat template so set it to None here.
chat_template=None,
# Default to None, change when it's needed.
Expand All @@ -255,7 +271,6 @@ async def start(self):
# Let vLLM decide the content format.
given_format="auto",
tokenizer=self._tokenizer,
trust_remote_code=self.model_config.trust_remote_code,
)

logger.info("Started vLLM engine.")
Expand Down Expand Up @@ -488,7 +503,7 @@ async def prepare_request(
) -> GenerationRequest:
from vllm.entrypoints.chat_utils import (
parse_chat_messages_futures,
apply_hf_chat_template,
apply_hf_chat_template as _apply_hf_chat_template,
)

model_config = self.model_config
Expand All @@ -504,14 +519,25 @@ async def prepare_request(
)
mm_data = await mm_futures

def apply_hf_chat_template(model_config, **kwargs):
try:
return _apply_hf_chat_template(model_config=model_config, **kwargs)
except TypeError:
# Legacy API before vLLM 0.9.0.
# TODO(#52975): Remove above once vLLM <0.9.0 is no longer supported.
return _apply_hf_chat_template(
trust_remote_code=model_config.trust_remote_code, **kwargs
)

prompt_text = apply_hf_chat_template(
model_config=model_config,
tokenizer=self._tokenizer,
conversation=conversation,
chat_template=None,
tools=None,
trust_remote_code=model_config.trust_remote_code,
tokenize=False,
# **kwargs for tokenizer.apply_chat_template
trust_remote_code=model_config.trust_remote_code,
add_generation_prompt=True,
continue_final_message=False,
)
Expand Down