Skip to content

Commit 1b6de73

Browse files
vrdn-23Alvant
authored andcommitted
[Bugfix]: Make chat content text allow type content (vllm-project#9358)
Signed-off-by: Vinay Damodaran <[email protected]> Signed-off-by: Alvant <[email protected]>
1 parent cb6364d commit 1b6de73

File tree

8 files changed

+107
-12
lines changed

8 files changed

+107
-12
lines changed

docs/source/serving/openai_compatible_server.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,23 @@ vllm serve <model> --chat-template ./path-to-chat-template.jinja
103103
vLLM community provides a set of chat templates for popular models. You can find them in the examples
104104
directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
105105

106+
With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
107+
both a `type` and a `text` field. An example is provided below:
108+
```python
109+
completion = client.chat.completions.create(
110+
model="NousResearch/Meta-Llama-3-8B-Instruct",
111+
messages=[
112+
{"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
113+
]
114+
)
115+
```
116+
Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like
117+
`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which
118+
format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify
119+
between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match
120+
this, unless explicitly specified.
121+
122+
106123
## Command line arguments for the server
107124

108125
```{argparse}

tests/entrypoints/openai/test_serving_chat.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ class MockModelConfig:
2626
tokenizer = MODEL_NAME
2727
trust_remote_code = False
2828
tokenizer_mode = "auto"
29+
chat_template_text_format = "string"
2930
max_model_len = 100
3031
tokenizer_revision = None
3132
multimodal_config = MultiModalConfig()

tests/entrypoints/test_chat_utils.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
1818

1919

20-
@pytest.fixture(scope="module")
20+
@pytest.fixture(scope="function")
2121
def phi3v_model_config():
2222
return ModelConfig(PHI3V_MODEL_ID,
2323
task="generate",
@@ -26,6 +26,7 @@ def phi3v_model_config():
2626
trust_remote_code=True,
2727
dtype="bfloat16",
2828
seed=0,
29+
chat_template_text_format="string",
2930
limit_mm_per_prompt={
3031
"image": 2,
3132
})
@@ -330,6 +331,51 @@ def test_parse_chat_messages_multiple_images_across_messages(
330331
_assert_mm_data_is_image_input(mm_data, 2)
331332

332333

334+
def test_parse_chat_messages_context_text_format(
335+
phi3v_model_config,
336+
phi3v_tokenizer,
337+
):
338+
phi3v_model_config.chat_template_text_format = "openai"
339+
conversation, mm_data = parse_chat_messages(
340+
[{
341+
"role": "user",
342+
"content": [{
343+
"type": "text",
344+
"text": "What's in this text?"
345+
}]
346+
}, {
347+
"role": "assistant",
348+
"content": "Some stuff."
349+
}, {
350+
"role": "user",
351+
"content": "What about this one?"
352+
}], phi3v_model_config, phi3v_tokenizer)
353+
354+
assert conversation == [
355+
{
356+
"role": "user",
357+
"content": [{
358+
"type": "text",
359+
"text": "What's in this text?"
360+
}]
361+
},
362+
{
363+
"role": "assistant",
364+
"content": [{
365+
"type": "text",
366+
"text": "Some stuff."
367+
}]
368+
},
369+
{
370+
"role": "user",
371+
"content": [{
372+
"type": "text",
373+
"text": "What about this one?"
374+
}]
375+
},
376+
]
377+
378+
333379
def test_parse_chat_messages_rejects_too_many_images_in_one_message(
334380
phi3v_model_config,
335381
phi3v_tokenizer,

vllm/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ def __init__(self,
142142
use_async_output_proc: bool = True,
143143
override_neuron_config: Optional[Dict[str, Any]] = None,
144144
config_format: ConfigFormat = ConfigFormat.AUTO,
145+
chat_template_text_format: str = "string",
145146
mm_processor_kwargs: Optional[Dict[str, Any]] = None) -> None:
146147
self.model = model
147148
self.tokenizer = tokenizer
@@ -176,6 +177,7 @@ def __init__(self,
176177
self.model, revision)
177178
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
178179
self.use_async_output_proc = use_async_output_proc
180+
self.chat_template_text_format = chat_template_text_format
179181
self.mm_processor_kwargs = mm_processor_kwargs
180182

181183
# Set enforce_eager to False if the value is unset.

vllm/engine/arg_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ class EngineArgs:
8989
task: TaskOption = "auto"
9090
skip_tokenizer_init: bool = False
9191
tokenizer_mode: str = 'auto'
92+
chat_template_text_format: str = 'string'
9293
trust_remote_code: bool = False
9394
download_dir: Optional[str] = None
9495
load_format: str = 'auto'
@@ -250,6 +251,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
250251
'fast tokenizer if available.\n* "slow" will '
251252
'always use the slow tokenizer. \n* '
252253
'"mistral" will always use the `mistral_common` tokenizer.')
254+
parser.add_argument(
255+
'--chat-template-text-format',
256+
type=str,
257+
default=EngineArgs.chat_template_text_format,
258+
choices=['string', 'openai'],
259+
help='The format to render text content within a chat template. '
260+
'"string" will keep the content field as a string whereas '
261+
'"openai" will parse content in the current OpenAI format.')
253262
parser.add_argument('--trust-remote-code',
254263
action='store_true',
255264
help='Trust remote code from huggingface.')
@@ -858,6 +867,7 @@ def create_model_config(self) -> ModelConfig:
858867
# We know this is not None because we set it in __post_init__
859868
tokenizer=cast(str, self.tokenizer),
860869
tokenizer_mode=self.tokenizer_mode,
870+
chat_template_text_format=self.chat_template_text_format,
861871
trust_remote_code=self.trust_remote_code,
862872
dtype=self.dtype,
863873
seed=self.seed,

vllm/engine/llm_engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ def __init__(
254254
"num_scheduler_steps=%d, chunked_prefill_enabled=%s "
255255
"multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
256256
"use_async_output_proc=%s, use_cached_outputs=%s, "
257-
"mm_processor_kwargs=%s)",
257+
"chat_template_text_format=%s, mm_processor_kwargs=%s)",
258258
VLLM_VERSION,
259259
model_config.model,
260260
speculative_config,
@@ -289,6 +289,7 @@ def __init__(
289289
cache_config.enable_prefix_caching,
290290
model_config.use_async_output_proc,
291291
use_cached_outputs,
292+
model_config.chat_template_text_format,
292293
model_config.mm_processor_kwargs,
293294
)
294295
# TODO(woosuk): Print more configs in debug mode.

vllm/entrypoints/chat_utils.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ class ConversationMessage(TypedDict, total=False):
121121
role: Required[str]
122122
"""The role of the message's author."""
123123

124-
content: Optional[str]
124+
content: Union[Optional[str], List[Dict[str, str]]]
125125
"""The contents of the message"""
126126

127127
tool_call_id: Optional[str]
@@ -431,7 +431,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
431431
def _parse_chat_message_content_mm_part(
432432
part: ChatCompletionContentPartParam) -> Tuple[str, str]:
433433
"""
434-
Parses a given multi modal content part based on its type.
434+
Parses a given multi-modal content part based on its type.
435435
436436
Args:
437437
part: A dict containing the content part, with a potential 'type' field.
@@ -485,21 +485,26 @@ def _parse_chat_message_content_parts(
485485
role: str,
486486
parts: Iterable[ChatCompletionContentPartParam],
487487
mm_tracker: BaseMultiModalItemTracker,
488+
chat_template_text_format: str,
488489
) -> List[ConversationMessage]:
489490
content: List[Union[str, Dict[str, str]]] = []
490491

491492
mm_parser = mm_tracker.create_parser()
492-
keep_multimodal_content = \
493+
wrap_dicts = \
493494
mm_tracker._model_config.hf_config.model_type in \
494-
MODEL_KEEP_MULTI_MODAL_CONTENT
495+
MODEL_KEEP_MULTI_MODAL_CONTENT or \
496+
(chat_template_text_format == "openai")
495497

496498
for part in parts:
497499
parse_res = _parse_chat_message_content_part(
498-
part, mm_parser, wrap_dicts=keep_multimodal_content)
500+
part,
501+
mm_parser,
502+
wrap_dicts=wrap_dicts,
503+
)
499504
if parse_res:
500505
content.append(parse_res)
501506

502-
if keep_multimodal_content:
507+
if wrap_dicts:
503508
# Parsing wraps images and texts as interleaved dictionaries
504509
return [ConversationMessage(role=role,
505510
content=content)] # type: ignore
@@ -560,6 +565,7 @@ def _parse_chat_message_content_part(
560565
def _parse_chat_message_content(
561566
message: ChatCompletionMessageParam,
562567
mm_tracker: BaseMultiModalItemTracker,
568+
chat_template_text_format: str,
563569
) -> List[ConversationMessage]:
564570
role = message["role"]
565571
content = message.get("content")
@@ -575,6 +581,7 @@ def _parse_chat_message_content(
575581
role,
576582
content, # type: ignore
577583
mm_tracker,
584+
chat_template_text_format,
578585
)
579586

580587
for result_msg in result:
@@ -618,7 +625,11 @@ def parse_chat_messages(
618625
mm_tracker = MultiModalItemTracker(model_config, tokenizer)
619626

620627
for msg in messages:
621-
sub_messages = _parse_chat_message_content(msg, mm_tracker)
628+
sub_messages = _parse_chat_message_content(
629+
msg,
630+
mm_tracker,
631+
model_config.chat_template_text_format,
632+
)
622633

623634
conversation.extend(sub_messages)
624635

@@ -636,7 +647,11 @@ def parse_chat_messages_futures(
636647
mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
637648

638649
for msg in messages:
639-
sub_messages = _parse_chat_message_content(msg, mm_tracker)
650+
sub_messages = _parse_chat_message_content(
651+
msg,
652+
mm_tracker,
653+
model_config.chat_template_text_format,
654+
)
640655

641656
conversation.extend(sub_messages)
642657

vllm/entrypoints/openai/serving_chat.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,7 @@ async def chat_completion_stream_generator(
384384
# Send response to echo the input portion of the
385385
# last message
386386
if request.echo or request.continue_final_message:
387-
last_msg_content: str = ""
387+
last_msg_content: Union[str, List[Dict[str, str]]] = ""
388388
if conversation and "content" in conversation[
389389
-1] and conversation[-1].get("role") == role:
390390
last_msg_content = conversation[-1]["content"] or ""
@@ -724,10 +724,13 @@ async def chat_completion_full_generator(
724724
choices.append(choice_data)
725725

726726
if request.echo or request.continue_final_message:
727-
last_msg_content = ""
727+
last_msg_content: Union[str, List[Dict[str, str]]] = ""
728728
if conversation and "content" in conversation[-1] and conversation[
729729
-1].get("role") == role:
730730
last_msg_content = conversation[-1]["content"] or ""
731+
if isinstance(last_msg_content, list):
732+
last_msg_content = "\n".join(msg['text']
733+
for msg in last_msg_content)
731734

732735
for choice in choices:
733736
full_message = last_msg_content + (choice.message.content

0 commit comments

Comments
 (0)