Skip to content

Commit 5231f08

Browse files
authored
[Frontend][VLM] Add support for multiple multi-modal items (#8049)
1 parent 8423aef commit 5231f08

File tree

8 files changed

+524
-136
lines changed

8 files changed

+524
-136
lines changed

.buildkite/test-pipeline.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ steps:
9090
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py
9191
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
9292
- pytest -v -s entrypoints/openai
93+
- pytest -v -s entrypoints/test_chat_utils.py
9394

9495
- label: Distributed Tests (4 GPUs) # 10min
9596
working_dir: "/vllm-workspace/tests"

examples/openai_vision_api_client.py

+39
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
"""An example showing how to use vLLM to serve VLMs.
22
33
Launch the vLLM server with the following command:
4+
5+
(single image inference with Llava)
46
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
7+
8+
(multi-image inference with Phi-3.5-vision-instruct)
9+
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
10+
--trust-remote-code --limit-mm-per-prompt image=2
511
"""
612
import base64
713

@@ -84,3 +90,36 @@ def encode_image_base64_from_url(image_url: str) -> str:
8490

8591
result = chat_completion_from_base64.choices[0].message.content
8692
print(f"Chat completion output:{result}")
93+
94+
# Multi-image input inference
95+
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
96+
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
97+
chat_completion_from_url = client.chat.completions.create(
98+
messages=[{
99+
"role":
100+
"user",
101+
"content": [
102+
{
103+
"type": "text",
104+
"text": "What are the animals in these images?"
105+
},
106+
{
107+
"type": "image_url",
108+
"image_url": {
109+
"url": image_url_duck
110+
},
111+
},
112+
{
113+
"type": "image_url",
114+
"image_url": {
115+
"url": image_url_lion
116+
},
117+
},
118+
],
119+
}],
120+
model=model,
121+
max_tokens=64,
122+
)
123+
124+
result = chat_completion_from_url.choices[0].message.content
125+
print(f"Chat completion output:{result}")

tests/entrypoints/openai/test_serving_chat.py

+2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from dataclasses import dataclass
44
from unittest.mock import MagicMock
55

6+
from vllm.config import MultiModalConfig
67
from vllm.engine.async_llm_engine import AsyncLLMEngine
78
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
89
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -20,6 +21,7 @@ class MockModelConfig:
2021
max_model_len = 100
2122
tokenizer_revision = None
2223
embedding_mode = False
24+
multimodal_config = MultiModalConfig()
2325

2426

2527
@dataclass

tests/entrypoints/openai/test_vision.py

+36-35
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66

77
from vllm.multimodal.utils import encode_image_base64, fetch_image
88

9-
from ...utils import VLLM_PATH, RemoteOpenAIServer
9+
from ...utils import RemoteOpenAIServer
1010

11-
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
12-
LLAVA_CHAT_TEMPLATE = VLLM_PATH / "examples/template_llava.jinja"
13-
assert LLAVA_CHAT_TEMPLATE.exists()
11+
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
12+
MAXIMUM_IMAGES = 2
1413

1514
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
1615
TEST_IMAGE_URLS = [
@@ -24,13 +23,9 @@
2423
@pytest.fixture(scope="module")
2524
def server():
2625
args = [
27-
"--dtype",
28-
"bfloat16",
29-
"--max-model-len",
30-
"4096",
31-
"--enforce-eager",
32-
"--chat-template",
33-
str(LLAVA_CHAT_TEMPLATE),
26+
"--dtype", "bfloat16", "--max-model-len", "4096", "--max-num-seqs",
27+
"5", "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt",
28+
f"image={MAXIMUM_IMAGES}"
3429
]
3530

3631
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -84,7 +79,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
8479
choice = chat_completion.choices[0]
8580
assert choice.finish_reason == "length"
8681
assert chat_completion.usage == openai.types.CompletionUsage(
87-
completion_tokens=10, prompt_tokens=596, total_tokens=606)
82+
completion_tokens=10, prompt_tokens=772, total_tokens=782)
8883

8984
message = choice.message
9085
message = chat_completion.choices[0].message
@@ -139,7 +134,7 @@ async def test_single_chat_session_image_base64encoded(
139134
choice = chat_completion.choices[0]
140135
assert choice.finish_reason == "length"
141136
assert chat_completion.usage == openai.types.CompletionUsage(
142-
completion_tokens=10, prompt_tokens=596, total_tokens=606)
137+
completion_tokens=10, prompt_tokens=772, total_tokens=782)
143138

144139
message = choice.message
145140
message = chat_completion.choices[0].message
@@ -217,47 +212,53 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
217212

218213
@pytest.mark.asyncio
219214
@pytest.mark.parametrize("model_name", [MODEL_NAME])
220-
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
215+
@pytest.mark.parametrize(
216+
"image_urls",
217+
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
221218
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
222-
image_url: str):
219+
image_urls: List[str]):
223220

224221
messages = [{
225222
"role":
226223
"user",
227224
"content": [
228-
{
229-
"type": "image_url",
230-
"image_url": {
231-
"url": image_url
232-
}
233-
},
234-
{
225+
*({
235226
"type": "image_url",
236227
"image_url": {
237228
"url": image_url
238229
}
239-
},
230+
} for image_url in image_urls),
240231
{
241232
"type": "text",
242233
"text": "What's in this image?"
243234
},
244235
],
245236
}]
246237

247-
with pytest.raises(openai.BadRequestError): # test multi-image input
248-
await client.chat.completions.create(
238+
if len(image_urls) > MAXIMUM_IMAGES:
239+
with pytest.raises(openai.BadRequestError): # test multi-image input
240+
await client.chat.completions.create(
241+
model=model_name,
242+
messages=messages,
243+
max_tokens=10,
244+
temperature=0.0,
245+
)
246+
247+
# the server should still work afterwards
248+
completion = await client.completions.create(
249+
model=model_name,
250+
prompt=[0, 0, 0, 0, 0],
251+
max_tokens=5,
252+
temperature=0.0,
253+
)
254+
completion = completion.choices[0].text
255+
assert completion is not None and len(completion) >= 0
256+
else:
257+
chat_completion = await client.chat.completions.create(
249258
model=model_name,
250259
messages=messages,
251260
max_tokens=10,
252261
temperature=0.0,
253262
)
254-
255-
# the server should still work afterwards
256-
completion = await client.completions.create(
257-
model=model_name,
258-
prompt=[0, 0, 0, 0, 0],
259-
max_tokens=5,
260-
temperature=0.0,
261-
)
262-
completion = completion.choices[0].text
263-
assert completion is not None and len(completion) >= 0
263+
message = chat_completion.choices[0].message
264+
assert message.content is not None and len(message.content) >= 0

0 commit comments

Comments
 (0)