Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
"""An example showing how to use vLLM to serve multimodal models
"""An example showing how to use vLLM to serve multimodal models
and run online serving with OpenAI client.

Launch the vLLM server with the following command:
Expand All @@ -12,12 +12,18 @@
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'

(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
--max-model-len 4096 --trust-remote-code
Comment on lines 15 to 16
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to add --trust-remote-code otherwise the model wasn't downloaded.


run the script with
python openai_chat_completion_client_for_multimodal.py --chat-type audio
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add cmd to run so user doesn't have to figure it out

"""

import base64

import requests
from openai import OpenAI
from utils import get_first_model

from vllm.utils import FlexibleArgumentParser

Expand All @@ -31,9 +37,6 @@
base_url=openai_api_base,
)

models = client.models.list()
model = models.data[0].id


def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
Expand All @@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str:


# Text-only inference
def run_text_only() -> None:
def run_text_only(model: str) -> None:
chat_completion = client.chat.completions.create(
messages=[{
"role": "user",
Expand All @@ -61,7 +64,7 @@ def run_text_only() -> None:


# Single-image input inference
def run_single_image() -> None:
def run_single_image(model: str) -> None:

## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
Expand Down Expand Up @@ -117,7 +120,7 @@ def run_single_image() -> None:


# Multi-image input inference
def run_multi_image() -> None:
def run_multi_image(model: str) -> None:
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url = client.chat.completions.create(
Expand Down Expand Up @@ -152,7 +155,7 @@ def run_multi_image() -> None:


# Video input inference
def run_video() -> None:
def run_video(model: str) -> None:
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
video_base64 = encode_base64_content_from_url(video_url)

Expand Down Expand Up @@ -208,7 +211,7 @@ def run_video() -> None:


# Audio input inference
def run_audio() -> None:
def run_audio(model: str) -> None:
from vllm.assets.audio import AudioAsset

audio_url = AudioAsset("winning_call").url
Expand Down Expand Up @@ -318,7 +321,8 @@ def parse_args():

def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
model = get_first_model(client)
example_function_map[chat_type](model)


if __name__ == "__main__":
Expand Down
25 changes: 25 additions & 0 deletions examples/online_serving/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-License-Identifier: Apache-2.0
from openai import APIConnectionError, OpenAI
from openai.pagination import SyncPage
from openai.types.model import Model


def get_first_model(client: OpenAI) -> str:
"""
Get the first model from the vLLM server.
"""
try:
models: SyncPage[Model] = client.models.list()
except APIConnectionError as e:
raise RuntimeError(
"Failed to get the list of models from the vLLM server at "
f"{client.base_url} with API key {client.api_key}. Check\n"
"1. the server is running\n"
"2. the server URL is correct\n"
"3. the API key is correct") from e

if len(models.data) == 0:
raise RuntimeError(
f"No models found on the vLLM server at {client.base_url}")

return models.data[0].id