vllm-project · ywang96 · May 16, 2025 · May 13, 2025 · davidxia · May 13, 2025
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
-"""An example showing how to use vLLM to serve multimodal models 
+"""An example showing how to use vLLM to serve multimodal models
 and run online serving with OpenAI client.
 
 Launch the vLLM server with the following command:
@@ -12,12 +12,18 @@
     --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}'
 
 (audio inference with Ultravox)
-vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
+vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \
+    --max-model-len 4096 --trust-remote-code
+
+run the script with
+python openai_chat_completion_client_for_multimodal.py --chat-type audio
 """
+
 import base64
 
 import requests
 from openai import OpenAI
+from utils import get_first_model
 
 from vllm.utils import FlexibleArgumentParser
 
@@ -31,9 +37,6 @@
     base_url=openai_api_base,
 )
 
-models = client.models.list()
-model = models.data[0].id
-
 
 def encode_base64_content_from_url(content_url: str) -> str:
     """Encode a content retrieved from a remote url to base64 format."""
@@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
 
 
 # Text-only inference
-def run_text_only() -> None:
+def run_text_only(model: str) -> None:
     chat_completion = client.chat.completions.create(
         messages=[{
             "role": "user",
@@ -61,7 +64,7 @@ def run_text_only() -> None:
 
 
 # Single-image input inference
-def run_single_image() -> None:
+def run_single_image(model: str) -> None:
 
     ## Use image url in the payload
     image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
@@ -117,7 +120,7 @@ def run_single_image() -> None:
 
 
 # Multi-image input inference
-def run_multi_image() -> None:
+def run_multi_image(model: str) -> None:
     image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
     image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
     chat_completion_from_url = client.chat.completions.create(
@@ -152,7 +155,7 @@ def run_multi_image() -> None:
 
 
 # Video input inference
-def run_video() -> None:
+def run_video(model: str) -> None:
     video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
     video_base64 = encode_base64_content_from_url(video_url)
 
@@ -208,7 +211,7 @@ def run_video() -> None:
 
 
 # Audio input inference
-def run_audio() -> None:
+def run_audio(model: str) -> None:
     from vllm.assets.audio import AudioAsset
 
     audio_url = AudioAsset("winning_call").url
@@ -318,7 +321,8 @@ def parse_args():
 
 def main(args) -> None:
     chat_type = args.chat_type
-    example_function_map[chat_type]()
+    model = get_first_model(client)
+    example_function_map[chat_type](model)
 
 
 if __name__ == "__main__":

diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+from openai import APIConnectionError, OpenAI
+from openai.pagination import SyncPage
+from openai.types.model import Model
+
+
+def get_first_model(client: OpenAI) -> str:
+    """
+    Get the first model from the vLLM server.
+    """
+    try:
+        models: SyncPage[Model] = client.models.list()
+    except APIConnectionError as e:
+        raise RuntimeError(
+            "Failed to get the list of models from the vLLM server at "
+            f"{client.base_url} with API key {client.api_key}. Check\n"
+            "1. the server is running\n"
+            "2. the server URL is correct\n"
+            "3. the API key is correct") from e
+
+    if len(models.data) == 0:
+        raise RuntimeError(
+            f"No models found on the vLLM server at {client.base_url}")
+
+    return models.data[0].id