diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 24fa83df7d75..07ecec790609 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -89,8 +89,8 @@ Alongside each architecture, we include some popular models that use it. - ✅︎ * - :code:`LlavaForConditionalGeneration` - LLaVA-1.5 - - :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc. - - + - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. + - * - :code:`MiniCPMForCausalLM` - MiniCPM - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 52afda747aab..da7269acede2 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -54,3 +54,44 @@ For now, we only support a single image per text prompt. To pass an image to the print(generated_text) A code example can be found in `examples/llava_example.py `_. + +OpenAI-Compatible Server +------------------------ + +We support image inputs to the OpenAI Chat API, as described in `GPT-4 with Vision `_. + +Here is a simple example using the :code:`openai` package: + +.. code-block:: python + + from openai import OpenAI + + openai_api_key = "EMPTY" + openai_api_base = "http://localhost:8000/v1" + + client = OpenAI( + api_key=openai_api_key, + base_url=openai_api_base, + ) + + # Note that this model expects the image to come before the main text + chat_response = client.chat.completions.create( + model="llava-hf/llava-1.5-7b-hf", + messages=[{ + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + }, + }, + {"type": "text", "text": "What's in this image?"}, + ], + }], + ) + print("Chat response:", chat_response) + +.. note:: + + For now, we only support a single image per API call. Also, the ``detail`` parameter is ignored since it may not be applicable to other models. diff --git a/examples/template_llava.jinja b/examples/template_llava.jinja new file mode 100644 index 000000000000..16b385465e69 --- /dev/null +++ b/examples/template_llava.jinja @@ -0,0 +1,11 @@ +{%- for message in messages -%} + {{ message['role'].upper() + ': ' + message['content'] }} + {%- if (loop.last and add_generation_prompt) or not loop.last -%} + {{- '\n' -}} + {%- endif -%} +{%- endfor -%} + + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- 'ASSISTANT:' -}} +{% endif %} \ No newline at end of file diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 972137030f46..f0d24926f75d 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -558,50 +558,52 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI, ) async def test_batch_completions(server, client: openai.AsyncOpenAI, model_name: str): - # test simple list - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - ) - assert len(batch.choices) == 2 - assert batch.choices[0].text == batch.choices[1].text - - # test n = 2 - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - n=2, - max_tokens=5, - temperature=0.0, - extra_body=dict( - # NOTE: this has to be true for n > 1 in vLLM, but not necessary - # for official client. - use_beam_search=True), - ) - assert len(batch.choices) == 4 - assert batch.choices[0].text != batch.choices[ - 1].text, "beam search should be different" - assert batch.choices[0].text == batch.choices[ - 2].text, "two copies of the same prompt should be the same" - assert batch.choices[1].text == batch.choices[ - 3].text, "two copies of the same prompt should be the same" - - # test streaming - batch = await client.completions.create( - model=model_name, - prompt=["Hello, my name is", "Hello, my name is"], - max_tokens=5, - temperature=0.0, - stream=True, - ) - texts = [""] * 2 - async for chunk in batch: - assert len(chunk.choices) == 1 - choice = chunk.choices[0] - texts[choice.index] += choice.text - assert texts[0] == texts[1] + # test using text and token IDs + for prompts in (["Hello, my name is"] * 2, [[0, 0, 0, 0, 0]] * 2): + # test simple list + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + ) + assert len(batch.choices) == 2 + assert batch.choices[0].text == batch.choices[1].text + + # test n = 2 + batch = await client.completions.create( + model=model_name, + prompt=prompts, + n=2, + max_tokens=5, + temperature=0.0, + extra_body=dict( + # NOTE: this has to be true for n > 1 in vLLM, but not necessary + # for official client. + use_beam_search=True), + ) + assert len(batch.choices) == 4 + assert batch.choices[0].text != batch.choices[ + 1].text, "beam search should be different" + assert batch.choices[0].text == batch.choices[ + 2].text, "two copies of the same prompt should be the same" + assert batch.choices[1].text == batch.choices[ + 3].text, "two copies of the same prompt should be the same" + + # test streaming + batch = await client.completions.create( + model=model_name, + prompt=prompts, + max_tokens=5, + temperature=0.0, + stream=True, + ) + texts = [""] * 2 + async for chunk in batch: + assert len(chunk.choices) == 1 + choice = chunk.choices[0] + texts[choice.index] += choice.text + assert texts[0] == texts[1] @pytest.mark.asyncio @@ -1047,7 +1049,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI, prompt_text = tokenizer.decode(prompt) if isinstance(prompt, list) else prompt assert (completion.choices[0].text is not None - and re.search(r"^" + prompt_text, completion.choices[0].text)) + and completion.choices[0].text.startswith(prompt_text)) logprobs = completion.choices[0].logprobs assert logprobs is not None assert len(logprobs.text_offset) > 5 diff --git a/tests/entrypoints/test_openai_server_vision.py b/tests/entrypoints/test_openai_server_vision.py new file mode 100644 index 000000000000..81ea5e8de6c5 --- /dev/null +++ b/tests/entrypoints/test_openai_server_vision.py @@ -0,0 +1,176 @@ +from pathlib import Path + +import openai # use the official client for correctness check +import pytest +# using Ray for overall ease of process management, parallel requests, +# and debugging. +import ray + +from ..utils import ServerRunner + +MODEL_NAME = "llava-hf/llava-1.5-7b-hf" +CHAT_TEMPLATE = (Path(__file__).parent.parent.parent / + "examples/template_llava.jinja") +assert CHAT_TEMPLATE.exists() + +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + +pytestmark = pytest.mark.openai + + +@pytest.fixture(scope="module") +def server(): + ray.init() + server_runner = ServerRunner.remote([ + "--model", + MODEL_NAME, + # use half precision for speed and memory savings in CI environment + "--dtype", + "bfloat16", + "--max-model-len", + "4096", + "--enforce-eager", + # vision language config below + "--image-input-type", + "pixel_values", + "--image-token-id", + "32000", + "--image-input-shape", + "1,3,336,336", + "--image-feature-size", + "576", + # chat template required for LLaVA + "--chat-template", + str(CHAT_TEMPLATE), + ]) + ray.get(server_runner.ready.remote()) + yield server_runner + ray.shutdown() + + +@pytest.fixture(scope="session") +def client(): + client = openai.AsyncOpenAI( + base_url="http://localhost:8000/v1", + api_key="token-abc123", + ) + yield client + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_single_chat_session_image(server, client: openai.AsyncOpenAI, + model_name: str, image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create(model=model_name, + messages=messages, + max_tokens=10, + logprobs=True, + top_logprobs=5) + assert chat_completion.id is not None + assert len(chat_completion.choices) == 1 + + choice = chat_completion.choices[0] + assert choice.finish_reason == "length" + assert chat_completion.usage == openai.types.CompletionUsage( + completion_tokens=10, prompt_tokens=594, total_tokens=604) + + message = choice.message + assert message.content is not None and len(message.content) >= 10 + assert message.role == "assistant" + messages.append({"role": "assistant", "content": message.content}) + + # test multi-turn dialogue + messages.append({"role": "user", "content": "express your result in json"}) + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + ) + message = chat_completion.choices[0].message + assert message.content is not None and len(message.content) >= 0 + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +async def test_chat_streaming_image(server, client: openai.AsyncOpenAI, + model_name: str, image_url: str): + messages = [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + + # test single completion + chat_completion = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + ) + output = chat_completion.choices[0].message.content + stop_reason = chat_completion.choices[0].finish_reason + + # test streaming + stream = await client.chat.completions.create( + model=model_name, + messages=messages, + max_tokens=10, + temperature=0.0, + stream=True, + ) + chunks = [] + finish_reason_count = 0 + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.role: + assert delta.role == "assistant" + if delta.content: + chunks.append(delta.content) + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + # finish reason should only return in last block + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert delta.content + assert "".join(chunks) == output + + +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/test_utils.py b/tests/test_utils.py index a6c3896fa43b..d8afeddfc89b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,11 +1,16 @@ import asyncio +import base64 +import mimetypes import sys -from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol, - Tuple, TypeVar) +from tempfile import NamedTemporaryFile +from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Dict, + Protocol, Tuple, TypeVar) +import numpy as np import pytest +from PIL import Image -from vllm.utils import deprecate_kwargs, merge_async_iterators +from vllm.utils import deprecate_kwargs, get_image, merge_async_iterators from .utils import error_on_warning @@ -60,6 +65,67 @@ async def stream_output(generator: AsyncIterator[Tuple[int, str]]): raise AssertionError() from e +# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) +TEST_IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", + "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png", + "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png", + "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", +] + + +@pytest.fixture(scope="session") +def url_images() -> Dict[str, Image.Image]: + return {image_url: get_image(image_url) for image_url in TEST_IMAGE_URLS} + + +def get_supported_suffixes() -> Tuple[str, ...]: + # We should at least test the file types mentioned in GPT-4 with Vision + OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif') + + # Additional file types that are supported by us + EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff') + + return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES + + +def _image_equals(a: Image.Image, b: Image.Image) -> bool: + return (np.asarray(a) == np.asarray(b.convert(a.mode))).all() + + +@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) +@pytest.mark.parametrize("suffix", get_supported_suffixes()) +def test_get_image_base64(url_images: Dict[str, Image.Image], image_url: str, + suffix: str): + url_image = url_images[image_url] + + try: + mime_type = Image.MIME[Image.registered_extensions()[suffix]] + except KeyError: + try: + mime_type = mimetypes.types_map[suffix] + except KeyError: + pytest.skip('No MIME type') + + with NamedTemporaryFile(suffix=suffix) as f: + try: + url_image.save(f.name) + except Exception as e: + if e.args[0] == 'cannot write mode RGBA as JPEG': + pytest.skip('Conversion not supported') + + raise + + base64_image = base64.b64encode(f.read()).decode("utf-8") + data_url = f"data:{mime_type};base64,{base64_image}" + + with get_image(data_url) as data_image: + if _image_equals(url_image, Image.open(f)): + assert _image_equals(url_image, data_image) + else: + pass # Lossy format; only check that image can be opened + + def test_deprecate_kwargs_always(): @deprecate_kwargs("old_arg", is_deprecated=True) diff --git a/vllm/config.py b/vllm/config.py index eee62d268383..b67e6490ca3c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,10 +1,11 @@ import enum import json from dataclasses import dataclass, field, fields -from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Union +from typing import (TYPE_CHECKING, ClassVar, Dict, List, Optional, Protocol, + Tuple, Union) import torch -from transformers import PretrainedConfig +from transformers import PretrainedConfig, PreTrainedTokenizerBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS @@ -1065,6 +1066,44 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): "LoRA is enabled.") +class OpenAIVisionAdapter(Protocol): + + def get_image_token_text(self, config: "VisionLanguageConfig", + tokenizer: PreTrainedTokenizerBase, + image_idx: int) -> str: + """Defines how to represent an image in the text prompt.""" + ... + + +class OpenAIVisionAdapterForNoImage(OpenAIVisionAdapter): + + def get_image_token_text(self, config: "VisionLanguageConfig", + tokenizer: PreTrainedTokenizerBase, + image_idx: int) -> str: + raise NotImplementedError("Image input not supported") + + +class OpenAIVisionAdapterForSingleImage(OpenAIVisionAdapter): + + def get_image_token_text(self, config: "VisionLanguageConfig", + tokenizer: PreTrainedTokenizerBase, + image_idx: int) -> str: + if image_idx > 0: + raise NotImplementedError("Multiple image input not supported") + + image_token_str = tokenizer.decode(config.image_token_id) + return image_token_str * config.image_feature_size + + +class OpenAIVisionAdapterForMultiImage(OpenAIVisionAdapter): + + def get_image_token_text(self, config: "VisionLanguageConfig", + tokenizer: PreTrainedTokenizerBase, + image_idx: int) -> str: + image_token_str = tokenizer.decode(config.image_token_id + image_idx) + return image_token_str * config.image_feature_size + + @dataclass class VisionLanguageConfig: """Configs the input data format and how models should run for @@ -1086,6 +1125,14 @@ class ImageInputType(enum.Enum): PIXEL_VALUES = enum.auto() IMAGE_FEATURES = enum.auto() + class ImageOpenAI(enum.Enum): + """Specifies how the model implements + `OpenAI's GPT-4 with Vision API `_. + """ + UNSUPPORTED = enum.auto() + SINGLE_IMAGE = enum.auto() + MULTI_IMAGE = enum.auto() + image_input_type: ImageInputType # The input id corresponding to image token. image_token_id: int @@ -1098,6 +1145,14 @@ class ImageInputType(enum.Enum): image_processor: Optional[str] image_processor_revision: Optional[str] + image_openai: ImageOpenAI = ImageOpenAI.SINGLE_IMAGE + _image_openai_processors: ClassVar[Dict[ + ImageOpenAI, OpenAIVisionAdapter]] = { + ImageOpenAI.UNSUPPORTED: OpenAIVisionAdapterForNoImage(), + ImageOpenAI.SINGLE_IMAGE: OpenAIVisionAdapterForSingleImage(), + ImageOpenAI.MULTI_IMAGE: OpenAIVisionAdapterForMultiImage(), + } + @classmethod def get_image_input_enum_type(cls, value: str) -> ImageInputType: """Get the image input type from a string.""" @@ -1108,6 +1163,21 @@ def get_image_input_enum_type(cls, value: str) -> ImageInputType: f"Expecting to choose from " f"{[x.name for x in cls.ImageInputType]}.") from e + @classmethod + def get_image_openai_enum_type(cls, value: str) -> ImageOpenAI: + """Get the GPT-4 with Vision API implementation from a string.""" + try: + return cls.ImageOpenAI[value.upper()] + except KeyError as e: + raise ValueError(f"{value} is not a valid choice. " + f"Expecting to choose from " + f"{[x.name for x in cls.ImageOpenAI]}.") from e + + def get_image_token_text(self, tokenizer: PreTrainedTokenizerBase, + image_idx: int) -> str: + return self._image_openai_processors[self.image_openai] \ + .get_image_token_text(self, tokenizer, image_idx) + _STR_DTYPE_TO_TORCH_DTYPE = { "half": torch.float16, diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b315d4d2ece2..cc1d82ffa9e2 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -84,6 +84,7 @@ class EngineArgs: image_processor: Optional[str] = None image_processor_revision: Optional[str] = None disable_image_processor: bool = False + image_openai: str = VisionLanguageConfig.ImageOpenAI.SINGLE_IMAGE.name scheduler_delay_factor: float = 0.0 enable_chunked_prefill: bool = False @@ -129,6 +130,12 @@ def add_cli_args_for_vlm( type=int, default=None, help=('The image feature size along the context dimension.')) + parser.add_argument( + '--image-openai', + type=str, + default=VisionLanguageConfig.ImageOpenAI.SINGLE_IMAGE.name.lower(), + choices=[t.name.lower() for t in VisionLanguageConfig.ImageOpenAI], + help=('Specifies how the model implements GPT-4 with Vision API.')) parser.add_argument( '--image-processor', type=str, @@ -718,6 +725,8 @@ def create_engine_config(self, ) -> EngineConfig: image_feature_size=self.image_feature_size, image_processor=self.image_processor, image_processor_revision=self.image_processor_revision, + image_openai=VisionLanguageConfig.get_image_openai_enum_type( + self.image_openai), ) else: vision_language_config = None diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index cc5b896e0e56..723542828786 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,15 +1,16 @@ import codecs import time from dataclasses import dataclass -from typing import (AsyncGenerator, AsyncIterator, Dict, Iterable, List, - Optional) +from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, + List, Optional) from typing import Sequence as GenericSequence from typing import TypedDict, Union, cast, final from fastapi import Request -from openai.types.chat import ChatCompletionContentPartTextParam +from openai.types.chat import (ChatCompletionContentPartImageParam, + ChatCompletionContentPartTextParam) -from vllm.config import ModelConfig +from vllm.config import ModelConfig, VisionLanguageConfig from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import ( ChatCompletionContentPartParam, ChatCompletionLogProb, @@ -20,12 +21,14 @@ UsageInfo) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, OpenAIServing) +from vllm.inputs import PromptInputs from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) +from vllm.multimodal.image import ImagePixelData from vllm.outputs import RequestOutput from vllm.sequence import Logprob -from vllm.utils import random_uuid +from vllm.utils import get_image_async, random_uuid logger = init_logger(__name__) @@ -39,6 +42,7 @@ class ConversationMessage(TypedDict): @dataclass(frozen=True) class ChatMessageParseResult: messages: List[ConversationMessage] + image_futures: List[Awaitable[ImagePixelData]] class OpenAIServingChat(OpenAIServing): @@ -58,6 +62,10 @@ def __init__(self, self.response_role = response_role self._load_chat_template(chat_template) + async def _get_and_parse_image(self, image_url: str) -> ImagePixelData: + with await get_image_async(image_url) as image: + return ImagePixelData(image) + def _load_chat_template(self, chat_template: Optional[str]): tokenizer = self.tokenizer @@ -92,20 +100,43 @@ def _parse_chat_message_content_parts( role: str, parts: Iterable[ChatCompletionContentPartParam], ) -> ChatMessageParseResult: + tokenizer = self.tokenizer + + vlm_config = getattr(self.engine.engine, "vision_language_config", + None) + texts: List[str] = [] + image_futures: List[Awaitable[ImagePixelData]] = [] - for _, part in enumerate(parts): + for i, part in enumerate(parts): part_type = part["type"] if part_type == "text": text = cast(ChatCompletionContentPartTextParam, part)["text"] texts.append(text) + elif part_type == "image_url": + if not isinstance(vlm_config, VisionLanguageConfig): + raise ValueError("GPT-4 with Vision API is only supported " + "for vision language models.") + + image_url = cast(ChatCompletionContentPartImageParam, + part)["image_url"] + if image_url.get("detail", "auto") != "auto": + logger.info("content[%s].image_url.detail is ignored", i) + + text = vlm_config.get_image_token_text( + tokenizer, image_idx=len(image_futures)) + image_future = self._get_and_parse_image(image_url["url"]) + + texts.append(text) + image_futures.append(image_future) else: raise NotImplementedError(f"Unknown part type: {part_type}") messages = [ConversationMessage(role=role, content="\n".join(texts))] - return ChatMessageParseResult(messages=messages) + return ChatMessageParseResult(messages=messages, + image_futures=image_futures) def _parse_chat_message_content( self, @@ -115,10 +146,10 @@ def _parse_chat_message_content( content = message.get("content") if content is None: - return ChatMessageParseResult(messages=[]) + return ChatMessageParseResult(messages=[], image_futures=[]) if isinstance(content, str): messages = [ConversationMessage(role=role, content=content)] - return ChatMessageParseResult(messages=messages) + return ChatMessageParseResult(messages=messages, image_futures=[]) return self._parse_chat_message_content_parts(role, content) @@ -143,11 +174,13 @@ async def create_chat_completion( try: conversation: List[ConversationMessage] = [] + multi_modal_futures: List[Awaitable[ImagePixelData]] = [] for msg in request.messages: parsed_msg = self._parse_chat_message_content(msg) conversation.extend(parsed_msg.messages) + multi_modal_futures.extend(parsed_msg.image_futures) prompt = self.tokenizer.apply_chat_template( conversation=conversation, @@ -158,6 +191,18 @@ async def create_chat_completion( logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) + try: + if len(multi_modal_futures) == 0: + multi_modal_data = None + elif len(multi_modal_futures) == 1: + multi_modal_data = await multi_modal_futures[0] + else: + # multi_modal_datas = await asyncio.gather(*multi_modal_futures) + raise NotImplementedError("Multiple image input not supported") + except Exception as e: + logger.error("Error in loading multi-modal data: %s", e) + return self.create_error_response(str(e)) + request_id = f"cmpl-{random_uuid()}" try: # Tokenize/detokenize depending on prompt format (string/token list) @@ -177,18 +222,24 @@ async def create_chat_completion( sampling_params.logits_processors = [] sampling_params.logits_processors.append( guided_decode_logits_processor) + + inputs: PromptInputs = { + "prompt": prompt_text, + "prompt_token_ids": prompt_ids, + } + if multi_modal_data is not None: + inputs["multi_modal_data"] = multi_modal_data + + result_generator = self.engine.generate( + inputs, + sampling_params, + request_id, + lora_request, + ) except ValueError as e: + # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) - result_generator = self.engine.generate( - { - "prompt": prompt_text, - "prompt_token_ids": prompt_ids - }, - sampling_params, - request_id, - lora_request, - ) # Streaming response if request.stream: return self.chat_completion_stream_generator( @@ -219,10 +270,11 @@ async def chat_completion_stream_generator( first_iteration = True # Send response for each token for each request.n (index) - assert request.n is not None - previous_texts = [""] * request.n - previous_num_tokens = [0] * request.n - finish_reason_sent = [False] * request.n + num_choices = 1 if request.n is None else request.n + previous_texts = [""] * num_choices + previous_num_tokens = [0] * num_choices + finish_reason_sent = [False] * num_choices + try: async for res in result_generator: # We need to do it here, because if there are exceptions in @@ -232,7 +284,7 @@ async def chat_completion_stream_generator( # Send first response for each request.n (index) with # the role role = self.get_chat_request_role(request) - for i in range(request.n): + for i in range(num_choices): choice_data = ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage(role=role), @@ -257,19 +309,19 @@ async def chat_completion_stream_generator( last_msg_content = conversation[-1]["content"] if last_msg_content: - for i in range(request.n): + for i in range(num_choices): choice_data = ( ChatCompletionResponseStreamChoice( index=i, delta=DeltaMessage( content=last_msg_content), + logprobs=None, finish_reason=None)) chunk = ChatCompletionStreamResponse( id=request_id, object=chunk_object_type, created=created_time, choices=[choice_data], - logprobs=None, model=model_name) data = chunk.model_dump_json( exclude_unset=True) @@ -365,7 +417,7 @@ async def chat_completion_full_generator( final_res = res assert final_res is not None - choices = [] + choices: List[ChatCompletionResponseChoice] = [] role = self.get_chat_request_role(request) for output in final_res.outputs: diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2fb122edaf98..572878b5527d 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -312,7 +312,7 @@ def request_output_to_completion_response( elif request.echo and request.max_tokens > 0: token_ids = prompt_token_ids + output.token_ids top_logprobs = (prompt_logprobs + output.logprobs - if request.logprobs else None) + if request.logprobs is not None else None) output_text = prompt_text + output.text else: token_ids = output.token_ids diff --git a/vllm/utils.py b/vllm/utils.py index 2781eceb7ba9..fa638c715be8 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -16,10 +16,14 @@ from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic, Hashable, List, Optional, OrderedDict, Tuple, TypeVar, Union) +from urllib.parse import urlparse +from urllib.request import urlopen import numpy as np import psutil +import requests import torch +from PIL import Image import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger @@ -280,6 +284,30 @@ def get_ip() -> str: return "0.0.0.0" +def get_image(url: str) -> Image.Image: + """ + Retrieve an image from a data URL or an online resource. + + The returned image should be used like a context manager to ensure + proper disposal of the underlying buffer. + """ + # Avoid circular import + from vllm import __version__ as VLLM_VERSION + + url_components = urlparse(url) + if url_components.scheme == 'data': + return Image.open(urlopen(url)) + + headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"} + response = requests.get(url, headers=headers, stream=True) + response.raise_for_status() + + return Image.open(response.raw) + + +get_image_async = make_async(get_image) + + def get_distributed_init_method(ip: str, port: int) -> str: # Brackets are not permitted in ipv4 addresses, # see https://github.com/python/cpython/issues/103848