diff --git a/docs/source/en/tasks/image_text_to_text.md b/docs/source/en/tasks/image_text_to_text.md index 28bd98457ee0..a98b1e5509cc 100644 --- a/docs/source/en/tasks/image_text_to_text.md +++ b/docs/source/en/tasks/image_text_to_text.md @@ -160,7 +160,48 @@ outputs[0]["generated_text"] # with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems ``` -## Streaming +If you prefer, you can also load the images separately and pass them to the pipeline like so: + +```python +pipe = pipeline("image-text-to-text", model="HuggingFaceTB/SmolVLM-256M-Instruct") + +img_urls = [ + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png", + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg", +] +images = [ + Image.open(requests.get(img_urls[0], stream=True).raw), + Image.open(requests.get(img_urls[1], stream=True).raw), +] + +messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "image"}, + {"type": "text", "text": "What do you see in these images?"}, + ], + } +] +outputs = pipe(text=messages, images=images, max_new_tokens=50, return_full_text=False) +outputs[0]["generated_text"] +" In the first image, there are two cats sitting on a plant. In the second image, there are flowers with a pinkish hue." +``` + +The images will still be included in the `"input_text"` field of the output: + +```python +outputs[0]['input_text'] +""" +[{'role': 'user', + 'content': [{'type': 'image', + 'image': }, + {'type': 'image', + 'image': }, + {'type': 'text', 'text': 'What do you see in these images?'}]}]## Streaming +""" +``` We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B. diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py index 6b743997f5ee..537d0a854370 100644 --- a/src/transformers/pipelines/image_text_to_text.py +++ b/src/transformers/pipelines/image_text_to_text.py @@ -58,13 +58,12 @@ def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", for message in messages: if not ("role" in message and "content" in message): raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.") - images = retrieve_images_in_messages(messages, images) + messages = add_images_to_messages(messages, images) self.messages = messages - self.images = images -def retrieve_images_in_messages( +def add_images_to_messages( messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]] ): """ @@ -72,38 +71,35 @@ def retrieve_images_in_messages( """ if images is None: images = [] - elif not isinstance(images, Iterable): + elif not isinstance(images, Iterable) or isinstance(images, str): images = [images] idx_images = 0 - retrieved_images = [] for message in messages: for content in message["content"]: - if isinstance(content, dict): - if content.get("type") == "image": - for key in ["image", "url", "path", "base64"]: - if key in content: - retrieved_images.append(content[key]) - break - else: - if idx_images < len(images): - retrieved_images.append(images[idx_images]) - idx_images += 1 - else: - raise ValueError( - "The number of images in the chat messages should be the same as the number of images passed to the pipeline." - ) - # Add support for OpenAI/TGI chat format - elif content.get("type") == "image_url": - if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]: - retrieved_images.append(content["image_url"]["url"]) - # Rewrite content to be in the Transformers chat format - content["type"] = "image" - content["image"] = content["image_url"]["url"] - del content["image_url"] + if not isinstance(content, dict): + continue + content_type = content.get("type") + if content_type == "image": + if not any(key in content for key in ["image", "url", "path", "base64"]): + if idx_images < len(images): + # Insert the image passed as argument in the chat message + content["image"] = images[idx_images] + idx_images += 1 else: raise ValueError( - "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key." + "The number of images in the chat messages should be the same as the number of images passed to the pipeline." ) + # Add support for OpenAI/TGI chat format + elif content_type == "image_url": + if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]: + # Rewrite content to be in the Transformers chat format + content["type"] = "image" + content["image"] = content["image_url"]["url"] + del content["image_url"] + else: + raise ValueError( + "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key." + ) # The number of images passed should be consistent with the number of images in the chat without an image key if idx_images != len(images): @@ -111,7 +107,7 @@ def retrieve_images_in_messages( "The number of images in the chat messages should be the same as the number of images passed to the pipeline." ) - return retrieved_images + return messages @add_end_docstrings(build_pipeline_init_args(has_processor=True)) @@ -331,32 +327,30 @@ def __call__( return super().__call__({"images": images, "text": text}, **kwargs) def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **processing_kwargs): + if isinstance(inputs, Chat): + # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default + # because very few models support multiple separate, consecutive assistant messages + if continue_final_message is None: + continue_final_message = inputs.messages[-1]["role"] == "assistant" + model_inputs = self.processor.apply_chat_template( + inputs.messages, + add_generation_prompt=not continue_final_message, + continue_final_message=continue_final_message, + return_tensors=self.framework, + tokenize=True, + return_dict=True, + ) + model_inputs["text"] = inputs + return model_inputs # In case we only have text inputs if isinstance(inputs, (list, tuple, str)): images = None text = inputs inputs_text = inputs else: - if isinstance(inputs, Chat): - # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default - # because very few models support multiple separate, consecutive assistant messages - if continue_final_message is None: - continue_final_message = inputs.messages[-1]["role"] == "assistant" - text = self.processor.apply_chat_template( - inputs.messages, - add_generation_prompt=not continue_final_message, - continue_final_message=continue_final_message, - return_tensors=self.framework, - **processing_kwargs, - ) - inputs_text = inputs - images = inputs.images - else: - text = inputs["text"] - inputs_text = inputs["text"] - images = inputs["images"] - - images = load_images(images, timeout=timeout) + images = load_images(inputs["images"], timeout=timeout) + text = inputs["text"] + inputs_text = inputs["text"] # if batched text inputs, we set padding to True unless specified otherwise if isinstance(text, (list, tuple)) and len(text) > 1: diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py index 903e90919c2c..b32c6f608c74 100644 --- a/tests/pipelines/test_pipelines_image_text_to_text.py +++ b/tests/pipelines/test_pipelines_image_text_to_text.py @@ -66,6 +66,78 @@ def run_pipeline_test(self, pipe, examples): ], ) + @require_torch + def test_small_model_pt_token_text_only(self): + pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") + text = "What is the capital of France? Assistant:" + + outputs = pipe(text=text) + self.assertEqual( + outputs, + [ + { + "input_text": "What is the capital of France? Assistant:", + "generated_text": "What is the capital of France? Assistant: The capital of France is Paris.", + } + ], + ) + + messages = [ + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Write a poem on Hugging Face, the company"}, + ], + }, + ], + [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What is the capital of France?"}, + ], + }, + ], + ] + outputs = pipe(text=messages) + self.assertEqual( + outputs, + [ + [ + { + "input_text": [ + { + "role": "user", + "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}], + } + ], + "generated_text": [ + { + "role": "user", + "content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}], + }, + { + "role": "assistant", + "content": "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom", + }, + ], + } + ], + [ + { + "input_text": [ + {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]} + ], + "generated_text": [ + {"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}, + {"role": "assistant", "content": "Paris"}, + ], + } + ], + ], + ) + @require_torch def test_small_model_pt_token(self): pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") @@ -124,7 +196,7 @@ def test_model_pt_chat_template(self): ], } ] - outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=False, max_new_tokens=10) + outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=True, max_new_tokens=10) self.assertEqual( outputs, [ @@ -134,12 +206,37 @@ def test_model_pt_chat_template(self): "role": "user", "content": [ {"type": "text", "text": "What’s the difference between these two images?"}, - {"type": "image"}, - {"type": "image"}, + { + "type": "image", + "image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", + }, + { + "type": "image", + "image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg", + }, ], } ], - "generated_text": "The first image shows a statue of Liberty in the", + "generated_text": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What’s the difference between these two images?"}, + { + "type": "image", + "image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", + }, + { + "type": "image", + "image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg", + }, + ], + }, + { + "role": "assistant", + "content": "The first image shows a statue of Liberty in the", + }, + ], } ], )