diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md index ab42c24d83e8..059312850876 100644 --- a/docs/source/en/model_doc/auto.md +++ b/docs/source/en/model_doc/auto.md @@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks. ### FlaxAutoModelForVision2Seq [[autodoc]] FlaxAutoModelForVision2Seq + +### AutoModelForImageTextToText + +[[autodoc]] AutoModelForImageTextToText diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md index d4baaf70e6fd..0497573f2b77 100644 --- a/docs/source/ja/model_doc/auto.md +++ b/docs/source/ja/model_doc/auto.md @@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel) ### FlaxAutoModelForVision2Seq [[autodoc]] FlaxAutoModelForVision2Seq + +### AutoModelForImageTextToText + +[[autodoc]] AutoModelForImageTextToText \ No newline at end of file diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index da29d77972f4..0e414eee0136 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1497,6 +1497,7 @@ "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", "MODEL_FOR_KEYPOINT_DETECTION_MAPPING", @@ -1538,6 +1539,7 @@ "AutoModelForDocumentQuestionAnswering", "AutoModelForImageClassification", "AutoModelForImageSegmentation", + "AutoModelForImageTextToText", "AutoModelForImageToImage", "AutoModelForInstanceSegmentation", "AutoModelForKeypointDetection", @@ -6352,6 +6354,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -6393,6 +6396,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 96a159133cc0..aed2df0f7c40 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -74,6 +74,7 @@ "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", "MODEL_MAPPING", "MODEL_WITH_LM_HEAD_MAPPING", @@ -119,6 +120,7 @@ "AutoModelWithLMHead", "AutoModelForZeroShotImageClassification", "AutoModelForZeroShotObjectDetection", + "AutoModelForImageTextToText", ] try: @@ -238,6 +240,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -279,6 +282,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 150dea04f374..f1a5fa59acc6 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -680,6 +680,24 @@ ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), + ("udop", "UdopForConditionalGeneration"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("vision-encoder-decoder", "VisionEncoderDecoderModel"), + ] +) + +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( + [ + ("blip", "BlipForConditionalGeneration"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("fuyu", "FuyuForCausalLM"), + ("git", "GitForCausalLM"), + ("idefics", "IdeficsForVisionText2Text"), + ("instructblip", "InstructBlipForConditionalGeneration"), + ("kosmos-2", "Kosmos2ForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), + ("pix2struct", "Pix2StructForConditionalGeneration"), + ("udop", "UdopForConditionalGeneration"), ("vipllava", "VipLlavaForConditionalGeneration"), ("vision-encoder-decoder", "VisionEncoderDecoderModel"), ] @@ -1317,6 +1335,9 @@ CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES ) MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES +) MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES ) @@ -1611,6 +1632,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass): AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling") +class AutoModelForImageTextToText(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + +AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling") + + class AutoModelForAudioClassification(_BaseAutoModelClass): _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 2f0851c06274..556d834b554d 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -81,19 +81,15 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") + text_features = {} if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + text_features = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + image_features = {} if images is not None: image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) - if text is not None and images is not None: - encoding["pixel_values"] = image_features.pixel_values - return encoding - elif text is not None: - return encoding - else: - return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + return BatchEncoding(data=dict(**text_features, **image_features), tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/llava/test.py b/src/transformers/models/llava/test.py new file mode 100644 index 000000000000..025482e2bc27 --- /dev/null +++ b/src/transformers/models/llava/test.py @@ -0,0 +1,25 @@ +from transformers import pipeline + + +# OK: +# model_id = "microsoft/git-base-coco" +model_id = "Salesforce/blip-image-captioning-base" +# model_id = "Salesforce/blip2-opt-2.7b" ok, although it doesn't include the text prompt in the output +# model_id = "Salesforce/instructblip-flan-t5-xl" ok, although it doesn't include the text prompt in the output +# model_id = "llava-hf/llava-1.5-7b-hf" +# model_id = "adept/fuyu-8b" +# model_id = "google/pix2struct-textcaps-base" +# model_id = "microsoft/udop-large" +# model_id = "naver-clova-ix/donut-base-finetuned-docvqa" +# model_id = "microsoft/kosmos-2-patch14-224" + +pipe = pipeline(task="image-text-to-text", model=model_id) + +outputs = pipe( + images=["http://images.cocodataset.org/val2017/000000039769.jpg"], + # text="USER: \nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud\nASSISTANT:", + text=["A photo of", "The cats are"], + max_new_tokens=200, +) + +print(outputs) diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 8ee0137a20b3..02be38c6b112 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -28,6 +28,7 @@ from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage +from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer from ..tokenization_utils import PreTrainedTokenizer from ..utils import ( @@ -66,6 +67,7 @@ from .image_classification import ImageClassificationPipeline from .image_feature_extraction import ImageFeatureExtractionPipeline from .image_segmentation import ImageSegmentationPipeline +from .image_text_to_text import ImageTextToTextPipeline from .image_to_image import ImageToImagePipeline from .image_to_text import ImageToTextPipeline from .mask_generation import MaskGenerationPipeline @@ -118,6 +120,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForMaskedLM, AutoModelForMaskGeneration, AutoModelForObjectDetection, @@ -392,6 +395,17 @@ }, "type": "multimodal", }, + "image-text-to-text": { + "impl": ImageTextToTextPipeline, + "tf": (), + "pt": (AutoModelForImageTextToText,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("Salesforce/blip-image-captioning-base", "89b09ea"), + } + }, + "type": "multimodal", + }, "object-detection": { "impl": ObjectDetectionPipeline, "tf": (), @@ -566,6 +580,7 @@ def pipeline( tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, + processor: Optional = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, @@ -917,6 +932,7 @@ def pipeline( load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None + load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some @@ -1079,6 +1095,27 @@ def pipeline( if not is_pyctcdecode_available(): logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode") + if load_processor: + # Try to infer processor from model or config name (if provided as str) + if processor is None: + if isinstance(model_name, str): + processor = model_name + elif isinstance(config, str): + processor = config + elif load_image_processor or load_feature_extractor: + pass + else: + # Impossible to guess what is the right processor here + raise Exception( + "Impossible to guess which processor to use. " + "Please provide a ProcessorMixin class or a path/identifier " + "to a pretrained processor." + ) + + # Instantiate processor if needed + if isinstance(processor, (str, tuple)): + processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs) + if task == "translation" and model.config.task_specific_params: for key in model.config.task_specific_params: if key.startswith("translation"): @@ -1101,6 +1138,9 @@ def pipeline( if image_processor is not None: kwargs["image_processor"] = image_processor + if processor is not None: + kwargs["processor"] = processor + if device is not None: kwargs["device"] = device diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index fa1f2fcf5dfa..e69f7f42eaac 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -708,6 +708,7 @@ def build_pipeline_init_args( has_tokenizer: bool = False, has_feature_extractor: bool = False, has_image_processor: bool = False, + has_processor: bool = False, supports_binary_output: bool = True, ) -> str: docstring = r""" @@ -730,6 +731,11 @@ def build_pipeline_init_args( image_processor ([`BaseImageProcessor`]): The image processor that will be used by the pipeline to encode data for the model. This object inherits from [`BaseImageProcessor`].""" + if has_processor: + docstring += r""" + processor ([`ProcessorMixin`]): + The processor that will be used by the pipeline to encode data for the model. This object inherits from + [`ProcessorMixin`].""" docstring += r""" modelcard (`str` or [`ModelCard`], *optional*): Model card attributed to the model for this pipeline. @@ -766,7 +772,11 @@ def build_pipeline_init_args( PIPELINE_INIT_ARGS = build_pipeline_init_args( - has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True + has_tokenizer=True, + has_feature_extractor=True, + has_image_processor=True, + has_processor=True, + supports_binary_output=True, ) @@ -805,6 +815,7 @@ def __init__( tokenizer: Optional[PreTrainedTokenizer] = None, feature_extractor: Optional[PreTrainedFeatureExtractor] = None, image_processor: Optional[BaseImageProcessor] = None, + processor: Optional = None, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, task: str = "", @@ -822,6 +833,7 @@ def __init__( self.tokenizer = tokenizer self.feature_extractor = feature_extractor self.image_processor = image_processor + self.processor = processor self.modelcard = modelcard self.framework = framework @@ -952,6 +964,9 @@ def save_pretrained(self, save_directory: str, safe_serialization: bool = True): if self.image_processor is not None: self.image_processor.save_pretrained(save_directory) + if self.processor is not None: + self.processor.save_pretrained(save_directory) + if self.modelcard is not None: self.modelcard.save_pretrained(save_directory) diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py new file mode 100644 index 000000000000..6148b9f80427 --- /dev/null +++ b/src/transformers/pipelines/image_text_to_text.py @@ -0,0 +1,171 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + +from ..utils import ( + add_end_docstrings, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_processor=True)) +class ImageTextToTextPipeline(Pipeline): + """ + Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text. + + Example: + + ```python + >>> from transformers import pipeline + + >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base") + >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of") + [{'generated_text': 'a photo of two birds'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier: + "image-text-to-text". + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES) + + def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, text=None, timeout=None): + forward_kwargs = {} + preprocess_params = {} + + if text is not None: + preprocess_params["text"] = text + if timeout is not None: + preprocess_params["timeout"] = timeout + + if generate_kwargs is not None: + forward_kwargs["generate_kwargs"] = generate_kwargs + if max_new_tokens is not None: + if "generate_kwargs" not in forward_kwargs: + forward_kwargs["generate_kwargs"] = {} + if "max_new_tokens" in forward_kwargs["generate_kwargs"]: + raise ValueError( + "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter," + " please use only one" + ) + forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens + + return preprocess_params, forward_kwargs, {} + + def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): + """ + Generate a text given text and the image(s) passed as inputs. + + Args: + images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a HTTP(s) link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. + + text (`str`): + The text to be used as a prompt for the generation. + + max_new_tokens (`int`, *optional*): + The amount of maximum tokens to generate. By default it will use `generate` default. + + generate_kwargs (`Dict`, *optional*): + Pass it to send all of these arguments directly to `generate` allowing full control of this function. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following key: + + - **generated_text** (`str`) -- The generated text. + """ + return super().__call__(images, **kwargs) + + def preprocess(self, image=None, text=None, timeout=None): + if image is not None: + image = load_image(image, timeout=timeout) + + model_type = self.model.config.model_type + + kwargs = {} + + if model_type == "pix2struct": + kwargs = {"add_special_tokens": False} + + if model_type == "idefics": + model_inputs = self.processor(text, return_tensors=self.framework, **kwargs) + else: + model_inputs = self.processor(images=image, text=text, return_tensors=self.framework, **kwargs) + + if model_type == "git": + # remove EOS token from input_ids and attention_mask + model_inputs["input_ids"] = model_inputs["input_ids"][:, :-1] + model_inputs["attention_mask"] = model_inputs["attention_mask"][:, :-1] + + if model_type == "vision-encoder-decoder" and self.processor.__class__.__name__ == "DonutProcessor": + model_inputs["decoder_input_ids"] = self.processor.tokenizer( + text, + add_special_tokens=False, + return_tensors=self.framework, + ).input_ids + + return model_inputs + + def _forward(self, model_inputs, generate_kwargs=None): + if generate_kwargs is None: + generate_kwargs = {} + + model_outputs = self.model.generate(**model_inputs, **generate_kwargs) + return model_outputs + + def postprocess(self, model_outputs): + records = [] + generated_texts = self.processor.batch_decode( + model_outputs, + skip_special_tokens=True, + ) + + records = [{"generated_text": text} for text in generated_texts] + + return records diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py index 4a9a3744d841..3b117ae311f0 100644 --- a/src/transformers/pipelines/image_to_text.py +++ b/src/transformers/pipelines/image_to_text.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from typing import List, Union from ..utils import ( @@ -96,7 +97,7 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs): """ - Assign labels to the image(s) passed as inputs. + Generate text based on the image(s) passed as inputs. Args: images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): @@ -128,6 +129,12 @@ def preprocess(self, image, prompt=None, timeout=None): image = load_image(image, timeout=timeout) if prompt is not None: + warnings.warn( + "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.45" + " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead", + FutureWarning, + ) + if not isinstance(prompt, str): raise ValueError( f"Received an invalid text input, got - {type(prompt)} - but expected a single string. " diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 1bdab80a13f6..8c10b918732f 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -600,6 +600,9 @@ def __init__(self, *args, **kwargs): MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None + + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None @@ -767,6 +770,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class AutoModelForImageTextToText(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class AutoModelForImageToImage(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 6a891f17b06a..869a597fc4cc 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -398,9 +398,7 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else () all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else () pipeline_model_mapping = ( - {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM} - if is_torch_available() - else {} + {"feature-extraction": GitModel, "image-to-text": GitForCausalLM} if is_torch_available() else {} ) fx_compatible = False test_torchscript = False diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py new file mode 100644 index 000000000000..91a4571d3a4d --- /dev/null +++ b/tests/pipelines/test_pipelines_image_text_to_text.py @@ -0,0 +1,176 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import requests + +from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available +from transformers.pipelines import pipeline +from transformers.testing_utils import ( + is_pipeline_test, + require_torch, + require_vision, + slow, +) + +from .test_pipelines_common import ANY + + +if is_vision_available(): + from PIL import Image +else: + + class Image: + @staticmethod + def open(*args, **kwargs): + pass + + +@is_pipeline_test +@require_vision +class ImageToTextPipelineTests(unittest.TestCase): + model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + def get_test_pipeline(self, model, processor): + pipe = pipeline("image-text-to-text", model=model, processor=processor) + examples = [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ] + return pipe, examples + + def run_pipeline_test(self, pipe, examples): + outputs = pipe(examples) + self.assertEqual( + outputs, + [ + [{"generated_text": ANY(str)}], + [{"generated_text": ANY(str)}], + ], + ) + + @require_torch + def test_small_model_pt(self): + pipe = pipeline( + "image-text-to-text", + model="hf-internal-testing/tiny-random-BlipForConditionalGeneration", + processor="hf-internal-testing/tiny-random-BlipForConditionalGeneration", + ) + image = "./tests/fixtures/tests_samples/COCO/000000039769.png" + text = "hello world" + + outputs = pipe(image, text=text) + self.assertEqual( + outputs, + [{"generated_text": "hello world 陽ɔ 劇र ♯ɔง 藥 ਾ"}], + ) + + outputs = pipe([image, image], text=text) + self.assertEqual( + outputs, + [ + [{"generated_text": "hello world 陽ɔ 劇र ♯ɔง 藥 ਾ"}], + [{"generated_text": "hello world 陽ɔ 劇र ♯ɔง 藥 ਾ"}], + ], + ) + + @require_torch + def test_consistent_batching_behaviour(self): + pipe = pipeline("image-text-to-text", model="hf-internal-testing/tiny-random-BlipForConditionalGeneration") + image = "./tests/fixtures/tests_samples/COCO/000000039769.png" + text = "a photo of" + + outputs = pipe([image, image], text=text) + self.assertTrue(outputs[0][0]["generated_text"].startswith(text)) + self.assertTrue(outputs[1][0]["generated_text"].startswith(text)) + + outputs = pipe([image, image], text=text, batch_size=2) + self.assertTrue(outputs[0][0]["generated_text"].startswith(text)) + self.assertTrue(outputs[1][0]["generated_text"].startswith(text)) + + from torch.utils.data import Dataset + + class MyDataset(Dataset): + def __len__(self): + return 5 + + def __getitem__(self, i): + return "./tests/fixtures/tests_samples/COCO/000000039769.png" + + dataset = MyDataset() + for batch_size in (1, 2, 4): + outputs = pipe(dataset, text=text, batch_size=batch_size if batch_size > 1 else None) + self.assertTrue(list(outputs)[0][0]["generated_text"].startswith(text)) + self.assertTrue(list(outputs)[1][0]["generated_text"].startswith(text)) + + @slow + @require_torch + def test_blip_pt(self): + pipe = pipeline("image-text-to-text", model="Salesforce/blip-image-captioning-base") + url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png" + image = Image.open(requests.get(url, stream=True).raw) + + text = "a photo of a" + + outputs = pipe(image, text=text) + self.assertEqual(outputs, [{"generated_text": "a photo of a pink pokemon with a blue shirt"}]) + + @slow + @require_torch + def test_git_pt(self): + pipe = pipeline("image-text-to-text", model="microsoft/git-base-coco") + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + text = "a photo of a" + + outputs = pipe(image, text=text) + self.assertEqual(outputs, [{"generated_text": "a photo of a tent with a tent and a tent in the background."}]) + + @slow + @require_torch + def test_pix2struct_pt(self): + pipe = pipeline("image-text-to-text", model="google/pix2struct-textcaps-base") + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg" + image = Image.open(requests.get(url, stream=True).raw) + + text = "A photo of a" + + outputs = pipe(image, text=text) + self.assertEqual(outputs, [{"generated_text": "A photo of a clock with the numbers 1, 9, and 12 on"}]) + + @slow + @require_torch + def test_llava_pt(self): + pipe = pipeline("image-text-to-text", model="llava-hf/bakLlava-v1-hf") + + text = ( + "\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT:" + ) + + outputs = pipe( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + text=text, + generate_kwargs={"max_new_tokens": 200}, + ) + + self.assertEqual( + outputs, + [ + { + "generated_text": "\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava" + } + ], + ) diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py index e2d59968ebf4..c77353a261f9 100644 --- a/tests/pipelines/test_pipelines_image_to_text.py +++ b/tests/pipelines/test_pipelines_image_to_text.py @@ -290,7 +290,7 @@ def test_conditional_generation_llava(self): outputs, [ { - "generated_text": " \nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava" + "generated_text": "\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava" } ], ) diff --git a/utils/update_metadata.py b/utils/update_metadata.py index 0762c4c2aa73..eb9fc469651f 100755 --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -109,6 +109,7 @@ "AutoModelForVisualQuestionAnswering", ), ("image-to-text", "MODEL_FOR_FOR_VISION_2_SEQ_MAPPING_NAMES", "AutoModelForVision2Seq"), + ("image-text-to-text", "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES", "AutoModelForImageTextToText"), ( "zero-shot-image-classification", "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES",