diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md index ab42c24d83e8..a98b10541737 100644 --- a/docs/source/en/model_doc/auto.md +++ b/docs/source/en/model_doc/auto.md @@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks. ### FlaxAutoModelForVision2Seq [[autodoc]] FlaxAutoModelForVision2Seq + +### AutoModelForImageTextToText + +[[autodoc]] AutoModelForImageTextToText \ No newline at end of file diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md index d4baaf70e6fd..492c46c79ea9 100644 --- a/docs/source/ja/model_doc/auto.md +++ b/docs/source/ja/model_doc/auto.md @@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel) ### FlaxAutoModelForVision2Seq [[autodoc]] FlaxAutoModelForVision2Seq + +### AutoModelForImageTextToText + +[[autodoc]] AutoModelForImageTextToText diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9108367f35b3..eb3ac1174d9d 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1353,6 +1353,7 @@ "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", "MODEL_FOR_KEYPOINT_DETECTION_MAPPING", @@ -1394,6 +1395,7 @@ "AutoModelForDocumentQuestionAnswering", "AutoModelForImageClassification", "AutoModelForImageSegmentation", + "AutoModelForImageTextToText", "AutoModelForImageToImage", "AutoModelForInstanceSegmentation", "AutoModelForKeypointDetection", @@ -6056,6 +6058,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -6097,6 +6100,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index 3bb2b8e9d4c1..2ee0541a1a71 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -74,6 +74,7 @@ "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", "MODEL_MAPPING", "MODEL_WITH_LM_HEAD_MAPPING", @@ -119,6 +120,7 @@ "AutoModelWithLMHead", "AutoModelForZeroShotImageClassification", "AutoModelForZeroShotObjectDetection", + "AutoModelForImageTextToText", ] try: @@ -238,6 +240,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -279,6 +282,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index d096abf43426..9ca99b13a710 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -719,6 +719,26 @@ ] ) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( + [ + ("blip", "BlipForConditionalGeneration"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("fuyu", "FuyuForCausalLM"), + ("git", "GitForCausalLM"), + ("idefics", "IdeficsForVisionText2Text"), + ("idefics2", "Idefics2ForConditionalGeneration"), + ("instructblip", "InstructBlipForConditionalGeneration"), + ("kosmos-2", "Kosmos2ForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), + ("llava_next", "LlavaNextForConditionalGeneration"), + ("paligemma", "PaliGemmaForConditionalGeneration"), + ("pix2struct", "Pix2StructForConditionalGeneration"), + ("udop", "UdopForConditionalGeneration"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("vision-encoder-decoder", "VisionEncoderDecoderModel"), + ] +) + MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( [ # Model for Masked LM mapping @@ -1371,6 +1391,9 @@ CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES ) MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES +) MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES ) @@ -1665,6 +1688,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass): AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling") +class AutoModelForImageTextToText(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + +AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling") + + class AutoModelForAudioClassification(_BaseAutoModelClass): _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 1ab136a1e74c..fd2a0c7f01e0 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -93,11 +93,13 @@ ("trocr", "TrOCRProcessor"), ("tvlt", "TvltProcessor"), ("tvp", "TvpProcessor"), + ("udop", "UdopProcessor"), ("unispeech", "Wav2Vec2Processor"), ("unispeech-sat", "Wav2Vec2Processor"), ("video_llava", "VideoLlavaProcessor"), ("vilt", "ViltProcessor"), ("vipllava", "LlavaProcessor"), + ("vision-encoder-decoder", "DonutProcessor"), ("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"), ("wav2vec2", "Wav2Vec2Processor"), ("wav2vec2-bert", "Wav2Vec2Processor"), diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index cd96b46ab1d2..babc11a43c21 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -144,6 +144,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 2d526a17ba68..8f0f25660412 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -148,6 +148,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index daf6e7d1dfe4..6e38414eb83f 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -71,6 +71,12 @@ def __call__(self, *args, **kwargs): [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. """ # For backward compatibility + legacy = kwargs.pop("legacy", True) + if legacy: + warnings.warn( + "The use of legacy will be deprecated in the future. Please use the new processing behavior by setting legacy=False." + ) + if self._in_target_context_manager: return self.current_processor(*args, **kwargs) @@ -85,7 +91,11 @@ def __call__(self, *args, **kwargs): if images is not None: inputs = self.image_processor(images, *args, **kwargs) - if text is not None: + if text is not None and images is None: + encodings = self.tokenizer(text, **kwargs) + elif text is not None: + if not legacy: + kwargs.update({"add_special_tokens": False}) encodings = self.tokenizer(text, **kwargs) if text is None: @@ -93,7 +103,10 @@ def __call__(self, *args, **kwargs): elif images is None: return encodings else: - inputs["labels"] = encodings["input_ids"] + if not legacy: + inputs["decoder_input_ids"] = encodings["input_ids"] + else: + inputs["labels"] = encodings["input_ids"] return inputs def batch_decode(self, *args, **kwargs): @@ -180,6 +193,20 @@ def token2json(self, tokens, is_inner_value=False, added_vocab=None): else: return [] if is_inner_value else {"text_sequence": tokens} + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def feature_extractor_class(self): warnings.warn( diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index 6b542ba3378e..109b9b3bb9e3 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -681,6 +681,30 @@ def tokens_to_points(tokens, original_size): return results + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-processes the output of `FuyuForConditionalGeneration` to only return the text output. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + containing the token ids of the generated sequences. + + Returns: + `List[str]`: The decoded text output. + """ + boa = self.tokenizer.vocab[BEGINNING_OF_ANSWER_STRING] + # get boa index for each outputted sequence tensor + # start all generated sequences from the beginning of the answer token, pad to have consistent length + unpadded_output_sequences = [seq[(seq == boa).nonzero(as_tuple=True)[0] + 1 :] for seq in generated_outputs] + max_len = max(len(seq) for seq in unpadded_output_sequences) + # convert to torch and pad sequences + padded_output_sequences = torch.full((len(unpadded_output_sequences), max_len), self.pad_token_id) + for i, seq in enumerate(unpadded_output_sequences): + padded_output_sequences[i, : len(seq)] = torch.tensor(seq) + + return self.batch_decode(padded_output_sequences, skip_special_tokens=True) + def batch_decode(self, *args, **kwargs): """ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 98649c644e72..97ac54185004 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -16,6 +16,8 @@ Image/Text processor class for GIT """ +import warnings + from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding @@ -76,6 +78,12 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + legacy = kwargs.pop("legacy", True) + if legacy: + warnings.warn( + "The use of legacy will be deprecated in the future. Please use the new processing behavior by setting legacy=False." + ) + tokenizer_kwargs, image_processor_kwargs = {}, {} if kwargs: tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys} @@ -94,6 +102,9 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values + if not legacy: + encoding["input_ids"] = encoding["input_ids"][:, :-1] + encoding["attention_mask"] = encoding["attention_mask"][:, :-1] return encoding elif text is not None: return encoding @@ -114,6 +125,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): return ["input_ids", "attention_mask", "pixel_values"] diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 8e9e196764f9..b9ae341da7c0 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -16,13 +16,14 @@ Processor class for IDEFICS. """ +import warnings from typing import Callable, List, Optional, Union from urllib.parse import urlparse from ...feature_extraction_utils import BatchFeature from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy -from ...utils import is_tf_available, is_torch_available +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType, is_tf_available, is_torch_available if is_torch_available(): @@ -201,15 +202,18 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u def __call__( self, - prompts: Union[List[TextInput], List[List[TextInput]]], - padding: Union[bool, str, PaddingStrategy] = "longest", + images=None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = None, max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + prompts: Optional[Union[List[TextInput], List[List[TextInput]]]] = None, transform: Callable = None, add_eos_token=False, add_end_of_utterance_token=None, debug=False, - return_tensors="pt", + **kwargs, ) -> BatchEncoding: """This method takes batched or non-batched prompts made of text and images and converts them into prompts that the model was trained on and prepares the image pixel values for the model to process. @@ -317,12 +321,35 @@ def __call__( In order to help debug prompt generation enable `debug=True` which will show you what's happening. """ + legacy = kwargs.pop("legacy", True) + if legacy: + warnings.warn( + "The use of legacy will be deprecated in the future. Please use the new processing behavior by setting legacy=False." + ) + if prompts is None: + # if the user didn't specify prompts=prompts in the call, we assume they want to use the old behavior with prompts as a first argument + prompts = images + elif prompts is None and (images is not None and text is not None): + # Assuming image-text-to-text behavior: one prompt for all images + # Check if batched images are provided + if not isinstance(images, (list, tuple)): + images = [images] + if not isinstance(text, (list, tuple)): + text = [text] * len(images) + # Check if batched text is provided + print("images: ", images) + print("text: ", text) + if isinstance(text, (list, tuple)) and len(text) != len(images): + raise ValueError( + "When using the image-text-to-text behavior, the number of prompts should be the same as the number of images." + ) + prompts = list(zip(images, text)) # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it if add_end_of_utterance_token is None: add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token # turn non-batched prompts into batched - if not any(isinstance(i, list) for i in prompts): + if not any(isinstance(i, (list, tuple)) for i in prompts): prompts = [prompts] fake_token = "" @@ -486,6 +513,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index 2e14118144ba..385cb8d3a1d8 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -246,6 +246,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index adebd22178ef..9d23927c84d1 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -149,6 +149,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py index 6d1cce14b186..d16d5f4eedb8 100644 --- a/src/transformers/models/kosmos2/processing_kosmos2.py +++ b/src/transformers/models/kosmos2/processing_kosmos2.py @@ -403,6 +403,21 @@ def post_process_generation(self, text, cleanup_and_extract=True): return clean_text_and_extract_entities_with_bboxes(caption) return caption + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + generated_texts = self.batch_decode(generated_outputs, skip_special_tokens=True) + return [self.post_process_generation(text, cleanup_and_extract=False) for text in generated_texts] + @property # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index a563b1cb82e7..bb9a44b13a9f 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -129,6 +129,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 7664b7954308..98984a82457f 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -133,6 +133,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names def model_input_names(self): diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index 3d0ece60c367..ae8c9fc80c01 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -17,6 +17,7 @@ """ import logging +import warnings from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -70,6 +71,11 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token): image_seq_len (`int`): The length of the image sequence. image_token (`str`): The image token. """ + if image_token in prompt: + warnings.warn( + f"The image token {image_token} is already present in the prompt. This may lead to unexpected behavior." + ) + prompt = prompt.replace(image_token, "") return f"{image_token * image_seq_len}{bos_token}{prompt}\n" @@ -300,6 +306,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma def model_input_names(self): diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py index 269fa8c62fb2..b610d9fd2454 100644 --- a/src/transformers/models/pix2struct/processing_pix2struct.py +++ b/src/transformers/models/pix2struct/processing_pix2struct.py @@ -16,6 +16,7 @@ Processor class for Pix2Struct. """ +import warnings from typing import List, Optional, Union from ...processing_utils import ProcessorMixin @@ -73,6 +74,13 @@ def __call__( Please refer to the docstring of the above two methods for more information. """ + legacy = kwargs.pop("legacy", True) + print("legacy: ", legacy) + if legacy: + warnings.warn( + "The use of legacy will be deprecated in the future. Please use the new processing behavior by setting legacy=False." + ) + if images is None and text is None: raise ValueError("You have to specify either images or text.") @@ -111,6 +119,8 @@ def __call__( ) if text is not None and not self.image_processor.is_vqa: + if not legacy: + add_special_tokens = False text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, @@ -156,6 +166,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property def model_input_names(self): tokenizer_input_names = self.tokenizer.model_input_names diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py index 2902541d6f5b..745ff444603d 100644 --- a/src/transformers/models/udop/processing_udop.py +++ b/src/transformers/models/udop/processing_udop.py @@ -198,6 +198,20 @@ def decode(self, *args, **kwargs): """ return self.tokenizer.decode(*args, **kwargs) + def post_process_image_text_to_text(self, generated_outputs): + """ + Post-process the output of the model to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=True) + @property # Copied from transformers.models.layoutlmv3.processing_layoutlmv3.LayoutLMv3Processor.model_input_names def model_input_names(self): diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 9bc0a1cf8b46..9605e20f76e4 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -28,7 +28,9 @@ from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage +from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer +from ..processing_utils import ProcessorMixin from ..tokenization_utils import PreTrainedTokenizer from ..utils import ( CONFIG_NAME, @@ -65,6 +67,7 @@ from .image_classification import ImageClassificationPipeline from .image_feature_extraction import ImageFeatureExtractionPipeline from .image_segmentation import ImageSegmentationPipeline +from .image_text_to_text import ImageTextToTextPipeline from .image_to_image import ImageToImagePipeline from .image_to_text import ImageToTextPipeline from .mask_generation import MaskGenerationPipeline @@ -117,6 +120,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForMaskedLM, AutoModelForMaskGeneration, AutoModelForObjectDetection, @@ -382,6 +386,17 @@ }, "type": "multimodal", }, + "image-text-to-text": { + "impl": ImageTextToTextPipeline, + "tf": (), + "pt": (AutoModelForImageTextToText,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("Salesforce/blip-image-captioning-base", "89b09ea"), + } + }, + "type": "multimodal", + }, "object-detection": { "impl": ObjectDetectionPipeline, "tf": (), @@ -556,6 +571,7 @@ def pipeline( tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, image_processor: Optional[Union[str, BaseImageProcessor]] = None, + processor: Optional[ProcessorMixin] = None, framework: Optional[str] = None, revision: Optional[str] = None, use_fast: bool = True, @@ -906,7 +922,10 @@ def pipeline( hub_kwargs["_commit_hash"] = model.config._commit_hash load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None + print("type(model_config)", type(model_config)) load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None + load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None + print("load_processor", load_processor) # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some @@ -990,7 +1009,7 @@ def pipeline( tokenizer_identifier = tokenizer tokenizer_kwargs = model_kwargs.copy() tokenizer_kwargs.pop("torch_dtype", None) - + print("tokenizer_identifier", tokenizer_identifier) tokenizer = AutoTokenizer.from_pretrained( tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs ) @@ -1069,6 +1088,27 @@ def pipeline( if not is_pyctcdecode_available(): logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode") + if load_processor: + # Try to infer processor from model or config name (if provided as str) + if processor is None: + if isinstance(model_name, str): + processor = model_name + elif isinstance(config, str): + processor = config + elif load_image_processor or load_feature_extractor: + pass + else: + # Impossible to guess what is the right processor here + raise Exception( + "Impossible to guess which processor to use. " + "Please provide a ProcessorMixin class or a path/identifier " + "to a pretrained processor." + ) + + # Instantiate processor if needed + if isinstance(processor, (str, tuple)): + processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs) + if task == "translation" and model.config.task_specific_params: for key in model.config.task_specific_params: if key.startswith("translation"): @@ -1091,6 +1131,9 @@ def pipeline( if image_processor is not None: kwargs["image_processor"] = image_processor + if processor is not None: + kwargs["processor"] = processor + if device is not None: kwargs["device"] = device diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 09f77402a143..ce176af04dc0 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -33,6 +33,7 @@ from ..image_processing_utils import BaseImageProcessor from ..modelcard import ModelCard from ..models.auto.configuration_auto import AutoConfig +from ..processing_utils import ProcessorMixin from ..tokenization_utils import PreTrainedTokenizer from ..utils import ( ModelOutput, @@ -711,6 +712,7 @@ def build_pipeline_init_args( has_tokenizer: bool = False, has_feature_extractor: bool = False, has_image_processor: bool = False, + has_processor: bool = False, supports_binary_output: bool = True, ) -> str: docstring = r""" @@ -733,6 +735,11 @@ def build_pipeline_init_args( image_processor ([`BaseImageProcessor`]): The image processor that will be used by the pipeline to encode data for the model. This object inherits from [`BaseImageProcessor`].""" + if has_processor: + docstring += r""" + processor ([`ProcessorMixin`]): + The processor that will be used by the pipeline to encode data for the model. This object inherits from + [`ProcessorMixin`].""" docstring += r""" modelcard (`str` or [`ModelCard`], *optional*): Model card attributed to the model for this pipeline. @@ -769,7 +776,11 @@ def build_pipeline_init_args( PIPELINE_INIT_ARGS = build_pipeline_init_args( - has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True + has_tokenizer=True, + has_feature_extractor=True, + has_image_processor=True, + has_processor=True, + supports_binary_output=True, ) @@ -808,6 +819,7 @@ def __init__( tokenizer: Optional[PreTrainedTokenizer] = None, feature_extractor: Optional[PreTrainedFeatureExtractor] = None, image_processor: Optional[BaseImageProcessor] = None, + processor: Optional[ProcessorMixin] = None, modelcard: Optional[ModelCard] = None, framework: Optional[str] = None, task: str = "", @@ -825,6 +837,7 @@ def __init__( self.tokenizer = tokenizer self.feature_extractor = feature_extractor self.image_processor = image_processor + self.processor = processor self.modelcard = modelcard self.framework = framework @@ -990,6 +1003,9 @@ def save_pretrained( if self.image_processor is not None: self.image_processor.save_pretrained(save_directory, **kwargs) + if self.processor is not None: + self.processor.save_pretrained(save_directory, **kwargs) + if self.modelcard is not None: self.modelcard.save_pretrained(save_directory) @@ -1182,7 +1198,14 @@ def get_iterator( logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already") os.environ["TOKENIZERS_PARALLELISM"] = "false" # TODO hack by collating feature_extractor and image_processor - feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor + if self.feature_extractor is not None: + feature_extractor = self.feature_extractor + elif self.image_processor is not None: + feature_extractor = self.image_processor + elif self.processor is not None: + feature_extractor = self.processor + else: + feature_extractor = None collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor) dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn) model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size) diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py new file mode 100644 index 000000000000..0ce5cd3276bb --- /dev/null +++ b/src/transformers/pipelines/image_text_to_text.py @@ -0,0 +1,299 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Union + +from ..utils import ( + add_end_docstrings, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES + from .pt_utils import KeyDataset + +logger = logging.get_logger(__name__) + +IMAGE_TOKEN = "" + + +class Chat: + """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats + to this format because the rest of the pipeline code tends to assume that lists of messages are + actually a batch of samples rather than messages in the same conversation.""" + + def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", List["Image.Image"]]): + for message in messages: + if not ("role" in message and "content" in message): + raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.") + if count_images_in_chat(messages) != len(images): + raise ValueError("The number of images should be the same as the number of images in the chat.") + + self.messages = messages + self.images = images + + +class ImageText: + """This class is intended to just be used internally in this pipeline and not exposed to users. We used this class + as the base pipeline does not support multiple inputs, so we need to convert multiple inputs to a single input.""" + + def __init__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], text: Union[str, List[str]]): + self.images = images + self.text = text + + +def count_images_in_chat(chat): + num_images = 0 + for message in chat: + num_images += sum(1 for content in message["content"] if content.get("type") == "image") + return num_images + + +@add_end_docstrings(build_pipeline_init_args(has_processor=True)) +class ImageTextToTextPipeline(Pipeline): + """ + Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text. + + Example: + + ```python + >>> from transformers import pipeline + + >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base") + >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of") + [{'generated_text': 'a photo of two birds'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier: + "image-text-to-text". + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES) + + def _sanitize_parameters( + self, + max_new_tokens=None, + generate_kwargs=None, + text=None, + truncation=None, + padding=None, + max_length=None, + timeout=None, + ): + forward_kwargs = {} + preprocess_params = {} + post_process_params = {} + + if timeout is not None: + preprocess_params["timeout"] = timeout + + if truncation is not None: + preprocess_params["truncation"] = truncation + + if padding is not None: + preprocess_params["padding"] = padding + + if max_length is not None: + preprocess_params["max_length"] = max_length + + if generate_kwargs is not None: + forward_kwargs["generate_kwargs"] = generate_kwargs + + if max_new_tokens is not None: + if "generate_kwargs" not in forward_kwargs: + forward_kwargs["generate_kwargs"] = {} + if "max_new_tokens" in forward_kwargs["generate_kwargs"]: + raise ValueError( + "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter," + " please use only one" + ) + forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens + + return preprocess_params, forward_kwargs, post_process_params + + def __call__( + self, + images: Union[str, List[str], List[List[str]], "Image.Image", List["Image.Image"], List[List["Image.Image"]]], + text: Union[str, List[str], List[dict]], + **kwargs, + ): + """ + Generate a text given text and the image(s) passed as inputs. + + Args: + images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a HTTP(s) link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. + + text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`): + The text to be used for generation. If a list of strings is passed, the length of the list should be the + same as the number of images. Text can also follow the chat format: a list of dictionaries where each + dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and + 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a dictionary + containing the text of the message and the type of the message. The type of the message can be either + 'text' or 'image'. If the type is 'image', no text is needed. + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following key: + + - **generated_text** (`str`) -- The generated text. + - **input_text** (`str`) -- The input text. + """ + batch_size = kwargs.get("batch_size", 1) + + if not isinstance(images, (list, tuple)): + images = [images] + + if isinstance(text, (list, tuple, KeyDataset) if is_torch_available() else (list, tuple)) and isinstance( + text[0], (list, tuple, dict) + ): + # We have one or more prompts in list-of-dicts format, so this is chat mode + if isinstance(text[0], dict): + return super().__call__(Chat(text, images), **kwargs) + else: + chats = [Chat(chat, image) for chat, image in zip(text, images)] # ๐Ÿˆ ๐Ÿˆ ๐Ÿˆ + return super().__call__(chats, **kwargs) + + if isinstance(text, str): + text = [text] * len(images) + if not isinstance(text[0], str): + raise ValueError("The pipeline does not support nested lists of prompts.") + + # Check number of IMAGE_TOKEN token in each text + num_images_in_text = [text_single.count(IMAGE_TOKEN) for text_single in text] + if sum(num_images_in_text) > 0: + if any(num > 1 for num in num_images_in_text) and batch_size > 1: + raise ValueError( + "The pipeline does not support multiple images for a single prompt with batch_size > 1." + ) + # Check if already nested images and consistency + if isinstance(images[0], (list, tuple)): + if len(images) != len(text): + raise ValueError("The number of nested image groups and prompts should be the same.") + num_images_in_images = [len(image) for image in images] + if num_images_in_text != num_images_in_images: + raise ValueError( + f"The number of images in each nested image group should be the same as the number of {IMAGE_TOKEN} tokens in the corresponding prompt." + ) + elif sum(num_images_in_text) != len(images): + raise ValueError( + f"The total number of {IMAGE_TOKEN} tokens in the prompts should be the same as the number of images passed." + ) + else: + # Reorganize the images to match the prompts + images_reorganized = [] + for num_images in num_images_in_text: + images_reorganized.append(images[:num_images]) + images = images[num_images:] + images = images_reorganized + # After reorganizing, these should be the same + if len(images) != len(text): + raise ValueError("The number of images and text should be the same.") + + return super().__call__([ImageText(image, text_single) for image, text_single in zip(images, text)], **kwargs) + + def preprocess(self, inputs=None, truncation=None, padding=False, max_length=None, timeout=None): + kwargs = { + "legacy": False, + "truncation": truncation, + "padding": padding, + "max_length": max_length, + } + images = inputs.images + + if isinstance(inputs, Chat): + # kwargs["chats"] = inputs.messages + text = self.processor.apply_chat_template( + inputs.messages, + add_generation_prompt=True, + return_tensors=self.framework, + **kwargs, + ) + else: + text = inputs.text + if not isinstance(images, (list, tuple)): + images = load_image(images, timeout=timeout) + else: + images = [load_image(image, timeout=timeout) for image in images] + + try: + model_inputs = self.processor(images=images, text=text, return_tensors=self.framework, **kwargs) + except TypeError: + kwargs.pop("legacy", None) + model_inputs = self.processor(images=images, text=text, return_tensors=self.framework, **kwargs) + + model_inputs["text"] = text + + return model_inputs + + def _forward(self, model_inputs, generate_kwargs=None): + if generate_kwargs is None: + generate_kwargs = {} + input_text = model_inputs.pop("text") + input_ids = ( + model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"] + ) # for decoder-only models + model_outputs = self.model.generate(**model_inputs, **generate_kwargs) + return {"outputs": model_outputs, "input_text": input_text, "input_ids": input_ids} + + def postprocess(self, model_outputs): + input_text = model_outputs["input_text"] + input_text = [input_text] if isinstance(input_text, str) else input_text + outputs = model_outputs["outputs"] + inputs_id = model_outputs["input_ids"] + + # Decode inputs and outputs the same way to remove input text from generated text if present + generated_texts = self.processor.post_process_image_text_to_text(outputs) + decoded_inputs = self.processor.post_process_image_text_to_text(inputs_id) + generated_texts = [text.strip() for text in generated_texts] + decoded_inputs = [text.strip() for text in decoded_inputs] + # Remove the input text from the generated text if the generated text starts with the input text + generated_texts = [ + text_generated[len(decoded_inputs[i]) :].strip() + if text_generated.startswith(decoded_inputs[i]) + else text_generated + for i, text_generated in enumerate(generated_texts) + ] + + records = [ + {"input_text": input_text[i], "generated_text": generated_text} + for i, generated_text in enumerate(generated_texts) + ] + + return records diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py index 88dce8e591ae..a34e5e1a7e19 100644 --- a/src/transformers/pipelines/image_to_text.py +++ b/src/transformers/pipelines/image_to_text.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from typing import List, Union from ..utils import ( @@ -96,7 +97,7 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs): """ - Assign labels to the image(s) passed as inputs. + Generate text based on the image(s) passed as inputs. Args: images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): @@ -128,6 +129,12 @@ def preprocess(self, image, prompt=None, timeout=None): image = load_image(image, timeout=timeout) if prompt is not None: + warnings.warn( + "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.45" + " of ๐Ÿค— Transformers. Use the `image-text-to-text` pipeline instead", + FutureWarning, + ) + if not isinstance(prompt, str): raise ValueError( f"Received an invalid text input, got - {type(prompt)} - but expected a single string. " diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index de739c6e7004..6b1244005db9 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -668,6 +668,9 @@ def __init__(self, *args, **kwargs): MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None + + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None @@ -835,6 +838,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class AutoModelForImageTextToText(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class AutoModelForImageToImage(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py new file mode 100644 index 000000000000..779248e7a236 --- /dev/null +++ b/tests/pipelines/test_pipelines_image_text_to_text.py @@ -0,0 +1,140 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available +from transformers.pipelines import ImageTextToTextPipeline, pipeline +from transformers.testing_utils import ( + is_pipeline_test, + require_torch, + require_vision, + slow, +) + +from .test_pipelines_common import ANY + + +if is_vision_available(): + from PIL import Image +else: + + class Image: + @staticmethod + def open(*args, **kwargs): + pass + + +@is_pipeline_test +@require_vision +class ImageTextToTextPipelineTests(unittest.TestCase): + model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + def get_test_pipeline(self, model, tokenizer, processor, torch_dtype="float32"): + pipe = ImageTextToTextPipeline( + model=model, tokenizer=tokenizer, image_processor=processor, torch_dtype=torch_dtype + ) + examples = { + "images": [ + Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"), + "./tests/fixtures/tests_samples/COCO/000000039769.png", + ], + "text": [" This is a ", " Here I see a "], + } + return pipe, examples + + def run_pipeline_test(self, pipe, examples): + outputs = pipe(examples.get("images"), text=examples.get("text"), max_new_tokens=20) + self.assertEqual( + outputs, + [ + [{"input_text": ANY(str), "generated_text": ANY(str)}], + [{"input_text": ANY(str), "generated_text": ANY(str)}], + ], + ) + + @require_torch + def test_small_model_pt_token(self): + pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") + image = "./tests/fixtures/tests_samples/COCO/000000039769.png" + text = " What this is? Assistant: This is" + + outputs = pipe(image, text=text, max_new_tokens=20) + self.assertEqual( + outputs, + [ + [ + { + "input_text": " What this is? Assistant: This is", + "generated_text": "a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable", + } + ] + ], + ) + + outputs = pipe([image, image], text=[text, text], max_new_tokens=20) + self.assertEqual( + outputs, + [ + [ + { + "input_text": " What this is? Assistant: This is", + "generated_text": "a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable", + } + ], + [ + { + "input_text": " What this is? Assistant: This is", + "generated_text": "a photo of two cats lying on a pink blanket. The cats are sleeping and appear to be comfortable", + } + ], + ], + ) + + @require_torch + def test_consistent_batching_behaviour(self): + pipe = pipeline("image-text-to-text", model="microsoft/kosmos-2-patch14-224") + image = "./tests/fixtures/tests_samples/COCO/000000039769.png" + prompt = "a photo of" + + outputs = pipe([image, image], text=[prompt, prompt], max_new_tokens=20) + outputs_batched = pipe([image, image], text=[prompt, prompt], max_new_tokens=20, batch_size=2) + self.assertEqual(outputs, outputs_batched) + + @slow + @require_torch + def test_model_pt_chat_template(self): + pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") + image_ny = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + image_chicago = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg" + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Whatโ€™s the difference between these two images?"}, + {"type": "image"}, + {"type": "image"}, + ], + } + ] + outputs = pipe([image_ny, image_chicago], max_new_tokens=20, text=messages) + self.assertEqual( + outputs, + [ + { + "input_text": "<|im_start|>user\n\nWhatโ€™s the difference between these two images?<|im_end|>\n<|im_start|>assistant\n", + "generated_text": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows", + } + ], + ) diff --git a/tests/test_pipeline_mixin.py b/tests/test_pipeline_mixin.py index 8a0ca08e8dab..08b0a2b3b3e5 100644 --- a/tests/test_pipeline_mixin.py +++ b/tests/test_pipeline_mixin.py @@ -40,6 +40,7 @@ from .pipelines.test_pipelines_image_classification import ImageClassificationPipelineTests from .pipelines.test_pipelines_image_feature_extraction import ImageFeatureExtractionPipelineTests from .pipelines.test_pipelines_image_segmentation import ImageSegmentationPipelineTests +from .pipelines.test_pipelines_image_text_to_text import ImageTextToTextPipeline from .pipelines.test_pipelines_image_to_image import ImageToImagePipelineTests from .pipelines.test_pipelines_image_to_text import ImageToTextPipelineTests from .pipelines.test_pipelines_mask_generation import MaskGenerationPipelineTests @@ -73,6 +74,7 @@ "image-segmentation": {"test": ImageSegmentationPipelineTests}, "image-to-image": {"test": ImageToImagePipelineTests}, "image-to-text": {"test": ImageToTextPipelineTests}, + "image-text-to-text": {"test": ImageTextToTextPipeline}, "mask-generation": {"test": MaskGenerationPipelineTests}, "object-detection": {"test": ObjectDetectionPipelineTests}, "question-answering": {"test": QAPipelineTests}, diff --git a/utils/update_metadata.py b/utils/update_metadata.py index 1806eb3f03df..799a59a9a59a 100755 --- a/utils/update_metadata.py +++ b/utils/update_metadata.py @@ -110,6 +110,7 @@ "AutoModelForVisualQuestionAnswering", ), ("image-to-text", "MODEL_FOR_FOR_VISION_2_SEQ_MAPPING_NAMES", "AutoModelForVision2Seq"), + ("image-text-to-text", "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES", "AutoModelForImageTextToText"), ( "zero-shot-image-classification", "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES",