diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
index ab42c24d83e8..059312850876 100644
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
 ### FlaxAutoModelForVision2Seq
 
 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md
index d4baaf70e6fd..0497573f2b77 100644
--- a/docs/source/ja/model_doc/auto.md
+++ b/docs/source/ja/model_doc/auto.md
@@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
 ### FlaxAutoModelForVision2Seq
 
 [[autodoc]] FlaxAutoModelForVision2Seq
+
+### AutoModelForImageTextToText
+
+[[autodoc]] AutoModelForImageTextToText
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index da29d77972f4..0e414eee0136 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1497,6 +1497,7 @@
             "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
             "MODEL_FOR_IMAGE_MAPPING",
             "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
+            "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
             "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
             "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
             "MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
@@ -1538,6 +1539,7 @@
             "AutoModelForDocumentQuestionAnswering",
             "AutoModelForImageClassification",
             "AutoModelForImageSegmentation",
+            "AutoModelForImageTextToText",
             "AutoModelForImageToImage",
             "AutoModelForInstanceSegmentation",
             "AutoModelForKeypointDetection",
@@ -6352,6 +6354,7 @@
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
             MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
             MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -6393,6 +6396,7 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
             AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForKeypointDetection,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 96a159133cc0..aed2df0f7c40 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -74,6 +74,7 @@
         "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
         "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
         "MODEL_FOR_VISION_2_SEQ_MAPPING",
+        "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
         "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
         "MODEL_MAPPING",
         "MODEL_WITH_LM_HEAD_MAPPING",
@@ -119,6 +120,7 @@
         "AutoModelWithLMHead",
         "AutoModelForZeroShotImageClassification",
         "AutoModelForZeroShotObjectDetection",
+        "AutoModelForImageTextToText",
     ]
 
 try:
@@ -238,6 +240,7 @@
             MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
             MODEL_FOR_IMAGE_MAPPING,
             MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
+            MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
             MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
             MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
             MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
@@ -279,6 +282,7 @@
             AutoModelForDocumentQuestionAnswering,
             AutoModelForImageClassification,
             AutoModelForImageSegmentation,
+            AutoModelForImageTextToText,
             AutoModelForImageToImage,
             AutoModelForInstanceSegmentation,
             AutoModelForKeypointDetection,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 150dea04f374..f1a5fa59acc6 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -680,6 +680,24 @@
         ("llava", "LlavaForConditionalGeneration"),
         ("llava_next", "LlavaNextForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("udop", "UdopForConditionalGeneration"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
+        ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
+    ]
+)
+
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
+    [
+        ("blip", "BlipForConditionalGeneration"),
+        ("blip-2", "Blip2ForConditionalGeneration"),
+        ("fuyu", "FuyuForCausalLM"),
+        ("git", "GitForCausalLM"),
+        ("idefics", "IdeficsForVisionText2Text"),
+        ("instructblip", "InstructBlipForConditionalGeneration"),
+        ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llava", "LlavaForConditionalGeneration"),
+        ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("udop", "UdopForConditionalGeneration"),
         ("vipllava", "VipLlavaForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
     ]
@@ -1317,6 +1335,9 @@
     CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
 )
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
 )
@@ -1611,6 +1632,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
 AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")
 
 
+class AutoModelForImageTextToText(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+
+AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")
+
+
 class AutoModelForAudioClassification(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING
 
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 2f0851c06274..556d834b554d 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -81,19 +81,15 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
+        text_features = {}
         if text is not None:
-            encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+            text_features = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
 
+        image_features = {}
         if images is not None:
             image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
 
-        if text is not None and images is not None:
-            encoding["pixel_values"] = image_features.pixel_values
-            return encoding
-        elif text is not None:
-            return encoding
-        else:
-            return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+        return BatchEncoding(data=dict(**text_features, **image_features), tensor_type=return_tensors)
 
     def batch_decode(self, *args, **kwargs):
         """
diff --git a/src/transformers/models/llava/test.py b/src/transformers/models/llava/test.py
new file mode 100644
index 000000000000..025482e2bc27
--- /dev/null
+++ b/src/transformers/models/llava/test.py
@@ -0,0 +1,25 @@
+from transformers import pipeline
+
+
+# OK:
+# model_id = "microsoft/git-base-coco"
+model_id = "Salesforce/blip-image-captioning-base"
+# model_id = "Salesforce/blip2-opt-2.7b" ok, although it doesn't include the text prompt in the output
+# model_id = "Salesforce/instructblip-flan-t5-xl" ok, although it doesn't include the text prompt in the output
+# model_id = "llava-hf/llava-1.5-7b-hf"
+# model_id = "adept/fuyu-8b"
+# model_id = "google/pix2struct-textcaps-base"
+# model_id = "microsoft/udop-large"
+# model_id = "naver-clova-ix/donut-base-finetuned-docvqa"
+# model_id = "microsoft/kosmos-2-patch14-224"
+
+pipe = pipeline(task="image-text-to-text", model=model_id)
+
+outputs = pipe(
+    images=["http://images.cocodataset.org/val2017/000000039769.jpg"],
+    # text="USER: <image>\nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud\nASSISTANT:",
+    text=["A photo of", "The cats are"],
+    max_new_tokens=200,
+)
+
+print(outputs)
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 8ee0137a20b3..02be38c6b112 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -28,6 +28,7 @@
 from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
 from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
 from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..tokenization_utils import PreTrainedTokenizer
 from ..utils import (
@@ -66,6 +67,7 @@
 from .image_classification import ImageClassificationPipeline
 from .image_feature_extraction import ImageFeatureExtractionPipeline
 from .image_segmentation import ImageSegmentationPipeline
+from .image_text_to_text import ImageTextToTextPipeline
 from .image_to_image import ImageToImagePipeline
 from .image_to_text import ImageToTextPipeline
 from .mask_generation import MaskGenerationPipeline
@@ -118,6 +120,7 @@
         AutoModelForDocumentQuestionAnswering,
         AutoModelForImageClassification,
         AutoModelForImageSegmentation,
+        AutoModelForImageTextToText,
         AutoModelForMaskedLM,
         AutoModelForMaskGeneration,
         AutoModelForObjectDetection,
@@ -392,6 +395,17 @@
         },
         "type": "multimodal",
     },
+    "image-text-to-text": {
+        "impl": ImageTextToTextPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("Salesforce/blip-image-captioning-base", "89b09ea"),
+            }
+        },
+        "type": "multimodal",
+    },
     "object-detection": {
         "impl": ObjectDetectionPipeline,
         "tf": (),
@@ -566,6 +580,7 @@ def pipeline(
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
     feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
     image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+    processor: Optional = None,
     framework: Optional[str] = None,
     revision: Optional[str] = None,
     use_fast: bool = True,
@@ -917,6 +932,7 @@ def pipeline(
     load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
     load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
     load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
 
     # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
     # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
@@ -1079,6 +1095,27 @@ def pipeline(
                     if not is_pyctcdecode_available():
                         logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
 
+    if load_processor:
+        # Try to infer processor from model or config name (if provided as str)
+        if processor is None:
+            if isinstance(model_name, str):
+                processor = model_name
+            elif isinstance(config, str):
+                processor = config
+            elif load_image_processor or load_feature_extractor:
+                pass
+            else:
+                # Impossible to guess what is the right processor here
+                raise Exception(
+                    "Impossible to guess which processor to use. "
+                    "Please provide a ProcessorMixin class or a path/identifier "
+                    "to a pretrained processor."
+                )
+
+        # Instantiate processor if needed
+        if isinstance(processor, (str, tuple)):
+            processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+
     if task == "translation" and model.config.task_specific_params:
         for key in model.config.task_specific_params:
             if key.startswith("translation"):
@@ -1101,6 +1138,9 @@ def pipeline(
     if image_processor is not None:
         kwargs["image_processor"] = image_processor
 
+    if processor is not None:
+        kwargs["processor"] = processor
+
     if device is not None:
         kwargs["device"] = device
 
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index fa1f2fcf5dfa..e69f7f42eaac 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -708,6 +708,7 @@ def build_pipeline_init_args(
     has_tokenizer: bool = False,
     has_feature_extractor: bool = False,
     has_image_processor: bool = False,
+    has_processor: bool = False,
     supports_binary_output: bool = True,
 ) -> str:
     docstring = r"""
@@ -730,6 +731,11 @@ def build_pipeline_init_args(
         image_processor ([`BaseImageProcessor`]):
             The image processor that will be used by the pipeline to encode data for the model. This object inherits from
             [`BaseImageProcessor`]."""
+    if has_processor:
+        docstring += r"""
+        processor ([`ProcessorMixin`]):
+            The processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`ProcessorMixin`]."""
     docstring += r"""
         modelcard (`str` or [`ModelCard`], *optional*):
             Model card attributed to the model for this pipeline.
@@ -766,7 +772,11 @@ def build_pipeline_init_args(
 
 
 PIPELINE_INIT_ARGS = build_pipeline_init_args(
-    has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True
+    has_tokenizer=True,
+    has_feature_extractor=True,
+    has_image_processor=True,
+    has_processor=True,
+    supports_binary_output=True,
 )
 
 
@@ -805,6 +815,7 @@ def __init__(
         tokenizer: Optional[PreTrainedTokenizer] = None,
         feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
         image_processor: Optional[BaseImageProcessor] = None,
+        processor: Optional = None,
         modelcard: Optional[ModelCard] = None,
         framework: Optional[str] = None,
         task: str = "",
@@ -822,6 +833,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.feature_extractor = feature_extractor
         self.image_processor = image_processor
+        self.processor = processor
         self.modelcard = modelcard
         self.framework = framework
 
@@ -952,6 +964,9 @@ def save_pretrained(self, save_directory: str, safe_serialization: bool = True):
         if self.image_processor is not None:
             self.image_processor.save_pretrained(save_directory)
 
+        if self.processor is not None:
+            self.processor.save_pretrained(save_directory)
+
         if self.modelcard is not None:
             self.modelcard.save_pretrained(save_directory)
 
diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
new file mode 100644
index 000000000000..6148b9f80427
--- /dev/null
+++ b/src/transformers/pipelines/image_text_to_text.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_processor=True))
+class ImageTextToTextPipeline(Pipeline):
+    """
+    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+    [{'generated_text': 'a photo of two birds'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-text-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, text=None, timeout=None):
+        forward_kwargs = {}
+        preprocess_params = {}
+
+        if text is not None:
+            preprocess_params["text"] = text
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if generate_kwargs is not None:
+            forward_kwargs["generate_kwargs"] = generate_kwargs
+        if max_new_tokens is not None:
+            if "generate_kwargs" not in forward_kwargs:
+                forward_kwargs["generate_kwargs"] = {}
+            if "max_new_tokens" in forward_kwargs["generate_kwargs"]:
+                raise ValueError(
+                    "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter,"
+                    " please use only one"
+                )
+            forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
+
+        return preprocess_params, forward_kwargs, {}
+
+    def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Generate a text given text and the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+
+            text (`str`):
+                The text to be used as a prompt for the generation.
+
+            max_new_tokens (`int`, *optional*):
+                The amount of maximum tokens to generate. By default it will use `generate` default.
+
+            generate_kwargs (`Dict`, *optional*):
+                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
+
+            - **generated_text** (`str`) -- The generated text.
+        """
+        return super().__call__(images, **kwargs)
+
+    def preprocess(self, image=None, text=None, timeout=None):
+        if image is not None:
+            image = load_image(image, timeout=timeout)
+
+        model_type = self.model.config.model_type
+
+        kwargs = {}
+
+        if model_type == "pix2struct":
+            kwargs = {"add_special_tokens": False}
+
+        if model_type == "idefics":
+            model_inputs = self.processor(text, return_tensors=self.framework, **kwargs)
+        else:
+            model_inputs = self.processor(images=image, text=text, return_tensors=self.framework, **kwargs)
+
+        if model_type == "git":
+            # remove EOS token from input_ids and attention_mask
+            model_inputs["input_ids"] = model_inputs["input_ids"][:, :-1]
+            model_inputs["attention_mask"] = model_inputs["attention_mask"][:, :-1]
+
+        if model_type == "vision-encoder-decoder" and self.processor.__class__.__name__ == "DonutProcessor":
+            model_inputs["decoder_input_ids"] = self.processor.tokenizer(
+                text,
+                add_special_tokens=False,
+                return_tensors=self.framework,
+            ).input_ids
+
+        return model_inputs
+
+    def _forward(self, model_inputs, generate_kwargs=None):
+        if generate_kwargs is None:
+            generate_kwargs = {}
+
+        model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        records = []
+        generated_texts = self.processor.batch_decode(
+            model_outputs,
+            skip_special_tokens=True,
+        )
+
+        records = [{"generated_text": text} for text in generated_texts]
+
+        return records
diff --git a/src/transformers/pipelines/image_to_text.py b/src/transformers/pipelines/image_to_text.py
index 4a9a3744d841..3b117ae311f0 100644
--- a/src/transformers/pipelines/image_to_text.py
+++ b/src/transformers/pipelines/image_to_text.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from typing import List, Union
 
 from ..utils import (
@@ -96,7 +97,7 @@ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt
 
     def __call__(self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs):
         """
-        Assign labels to the image(s) passed as inputs.
+        Generate text based on the image(s) passed as inputs.
 
         Args:
             images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
@@ -128,6 +129,12 @@ def preprocess(self, image, prompt=None, timeout=None):
         image = load_image(image, timeout=timeout)
 
         if prompt is not None:
+            warnings.warn(
+                "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.45"
+                " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead",
+                FutureWarning,
+            )
+
             if not isinstance(prompt, str):
                 raise ValueError(
                     f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 1bdab80a13f6..8c10b918732f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -600,6 +600,9 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None
 
 
+MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None
+
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None
 
 
@@ -767,6 +770,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForImageTextToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForImageToImage(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py
index 6a891f17b06a..869a597fc4cc 100644
--- a/tests/models/git/test_modeling_git.py
+++ b/tests/models/git/test_modeling_git.py
@@ -398,9 +398,7 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else ()
     pipeline_model_mapping = (
-        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM}
-        if is_torch_available()
-        else {}
+        {"feature-extraction": GitModel, "image-to-text": GitForCausalLM} if is_torch_available() else {}
     )
     fx_compatible = False
     test_torchscript = False
diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py
new file mode 100644
index 000000000000..91a4571d3a4d
--- /dev/null
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -0,0 +1,176 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import requests
+
+from transformers import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, is_vision_available
+from transformers.pipelines import pipeline
+from transformers.testing_utils import (
+    is_pipeline_test,
+    require_torch,
+    require_vision,
+    slow,
+)
+
+from .test_pipelines_common import ANY
+
+
+if is_vision_available():
+    from PIL import Image
+else:
+
+    class Image:
+        @staticmethod
+        def open(*args, **kwargs):
+            pass
+
+
+@is_pipeline_test
+@require_vision
+class ImageToTextPipelineTests(unittest.TestCase):
+    model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING
+
+    def get_test_pipeline(self, model, processor):
+        pipe = pipeline("image-text-to-text", model=model, processor=processor)
+        examples = [
+            Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "./tests/fixtures/tests_samples/COCO/000000039769.png",
+        ]
+        return pipe, examples
+
+    def run_pipeline_test(self, pipe, examples):
+        outputs = pipe(examples)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": ANY(str)}],
+                [{"generated_text": ANY(str)}],
+            ],
+        )
+
+    @require_torch
+    def test_small_model_pt(self):
+        pipe = pipeline(
+            "image-text-to-text",
+            model="hf-internal-testing/tiny-random-BlipForConditionalGeneration",
+            processor="hf-internal-testing/tiny-random-BlipForConditionalGeneration",
+        )
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "hello world"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(
+            outputs,
+            [{"generated_text": "hello world 陽ɔ 劇र ♯ɔง 藥 ਾ"}],
+        )
+
+        outputs = pipe([image, image], text=text)
+        self.assertEqual(
+            outputs,
+            [
+                [{"generated_text": "hello world 陽ɔ 劇र ♯ɔง 藥 ਾ"}],
+                [{"generated_text": "hello world 陽ɔ 劇र ♯ɔง 藥 ਾ"}],
+            ],
+        )
+
+    @require_torch
+    def test_consistent_batching_behaviour(self):
+        pipe = pipeline("image-text-to-text", model="hf-internal-testing/tiny-random-BlipForConditionalGeneration")
+        image = "./tests/fixtures/tests_samples/COCO/000000039769.png"
+        text = "a photo of"
+
+        outputs = pipe([image, image], text=text)
+        self.assertTrue(outputs[0][0]["generated_text"].startswith(text))
+        self.assertTrue(outputs[1][0]["generated_text"].startswith(text))
+
+        outputs = pipe([image, image], text=text, batch_size=2)
+        self.assertTrue(outputs[0][0]["generated_text"].startswith(text))
+        self.assertTrue(outputs[1][0]["generated_text"].startswith(text))
+
+        from torch.utils.data import Dataset
+
+        class MyDataset(Dataset):
+            def __len__(self):
+                return 5
+
+            def __getitem__(self, i):
+                return "./tests/fixtures/tests_samples/COCO/000000039769.png"
+
+        dataset = MyDataset()
+        for batch_size in (1, 2, 4):
+            outputs = pipe(dataset, text=text, batch_size=batch_size if batch_size > 1 else None)
+            self.assertTrue(list(outputs)[0][0]["generated_text"].startswith(text))
+            self.assertTrue(list(outputs)[1][0]["generated_text"].startswith(text))
+
+    @slow
+    @require_torch
+    def test_blip_pt(self):
+        pipe = pipeline("image-text-to-text", model="Salesforce/blip-image-captioning-base")
+        url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        text = "a photo of a"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(outputs, [{"generated_text": "a photo of a pink pokemon with a blue shirt"}])
+
+    @slow
+    @require_torch
+    def test_git_pt(self):
+        pipe = pipeline("image-text-to-text", model="microsoft/git-base-coco")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        text = "a photo of a"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(outputs, [{"generated_text": "a photo of a tent with a tent and a tent in the background."}])
+
+    @slow
+    @require_torch
+    def test_pix2struct_pt(self):
+        pipe = pipeline("image-text-to-text", model="google/pix2struct-textcaps-base")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        text = "A photo of a"
+
+        outputs = pipe(image, text=text)
+        self.assertEqual(outputs, [{"generated_text": "A photo of a clock with the numbers 1, 9, and 12 on"}])
+
+    @slow
+    @require_torch
+    def test_llava_pt(self):
+        pipe = pipeline("image-text-to-text", model="llava-hf/bakLlava-v1-hf")
+
+        text = (
+            "<image>\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT:"
+        )
+
+        outputs = pipe(
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+            text=text,
+            generate_kwargs={"max_new_tokens": 200},
+        )
+
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "generated_text": "\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava"
+                }
+            ],
+        )
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index e2d59968ebf4..c77353a261f9 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -290,7 +290,7 @@ def test_conditional_generation_llava(self):
             outputs,
             [
                 {
-                    "generated_text": "<image> \nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava"
+                    "generated_text": "\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava"
                 }
             ],
         )
diff --git a/utils/update_metadata.py b/utils/update_metadata.py
index 0762c4c2aa73..eb9fc469651f 100755
--- a/utils/update_metadata.py
+++ b/utils/update_metadata.py
@@ -109,6 +109,7 @@
         "AutoModelForVisualQuestionAnswering",
     ),
     ("image-to-text", "MODEL_FOR_FOR_VISION_2_SEQ_MAPPING_NAMES", "AutoModelForVision2Seq"),
+    ("image-text-to-text", "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES", "AutoModelForImageTextToText"),
     (
         "zero-shot-image-classification",
         "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES",