Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
cff2439
Add pipeline
NielsRogge Mar 9, 2024
d254f58
More improvements
NielsRogge Mar 9, 2024
40051db
More improvements
NielsRogge Mar 9, 2024
55a40ad
Add support for Donut
NielsRogge Mar 9, 2024
cd0f3ac
More improvements
NielsRogge Mar 9, 2024
21c6fb9
More improvements
NielsRogge Mar 9, 2024
056e363
More improvements
NielsRogge Mar 10, 2024
0d6d7df
Fix tests
NielsRogge Mar 10, 2024
b48add3
Fix tests
NielsRogge Mar 11, 2024
e4541aa
Fix git tests
NielsRogge Mar 11, 2024
7cbb644
Fix merge
NielsRogge Mar 19, 2024
cfc8a13
Fix merge
NielsRogge Mar 19, 2024
04fcbfe
Merge branch 'feature/use_processor' of github.com:NielsRogge/transfo…
NielsRogge Mar 20, 2024
021334d
Fix merge
NielsRogge Mar 20, 2024
9c384cc
Update metadata
NielsRogge Mar 20, 2024
f6ba64d
Add support for idefics
NielsRogge Mar 20, 2024
fc4363a
Add pipeline
NielsRogge Mar 9, 2024
dc2ca31
More improvements
NielsRogge Mar 9, 2024
d575921
More improvements
NielsRogge Mar 9, 2024
fd77e76
Add support for Donut
NielsRogge Mar 9, 2024
5f772f1
More improvements
NielsRogge Mar 9, 2024
59855ad
More improvements
NielsRogge Mar 9, 2024
c2067b9
More improvements
NielsRogge Mar 10, 2024
40fe2f8
Fix tests
NielsRogge Mar 10, 2024
8b06c67
Fix tests
NielsRogge Mar 11, 2024
81db879
Fix git tests
NielsRogge Mar 11, 2024
7382075
Update metadata
NielsRogge Mar 20, 2024
8acf164
Add support for idefics
NielsRogge Mar 20, 2024
89ac5c4
Fix documentation test
NielsRogge Mar 22, 2024
b64070c
Merge remote-tracking branch 'upstream/main' into feature/use_processor
NielsRogge Apr 1, 2024
40bd731
Remove script
NielsRogge Apr 2, 2024
743a967
Fix merge
NielsRogge Apr 12, 2024
22d3d70
Address comments
NielsRogge Apr 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/en/model_doc/auto.md
Original file line number Diff line number Diff line change
Expand Up @@ -381,3 +381,7 @@ The following auto classes are available for the following multimodal tasks.
### FlaxAutoModelForVision2Seq

[[autodoc]] FlaxAutoModelForVision2Seq

### AutoModelForImageTextToText

[[autodoc]] AutoModelForImageTextToText
4 changes: 4 additions & 0 deletions docs/source/ja/model_doc/auto.md
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,7 @@ AutoModel.register(NewModelConfig, NewModel)
### FlaxAutoModelForVision2Seq

[[autodoc]] FlaxAutoModelForVision2Seq

### AutoModelForImageTextToText

[[autodoc]] AutoModelForImageTextToText
4 changes: 4 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1497,6 +1497,7 @@
"MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING",
"MODEL_FOR_IMAGE_MAPPING",
"MODEL_FOR_IMAGE_SEGMENTATION_MAPPING",
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
"MODEL_FOR_IMAGE_TO_IMAGE_MAPPING",
"MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING",
"MODEL_FOR_KEYPOINT_DETECTION_MAPPING",
Expand Down Expand Up @@ -1538,6 +1539,7 @@
"AutoModelForDocumentQuestionAnswering",
"AutoModelForImageClassification",
"AutoModelForImageSegmentation",
"AutoModelForImageTextToText",
"AutoModelForImageToImage",
"AutoModelForInstanceSegmentation",
"AutoModelForKeypointDetection",
Expand Down Expand Up @@ -6352,6 +6354,7 @@
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
Expand Down Expand Up @@ -6393,6 +6396,7 @@
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForImageTextToText,
AutoModelForImageToImage,
AutoModelForInstanceSegmentation,
AutoModelForKeypointDetection,
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/auto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
"MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
"MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
"MODEL_FOR_VISION_2_SEQ_MAPPING",
"MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING",
"MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
"MODEL_MAPPING",
"MODEL_WITH_LM_HEAD_MAPPING",
Expand Down Expand Up @@ -119,6 +120,7 @@
"AutoModelWithLMHead",
"AutoModelForZeroShotImageClassification",
"AutoModelForZeroShotObjectDetection",
"AutoModelForImageTextToText",
]

try:
Expand Down Expand Up @@ -238,6 +240,7 @@
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
MODEL_FOR_IMAGE_MAPPING,
MODEL_FOR_IMAGE_SEGMENTATION_MAPPING,
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING,
MODEL_FOR_IMAGE_TO_IMAGE_MAPPING,
MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING,
MODEL_FOR_KEYPOINT_DETECTION_MAPPING,
Expand Down Expand Up @@ -279,6 +282,7 @@
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForImageTextToText,
AutoModelForImageToImage,
AutoModelForInstanceSegmentation,
AutoModelForKeypointDetection,
Expand Down
28 changes: 28 additions & 0 deletions src/transformers/models/auto/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,24 @@
("llava", "LlavaForConditionalGeneration"),
("llava_next", "LlavaNextForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("udop", "UdopForConditionalGeneration"),
("vipllava", "VipLlavaForConditionalGeneration"),
("vision-encoder-decoder", "VisionEncoderDecoderModel"),
]
)

MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict(
[
("blip", "BlipForConditionalGeneration"),
("blip-2", "Blip2ForConditionalGeneration"),
("fuyu", "FuyuForCausalLM"),
("git", "GitForCausalLM"),
("idefics", "IdeficsForVisionText2Text"),
("instructblip", "InstructBlipForConditionalGeneration"),
("kosmos-2", "Kosmos2ForConditionalGeneration"),
("llava", "LlavaForConditionalGeneration"),
("pix2struct", "Pix2StructForConditionalGeneration"),
("udop", "UdopForConditionalGeneration"),
("vipllava", "VipLlavaForConditionalGeneration"),
("vision-encoder-decoder", "VisionEncoderDecoderModel"),
]
Expand Down Expand Up @@ -1317,6 +1335,9 @@
CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
)
MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
)
MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
)
Expand Down Expand Up @@ -1611,6 +1632,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass):
AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling")


class AutoModelForImageTextToText(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING


AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling")


class AutoModelForAudioClassification(_BaseAutoModelClass):
_model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING

Expand Down
12 changes: 4 additions & 8 deletions src/transformers/models/git/processing_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,19 +81,15 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")

text_features = {}
if text is not None:
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
text_features = self.tokenizer(text, return_tensors=return_tensors, **kwargs)

image_features = {}
if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)

if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
return BatchEncoding(data=dict(**text_features, **image_features), tensor_type=return_tensors)

def batch_decode(self, *args, **kwargs):
"""
Expand Down
25 changes: 25 additions & 0 deletions src/transformers/models/llava/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from transformers import pipeline


# OK:
# model_id = "microsoft/git-base-coco"
model_id = "Salesforce/blip-image-captioning-base"
# model_id = "Salesforce/blip2-opt-2.7b" ok, although it doesn't include the text prompt in the output
# model_id = "Salesforce/instructblip-flan-t5-xl" ok, although it doesn't include the text prompt in the output
# model_id = "llava-hf/llava-1.5-7b-hf"
# model_id = "adept/fuyu-8b"
# model_id = "google/pix2struct-textcaps-base"
# model_id = "microsoft/udop-large"
# model_id = "naver-clova-ix/donut-base-finetuned-docvqa"
# model_id = "microsoft/kosmos-2-patch14-224"

pipe = pipeline(task="image-text-to-text", model=model_id)

outputs = pipe(
images=["http://images.cocodataset.org/val2017/000000039769.jpg"],
# text="USER: <image>\nWhat does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud\nASSISTANT:",
text=["A photo of", "The cats are"],
max_new_tokens=200,
)

print(outputs)
40 changes: 40 additions & 0 deletions src/transformers/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
from ..tokenization_utils import PreTrainedTokenizer
from ..utils import (
Expand Down Expand Up @@ -66,6 +67,7 @@
from .image_classification import ImageClassificationPipeline
from .image_feature_extraction import ImageFeatureExtractionPipeline
from .image_segmentation import ImageSegmentationPipeline
from .image_text_to_text import ImageTextToTextPipeline
from .image_to_image import ImageToImagePipeline
from .image_to_text import ImageToTextPipeline
from .mask_generation import MaskGenerationPipeline
Expand Down Expand Up @@ -118,6 +120,7 @@
AutoModelForDocumentQuestionAnswering,
AutoModelForImageClassification,
AutoModelForImageSegmentation,
AutoModelForImageTextToText,
AutoModelForMaskedLM,
AutoModelForMaskGeneration,
AutoModelForObjectDetection,
Expand Down Expand Up @@ -392,6 +395,17 @@
},
"type": "multimodal",
},
"image-text-to-text": {
"impl": ImageTextToTextPipeline,
"tf": (),
"pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
"default": {
"model": {
"pt": ("Salesforce/blip-image-captioning-base", "89b09ea"),
}
},
"type": "multimodal",
},
"object-detection": {
"impl": ObjectDetectionPipeline,
"tf": (),
Expand Down Expand Up @@ -566,6 +580,7 @@ def pipeline(
tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
image_processor: Optional[Union[str, BaseImageProcessor]] = None,
processor: Optional = None,
framework: Optional[str] = None,
revision: Optional[str] = None,
use_fast: bool = True,
Expand Down Expand Up @@ -917,6 +932,7 @@ def pipeline(
load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None

# If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
# `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
Expand Down Expand Up @@ -1079,6 +1095,27 @@ def pipeline(
if not is_pyctcdecode_available():
logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")

if load_processor:
# Try to infer processor from model or config name (if provided as str)
if processor is None:
if isinstance(model_name, str):
processor = model_name
elif isinstance(config, str):
processor = config
elif load_image_processor or load_feature_extractor:
pass
else:
# Impossible to guess what is the right processor here
raise Exception(
"Impossible to guess which processor to use. "
"Please provide a ProcessorMixin class or a path/identifier "
"to a pretrained processor."
)

# Instantiate processor if needed
if isinstance(processor, (str, tuple)):
processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)

if task == "translation" and model.config.task_specific_params:
for key in model.config.task_specific_params:
if key.startswith("translation"):
Expand All @@ -1101,6 +1138,9 @@ def pipeline(
if image_processor is not None:
kwargs["image_processor"] = image_processor

if processor is not None:
kwargs["processor"] = processor

if device is not None:
kwargs["device"] = device

Expand Down
17 changes: 16 additions & 1 deletion src/transformers/pipelines/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,6 +708,7 @@ def build_pipeline_init_args(
has_tokenizer: bool = False,
has_feature_extractor: bool = False,
has_image_processor: bool = False,
has_processor: bool = False,
supports_binary_output: bool = True,
) -> str:
docstring = r"""
Expand All @@ -730,6 +731,11 @@ def build_pipeline_init_args(
image_processor ([`BaseImageProcessor`]):
The image processor that will be used by the pipeline to encode data for the model. This object inherits from
[`BaseImageProcessor`]."""
if has_processor:
docstring += r"""
processor ([`ProcessorMixin`]):
The processor that will be used by the pipeline to encode data for the model. This object inherits from
[`ProcessorMixin`]."""
docstring += r"""
modelcard (`str` or [`ModelCard`], *optional*):
Model card attributed to the model for this pipeline.
Expand Down Expand Up @@ -766,7 +772,11 @@ def build_pipeline_init_args(


PIPELINE_INIT_ARGS = build_pipeline_init_args(
has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, supports_binary_output=True
has_tokenizer=True,
has_feature_extractor=True,
has_image_processor=True,
has_processor=True,
supports_binary_output=True,
)


Expand Down Expand Up @@ -805,6 +815,7 @@ def __init__(
tokenizer: Optional[PreTrainedTokenizer] = None,
feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
image_processor: Optional[BaseImageProcessor] = None,
processor: Optional = None,
modelcard: Optional[ModelCard] = None,
framework: Optional[str] = None,
task: str = "",
Expand All @@ -822,6 +833,7 @@ def __init__(
self.tokenizer = tokenizer
self.feature_extractor = feature_extractor
self.image_processor = image_processor
self.processor = processor
self.modelcard = modelcard
self.framework = framework

Expand Down Expand Up @@ -952,6 +964,9 @@ def save_pretrained(self, save_directory: str, safe_serialization: bool = True):
if self.image_processor is not None:
self.image_processor.save_pretrained(save_directory)

if self.processor is not None:
self.processor.save_pretrained(save_directory)

if self.modelcard is not None:
self.modelcard.save_pretrained(save_directory)

Expand Down
Loading