diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx index 7957f453a2fb..b39920151db4 100644 --- a/docs/source/en/model_doc/auto.mdx +++ b/docs/source/en/model_doc/auto.mdx @@ -254,6 +254,10 @@ The following auto classes are available for the following computer vision tasks [[autodoc]] AutoModelForInstanceSegmentation +### AutoModelForUniversalSegmentation + +[[autodoc]] AutoModelForUniversalSegmentation + ### AutoModelForZeroShotObjectDetection [[autodoc]] AutoModelForZeroShotObjectDetection diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e6758c7df38f..085528a51c55 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -943,6 +943,7 @@ "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", @@ -974,6 +975,7 @@ "AutoModelForSpeechSeq2Seq", "AutoModelForTableQuestionAnswering", "AutoModelForTokenClassification", + "AutoModelForUniversalSegmentation", "AutoModelForVideoClassification", "AutoModelForVision2Seq", "AutoModelForVisualQuestionAnswering", @@ -4112,6 +4114,7 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, MODEL_FOR_VISION_2_SEQ_MAPPING, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, @@ -4143,6 +4146,7 @@ AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, + AutoModelForUniversalSegmentation, AutoModelForVideoClassification, AutoModelForVision2Seq, AutoModelForVisualQuestionAnswering, diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index a6ee30366b39..da8ceb8e7e62 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -67,6 +67,7 @@ "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", "MODEL_FOR_VISION_2_SEQ_MAPPING", "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", @@ -97,6 +98,7 @@ "AutoModelForSpeechSeq2Seq", "AutoModelForTableQuestionAnswering", "AutoModelForTokenClassification", + "AutoModelForUniversalSegmentation", "AutoModelForVideoClassification", "AutoModelForVision2Seq", "AutoModelForVisualQuestionAnswering", @@ -222,6 +224,7 @@ MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING, MODEL_FOR_VISION_2_SEQ_MAPPING, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING, @@ -253,6 +256,7 @@ AutoModelForSpeechSeq2Seq, AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, + AutoModelForUniversalSegmentation, AutoModelForVideoClassification, AutoModelForVision2Seq, AutoModelForVisualQuestionAnswering, diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index c8dcc9aed1e7..ec25db48e233 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -434,6 +434,15 @@ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict( [ # Model for Instance Segmentation mapping + # MaskFormerForInstanceSegmentation can be removed from this mapping in v5 + ("maskformer", "MaskFormerForInstanceSegmentation"), + ] +) + +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Universal Segmentation mapping + ("detr", "DetrForSegmentation"), ("maskformer", "MaskFormerForInstanceSegmentation"), ] ) @@ -891,6 +900,9 @@ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES ) +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES +) MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES ) @@ -1082,6 +1094,15 @@ class AutoModelForSemanticSegmentation(_BaseAutoModelClass): ) +class AutoModelForUniversalSegmentation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING + + +AutoModelForUniversalSegmentation = auto_class_update( + AutoModelForUniversalSegmentation, head_doc="universal image segmentation" +) + + class AutoModelForInstanceSegmentation(_BaseAutoModelClass): _model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py index 55b2217ccde2..9fdb0dc3314d 100644 --- a/src/transformers/pipelines/image_segmentation.py +++ b/src/transformers/pipelines/image_segmentation.py @@ -16,6 +16,7 @@ MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING, ) @@ -75,6 +76,7 @@ def __init__(self, *args, **kwargs): MODEL_FOR_IMAGE_SEGMENTATION_MAPPING.items() + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING.items() + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING.items() + + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING.items() ) ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index e8fcfa496932..1176f94cec33 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -446,6 +446,9 @@ def __init__(self, *args, **kwargs): MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = None + + MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None @@ -639,6 +642,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class AutoModelForUniversalSegmentation(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class AutoModelForVideoClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/utils/check_repo.py b/utils/check_repo.py index c72c089d7906..1c2fd4b45c41 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -239,7 +239,6 @@ "VisualBertForMultipleChoice", "TFWav2Vec2ForCTC", "TFHubertForCTC", - "MaskFormerForInstanceSegmentation", "XCLIPVisionModel", "XCLIPTextModel", ]