diff --git a/docs/source/index.rst b/docs/source/index.rst index fcbedc343ea6..4414d3443fd4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -125,134 +125,134 @@ conversion utilities for the following models: 14. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation `__ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. -14. :doc:`DETR ` (from Facebook) released with the paper `End-to-End Object Detection with Transformers +15. :doc:`DETR ` (from Facebook) released with the paper `End-to-End Object Detection with Transformers `__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko. -15. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain +16. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain Question Answering `__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih. -16. :doc:`DeBERTa ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with +17. :doc:`DeBERTa ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -17. :doc:`DeBERTa-v2 ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT +18. :doc:`DeBERTa-v2 ` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. -18. :doc:`DeiT ` (from Facebook) released with the paper `Training data-efficient image transformers & +19. :doc:`DeiT ` (from Facebook) released with the paper `Training data-efficient image transformers & distillation through attention `__ by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou. -19. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale +20. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation `__ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan. -20. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a +21. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter `__ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 `__, RoBERTa into `DistilRoBERTa `__, Multilingual BERT into `DistilmBERT `__ and a German version of DistilBERT. -21. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: +22. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators `__ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning. -22. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model +23. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French `__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab. -23. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: +24. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing `__ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -24. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative +25. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training `__ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. -25. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask +26. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners `__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. -26. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo +27. :doc:`GPT Neo ` (from EleutherAI) released in the repository `EleutherAI/gpt-neo `__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy. -27. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization +28. :doc:`I-BERT ` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization `__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer -28. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training +29. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training of Text and Layout for Document Image Understanding `__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou. -29. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer +30. :doc:`LED ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -30. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document +31. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan. -31. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity +32. :doc:`LUKE ` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention `__ by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto. -32. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality +33. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering `__ by Hao Tan and Mohit Bansal. -33. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual +34. :doc:`M2M100 ` (from Facebook) released with the paper `Beyond English-Centric Multilingual Machine Translation `__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin. -34. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by +35. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by Jörg Tiedemann. The `Marian Framework `__ is being developed by the Microsoft Translator Team. -35. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for +36. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for Neural Machine Translation `__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. -36. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible +37. :doc:`MBart-50 ` (from Facebook) released with the paper `Multilingual Translation with Extensible Multilingual Pretraining and Finetuning `__ by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan. -37. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training +38. :doc:`Megatron-BERT ` (from NVIDIA) released with the paper `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -38. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training +39. :doc:`Megatron-GPT2 ` (from NVIDIA) released with the paper `Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism `__ by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. -39. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted +40. :doc:`MPNet ` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted Pre-training for Language Understanding `__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu. -40. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained +41. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained text-to-text transformer `__ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel. -41. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted +42. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization `__> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu. -42. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting +43. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -43. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient +44. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient Transformer `__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. -44. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT +45. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach `__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. -45. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper +46. :doc:`SpeechToTextTransformer ` (from Facebook), released together with the paper `fairseq S2T: Fast Speech-to-Text Modeling with fairseq `__ by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino. -46. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP +47. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP about efficient neural networks? `__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -47. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a +48. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer `__ by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -48. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via +49. :doc:`TAPAS ` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via Pre-training `__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos. -49. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: +50. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context `__ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. -50. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 +51. :doc:`Vision Transformer (ViT) ` (from Google AI) released with the paper `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale `__ by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. -51. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for +52. :doc:`Wav2Vec2 ` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations `__ by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli. -52. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model +53. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining `__ by Guillaume Lample and Alexis Conneau. -53. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: +54. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. -54. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised +55. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `__ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. -55. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive +56. :doc:`XLNet ` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding `__ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. -56. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised +57. :doc:`XLSR-Wav2Vec2 ` (from Facebook AI) released with the paper `Unsupervised Cross-Lingual Representation Learning For Speech Recognition `__ by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. diff --git a/docs/source/model_doc/detr.rst b/docs/source/model_doc/detr.rst index 410df7d5d7c3..65c1aeffb237 100644 --- a/docs/source/model_doc/detr.rst +++ b/docs/source/model_doc/detr.rst @@ -55,12 +55,12 @@ than usual, but with a smaller :obj:`d_model` (which in NLP is typically 768 or Next, this is sent through the encoder, outputting :obj:`encoder_hidden_states` of the same shape (you can consider these as image features). Next, so-called **object queries** are sent through the decoder. This is a tensor of shape :obj:`(batch_size, num_queries, d_model)`, with :obj:`num_queries` typically set to 100 and initialized with zeros. -These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to the -encoder, they are added to the input of each attention layer. Each object query will look for a particular object in the -image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers to output -:obj:`decoder_hidden_states` of the same shape: :obj:`(batch_size, num_queries, d_model)`. Next, two heads are added on top -for object detection: a linear layer for classifying each object query into one of the objects or "no object", and a MLP to -predict bounding boxes for each query. +These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to +the encoder, they are added to the input of each attention layer. Each object query will look for a particular object +in the image. The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers +to output :obj:`decoder_hidden_states` of the same shape: :obj:`(batch_size, num_queries, d_model)`. Next, two heads +are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no +object", and a MLP to predict bounding boxes for each query. The model is trained using a **bipartite matching loss**: so what we actually do is compare the predicted classes + bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N @@ -89,15 +89,17 @@ Tips: `num_boxes` variable in the `SetCriterion` class of `modeling_detr.py`. When training on multiple nodes, this should be set to the average number of target boxes across all nodes, as can be seen in the original implementation `here `__. -- :class:`~transformers.DetrForObjectDetection` can be initialized with any convolutional backbone available in the `timm - library `__. Initializing with a MobileNet backbone for example can be - done by setting the :obj:`backbone` attribute of :class:`~transformers.DetrConfig` to :obj:`"tf_mobilenetv3_small_075"`, - and then initializing :class:`~transformers.DetrForObjectDetection` with that config. -- At inference time, DETR resizes the input images such that the shortest side is at least 800 pixels while the longest at most - 1333 pixels. One can use :class:`~transformers.DetrFeatureExtractor` to prepare images (and optional annotations in COCO format) - for the model. Due to this, images in a batch can have different sizes. DETR solves this by padding images up to the largest - size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding. Alternatively, one can also - define a custom :obj:`collate_fn` in order to batch images together, using :meth:`~transformers.DetrFeatureExtractor.pad_and_create_pixel_mask`. +- :class:`~transformers.DetrForObjectDetection` can be initialized with any convolutional backbone available in the + `timm library `__. Initializing with a MobileNet backbone for + example can be done by setting the :obj:`backbone` attribute of :class:`~transformers.DetrConfig` to + :obj:`"tf_mobilenetv3_small_075"`, and then initializing :class:`~transformers.DetrForObjectDetection` with that + config. +- At inference time, DETR resizes the input images such that the shortest side is at least 800 pixels while the longest + at most 1333 pixels. One can use :class:`~transformers.DetrFeatureExtractor` to prepare images (and optional + annotations in COCO format) for the model. Due to this, images in a batch can have different sizes. DETR solves this + by padding images up to the largest size in a batch, and by creating a pixel mask that indicates which pixels are + real/which are padding. Alternatively, one can also define a custom :obj:`collate_fn` in order to batch images + together, using :meth:`~transformers.DetrFeatureExtractor.pad_and_create_pixel_mask`. DetrConfig ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 85b1214c773c..51f75bb8ffed 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1527,6 +1527,7 @@ is_sklearn_available, is_speech_available, is_tf_available, + is_timm_available, is_tokenizers_available, is_torch_available, is_torch_tpu_available, diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 8d714138e8c2..c50b27704697 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -71,6 +71,8 @@ class DetrConfig(PretrainedConfig): just in case (e.g., 512 or 1024 or 2048). init_std (:obj:`float`, `optional`, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + init_xavier_std (:obj:`float`, `optional`, defaults to 1.): + The scaling factor used for the Xavier initialization gain in the HM Attention map module. encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details. @@ -142,6 +144,7 @@ def __init__( attention_dropout=0.0, activation_dropout=0.0, init_std=0.02, + init_xavier_std=1.0, classifier_dropout=0.0, scale_embedding=False, auxiliary_loss=False, @@ -176,6 +179,7 @@ def __init__( self.activation_dropout = activation_dropout self.activation_function = activation_function self.init_std = init_std + self.init_xavier_std = init_xavier_std self.encoder_layerdrop = encoder_layerdrop self.decoder_layerdrop = decoder_layerdrop self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index cfae8710a4d3..a093e4d734b2 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -460,8 +460,8 @@ def __call__( :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields: - **pixel_values** -- Pixel values to be fed to a model. - - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if `"pixel_mask"` - is in :obj:`self.model_input_names`). + - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if + `"pixel_mask"` is in :obj:`self.model_input_names`). """ # Input type checking for clearer error @@ -623,9 +623,9 @@ def _max_by_axis(self, the_list): maxes[index] = max(maxes[index], item) return maxes - def pad_and_create_pixel_mask(self, - pixel_values_list: List[torch.Tensor], - return_tensors: Optional[Union[str, TensorType]] = None): + def pad_and_create_pixel_mask( + self, pixel_values_list: List[torch.Tensor], return_tensors: Optional[Union[str, TensorType]] = None + ): """ Pad images up to the largest image in a batch and create a corresponding :obj:`pixel_mask`. @@ -641,11 +641,11 @@ def pad_and_create_pixel_mask(self, :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields: - **pixel_values** -- Pixel values to be fed to a model. - - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if `"pixel_mask"` - is in :obj:`self.model_input_names`). + - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if + `"pixel_mask"` is in :obj:`self.model_input_names`). """ - + max_size = self._max_by_axis([list(image.shape) for image in pixel_values_list]) c, h, w = max_size padded_images = [] diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index 43a7320ffa11..653bd93f8a9e 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -100,6 +100,11 @@ class DetrObjectDetectionOutput(ModelOutput): Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to `True`) and labels are provided. It is a list of dictionnaries containing the two above keys (:obj:`logits` and :obj:`pred_boxes`) for each decoder layer. + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of @@ -129,6 +134,7 @@ class DetrObjectDetectionOutput(ModelOutput): logits: torch.FloatTensor = None pred_boxes: torch.FloatTensor = None auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -138,16 +144,72 @@ class DetrObjectDetectionOutput(ModelOutput): @dataclass -class DetrForSegmentationOutput(DetrObjectDetectionOutput): +class DetrForSegmentationOutput(ModelOutput): """ - This class adds one attribute to DetrObjectDetectionOutput, namely predicted masks. + Output type of :class:`~transformers.DetrForSegmentation`. Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (:obj:`Dict`, `optional`): + A dictionary containing the individual losses. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use :class:`~transformers.DetrForObjectDetection.post_process` to retrieve the + unnormalized bounding boxes. pred_masks (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_queries, width, height)`): ... + auxiliary_outputs (:obj:`list[Dict]`, `optional`): + Optional, only returned when auxilary losses are activated (i.e. :obj:`config.auxiliary_loss` is set to + `True`) and labels are provided. It is a list of dictionnaries containing the two above keys (:obj:`logits` + and :obj:`pred_boxes`) for each decoder layer. + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the decoder at the output of + each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the + attention softmax, used to compute the weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to + compute the weighted average in the self-attention heads. """ + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None pred_masks: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None # BELOW: utilities copied from @@ -676,7 +738,9 @@ class DetrPreTrainedModel(PreTrainedModel): def _init_weights(self, module): std = self.config.init_std - if isinstance(module, nn.Linear): + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=std) if module.bias is not None: module.bias.data.zero_() @@ -1412,6 +1476,7 @@ class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number logits=logits, pred_boxes=pred_boxes, auxiliary_outputs=auxiliary_outputs, + last_hidden_state=outputs.last_hidden_state, decoder_hidden_states=outputs.decoder_hidden_states, decoder_attentions=outputs.decoder_attentions, cross_attentions=outputs.cross_attentions, @@ -1424,7 +1489,7 @@ class labels themselves should be a :obj:`torch.LongTensor` of len :obj:`(number @add_start_docstrings( """ DETR Model (consisting of a backbone and encoder-decoder Transformer) with a segmentation head on top, for tasks - such as COCO panoptic. + such as COCO panoptic. """, DETR_START_DOCSTRING, @@ -1439,9 +1504,16 @@ def __init__(self, config: DetrConfig): # segmentation head hidden_size, number_of_heads = config.d_model, config.encoder_attention_heads - self.bbox_attention = DetrMHAttentionMap(hidden_size, hidden_size, number_of_heads, dropout=0.0) self.mask_head = DetrMaskHeadSmallConv(hidden_size + number_of_heads, [1024, 512, 256], hidden_size) + self.init_weights() + + # The DetrMHAttentionMap has a custom layer initialization scheme which must not get overwritten by the + # self.init_weights() + self.bbox_attention = DetrMHAttentionMap( + hidden_size, hidden_size, number_of_heads, dropout=0.0, std=config.init_xavier_std + ) + @add_start_docstrings_to_model_forward(DETR_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=DetrForSegmentationOutput, config_class=_CONFIG_FOR_DOC) def forward( @@ -1622,6 +1694,7 @@ def forward( pred_boxes=pred_boxes, pred_masks=pred_masks, auxiliary_outputs=auxiliary_outputs, + last_hidden_state=decoder_outputs.last_hidden_state, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, @@ -1717,7 +1790,7 @@ def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): class DetrMHAttentionMap(nn.Module): """This is a 2D attention module, which only returns the attention softmax (no multiplication by value)""" - def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True): + def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std=None): super().__init__() self.num_heads = num_heads self.hidden_dim = hidden_dim @@ -1728,8 +1801,8 @@ def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True): nn.init.zeros_(self.k_linear.bias) nn.init.zeros_(self.q_linear.bias) - nn.init.xavier_uniform_(self.k_linear.weight) - nn.init.xavier_uniform_(self.q_linear.weight) + nn.init.xavier_uniform_(self.k_linear.weight, gain=std) + nn.init.xavier_uniform_(self.q_linear.weight, gain=std) self.normalize_fact = float(hidden_dim / self.num_heads) ** -0.5 def forward(self, q, k, mask: Optional[Tensor] = None): diff --git a/tests/test_feature_extraction_detr.py b/tests/test_feature_extraction_detr.py index 6b0bd1aaf527..bd1d3ebca80c 100644 --- a/tests/test_feature_extraction_detr.py +++ b/tests/test_feature_extraction_detr.py @@ -265,7 +265,7 @@ def test_equivalence_pad_and_create_pixel_mask(self): image_inputs = self.feature_extract_tester.prepare_inputs(equal_resolution=False, torchify=True) for image in image_inputs: self.assertIsInstance(image, torch.Tensor) - + # Test whether the method "pad_and_return_pixel_mask" and calling the feature extractor return the same tensors encoded_images_with_method = feature_extractor_1.pad_and_create_pixel_mask(image_inputs, return_tensors="pt") encoded_images = feature_extractor_2(image_inputs, return_tensors="pt") diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 19469075adca..636717ebe82d 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -20,7 +20,7 @@ import random import tempfile import unittest -from typing import List, Tuple +from typing import Dict, List, Tuple from huggingface_hub import HfApi from requests.exceptions import HTTPError @@ -856,7 +856,6 @@ def test_retain_grad_hidden_states_attentions(self): outputs = model(**inputs) - print(outputs) output = outputs[0] if config.is_encoder_decoder: @@ -1110,6 +1109,11 @@ def recursive_check(tuple_object, dict_object): if isinstance(tuple_object, (List, Tuple)): for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): recursive_check(tuple_iterable_value, dict_iterable_value) + elif isinstance(tuple_object, Dict): + for tuple_iterable_value, dict_iterable_value in zip( + tuple_object.values(), dict_object.values() + ): + recursive_check(tuple_iterable_value, dict_iterable_value) elif tuple_object is None: return else: @@ -1123,6 +1127,7 @@ def recursive_check(tuple_object, dict_object): recursive_check(tuple_output, dict_output) for model_class in self.all_model_classes: + print(model_class) model = model_class(config) model.to(torch_device) model.eval() diff --git a/tests/test_modeling_detr.py b/tests/test_modeling_detr.py index c87e10b262bd..b119407a452e 100644 --- a/tests/test_modeling_detr.py +++ b/tests/test_modeling_detr.py @@ -87,7 +87,7 @@ def __init__( def prepare_config_and_inputs(self): pixel_values = floats_tensor([self.batch_size, self.num_channels, self.min_size, self.max_size]) - pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size]) + pixel_mask = torch.ones([self.batch_size, self.min_size, self.max_size], device=torch_device) labels = None if self.use_labels: @@ -95,9 +95,11 @@ def prepare_config_and_inputs(self): labels = [] for i in range(self.batch_size): target = {} - target["class_labels"] = torch.randint(high=self.num_labels, size=(self.n_targets,)) - target["boxes"] = torch.rand(self.n_targets, 4) - target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size) + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.min_size, self.max_size, device=torch_device) labels.append(target) config = DetrConfig( @@ -176,12 +178,18 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): labels = [] for i in range(self.model_tester.batch_size): target = {} - target["class_labels"] = torch.randint( - high=self.model_tester.num_labels, size=(self.model_tester.n_targets,) + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float ) - target["boxes"] = torch.rand(self.model_tester.n_targets, 4) - target["masks"] = torch.rand( - self.model_tester.n_targets, self.model_tester.min_size, self.model_tester.max_size + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.min_size, + self.model_tester.max_size, + device=torch_device, + dtype=torch.float, ) labels.append(target) inputs_dict["labels"] = labels @@ -238,6 +246,7 @@ def test_attention_outputs(self): encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes for model_class in self.all_model_classes: + print(model_class) inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False config.return_dict = True @@ -278,12 +287,12 @@ def test_attention_outputs(self): # loss is at first position if "labels" in inputs_dict: correct_outlen += 1 # loss is added to beginning - # Object Detection model returns pred_logits and pred_boxes instead of last_hidden_state + # Object Detection model returns pred_logits and pred_boxes if model_class.__name__ == "DetrForObjectDetection": - correct_outlen += 1 + correct_outlen += 2 # Panoptic Segmentation model returns pred_logits, pred_boxes, pred_masks if model_class.__name__ == "DetrForSegmentation": - correct_outlen += 2 + correct_outlen += 3 if "past_key_values" in outputs: correct_outlen += 1 # past_key_values have been returned @@ -378,11 +387,15 @@ def test_different_timm_backbone(self): model.eval() with torch.no_grad(): outputs = model(**self._prepare_for_class(inputs_dict, model_class)) - + if model_class.__name__ == "DetrForObjectDetection": - expected_shape = (self.model_tester.batch_size, self.model_tester.num_queries, self.model_tester.num_labels + 1) + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + self.model_tester.num_labels + 1, + ) self.assertEqual(outputs.logits.shape, expected_shape) - + self.assertTrue(outputs) diff --git a/utils/check_repo.py b/utils/check_repo.py index 9c8648a01c34..5f6866886726 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -94,6 +94,7 @@ # should **not** be the rule. IGNORE_NON_AUTO_CONFIGURED = [ # models to ignore for model xxx mapping + "DetrForSegmentation", "DPRReader", "DPRSpanPredictor", "FlaubertForQuestionAnswering",