huggingface · LysandreJik · Dec 15, 2021 · Dec 15, 2021 · Dec 15, 2021
diff --git a/docs/source/model_doc/perceiver.mdx b/docs/source/model_doc/perceiver.mdx
@@ -81,9 +81,10 @@ Tips:
 
 - The quickest way to get started with the Perceiver is by checking the [tutorial
   notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver).
-- Note that the models available in the library only showcase some examples of what you can do with the Perceiver.
-  There are many more use cases, including question answering,
-named-entity recognition, object detection, audio classification, video classification, etc.
+- Refer to the [blog post](https://huggingface.co/blog/perceiver) if you want to fully understand how the model works and
+is implemented in the library. Note that the models available in the library only showcase some examples of what you can do
+with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, 
+audio classification, video classification, etc. 
 
 ## Perceiver specific outputs
 
@@ -102,10 +103,7 @@ named-entity recognition, object detection, audio classification, video classifi
 ## PerceiverTokenizer
 
 [[autodoc]] PerceiverTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
+    - __call__
 
 ## PerceiverFeatureExtractor
 

diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -757,12 +757,7 @@ class PreTrainedModel
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=PerceiverModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=PerceiverModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         inputs,
@@ -773,6 +768,85 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+
+        Examples::
+
+            >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
+            >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
+            >>> import torch
+            >>> import requests
+            >>> from PIL import Image
+
+            >>> # EXAMPLE 1: using the Perceiver to classify texts
+            >>> # - we define a TextPreprocessor, which can be used to embed tokens
+            >>> # - we define a ClassificationDecoder, which can be used to decode the
+            >>> # final hidden states of the latents to classification logits
+            >>> # using trainable position embeddings
+            >>> config = PerceiverConfig()
+            >>> preprocessor = PerceiverTextPreprocessor(config)
+            >>> decoder = PerceiverClassificationDecoder(config,
+            ...                                          num_channels=config.d_latents,
+            ...                                          trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+            ...                                          use_query_residual=True)
+            >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
+
+            >>> # you can then do a forward pass as follows:
+            >>> tokenizer = PerceiverTokenizer()
+            >>> text = "hello world"
+            >>> inputs = tokenizer(text, return_tensors="pt").input_ids
+
+            >>> with torch.no_grad():
+            >>>    outputs = model(inputs=inputs)
+            >>> logits = outputs.logits
+
+            >>> # to train, one can train the model using standard cross-entropy:
+            >>> criterion = torch.nn.CrossEntropyLoss()
+
+            >>> labels = torch.tensor([1])
+            >>> loss = criterion(logits, labels)
+
+            >>> # EXAMPLE 2: using the Perceiver to classify images
+            >>> # - we define an ImagePreprocessor, which can be used to embed images
+            >>> preprocessor=PerceiverImagePreprocessor(
+                            config,
+                            prep_type="conv1x1",
+                            spatial_downsample=1,
+                            out_channels=256,
+                            position_encoding_type="trainable",
+                            concat_or_add_pos="concat",
+                            project_pos_dim=256,
+                            trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2),
+                        )
+
+            >>> model = PerceiverModel(
+            ...         config,
+            ...         input_preprocessor=preprocessor,
+            ...         decoder=PerceiverClassificationDecoder(
+            ...              config,
+            ...              num_channels=config.d_latents,
+            ...              trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+            ...              use_query_residual=True,
+            ...          ),
+            ... )
+
+            >>> # you can then do a forward pass as follows:
+            >>> feature_extractor = PerceiverFeatureExtractor()
+            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+            >>> image = Image.open(requests.get(url, stream=True).raw)
+            >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
+
+            >>> with torch.no_grad():
+            >>>    outputs = model(inputs=inputs)
+            >>> logits = outputs.logits
+
+            >>> # to train, one can train the model using standard cross-entropy:
+            >>> criterion = torch.nn.CrossEntropyLoss()
+
+            >>> labels = torch.tensor([1])
+            >>> loss = criterion(logits, labels)
+        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -901,12 +975,7 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        processor_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=PerceiverMaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
+    @replace_return_docstrings(output_type=PerceiverMaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         inputs=None,
@@ -923,6 +992,42 @@ def forward(
             Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
             config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
             (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+        Returns:
+
+        Examples::
+            >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM
+            >>> import torch
+
+            >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
+            >>> model = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')
+
+            >>> # training
+            >>> text = "This is an incomplete sentence where some words are missing."
+            >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt")
+            >>> # mask " missing."
+            >>> inputs['input_ids'][0, 52:61] = tokenizer.mask_token_id
+            >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids
+
+            >>> outputs = model(**inputs, labels=labels)
+            >>> loss = outputs.loss
+            >>> logits = outputs.logits
+
+            >>> # inference
+            >>> text = "This is an incomplete sentence where some words are missing."
+            >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt")
+
+            >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space.
+            >>> encoding['input_ids'][0, 52:61] = tokenizer.mask_token_id
+
+            >>> # forward pass
+            >>> with torch.no_grad():
+            >>>    outputs = model(**encoding)
+            >>> logits = outputs.logits
+
+            >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
+            >>> tokenizer.decode(masked_tokens_predictions)
+            ' missing.'
         """
         if inputs is not None and input_ids is not None:
             raise ValueError("You cannot use both `inputs` and `input_ids`")