diff --git a/docs/source/model_doc/perceiver.mdx b/docs/source/model_doc/perceiver.mdx index 21f12266b718..ff9bbaedb40d 100644 --- a/docs/source/model_doc/perceiver.mdx +++ b/docs/source/model_doc/perceiver.mdx @@ -81,9 +81,10 @@ Tips: - The quickest way to get started with the Perceiver is by checking the [tutorial notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Perceiver). -- Note that the models available in the library only showcase some examples of what you can do with the Perceiver. - There are many more use cases, including question answering, -named-entity recognition, object detection, audio classification, video classification, etc. +- Refer to the [blog post](https://huggingface.co/blog/perceiver) if you want to fully understand how the model works and +is implemented in the library. Note that the models available in the library only showcase some examples of what you can do +with the Perceiver. There are many more use cases, including question answering, named-entity recognition, object detection, +audio classification, video classification, etc. ## Perceiver specific outputs @@ -102,10 +103,7 @@ named-entity recognition, object detection, audio classification, video classifi ## PerceiverTokenizer [[autodoc]] PerceiverTokenizer - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - save_vocabulary + - __call__ ## PerceiverFeatureExtractor diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py index 68752e7b14b5..c365d8521741 100755 --- a/src/transformers/models/perceiver/modeling_perceiver.py +++ b/src/transformers/models/perceiver/modeling_perceiver.py @@ -757,12 +757,7 @@ class PreTrainedModel self.encoder.layer[layer].attention.prune_heads(heads) @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=PerceiverModelOutput, - config_class=_CONFIG_FOR_DOC, - ) + @replace_return_docstrings(output_type=PerceiverModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, inputs, @@ -773,6 +768,85 @@ def forward( output_hidden_states=None, return_dict=None, ): + r""" + Returns: + + Examples:: + + >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel + >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder + >>> import torch + >>> import requests + >>> from PIL import Image + + >>> # EXAMPLE 1: using the Perceiver to classify texts + >>> # - we define a TextPreprocessor, which can be used to embed tokens + >>> # - we define a ClassificationDecoder, which can be used to decode the + >>> # final hidden states of the latents to classification logits + >>> # using trainable position embeddings + >>> config = PerceiverConfig() + >>> preprocessor = PerceiverTextPreprocessor(config) + >>> decoder = PerceiverClassificationDecoder(config, + ... num_channels=config.d_latents, + ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1), + ... use_query_residual=True) + >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder) + + >>> # you can then do a forward pass as follows: + >>> tokenizer = PerceiverTokenizer() + >>> text = "hello world" + >>> inputs = tokenizer(text, return_tensors="pt").input_ids + + >>> with torch.no_grad(): + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + + >>> # to train, one can train the model using standard cross-entropy: + >>> criterion = torch.nn.CrossEntropyLoss() + + >>> labels = torch.tensor([1]) + >>> loss = criterion(logits, labels) + + >>> # EXAMPLE 2: using the Perceiver to classify images + >>> # - we define an ImagePreprocessor, which can be used to embed images + >>> preprocessor=PerceiverImagePreprocessor( + config, + prep_type="conv1x1", + spatial_downsample=1, + out_channels=256, + position_encoding_type="trainable", + concat_or_add_pos="concat", + project_pos_dim=256, + trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2), + ) + + >>> model = PerceiverModel( + ... config, + ... input_preprocessor=preprocessor, + ... decoder=PerceiverClassificationDecoder( + ... config, + ... num_channels=config.d_latents, + ... trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1), + ... use_query_residual=True, + ... ), + ... ) + + >>> # you can then do a forward pass as follows: + >>> feature_extractor = PerceiverFeatureExtractor() + >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg' + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values + + >>> with torch.no_grad(): + >>> outputs = model(inputs=inputs) + >>> logits = outputs.logits + + >>> # to train, one can train the model using standard cross-entropy: + >>> criterion = torch.nn.CrossEntropyLoss() + + >>> labels = torch.tensor([1]) + >>> loss = criterion(logits, labels) + """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -901,12 +975,7 @@ def __init__(self, config): self.post_init() @add_start_docstrings_to_model_forward(PERCEIVER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) - @add_code_sample_docstrings( - processor_class=_TOKENIZER_FOR_DOC, - checkpoint=_CHECKPOINT_FOR_DOC, - output_type=PerceiverMaskedLMOutput, - config_class=_CONFIG_FOR_DOC, - ) + @replace_return_docstrings(output_type=PerceiverMaskedLMOutput, config_class=_CONFIG_FOR_DOC) def forward( self, inputs=None, @@ -923,6 +992,42 @@ def forward( Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + + Returns: + + Examples:: + >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM + >>> import torch + + >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver') + >>> model = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver') + + >>> # training + >>> text = "This is an incomplete sentence where some words are missing." + >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt") + >>> # mask " missing." + >>> inputs['input_ids'][0, 52:61] = tokenizer.mask_token_id + >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids + + >>> outputs = model(**inputs, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits + + >>> # inference + >>> text = "This is an incomplete sentence where some words are missing." + >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt") + + >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space. + >>> encoding['input_ids'][0, 52:61] = tokenizer.mask_token_id + + >>> # forward pass + >>> with torch.no_grad(): + >>> outputs = model(**encoding) + >>> logits = outputs.logits + + >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist() + >>> tokenizer.decode(masked_tokens_predictions) + ' missing.' """ if inputs is not None and input_ids is not None: raise ValueError("You cannot use both `inputs` and `input_ids`")