From 697734031bdbaa912a4c7b36ebc9805de02e0a95 Mon Sep 17 00:00:00 2001 From: Johannes Kolbe Date: Wed, 6 Apr 2022 20:45:09 +0200 Subject: [PATCH 1/2] add vit tf doctest with @add_code_sample_docstrings --- .../models/vit/modeling_tf_vit.py | 79 ++++++------------- utils/documentation_tests.txt | 1 + 2 files changed, 27 insertions(+), 53 deletions(-) diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index cbf935f4f743..7e5381d37dc7 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -33,14 +33,23 @@ unpack_inputs, ) from ...tf_utils import shape_list -from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_vit import ViTConfig logger = logging.get_logger(__name__) +# General docstring _CONFIG_FOR_DOC = "ViTConfig" -_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224" +_FEAT_EXTRACTOR_FOR_DOC = "ViTFeatureExtractor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "google/vit-base-patch16-224-in21k" +_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "google/vit-base-patch16-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" # Inspired by @@ -645,7 +654,14 @@ def __init__(self, config: ViTConfig, *inputs, add_pooling_layer=True, **kwargs) @unpack_inputs @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) def call( self, pixel_values: Optional[TFModelInputType] = None, @@ -656,26 +672,6 @@ def call( return_dict: Optional[bool] = None, training: bool = False, ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: - r""" - Returns: - - Examples: - - ```python - >>> from transformers import ViTFeatureExtractor, TFViTModel - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") - >>> model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k") - - >>> inputs = feature_extractor(images=image, return_tensors="tf") - >>> outputs = model(**inputs) - >>> last_hidden_states = outputs.last_hidden_state - ```""" outputs = self.vit( pixel_values=pixel_values, @@ -744,7 +740,13 @@ def __init__(self, config: ViTConfig, *inputs, **kwargs): @unpack_inputs @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING) - @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) def call( self, pixel_values: Optional[TFModelInputType] = None, @@ -756,35 +758,6 @@ def call( labels: Optional[Union[np.ndarray, tf.Tensor]] = None, training: Optional[bool] = False, ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: - r""" - labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): - Labels for computing the image classification/regression loss. Indices should be in `[0, ..., - config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If - `config.num_labels > 1` a classification loss is computed (Cross-Entropy). - - Returns: - - Examples: - - ```python - >>> from transformers import ViTFeatureExtractor, TFViTForImageClassification - >>> import tensorflow as tf - >>> from PIL import Image - >>> import requests - - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" - >>> image = Image.open(requests.get(url, stream=True).raw) - - >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224") - >>> model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224") - - >>> inputs = feature_extractor(images=image, return_tensors="tf") - >>> outputs = model(**inputs) - >>> logits = outputs.logits - >>> # model predicts one of the 1000 ImageNet classes - >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0] - >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)]) - ```""" outputs = self.vit( pixel_values=pixel_values, diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 115026b2da1f..fba8f7a5847d 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -35,6 +35,7 @@ src/transformers/models/van/modeling_van.py src/transformers/models/vilt/modeling_vilt.py src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py src/transformers/models/vit/modeling_vit.py +src/transformers/models/vit/modeling_tf_vit.py src/transformers/models/vit_mae/modeling_vit_mae.py src/transformers/models/wav2vec2/modeling_wav2vec2.py src/transformers/models/wav2vec2/tokenization_wav2vec2.py From 25fd461c5f630458eaef41919b65baf20772e1b4 Mon Sep 17 00:00:00 2001 From: Johannes Kolbe Date: Thu, 7 Apr 2022 21:20:41 +0200 Subject: [PATCH 2/2] add labels string back in --- src/transformers/models/vit/modeling_tf_vit.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py index 7e5381d37dc7..e3c039ca83e1 100644 --- a/src/transformers/models/vit/modeling_tf_vit.py +++ b/src/transformers/models/vit/modeling_tf_vit.py @@ -758,6 +758,12 @@ def call( labels: Optional[Union[np.ndarray, tf.Tensor]] = None, training: Optional[bool] = False, ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ outputs = self.vit( pixel_values=pixel_values,