-
Notifications
You must be signed in to change notification settings - Fork 32.1k
Add Doc Test for BERT #16523
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Doc Test for BERT #16523
Changes from all commits
b59603a
e60a4bf
2ea5ebe
1193bff
e4a426f
390a13f
1b5d204
5e8566f
67a7e92
3d9e661
8524a1e
84f0953
4263019
f8ea70c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,6 +63,26 @@ | |
| _CONFIG_FOR_DOC = "BertConfig" | ||
| _TOKENIZER_FOR_DOC = "BertTokenizer" | ||
|
|
||
| # TokenClassification docstring | ||
| _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english" | ||
| _TOKEN_CLASS_EXPECTED_OUTPUT = ( | ||
| "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] " | ||
| ) | ||
| _TOKEN_CLASS_EXPECTED_LOSS = 0.01 | ||
|
|
||
| # QuestionAnswering docstring | ||
| _CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2" | ||
| _QA_EXPECTED_OUTPUT = "'a nice puppet'" | ||
| _QA_EXPECTED_LOSS = 7.41 | ||
| _QA_TARGET_START_INDEX = 14 | ||
| _QA_TARGET_END_INDEX = 15 | ||
|
|
||
| # SequenceClassification docstring | ||
| _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity" | ||
| _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" | ||
| _SEQ_CLASS_EXPECTED_LOSS = 0.01 | ||
|
|
||
|
|
||
| BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ | ||
| "bert-base-uncased", | ||
| "bert-large-uncased", | ||
|
|
@@ -1156,7 +1176,12 @@ def set_output_embeddings(self, new_embeddings): | |
| self.cls.predictions.decoder = new_embeddings | ||
|
|
||
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| output_type=CausalLMOutputWithCrossAttentions, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| ) | ||
| def forward( | ||
| self, | ||
| input_ids: Optional[torch.Tensor] = None, | ||
|
|
@@ -1175,49 +1200,28 @@ def forward( | |
| return_dict: Optional[bool] = None, | ||
| ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: | ||
| r""" | ||
| encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): | ||
| Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention | ||
| if the model is configured as a decoder. | ||
| encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): | ||
| Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used | ||
| in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: | ||
|
|
||
| - 1 for tokens that are **not masked**, | ||
| - 0 for tokens that are **masked**. | ||
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | ||
| Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be | ||
| in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` | ||
| are ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., | ||
| config.vocab_size]` | ||
| past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): | ||
| Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up | ||
| decoding. | ||
|
|
||
| If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those | ||
| that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of | ||
| all `decoder_input_ids` of shape `(batch_size, sequence_length)`. | ||
| use_cache (`bool`, *optional*): | ||
| If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding | ||
| (see `past_key_values`). | ||
|
|
||
| Returns: | ||
|
|
||
| Example: | ||
|
|
||
| ```python | ||
| >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig | ||
| >>> import torch | ||
|
|
||
| >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased") | ||
| >>> config = BertConfig.from_pretrained("bert-base-cased") | ||
| >>> config.is_decoder = True | ||
| >>> model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config) | ||
| encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): | ||
| Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if | ||
| the model is configured as a decoder. | ||
| encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): | ||
| Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in | ||
| the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More than super! Thanks for the fix |
||
|
|
||
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | ||
| >>> outputs = model(**inputs) | ||
| - 1 for tokens that are **not masked**, | ||
| - 0 for tokens that are **masked**. | ||
| labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): | ||
| Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in | ||
| `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are | ||
| ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` | ||
| past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): | ||
| Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. | ||
|
|
||
| >>> prediction_logits = outputs.logits | ||
| ``` | ||
| If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that | ||
| don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all | ||
| `decoder_input_ids` of shape `(batch_size, sequence_length)`. | ||
| use_cache (`bool`, *optional*): | ||
| If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see | ||
| `past_key_values`). | ||
| """ | ||
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict | ||
| if labels is not None: | ||
|
|
@@ -1315,6 +1319,8 @@ def set_output_embeddings(self, new_embeddings): | |
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| output_type=MaskedLMOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output="'paris'", | ||
| expected_loss=0.88, | ||
| ) | ||
| def forward( | ||
| self, | ||
|
|
@@ -1517,9 +1523,11 @@ def __init__(self, config): | |
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, | ||
| output_type=SequenceClassifierOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, | ||
| expected_loss=_SEQ_CLASS_EXPECTED_LOSS, | ||
| ) | ||
| def forward( | ||
| self, | ||
|
|
@@ -1716,9 +1724,11 @@ def __init__(self, config): | |
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, | ||
| output_type=TokenClassifierOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT, | ||
| expected_loss=_TOKEN_CLASS_EXPECTED_LOSS, | ||
| ) | ||
| def forward( | ||
| self, | ||
|
|
@@ -1797,9 +1807,13 @@ def __init__(self, config): | |
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_QA, | ||
| output_type=QuestionAnsweringModelOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| qa_target_start_index=_QA_TARGET_START_INDEX, | ||
| qa_target_end_index=_QA_TARGET_END_INDEX, | ||
| expected_output=_QA_EXPECTED_OUTPUT, | ||
| expected_loss=_QA_EXPECTED_LOSS, | ||
| ) | ||
| def forward( | ||
| self, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,10 +65,29 @@ | |
|
|
||
| logger = logging.get_logger(__name__) | ||
|
|
||
| _CHECKPOINT_FOR_DOC = "bert-base-cased" | ||
| _CHECKPOINT_FOR_DOC = "bert-base-uncased" | ||
| _CONFIG_FOR_DOC = "BertConfig" | ||
| _TOKENIZER_FOR_DOC = "BertTokenizer" | ||
|
|
||
| # TokenClassification docstring | ||
| _CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english" | ||
| _TOKEN_CLASS_EXPECTED_OUTPUT = ( | ||
| "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', " "'I-LOC'] " | ||
| ) | ||
| _TOKEN_CLASS_EXPECTED_LOSS = 0.01 | ||
|
|
||
| # QuestionAnswering docstring | ||
| _CHECKPOINT_FOR_QA = "ydshieh/bert-base-cased-squad2" | ||
| _QA_EXPECTED_OUTPUT = "'a nice puppet'" | ||
| _QA_EXPECTED_LOSS = 7.41 | ||
| _QA_TARGET_START_INDEX = 14 | ||
| _QA_TARGET_END_INDEX = 15 | ||
|
|
||
| # SequenceClassification docstring | ||
| _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ydshieh/bert-base-uncased-yelp-polarity" | ||
| _SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" | ||
| _SEQ_CLASS_EXPECTED_LOSS = 0.01 | ||
|
|
||
| TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ | ||
| "bert-base-uncased", | ||
| "bert-large-uncased", | ||
|
|
@@ -1197,11 +1216,11 @@ def call( | |
|
|
||
| >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | ||
| >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased") | ||
| >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[ | ||
| ... None, : | ||
| >>> ] # Batch size 1 | ||
| >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf") | ||
| >>> # Batch size 1 | ||
|
|
||
| >>> outputs = model(input_ids) | ||
| >>> prediction_scores, seq_relationship_scores = outputs[:2] | ||
| >>> prediction_logits, seq_relationship_logits = outputs[:2] | ||
|
Comment on lines
+1219
to
+1223
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Very nice :-)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you :D |
||
| ```""" | ||
| outputs = self.bert( | ||
| input_ids=input_ids, | ||
|
|
@@ -1285,6 +1304,8 @@ def get_prefix_bias_name(self) -> str: | |
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| output_type=TFMaskedLMOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output="'paris'", | ||
| expected_loss=0.88, | ||
| ) | ||
| def call( | ||
| self, | ||
|
|
@@ -1606,9 +1627,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): | |
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, | ||
| output_type=TFSequenceClassifierOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, | ||
| expected_loss=_SEQ_CLASS_EXPECTED_LOSS, | ||
| ) | ||
| def call( | ||
| self, | ||
|
|
@@ -1833,9 +1856,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): | |
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, | ||
| output_type=TFTokenClassifierOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT, | ||
| expected_loss=_TOKEN_CLASS_EXPECTED_LOSS, | ||
| ) | ||
| def call( | ||
| self, | ||
|
|
@@ -1923,9 +1948,11 @@ def __init__(self, config: BertConfig, *inputs, **kwargs): | |
| @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) | ||
| @add_code_sample_docstrings( | ||
| processor_class=_TOKENIZER_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_DOC, | ||
| checkpoint=_CHECKPOINT_FOR_QA, | ||
| output_type=TFQuestionAnsweringModelOutput, | ||
| config_class=_CONFIG_FOR_DOC, | ||
| expected_output=_QA_EXPECTED_OUTPUT, | ||
| expected_loss=_QA_EXPECTED_LOSS, | ||
| ) | ||
| def call( | ||
| self, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
super!