diff --git a/docs/source/model_doc/bertgeneration.rst b/docs/source/model_doc/bertgeneration.rst index 34076cd8f30a..9ea904c590e2 100644 --- a/docs/source/model_doc/bertgeneration.rst +++ b/docs/source/model_doc/bertgeneration.rst @@ -40,7 +40,7 @@ Usage: labels = tokenizer('This is a short summary', return_tensors="pt").input_ids # train... - loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels, return_dict=True).loss + loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss loss.backward() diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst index 6803868b6c91..e065daf1b401 100644 --- a/docs/source/model_doc/t5.rst +++ b/docs/source/model_doc/t5.rst @@ -64,7 +64,7 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash input_ids = tokenizer('The walks in park', return_tensors='pt').input_ids labels = tokenizer(' cute dog the ', return_tensors='pt').input_ids # the forward function automatically creates the correct decoder_input_ids - loss = model(input_ids=input_ids, labels=labels, return_dict=True).loss + loss = model(input_ids=input_ids, labels=labels).loss - Supervised training @@ -77,7 +77,7 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids # the forward function automatically creates the correct decoder_input_ids - loss = model(input_ids=input_ids, labels=labels, return_dict=True).loss + loss = model(input_ids=input_ids, labels=labels).loss T5Config diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst index 8ce9d239cb6c..2f0f8336c39b 100644 --- a/docs/source/task_summary.rst +++ b/docs/source/task_summary.rst @@ -89,7 +89,7 @@ each other. The process is the following: >>> import torch >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") - >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=True) + >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") >>> classes = ["not paraphrase", "is paraphrase"] @@ -122,7 +122,7 @@ each other. The process is the following: >>> import tensorflow as tf >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") - >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=True) + >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") >>> classes = ["not paraphrase", "is paraphrase"] @@ -211,7 +211,7 @@ Here is an example of question answering using a model and a tokenizer. The proc >>> import torch >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") - >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True) + >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") >>> text = r""" ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose @@ -253,7 +253,7 @@ Here is an example of question answering using a model and a tokenizer. The proc >>> import tensorflow as tf >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") - >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True) + >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") >>> text = r""" ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose @@ -373,7 +373,7 @@ Here is an example of doing masked language modeling using a model and a tokeniz >>> import torch >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") - >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased", return_dict=True) + >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased") >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint." @@ -389,7 +389,7 @@ Here is an example of doing masked language modeling using a model and a tokeniz >>> import tensorflow as tf >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased") - >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased", return_dict=True) + >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased") >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint." @@ -437,7 +437,7 @@ of tokens. >>> from torch.nn import functional as F >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelWithLMHead.from_pretrained("gpt2", return_dict=True) + >>> model = AutoModelWithLMHead.from_pretrained("gpt2") >>> sequence = f"Hugging Face is based in DUMBO, New York City, and " @@ -461,7 +461,7 @@ of tokens. >>> import tensorflow as tf >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2", return_dict=True) + >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2") >>> sequence = f"Hugging Face is based in DUMBO, New York City, and " @@ -520,7 +520,7 @@ Here is an example of text generation using ``XLNet`` and its tokenizer. >>> ## PYTORCH CODE >>> from transformers import AutoModelWithLMHead, AutoTokenizer - >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True) + >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology @@ -545,7 +545,7 @@ Here is an example of text generation using ``XLNet`` and its tokenizer. >>> ## TENSORFLOW CODE >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer - >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True) + >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased") >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased") >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology @@ -664,7 +664,7 @@ Here is an example of doing named entity recognition, using a model and a tokeni >>> from transformers import AutoModelForTokenClassification, AutoTokenizer >>> import torch - >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", return_dict=True) + >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") >>> label_list = [ @@ -692,7 +692,7 @@ Here is an example of doing named entity recognition, using a model and a tokeni >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer >>> import tensorflow as tf - >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", return_dict=True) + >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") >>> label_list = [ @@ -790,7 +790,7 @@ CNN / Daily Mail), it yields very good results. >>> ## PYTORCH CODE >>> from transformers import AutoModelWithLMHead, AutoTokenizer - >>> model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True) + >>> model = AutoModelWithLMHead.from_pretrained("t5-base") >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens. @@ -799,7 +799,7 @@ CNN / Daily Mail), it yields very good results. >>> ## TENSORFLOW CODE >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer - >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base", return_dict=True) + >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base") >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens. @@ -843,7 +843,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce >>> ## PYTORCH CODE >>> from transformers import AutoModelWithLMHead, AutoTokenizer - >>> model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True) + >>> model = AutoModelWithLMHead.from_pretrained("t5-base") >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt") @@ -851,7 +851,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce >>> ## TENSORFLOW CODE >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer - >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base", return_dict=True) + >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base") >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf") diff --git a/docs/source/training.rst b/docs/source/training.rst index f7fb158e1c08..69773d71dd8c 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -39,7 +39,7 @@ head on top of the encoder with an output size of 2. Models are initialized in ` .. code-block:: python from transformers import BertForSequenceClassification - model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True) + model = BertForSequenceClassification.from_pretrained('bert-base-uncased') model.train() This is useful because it allows us to make use of the pre-trained BERT encoder and easily train it on whatever diff --git a/examples/lxmert/demo.ipynb b/examples/lxmert/demo.ipynb index fcdf568e3bf6..ee2c06cac342 100644 --- a/examples/lxmert/demo.ipynb +++ b/examples/lxmert/demo.ipynb @@ -210,7 +210,6 @@ " visual_feats=features,\n", " visual_pos=normalized_boxes,\n", " token_type_ids=inputs.token_type_ids,\n", - " return_dict=True,\n", " output_attentions=False,\n", " )\n", " output_vqa = lxmert_vqa(\n", @@ -219,7 +218,6 @@ " visual_feats=features,\n", " visual_pos=normalized_boxes,\n", " token_type_ids=inputs.token_type_ids,\n", - " return_dict=True,\n", " output_attentions=False,\n", " )\n", " # get prediction\n", @@ -266,4 +264,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py index 85298d4e56cc..d717c17e0101 100644 --- a/examples/question-answering/run_squad.py +++ b/examples/question-answering/run_squad.py @@ -321,7 +321,7 @@ def evaluate(args, model, tokenizer, prefix=""): eval_feature = features[feature_index.item()] unique_id = int(eval_feature.unique_id) - output = [to_list(output[i]) for output in outputs] + output = [to_list(output[i]) for output in outputs.to_tuple()] # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" # models only use two. diff --git a/examples/rag/eval_rag.py b/examples/rag/eval_rag.py index fd0c9711a6a9..6a63b9708fef 100644 --- a/examples/rag/eval_rag.py +++ b/examples/rag/eval_rag.py @@ -95,7 +95,7 @@ def strip_title(title): truncation=True, )["input_ids"].to(args.device) - question_enc_outputs = rag_model.rag.question_encoder(retriever_input_ids, return_dict=True) + question_enc_outputs = rag_model.rag.question_encoder(retriever_input_ids) question_enc_pool_output = question_enc_outputs.pooler_output result = rag_model.retriever( diff --git a/examples/rag/finetune.py b/examples/rag/finetune.py index 24489962a70d..9882b9e2dc12 100644 --- a/examples/rag/finetune.py +++ b/examples/rag/finetune.py @@ -204,7 +204,6 @@ def _step(self, batch: dict) -> Tuple: decoder_input_ids=decoder_input_ids, use_cache=False, labels=lm_labels, - return_dict=True, **rag_kwargs, ) diff --git a/examples/rag/use_own_knowledge_dataset.py b/examples/rag/use_own_knowledge_dataset.py index fd465e6900c7..081216f99fb1 100644 --- a/examples/rag/use_own_knowledge_dataset.py +++ b/examples/rag/use_own_knowledge_dataset.py @@ -47,7 +47,7 @@ def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRCon input_ids = ctx_tokenizer( documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt" )["input_ids"] - embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output + embeddings = ctx_encoder(input_ids.to(device=device)).pooler_output return {"embeddings": embeddings.detach().cpu().numpy()} diff --git a/examples/seq2seq/distillation.py b/examples/seq2seq/distillation.py index e76e13fafe43..58f23345ede9 100755 --- a/examples/seq2seq/distillation.py +++ b/examples/seq2seq/distillation.py @@ -153,7 +153,6 @@ def _step(self, batch: dict) -> tuple: output_hidden_states=self.do_calc_hidden_loss, output_attentions=False, use_cache=False, - return_dict=True, ) lm_logits = student_outputs.logits @@ -179,7 +178,6 @@ def zero_tensor(): input_ids, attention_mask=src_mask, output_hidden_states=self.do_calc_hidden_loss, - return_dict=True, ) if self.different_base_models: teacher_enc_outputs = all_teacher_encoder_outputs.last_hidden_state @@ -199,7 +197,6 @@ def zero_tensor(): decoder_input_ids=decoder_input_ids, output_hidden_states=self.do_calc_hidden_loss, use_cache=False, # since we are not passing labels, never let this default to True - return_dict=True, ) dec_mask = decoder_input_ids.ne(pad_token_id) loss_ce = self.calc_ce_loss(dec_mask, lm_logits, teacher_outputs.logits) diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index 2fee837fe917..497c1942ed14 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -185,7 +185,7 @@ def test_distill_checkpointing_with_teacher(self): @require_torch_non_multi_gpu_but_fix_me def test_loss_fn(self): - model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True) + model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY) input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"] target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device) decoder_input_ids = target_ids[:, :-1].contiguous() # Why this line? diff --git a/model_cards/microsoft/prophetnet-large-uncased/README.md b/model_cards/microsoft/prophetnet-large-uncased/README.md index 8c0345fb8a16..c449bc4ba83d 100644 --- a/model_cards/microsoft/prophetnet-large-uncased/README.md +++ b/model_cards/microsoft/prophetnet-large-uncased/README.md @@ -23,7 +23,7 @@ target_str = "us rejects charges against its ambassador in bolivia" input_ids = tokenizer(input_str, return_tensors="pt").input_ids labels = tokenizer(target_str, return_tensors="pt").input_ids -loss = model(input_ids, labels=labels, return_dict=True).loss +loss = model(input_ids, labels=labels).loss ``` ### Citation diff --git a/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md b/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md index 55a55342a445..a2f687ee0de1 100644 --- a/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md +++ b/model_cards/microsoft/xprophetnet-large-wiki100-cased/README.md @@ -26,7 +26,7 @@ target_str = "us rejects charges against its ambassador in bolivia" input_ids = tokenizer(input_str, return_tensors="pt").input_ids labels = tokenizer(target_str, return_tensors="pt").input_ids -loss = model(input_ids, labels=labels, return_dict=True).loss +loss = model(input_ids, labels=labels).loss ``` Note that since this model is a multi-lingual model it can be fine-tuned on all kinds of other languages. diff --git a/model_cards/mrm8488/codebert-base-finetuned-detect-insecure-code/README.md b/model_cards/mrm8488/codebert-base-finetuned-detect-insecure-code/README.md index 1d573a03b004..68906e3c67dd 100644 --- a/model_cards/mrm8488/codebert-base-finetuned-detect-insecure-code/README.md +++ b/model_cards/mrm8488/codebert-base-finetuned-detect-insecure-code/README.md @@ -45,7 +45,7 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification import torch import numpy as np tokenizer = AutoTokenizer.from_pretrained('mrm8488/codebert-base-finetuned-detect-insecure-code') -model = AutoModelForSequenceClassification.from_pretrained('mrm8488/codebert-base-finetuned-detect-insecure-code', return_dict=True) +model = AutoModelForSequenceClassification.from_pretrained('mrm8488/codebert-base-finetuned-detect-insecure-code') inputs = tokenizer("your code here", return_tensors="pt", truncation=True, padding='max_length') labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 diff --git a/model_cards/sentence-transformers/LaBSE/README.md b/model_cards/sentence-transformers/LaBSE/README.md index 2f10ed6d606a..51b05dd6245d 100644 --- a/model_cards/sentence-transformers/LaBSE/README.md +++ b/model_cards/sentence-transformers/LaBSE/README.md @@ -13,7 +13,7 @@ sentences = ["Hello World", "Hallo Welt"] encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors='pt') with torch.no_grad(): - model_output = model(**encoded_input, return_dict=True) + model_output = model(**encoded_input) embeddings = model_output.pooler_output embeddings = torch.nn.functional.normalize(embeddings) diff --git a/scripts/fsmt/fsmt-make-super-tiny-model.py b/scripts/fsmt/fsmt-make-super-tiny-model.py index 2521799b63d1..f1742a4dc4ff 100755 --- a/scripts/fsmt/fsmt-make-super-tiny-model.py +++ b/scripts/fsmt/fsmt-make-super-tiny-model.py @@ -59,7 +59,7 @@ # Test batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"]) -outputs = tiny_model(**batch, return_dict=True) +outputs = tiny_model(**batch) print("test output:", len(outputs.logits[0])) diff --git a/scripts/fsmt/fsmt-make-tiny-model.py b/scripts/fsmt/fsmt-make-tiny-model.py index ba8abe0139c4..924eb0b63c93 100755 --- a/scripts/fsmt/fsmt-make-tiny-model.py +++ b/scripts/fsmt/fsmt-make-tiny-model.py @@ -30,7 +30,7 @@ # Test batch = tokenizer.prepare_seq2seq_batch(["Making tiny model"]) -outputs = tiny_model(**batch, return_dict=True) +outputs = tiny_model(**batch) print("test output:", len(outputs.logits[0])) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 6934e02f512a..94e85b9629db 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -55,7 +55,7 @@ class PretrainedConfig(object): Whether or not the model should returns all attentions. use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not the model should return the last key/values attentions (not used by all models). - return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): + return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): @@ -163,7 +163,7 @@ class PretrainedConfig(object): def __init__(self, **kwargs): # Attributes with defaults - self.return_dict = kwargs.pop("return_dict", False) + self.return_dict = kwargs.pop("return_dict", True) self.output_hidden_states = kwargs.pop("output_hidden_states", False) self.output_attentions = kwargs.pop("output_attentions", False) self.use_cache = kwargs.pop("use_cache", True) # Not used by all models diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 374b10dafabe..8315a781f9d9 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -559,7 +559,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 @@ -576,7 +576,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> inputs = tokenizer(question, text, return_tensors='pt') @@ -596,7 +596,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 @@ -612,7 +612,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] @@ -629,7 +629,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -644,7 +644,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." @@ -666,7 +666,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> from transformers import {tokenizer_class}, {model_class} >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs, labels=inputs["input_ids"]) @@ -681,7 +681,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> input_ids = inputs["input_ids"] @@ -699,7 +699,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> input_dict = tokenizer(question, text, return_tensors='tf') @@ -718,7 +718,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1 @@ -735,7 +735,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf") >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] @@ -752,7 +752,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> outputs = model(inputs) @@ -767,7 +767,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." @@ -788,7 +788,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import tensorflow as tf >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) + >>> model = {model_class}.from_pretrained('{checkpoint}') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf") >>> outputs = model(inputs) diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 6eb0a9561312..d3c3102d2a7b 100755 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -416,7 +416,7 @@ def forward( head_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): hidden_states = self.embedding_hidden_mapping_in(hidden_states) @@ -764,7 +764,7 @@ def forward( >>> import torch >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') - >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True) + >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2') >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index 4bcef0041224..48ac20aa60b5 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -329,7 +329,7 @@ def __init__(self, config: BartConfig, embed_tokens): self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None def forward( - self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False + self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True ): """ Args: @@ -528,7 +528,7 @@ def forward( use_cache=False, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, **unused, ): """ diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 71d4a23dce6a..3787db98e92d 100755 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -446,7 +446,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -920,7 +920,7 @@ def forward( >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -1036,7 +1036,7 @@ def forward( >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> config.is_decoder = True - >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True) + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -1250,7 +1250,7 @@ def forward( >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True) + >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." diff --git a/src/transformers/modeling_bert_generation.py b/src/transformers/modeling_bert_generation.py index 8366f182bd74..68926e9c3871 100755 --- a/src/transformers/modeling_bert_generation.py +++ b/src/transformers/modeling_bert_generation.py @@ -463,7 +463,7 @@ def forward( >>> tokenizer = BertGenerationTokenizer.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder') >>> config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") >>> config.is_decoder = True - >>> model = BertGenerationDecoder.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder', config=config, return_dict=True) + >>> model = BertGenerationDecoder.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder', config=config) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) diff --git a/src/transformers/modeling_deberta.py b/src/transformers/modeling_deberta.py index c5ad2fd821d9..7e1b6837eb91 100644 --- a/src/transformers/modeling_deberta.py +++ b/src/transformers/modeling_deberta.py @@ -384,7 +384,7 @@ def forward( output_attentions=False, query_states=None, relative_pos=None, - return_dict=False, + return_dict=True, ): attention_mask = self.get_attention_mask(attention_mask) relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 20837a938dab..a75e3f32b481 100755 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -885,7 +885,7 @@ def forward( >>> import torch >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') - >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True) + >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." diff --git a/src/transformers/modeling_dpr.py b/src/transformers/modeling_dpr.py index 9f365304a481..b93908b93336 100644 --- a/src/transformers/modeling_dpr.py +++ b/src/transformers/modeling_dpr.py @@ -455,7 +455,7 @@ def forward( >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') - >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) + >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] >>> embeddings = model(input_ids).pooler_output """ @@ -533,7 +533,7 @@ def forward( >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') - >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True) + >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base') >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] >>> embeddings = model(input_ids).pooler_output """ @@ -609,7 +609,7 @@ def forward( >>> from transformers import DPRReader, DPRReaderTokenizer >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base') - >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True) + >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base') >>> encoded_inputs = tokenizer( ... questions=["What is love ?"], ... titles=["Haddaway"], diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 69dffa59b353..2dca8b215b93 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -442,7 +442,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 5080b1cea59d..81d6c64661ff 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -370,7 +370,7 @@ def forward( >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids) >>> # training - >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids, return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) >>> loss, logits = outputs.loss, outputs.logits >>> # save and load from pretrained diff --git a/src/transformers/modeling_fsmt.py b/src/transformers/modeling_fsmt.py index fba900b1370c..471181dd4fc9 100644 --- a/src/transformers/modeling_fsmt.py +++ b/src/transformers/modeling_fsmt.py @@ -434,7 +434,7 @@ def __init__(self, config: FSMTConfig, embed_tokens): ) # type: List[EncoderLayer] def forward( - self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False + self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True ): """ Args: @@ -617,7 +617,7 @@ def forward( use_cache=False, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, **unused, ): """ diff --git a/src/transformers/modeling_funnel.py b/src/transformers/modeling_funnel.py index 75c351945220..867f0e1cb85a 100644 --- a/src/transformers/modeling_funnel.py +++ b/src/transformers/modeling_funnel.py @@ -619,7 +619,7 @@ def forward( token_type_ids=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): # The pooling is not implemented on long tensors, so we convert this mask. attention_mask = attention_mask.type_as(inputs_embeds) @@ -698,7 +698,7 @@ def forward( token_type_ids=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): upsampled_hidden = upsample( final_hidden, @@ -1111,7 +1111,7 @@ def forward( >>> import torch >>> tokenizer = FunnelTokenizer.from_pretrained('funnel-transformer/small') - >>> model = FunnelForPreTraining.from_pretrained('funnel-transformer/small', return_dict=True) + >>> model = FunnelForPreTraining.from_pretrained('funnel-transformer/small') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "pt") >>> logits = model(**inputs).logits diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 45b4bebfd60c..e8cc8dcf3a78 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -911,7 +911,7 @@ def forward( >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True) + >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2') >>> # Add a [CLS] to the vocabulary (we should train it also!) >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'}) diff --git a/src/transformers/modeling_layoutlm.py b/src/transformers/modeling_layoutlm.py index 24126c0c0074..073e25b6c4f2 100644 --- a/src/transformers/modeling_layoutlm.py +++ b/src/transformers/modeling_layoutlm.py @@ -376,7 +376,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index 950dd0da4448..665de8b543fe 100755 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -1050,7 +1050,7 @@ def forward( attention_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): is_index_masked = attention_mask < 0 @@ -1388,7 +1388,7 @@ def forward( >>> import torch >>> from transformers import LongformerModel, LongformerTokenizer - >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True) + >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096') >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document @@ -1526,7 +1526,7 @@ def forward( >>> import torch >>> from transformers import LongformerForMaskedLM, LongformerTokenizer - >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True) + >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096') >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document @@ -1742,7 +1742,7 @@ def forward( >>> import torch >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") - >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True) + >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> encoding = tokenizer(question, text, return_tensors="pt") diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index e890c30eb22b..bd146e697fdb 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -558,7 +558,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1006,7 +1006,7 @@ def forward( >>> import torch >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased") - >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True) + >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased") >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) @@ -1216,7 +1216,7 @@ def forward( >>> import torch >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') - >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True) + >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased') >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 011fc5eb8cc8..25ed2d40bdf2 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -670,7 +670,7 @@ def forward( >>> import torch >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') - >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True) + >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) >>> model.resize_token_embeddings(len(tokenizer)) diff --git a/src/transformers/modeling_prophetnet.py b/src/transformers/modeling_prophetnet.py index d33504079159..227817e45bcc 100644 --- a/src/transformers/modeling_prophetnet.py +++ b/src/transformers/modeling_prophetnet.py @@ -1180,7 +1180,7 @@ def forward( >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') - >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone', return_dict=True) + >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -1319,7 +1319,7 @@ def forward( >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') - >>> model = ProphetNetDecoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone', add_cross_attention=False, return_dict=True) + >>> model = ProphetNetDecoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone', add_cross_attention=False) >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -1621,7 +1621,7 @@ def forward( >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> last_hidden_states = outputs.last_hidden_state # main stream hidden states >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states @@ -1731,7 +1731,7 @@ def forward( >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> logits_next_token = outputs.logits # logits to predict next token as usual >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens @@ -1940,10 +1940,10 @@ def forward( >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') - >>> model = ProphetNetForCausalLM.from_pretrained('patrickvonplaten/prophetnet-decoder-clm-large-uncased', return_dict=True) + >>> model = ProphetNetForCausalLM.from_pretrained('patrickvonplaten/prophetnet-decoder-clm-large-uncased') >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") - >>> outputs = model(**inputs, return_dict=True) + >>> outputs = model(**inputs) >>> logits = outputs.logits @@ -1962,7 +1962,7 @@ def forward( ... ) >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids - >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:], return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:]) >>> loss = outputs.loss """ diff --git a/src/transformers/modeling_rag.py b/src/transformers/modeling_rag.py index a2d8ddcf26a7..fe134712887c 100644 --- a/src/transformers/modeling_rag.py +++ b/src/transformers/modeling_rag.py @@ -558,9 +558,7 @@ def forward( if encoder_outputs is None: if has_to_retrieve: - question_enc_outputs = self.question_encoder( - input_ids, attention_mask=attention_mask, return_dict=True - ) + question_enc_outputs = self.question_encoder(input_ids, attention_mask=attention_mask) question_encoder_last_hidden_state = question_enc_outputs[0] # hidden states of question encoder retriever_outputs = self.retriever( @@ -620,7 +618,6 @@ def forward( decoder_attention_mask=decoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, - return_dict=True, ) if not has_to_retrieve: @@ -1364,7 +1361,7 @@ def generate( batch_size = context_input_ids.shape[0] // n_docs encoder = self.rag.generator.get_encoder() - encoder_outputs = encoder(input_ids=context_input_ids, attention_mask=context_attention_mask, return_dict=True) + encoder_outputs = encoder(input_ids=context_input_ids, attention_mask=context_attention_mask) input_ids = torch.full( (batch_size * num_beams, 1), diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 3bb3a79a2326..dab19ed7d9ed 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -390,7 +390,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None @@ -770,7 +770,7 @@ def forward( >>> import torch >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - >>> config = RobertaConfig.from_pretrained("roberta-base", return_dict=True) + >>> config = RobertaConfig.from_pretrained("roberta-base") >>> config.is_decoder = True >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config) diff --git a/src/transformers/modeling_squeezebert.py b/src/transformers/modeling_squeezebert.py index 875f0fdd4fb8..54c5cb7b02f8 100644 --- a/src/transformers/modeling_squeezebert.py +++ b/src/transformers/modeling_squeezebert.py @@ -314,7 +314,7 @@ def forward( head_mask=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, ): if head_mask is None: diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index c77fa5f7fb58..21f185e24640 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -534,7 +534,7 @@ def forward( past_key_value=None, use_cache=False, output_attentions=False, - return_dict=False, + return_dict=True, ): if past_key_value is not None: @@ -1022,7 +1022,7 @@ def forward( >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> last_hidden_states = outputs.last_hidden_state """ @@ -1177,7 +1177,7 @@ def forward( >>> from transformers import T5Tokenizer, T5ForConditionalGeneration >>> tokenizer = T5Tokenizer.from_pretrained('t5-small') - >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) + >>> model = T5ForConditionalGeneration.from_pretrained('t5-small') >>> input_ids = tokenizer('The walks in park', return_tensors='pt').input_ids >>> labels = tokenizer(' cute dog the ', return_tensors='pt').input_ids diff --git a/src/transformers/modeling_tf_bart.py b/src/transformers/modeling_tf_bart.py index 6a86aeab2ad5..e1bf4f76b70e 100644 --- a/src/transformers/modeling_tf_bart.py +++ b/src/transformers/modeling_tf_bart.py @@ -1063,7 +1063,7 @@ def call( TXT = "My friends are but they eat too many carbs." model = TFBartForConditionalGeneration.from_pretrained(mname) batch = tokenizer([TXT], return_tensors='tf') - logits = model(inputs=batch.input_ids, return_dict=True).logits + logits = model(inputs=batch.input_ids).logits probs = tf.nn.softmax(logits[0]) # probs[5] is associated with the mask token """ diff --git a/src/transformers/modeling_tf_dpr.py b/src/transformers/modeling_tf_dpr.py index 1b4b4f5bb5e0..45f29ded13b0 100644 --- a/src/transformers/modeling_tf_dpr.py +++ b/src/transformers/modeling_tf_dpr.py @@ -466,7 +466,7 @@ def call( >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') - >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True, from_pt=True) + >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', from_pt=True) >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"] >>> embeddings = model(input_ids).pooler_output """ @@ -565,7 +565,7 @@ def call( >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') - >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True, from_pt=True) + >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', from_pt=True) >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"] >>> embeddings = model(input_ids).pooler_output """ @@ -663,7 +663,7 @@ def call( >>> from transformers import TFDPRReader, DPRReaderTokenizer >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base') - >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True, from_pt=True) + >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', from_pt=True) >>> encoded_inputs = tokenizer( ... questions=["What is love ?"], ... titles=["Haddaway"], diff --git a/src/transformers/modeling_tf_funnel.py b/src/transformers/modeling_tf_funnel.py index ed475e8d0955..1b5fa323b0af 100644 --- a/src/transformers/modeling_tf_funnel.py +++ b/src/transformers/modeling_tf_funnel.py @@ -634,7 +634,7 @@ def call( token_type_ids=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, training=False, ): # The pooling is not implemented on long tensors, so we convert this mask. @@ -719,7 +719,7 @@ def call( token_type_ids=None, output_attentions=False, output_hidden_states=False, - return_dict=False, + return_dict=True, training=False, ): upsampled_hidden = upsample( diff --git a/src/transformers/modeling_tf_lxmert.py b/src/transformers/modeling_tf_lxmert.py index a9b189309d95..09c6ca0fdaa5 100644 --- a/src/transformers/modeling_tf_lxmert.py +++ b/src/transformers/modeling_tf_lxmert.py @@ -1275,6 +1275,7 @@ def call( obj_labels = inputs.pop("obj_labels", obj_labels) matched_label = inputs.pop("matched_label", matched_label) ans = inputs.pop("ans", ans) + return_dict = return_dict if return_dict is not None else self.lxmert.return_dict lxmert_output = self.lxmert( inputs, diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index dcdfc91e1f98..0b01002c8ccd 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -1022,7 +1022,7 @@ def call( >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="tf").input_ids # Batch size 1 - >>> outputs = model(input_ids, decoder_input_ids=decoder_input_ids, return_dict=True) + >>> outputs = model(input_ids, decoder_input_ids=decoder_input_ids) """ @@ -1219,7 +1219,7 @@ def call( >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration - >>> tokenizer = T5Tokenizer.from_pretrained('t5-small', return_dict=True) + >>> tokenizer = T5Tokenizer.from_pretrained('t5-small') >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small') >>> inputs = tokenizer('The walks in park', return_tensors='tf').input_ids diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index fda792f57061..7b423d7e1fa1 100755 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -1020,7 +1020,7 @@ def forward( >>> import torch >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') - >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True) + >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> start_positions = torch.tensor([1]) diff --git a/src/transformers/modeling_xlm_prophetnet.py b/src/transformers/modeling_xlm_prophetnet.py index ed9d1e3b2f69..903d3d6e08eb 100644 --- a/src/transformers/modeling_xlm_prophetnet.py +++ b/src/transformers/modeling_xlm_prophetnet.py @@ -46,7 +46,7 @@ class XLMProphetNetEncoder(ProphetNetEncoder): >>> import torch >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') - >>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', return_dict=True) + >>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone') >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -68,7 +68,7 @@ class XLMProphetNetDecoder(ProphetNetDecoder): >>> import torch >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') - >>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False, return_dict=True) + >>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False) >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -93,7 +93,7 @@ class XLMProphetNetModel(ProphetNetModel): >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> last_hidden_states = outputs.last_hidden_state # main stream hidden states >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states @@ -116,7 +116,7 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration): >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 - >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> logits_next_token = outputs.logits # logits to predict next token as usual >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens @@ -136,7 +136,7 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM): >>> import torch >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased') - >>> model = XLMProphetNetForCausalLM.from_pretrained('patrickvonplaten/xprophetnet-decoder-clm-large-uncased', return_dict=True) + >>> model = XLMProphetNetForCausalLM.from_pretrained('patrickvonplaten/xprophetnet-decoder-clm-large-uncased') >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -158,7 +158,7 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM): ... ) >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids - >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:], return_dict=True) + >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:]) >>> loss = outputs.loss """ diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 6405cd13532f..601b201635d7 100755 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -1381,7 +1381,7 @@ def forward( >>> import torch >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') - >>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True) + >>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') >>> # We show how to setup inputs to predict a next token using a bi-directional context. >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token @@ -1916,7 +1916,7 @@ def forward( >>> import torch >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') - >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True) + >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> start_positions = torch.tensor([1]) diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py index 15263c360580..0859b16b5cc3 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py @@ -118,7 +118,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_generation_utils.py b/tests/test_generation_utils.py index ab07987315a4..433dad34e680 100644 --- a/tests/test_generation_utils.py +++ b/tests/test_generation_utils.py @@ -118,7 +118,7 @@ def _get_beam_scorer_and_kwargs(batch_size, max_length, num_return_sequences=1): @staticmethod def _get_encoder_outputs(model, input_ids, attention_mask, num_interleave=1): encoder = model.get_encoder() - encoder_outputs = encoder(input_ids, attention_mask=attention_mask, return_dict=True) + encoder_outputs = encoder(input_ids, attention_mask=attention_mask) encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave( num_interleave, dim=0 ) @@ -344,6 +344,7 @@ def test_beam_search_generate(self): def test_beam_sample_generate(self): for model_class in self.all_generative_model_classes: config, input_ids, attention_mask, max_length = self._get_input_ids_and_config() + print("Return dict", config.return_dict) logits_warper_kwargs, logits_warper = self._get_warper_and_kwargs(num_beams=1) model = model_class(config).to(torch_device) diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py index f3f2459b16b1..a53fa069af74 100644 --- a/tests/test_modeling_albert.py +++ b/tests/test_modeling_albert.py @@ -102,7 +102,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, num_hidden_groups=self.num_hidden_groups, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py index 552323893669..3b997b264939 100644 --- a/tests/test_modeling_bart.py +++ b/tests/test_modeling_bart.py @@ -259,7 +259,6 @@ def _get_config_and_data(self): eos_token_id=2, pad_token_id=1, bos_token_id=0, - return_dict=True, ) return config, input_ids, batch_size @@ -310,7 +309,6 @@ def test_lm_uneven_forward(self): encoder_ffn_dim=8, decoder_ffn_dim=8, max_position_embeddings=48, - return_dict=True, ) lm_model = BartForConditionalGeneration(config).to(torch_device) context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device) @@ -713,6 +711,6 @@ def test_encoder_equiv(self): padding="longest", truncation=True, ) - features = self.xsum_1_1_model.get_encoder()(**batch, return_dict=True).last_hidden_state + features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state expected = [[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]] assert_tensors_close(features[0, :3, :3], torch.tensor(expected), atol=1e-3) diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py index 1cc296714b5e..7e80465c072f 100755 --- a/tests/test_modeling_bert.py +++ b/tests/test_modeling_bert.py @@ -124,7 +124,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_bert_generation.py b/tests/test_modeling_bert_generation.py index f5ce360a89f0..b71b02c77178 100755 --- a/tests/test_modeling_bert_generation.py +++ b/tests/test_modeling_bert_generation.py @@ -89,7 +89,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, is_decoder=False, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, input_mask, token_labels diff --git a/tests/test_modeling_camembert.py b/tests/test_modeling_camembert.py index 41b0626e5bdd..26888a1d70bb 100644 --- a/tests/test_modeling_camembert.py +++ b/tests/test_modeling_camembert.py @@ -31,7 +31,7 @@ class CamembertModelIntegrationTest(unittest.TestCase): @slow def test_output_embeds_base_model(self): - model = CamembertModel.from_pretrained("camembert-base", return_dict=True) + model = CamembertModel.from_pretrained("camembert-base") model.to(torch_device) input_ids = torch.tensor( diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 80e510fc35e4..b72031d2f501 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -657,7 +657,7 @@ def check_hidden_states_output(inputs_dict, config, model_class): model.eval() with torch.no_grad(): - outputs = model(**self._prepare_for_class(inputs_dict, model_class), return_dict=True) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) hidden_states = outputs["hidden_states"] if "hidden_states" in outputs else outputs[-1] expected_num_layers = getattr( diff --git a/tests/test_modeling_ctrl.py b/tests/test_modeling_ctrl.py index d73c3f9c329a..030a7bf9fe34 100644 --- a/tests/test_modeling_ctrl.py +++ b/tests/test_modeling_ctrl.py @@ -94,7 +94,6 @@ def prepare_config_and_inputs(self): n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range, - return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) diff --git a/tests/test_modeling_deberta.py b/tests/test_modeling_deberta.py index 28ded3cf974c..96b08cdb62c6 100644 --- a/tests/test_modeling_deberta.py +++ b/tests/test_modeling_deberta.py @@ -148,7 +148,7 @@ def prepare_config_and_inputs(self): return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels def check_loss_output(self, result): - self.parent.assertListEqual(list(result["loss"].size()), []) + self.parent.assertListEqual(list(result.loss.size()), []) def create_and_check_deberta_model( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels @@ -160,11 +160,8 @@ def create_and_check_deberta_model( sequence_output = model(input_ids, token_type_ids=token_type_ids)[0] sequence_output = model(input_ids)[0] - result = { - "sequence_output": sequence_output, - } self.parent.assertListEqual( - list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size] + list(sequence_output.size()), [self.batch_size, self.seq_length, self.hidden_size] ) def create_and_check_deberta_for_sequence_classification( @@ -174,14 +171,8 @@ def create_and_check_deberta_for_sequence_classification( model = DebertaForSequenceClassification(config) model.to(torch_device) model.eval() - loss, logits = model( - input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels - ) - result = { - "loss": loss, - "logits": logits, - } - self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) + result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) + self.parent.assertListEqual(list(result.logits.size()), [self.batch_size, self.num_labels]) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py index fb4dc4e4ebdd..d1a014ab2938 100644 --- a/tests/test_modeling_distilbert.py +++ b/tests/test_modeling_distilbert.py @@ -110,7 +110,6 @@ def prepare_config_and_inputs(self): attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_dpr.py b/tests/test_modeling_dpr.py index 2c9ad7f25056..f9a04b1a9f8f 100644 --- a/tests/test_modeling_dpr.py +++ b/tests/test_modeling_dpr.py @@ -117,7 +117,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, - return_dict=True, ) config = DPRConfig(projection_dim=self.projection_dim, **config.to_dict()) diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py index 340fbcd18023..ee39fd1b1d87 100644 --- a/tests/test_modeling_electra.py +++ b/tests/test_modeling_electra.py @@ -101,7 +101,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, - return_dict=True, ) return ( diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py index 0dbab4634d04..d446eea76bb2 100644 --- a/tests/test_modeling_encoder_decoder.py +++ b/tests/test_modeling_encoder_decoder.py @@ -85,7 +85,6 @@ def check_encoder_decoder_model_from_pretrained_configs( decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, - return_dict=True, ) self.assertEqual( @@ -117,7 +116,6 @@ def check_encoder_decoder_model( decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, - return_dict=True, ) self.assertEqual( outputs_encoder_decoder["logits"].shape, (decoder_input_ids.shape + (decoder_config.vocab_size,)) @@ -132,7 +130,6 @@ def check_encoder_decoder_model( decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, - return_dict=True, ) self.assertEqual( @@ -278,7 +275,6 @@ def check_encoder_decoder_model_labels( attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, labels=labels, - return_dict=True, ) loss = outputs_encoder_decoder["loss"] @@ -313,7 +309,6 @@ def check_encoder_decoder_model_output_attentions( attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, output_attentions=True, - return_dict=True, ) encoder_attentions = outputs_encoder_decoder["encoder_attentions"] diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py index b5617a059147..d07a8f5138f4 100644 --- a/tests/test_modeling_flaubert.py +++ b/tests/test_modeling_flaubert.py @@ -113,7 +113,6 @@ def prepare_config_and_inputs(self): initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, - return_dict=True, ) return ( diff --git a/tests/test_modeling_flax_bert.py b/tests/test_modeling_flax_bert.py index 3bd67c35d4ee..f06d2559164a 100644 --- a/tests/test_modeling_flax_bert.py +++ b/tests/test_modeling_flax_bert.py @@ -29,7 +29,7 @@ def test_from_pytorch(self): # Check for simple input pt_inputs = tokenizer.encode_plus("This is a simple input", return_tensors=TensorType.PYTORCH) fx_inputs = tokenizer.encode_plus("This is a simple input", return_tensors=TensorType.JAX) - pt_outputs = pt_model(**pt_inputs) + pt_outputs = pt_model(**pt_inputs).to_tuple() fx_outputs = fx_model(**fx_inputs) self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") diff --git a/tests/test_modeling_flax_roberta.py b/tests/test_modeling_flax_roberta.py index 2db0cf9c8377..b8b89776b8d2 100644 --- a/tests/test_modeling_flax_roberta.py +++ b/tests/test_modeling_flax_roberta.py @@ -34,7 +34,7 @@ def test_from_pytorch(self): self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") - for fx_output, pt_output in zip(fx_outputs, pt_outputs): + for fx_output, pt_output in zip(fx_outputs, pt_outputs.to_tuple()): self.assert_almost_equals(fx_output, pt_output.numpy(), 5e-4) def assert_almost_equals(self, a: ndarray, b: ndarray, tol: float): diff --git a/tests/test_modeling_fsmt.py b/tests/test_modeling_fsmt.py index 138a2a39154a..b5e3e8d1e349 100644 --- a/tests/test_modeling_fsmt.py +++ b/tests/test_modeling_fsmt.py @@ -259,7 +259,6 @@ def _get_config(self): eos_token_id=2, pad_token_id=1, bos_token_id=0, - return_dict=True, ) def _get_config_and_data(self): diff --git a/tests/test_modeling_funnel.py b/tests/test_modeling_funnel.py index f3fd12e9378c..0e3846cef147 100644 --- a/tests/test_modeling_funnel.py +++ b/tests/test_modeling_funnel.py @@ -140,7 +140,6 @@ def prepare_config_and_inputs(self): activation_dropout=self.activation_dropout, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - return_dict=True, ) return ( diff --git a/tests/test_modeling_gpt2.py b/tests/test_modeling_gpt2.py index 6b8fbbbc9f2f..3ed643f20598 100644 --- a/tests/test_modeling_gpt2.py +++ b/tests/test_modeling_gpt2.py @@ -131,7 +131,6 @@ def prepare_config_and_inputs(self, gradient_checkpointing=False): bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, - return_dict=True, gradient_checkpointing=gradient_checkpointing, ) diff --git a/tests/test_modeling_layoutlm.py b/tests/test_modeling_layoutlm.py index 2b616e4df60d..cf5a10e3b9da 100644 --- a/tests/test_modeling_layoutlm.py +++ b/tests/test_modeling_layoutlm.py @@ -125,7 +125,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, bbox, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py index afbc812ae578..216afe4c91e3 100644 --- a/tests/test_modeling_longformer.py +++ b/tests/test_modeling_longformer.py @@ -113,7 +113,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, attention_window=self.attention_window, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_lxmert.py b/tests/test_modeling_lxmert.py index e335603d71ab..3222c21687cd 100644 --- a/tests/test_modeling_lxmert.py +++ b/tests/test_modeling_lxmert.py @@ -282,7 +282,6 @@ def create_and_check_lxmert_for_question_answering( attention_mask=input_mask, labels=ans, output_attentions=output_attentions, - return_dict=True, ) result = model(input_ids, visual_feats, bounding_boxes, labels=ans) result = model( @@ -302,7 +301,6 @@ def create_and_check_lxmert_for_question_answering( attention_mask=input_mask, labels=ans, output_attentions=not output_attentions, - return_dict=True, ) self.parent.assertEqual(result.question_answering_score.shape, (self.batch_size, self.num_qa_labels)) @@ -335,7 +333,6 @@ def create_and_check_lxmert_for_pretraining( matched_label=matched_label, ans=ans, output_attentions=output_attentions, - return_dict=True, ) result = model( input_ids, @@ -390,7 +387,6 @@ def create_and_check_lxmert_for_pretraining( matched_label=matched_label, ans=ans, output_attentions=not output_attentions, - return_dict=True, ) self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) @@ -427,7 +423,6 @@ def resize_lxmert_num_qa_labels( token_type_ids=token_type_ids, attention_mask=input_mask, ans=ans, - return_dict=True, ) result_qa = model_qa( @@ -437,7 +432,6 @@ def resize_lxmert_num_qa_labels( labels=ans, token_type_ids=token_type_ids, attention_mask=input_mask, - return_dict=True, ) model_pretrain.resize_num_qa_labels(num_small_labels) @@ -450,7 +444,6 @@ def resize_lxmert_num_qa_labels( token_type_ids=token_type_ids, attention_mask=input_mask, ans=less_labels_ans, - return_dict=True, ) result_qa_less = model_qa( @@ -460,7 +453,6 @@ def resize_lxmert_num_qa_labels( labels=less_labels_ans, token_type_ids=token_type_ids, attention_mask=input_mask, - return_dict=True, ) model_pretrain.resize_num_qa_labels(num_large_labels) @@ -473,7 +465,6 @@ def resize_lxmert_num_qa_labels( token_type_ids=token_type_ids, attention_mask=input_mask, ans=more_labels_ans, - return_dict=True, ) result_qa_more = model_qa( @@ -483,7 +474,6 @@ def resize_lxmert_num_qa_labels( labels=more_labels_ans, token_type_ids=token_type_ids, attention_mask=input_mask, - return_dict=True, ) model_qa_labels = model_qa.num_qa_labels diff --git a/tests/test_modeling_marian.py b/tests/test_modeling_marian.py index d387d3728f8f..a20c4a20d869 100644 --- a/tests/test_modeling_marian.py +++ b/tests/test_modeling_marian.py @@ -50,7 +50,6 @@ def __init__(self, parent): decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, - return_dict=True, ) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_mbart.py b/tests/test_modeling_mbart.py index ced627907c83..2b8da23338a5 100644 --- a/tests/test_modeling_mbart.py +++ b/tests/test_modeling_mbart.py @@ -37,7 +37,6 @@ def __init__(self, parent): decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, - return_dict=True, ) def prepare_config_and_inputs_for_common(self): @@ -132,7 +131,6 @@ def test_mbart_fast_forward(self): decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, - return_dict=True, ) lm_model = MBartForConditionalGeneration(config).to(torch_device) context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2], [68, 34, 26, 58, 30, 2, 1]]).long().to(torch_device) diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py index e1e3ad82d078..24c636161dcd 100644 --- a/tests/test_modeling_mobilebert.py +++ b/tests/test_modeling_mobilebert.py @@ -124,7 +124,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py index 75858a05498b..34678cae9014 100644 --- a/tests/test_modeling_openai.py +++ b/tests/test_modeling_openai.py @@ -94,7 +94,6 @@ def prepare_config_and_inputs(self): # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range pad_token_id=self.pad_token_id, - return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) diff --git a/tests/test_modeling_pegasus.py b/tests/test_modeling_pegasus.py index 2cb0a1567b67..07f3326d13da 100644 --- a/tests/test_modeling_pegasus.py +++ b/tests/test_modeling_pegasus.py @@ -33,7 +33,6 @@ def __init__(self, parent): decoder_ffn_dim=32, max_position_embeddings=48, add_final_layer_norm=True, - return_dict=True, ) def prepare_config_and_inputs_for_common(self): diff --git a/tests/test_modeling_prophetnet.py b/tests/test_modeling_prophetnet.py index e8016c4282fb..8457e6f1467b 100644 --- a/tests/test_modeling_prophetnet.py +++ b/tests/test_modeling_prophetnet.py @@ -142,7 +142,6 @@ def prepare_config_and_inputs(self): disable_ngram_loss=self.disable_ngram_loss, max_position_embeddings=self.max_position_embeddings, is_encoder_decoder=self.is_encoder_decoder, - return_dict=True, ) return ( @@ -344,7 +343,6 @@ def create_and_check_encoder_decoder_shared_weights( decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, - return_dict=True, ) tied_model_result = tied_model( @@ -352,7 +350,6 @@ def create_and_check_encoder_decoder_shared_weights( decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, - return_dict=True, ) # check that models has less parameters @@ -419,7 +416,6 @@ def check_fast_integration( attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, labels=lm_labels, - return_dict=True, ) self.parent.assertTrue(torch.allclose(result.loss, torch.tensor(128.2925, device=torch_device), atol=1e-3)) @@ -433,9 +429,7 @@ def check_model_with_attn_mask(self, config, input_ids, decoder_input_ids, *args model.to(torch_device) model.eval() - outputs_no_mask = model( - input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5], return_dict=True - ) + outputs_no_mask = model(input_ids=input_ids[:, :5], decoder_input_ids=decoder_input_ids[:, :5]) attention_mask = torch.ones_like(input_ids) decoder_attention_mask = torch.ones_like(decoder_input_ids) @@ -446,7 +440,6 @@ def check_model_with_attn_mask(self, config, input_ids, decoder_input_ids, *args attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, - return_dict=True, ) # check encoder @@ -524,7 +517,6 @@ def __init__( bos_token_id=1, eos_token_id=2, ngram=2, - return_dict=True, num_buckets=32, relative_max_distance=128, disable_ngram_loss=False, @@ -562,7 +554,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.add_cross_attention = add_cross_attention self.is_encoder_decoder = is_encoder_decoder - self.return_dict = return_dict self.scope = None self.decoder_key_length = decoder_seq_length @@ -602,7 +593,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, add_cross_attention=self.add_cross_attention, is_encoder_decoder=self.is_encoder_decoder, - return_dict=self.return_dict, ) return ( @@ -757,7 +747,6 @@ def __init__( pad_token_id=0, bos_token_id=1, eos_token_id=2, - return_dict=True, num_buckets=32, relative_max_distance=128, disable_ngram_loss=False, @@ -794,7 +783,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.add_cross_attention = add_cross_attention self.is_encoder_decoder = is_encoder_decoder - self.return_dict = return_dict self.scope = None self.decoder_key_length = decoder_seq_length @@ -829,7 +817,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, add_cross_attention=self.add_cross_attention, is_encoder_decoder=self.is_encoder_decoder, - return_dict=self.return_dict, ) return ( @@ -919,7 +906,6 @@ def test_fp16_forward(self): # methods overwrite method in `test_modeling_common.py` def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True seq_len = getattr(self.model_tester, "seq_length", None) decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) @@ -933,7 +919,6 @@ def test_attention_outputs(self): for model_class in self.all_model_classes: inputs_dict["output_attentions"] = True inputs_dict["output_hidden_states"] = False - config.return_dict = True model = model_class(config) model.to(torch_device) model.eval() @@ -1121,7 +1106,6 @@ def test_pretrained_checkpoint_hidden_states(self): attention_mask=None, encoder_outputs=None, decoder_input_ids=decoder_prev_ids, - return_dict=True, ) output_predited_logits = output[0] expected_shape = torch.Size((1, 12, 30522)) @@ -1143,9 +1127,7 @@ def test_pretrained_checkpoint_hidden_states(self): assert torch.allclose(encoder_outputs[:, :3, :3], expected_encoder_outputs_slice, atol=1e-4) # decoder outputs - decoder_outputs = model.prophetnet.decoder( - decoder_prev_ids, encoder_hidden_states=encoder_outputs, return_dict=True - ) + decoder_outputs = model.prophetnet.decoder(decoder_prev_ids, encoder_hidden_states=encoder_outputs) predicting_streams = decoder_outputs[1].view(1, model.config.ngram, 12, -1) predicting_streams_logits = model.lm_head(predicting_streams) next_first_stream_logits = predicting_streams_logits[:, 0] diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index 3a92e3b81ce6..92f8e01b36f8 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -174,7 +174,6 @@ def prepare_config_and_inputs(self): attn_layers=self.attn_layers, pad_token_id=self.pad_token_id, hash_seed=self.hash_seed, - return_dict=True, ) return ( diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py index de753266c387..20b7dfcb6cf0 100644 --- a/tests/test_modeling_roberta.py +++ b/tests/test_modeling_roberta.py @@ -103,7 +103,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_squeezebert.py b/tests/test_modeling_squeezebert.py index f257f1de8d30..18f41e8cf8cd 100644 --- a/tests/test_modeling_squeezebert.py +++ b/tests/test_modeling_squeezebert.py @@ -131,7 +131,6 @@ def prepare_config_and_inputs(self): post_attention_groups=self.post_attention_groups, intermediate_groups=self.intermediate_groups, output_groups=self.output_groups, - return_dict=True, ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index 11e3ab0ad872..7bf81df9df3e 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -115,7 +115,6 @@ def prepare_config_and_inputs(self): bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.decoder_start_token_id, - return_dict=True, ) return ( diff --git a/tests/test_modeling_tf_albert.py b/tests/test_modeling_tf_albert.py index 8ab6189d5a46..96cfdfb3c9fe 100644 --- a/tests/test_modeling_tf_albert.py +++ b/tests/test_modeling_tf_albert.py @@ -121,7 +121,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_tf_bart.py b/tests/test_modeling_tf_bart.py index 4efdd3b08b09..999692404d2f 100644 --- a/tests/test_modeling_tf_bart.py +++ b/tests/test_modeling_tf_bart.py @@ -182,7 +182,6 @@ def _get_config_and_data(self): eos_token_id=2, pad_token_id=1, bos_token_id=0, - return_dict=True, decoder_start_token_id=2, ) return config, input_ids, batch_size @@ -206,7 +205,6 @@ def test_lm_uneven_forward(self): encoder_ffn_dim=32, decoder_ffn_dim=32, max_position_embeddings=48, - return_dict=True, ) lm_model = TFBartForConditionalGeneration(config) context = tf.fill((7, 2), 4) @@ -356,7 +354,7 @@ def test_encoder_equiv(self): padding="longest", truncation=True, ) - features = self.xsum_1_1_model.get_encoder()(**batch, return_dict=True).last_hidden_state + features = self.xsum_1_1_model.get_encoder()(**batch).last_hidden_state import numpy as np expected = np.array([[-0.0828, -0.0251, -0.0674], [0.1277, 0.3311, -0.0255], [0.2613, -0.0840, -0.2763]]) diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py index 48a14f4a23d0..f6122b09ef4c 100644 --- a/tests/test_modeling_tf_bert.py +++ b/tests/test_modeling_tf_bert.py @@ -120,7 +120,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_tf_camembert.py b/tests/test_modeling_tf_camembert.py index 92caa29a6439..cfd96fe56e9d 100644 --- a/tests/test_modeling_tf_camembert.py +++ b/tests/test_modeling_tf_camembert.py @@ -39,7 +39,7 @@ def test_output_embeds_base_model(self): dtype=tf.int32, ) # J'aime le camembert !" - output = model(input_ids, return_dict=True)["last_hidden_state"] + output = model(input_ids)["last_hidden_state"] expected_shape = tf.TensorShape((1, 10, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index e4c2ceb9f686..53fbdfc99dae 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -284,7 +284,7 @@ def assert_outputs_same(self, after_outputs, outputs): if isinstance(after_outputs, tf.Tensor): out_1 = after_outputs.numpy() elif isinstance(after_outputs, dict): - out_1 = after_outputs[list(after_outputs.keys())[0]] + out_1 = after_outputs[list(after_outputs.keys())[0]].numpy() else: out_1 = after_outputs[0].numpy() out_2 = outputs[0].numpy() diff --git a/tests/test_modeling_tf_ctrl.py b/tests/test_modeling_tf_ctrl.py index be9ba0111dad..4cae35634a70 100644 --- a/tests/test_modeling_tf_ctrl.py +++ b/tests/test_modeling_tf_ctrl.py @@ -94,7 +94,6 @@ def prepare_config_and_inputs(self): n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range, - return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py index 2e6633491646..73bcd7d00eb2 100644 --- a/tests/test_modeling_tf_distilbert.py +++ b/tests/test_modeling_tf_distilbert.py @@ -91,7 +91,6 @@ def prepare_config_and_inputs(self): attention_dropout=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_tf_electra.py b/tests/test_modeling_tf_electra.py index 95a570a6a526..b5d3c933bb3b 100644 --- a/tests/test_modeling_tf_electra.py +++ b/tests/test_modeling_tf_electra.py @@ -97,7 +97,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py index 77f04266662a..56eddaea6947 100644 --- a/tests/test_modeling_tf_flaubert.py +++ b/tests/test_modeling_tf_flaubert.py @@ -114,7 +114,6 @@ def prepare_config_and_inputs(self): summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, - return_dict=True, ) return ( diff --git a/tests/test_modeling_tf_funnel.py b/tests/test_modeling_tf_funnel.py index bb723c8d5b81..804b30752610 100644 --- a/tests/test_modeling_tf_funnel.py +++ b/tests/test_modeling_tf_funnel.py @@ -137,7 +137,6 @@ def prepare_config_and_inputs(self): activation_dropout=self.activation_dropout, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, - return_dict=True, ) return ( diff --git a/tests/test_modeling_tf_gpt2.py b/tests/test_modeling_tf_gpt2.py index b8532b7ad392..b63a843dc984 100644 --- a/tests/test_modeling_tf_gpt2.py +++ b/tests/test_modeling_tf_gpt2.py @@ -104,7 +104,6 @@ def prepare_config_and_inputs(self): # initializer_range=self.initializer_range bos_token_id=self.bos_token_id, eos_token_id=self.eos_token_id, - return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) diff --git a/tests/test_modeling_tf_longformer.py b/tests/test_modeling_tf_longformer.py index 47c7e80afb7a..f402c41ae99e 100644 --- a/tests/test_modeling_tf_longformer.py +++ b/tests/test_modeling_tf_longformer.py @@ -594,7 +594,9 @@ def test_inference_masked_lm_long(self): # 'Hello world! ' repeated 1000 times input_ids = tf.convert_to_tensor([[0] + [20920, 232, 328, 1437] * 1000 + [2]], dtype=tf.dtypes.int32) - loss, prediction_scores = model(input_ids, labels=input_ids) + output = model(input_ids, labels=input_ids) + loss = output.loss + prediction_scores = output.logits expected_loss = tf.constant(0.0073798) expected_prediction_scores_sum = tf.constant(-610476600.0) diff --git a/tests/test_modeling_tf_lxmert.py b/tests/test_modeling_tf_lxmert.py index 42f6287b187c..5037208229cb 100644 --- a/tests/test_modeling_tf_lxmert.py +++ b/tests/test_modeling_tf_lxmert.py @@ -297,7 +297,6 @@ def create_and_check_lxmert_for_pretraining( matched_label=matched_label, ans=ans, output_attentions=output_attentions, - return_dict=True, ) result = model( input_ids, @@ -352,7 +351,6 @@ def create_and_check_lxmert_for_pretraining( matched_label=matched_label, ans=ans, output_attentions=not output_attentions, - return_dict=True, ) self.parent.assertEqual(result.prediction_logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) @@ -695,7 +693,8 @@ def test_saved_model_with_hidden_states_output(self): model = tf.keras.models.load_model(tmpdirname) outputs = model(class_inputs_dict) - language_hidden_states, vision_hidden_states = outputs[-2], outputs[-1] + language_hidden_states = outputs["language_hidden_states"] + vision_hidden_states = outputs["vision_hidden_states"] self.assertEqual(len(language_hidden_states), self.model_tester.num_hidden_layers["language"] + 1) self.assertEqual(len(vision_hidden_states), self.model_tester.num_hidden_layers["vision"] + 1) @@ -731,11 +730,9 @@ def test_saved_model_with_attentions_output(self): model = tf.keras.models.load_model(tmpdirname) outputs = model(class_inputs_dict) - language_attentions, vision_attentions, cross_encoder_attentions = ( - outputs[-3], - outputs[-2], - outputs[-1], - ) + language_attentions = outputs["language_attentions"] + vision_attentions = outputs["vision_attentions"] + cross_encoder_attentions = outputs["cross_encoder_attentions"] self.assertEqual(len(language_attentions), self.model_tester.num_hidden_layers["language"]) self.assertEqual(len(vision_attentions), self.model_tester.num_hidden_layers["vision"]) diff --git a/tests/test_modeling_tf_mobilebert.py b/tests/test_modeling_tf_mobilebert.py index d40803160c14..1ea2b663c58a 100644 --- a/tests/test_modeling_tf_mobilebert.py +++ b/tests/test_modeling_tf_mobilebert.py @@ -139,7 +139,6 @@ def prepare_config_and_inputs(self): type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, embedding_size=self.embedding_size, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_tf_openai.py b/tests/test_modeling_tf_openai.py index 6e57db2d39f4..1c9dab78c948 100644 --- a/tests/test_modeling_tf_openai.py +++ b/tests/test_modeling_tf_openai.py @@ -99,7 +99,6 @@ def prepare_config_and_inputs(self): n_ctx=self.max_position_embeddings, # type_vocab_size=self.type_vocab_size, # initializer_range=self.initializer_range, - return_dict=True, ) head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2) diff --git a/tests/test_modeling_tf_roberta.py b/tests/test_modeling_tf_roberta.py index c79e8799d7ac..b9614dd4d8a3 100644 --- a/tests/test_modeling_tf_roberta.py +++ b/tests/test_modeling_tf_roberta.py @@ -97,7 +97,6 @@ def prepare_config_and_inputs(self): max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, - return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index af6080de77da..d7e43d569cc3 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -78,7 +78,6 @@ def prepare_config_and_inputs(self): bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, decoder_start_token_id=self.pad_token_id, - return_dict=True, ) return (config, input_ids, input_mask, token_labels) diff --git a/tests/test_modeling_tf_transfo_xl.py b/tests/test_modeling_tf_transfo_xl.py index ecaad385765d..f8da32532cd5 100644 --- a/tests/test_modeling_tf_transfo_xl.py +++ b/tests/test_modeling_tf_transfo_xl.py @@ -77,7 +77,6 @@ def prepare_config_and_inputs(self): div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, - return_dict=True, ) return (config, input_ids_1, input_ids_2, lm_labels) diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index be7f7d1cf3de..da376ff531d8 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -114,7 +114,6 @@ def prepare_config_and_inputs(self): summary_type=self.summary_type, use_proj=self.use_proj, bos_token_id=self.bos_token_id, - return_dict=True, ) return ( diff --git a/tests/test_modeling_tf_xlm_roberta.py b/tests/test_modeling_tf_xlm_roberta.py index 10485abfe5cd..b67d42db4e5f 100644 --- a/tests/test_modeling_tf_xlm_roberta.py +++ b/tests/test_modeling_tf_xlm_roberta.py @@ -39,7 +39,7 @@ def test_output_embeds_base_model(self): "attention_mask": tf.convert_to_tensor([[1, 1, 1, 1, 1, 1]], dtype=tf.int32), } - output = model(features, return_dict=True)["last_hidden_state"] + output = model(features)["last_hidden_state"] expected_shape = tf.TensorShape((1, 6, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. diff --git a/tests/test_modeling_tf_xlnet.py b/tests/test_modeling_tf_xlnet.py index 08e3ee1c6f0b..da5a66c8bc70 100644 --- a/tests/test_modeling_tf_xlnet.py +++ b/tests/test_modeling_tf_xlnet.py @@ -111,7 +111,6 @@ def prepare_config_and_inputs(self): bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, - return_dict=True, ) return ( diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py index ce199f2da8ed..2c9c893623b1 100644 --- a/tests/test_modeling_transfo_xl.py +++ b/tests/test_modeling_transfo_xl.py @@ -78,7 +78,6 @@ def prepare_config_and_inputs(self): div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, - return_dict=True, ) return (config, input_ids_1, input_ids_2, lm_labels) diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index 852a6a4e0544..9fd0de1bc384 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -116,7 +116,6 @@ def prepare_config_and_inputs(self): use_proj=self.use_proj, num_labels=self.num_labels, bos_token_id=self.bos_token_id, - return_dict=True, ) return ( diff --git a/tests/test_modeling_xlm_roberta.py b/tests/test_modeling_xlm_roberta.py index ef22e325c95a..f5e766bb7b02 100644 --- a/tests/test_modeling_xlm_roberta.py +++ b/tests/test_modeling_xlm_roberta.py @@ -32,7 +32,7 @@ class XLMRobertaModelIntegrationTest(unittest.TestCase): @slow def test_xlm_roberta_base(self): - model = XLMRobertaModel.from_pretrained("xlm-roberta-base", return_dict=True) + model = XLMRobertaModel.from_pretrained("xlm-roberta-base") input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]) # The dog is cute and lives in the garden house @@ -51,7 +51,7 @@ def test_xlm_roberta_base(self): @slow def test_xlm_roberta_large(self): - model = XLMRobertaModel.from_pretrained("xlm-roberta-large", return_dict=True) + model = XLMRobertaModel.from_pretrained("xlm-roberta-large") input_ids = torch.tensor([[0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2]]) # The dog is cute and lives in the garden house diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py index 9bd81f9b9e2e..72497dbd55b2 100644 --- a/tests/test_modeling_xlnet.py +++ b/tests/test_modeling_xlnet.py @@ -148,7 +148,6 @@ def prepare_config_and_inputs(self): bos_token_id=self.bos_token_id, pad_token_id=self.pad_token_id, eos_token_id=self.eos_token_id, - return_dict=True, ) return (