diff --git a/tests/test_generation_logits_process.py b/tests/test_generation_logits_process.py
index 2e00be0fa4ae..e07fd3066e2e 100644
--- a/tests/test_generation_logits_process.py
+++ b/tests/test_generation_logits_process.py
@@ -24,7 +24,7 @@
 
 if is_torch_available():
     import torch
-    import torch.nn.functional as F
+    from torch import nn
 
     from transformers.generation_logits_process import (
         EncoderNoRepeatNGramLogitsProcessor,
@@ -80,13 +80,13 @@ def test_temperature_dist_warper(self):
         scores[1, 10] = (1 / length) - 0.4  # valley, 1st batch
 
         # compute softmax
-        probs = F.softmax(scores, dim=-1)
+        probs = nn.functional.softmax(scores, dim=-1)
 
         temp_dist_warper_sharper = TemperatureLogitsWarper(temperature=0.5)
         temp_dist_warper_smoother = TemperatureLogitsWarper(temperature=1.3)
 
-        warped_prob_sharp = F.softmax(temp_dist_warper_sharper(input_ids, scores.clone()), dim=-1)
-        warped_prob_smooth = F.softmax(temp_dist_warper_smoother(input_ids, scores.clone()), dim=-1)
+        warped_prob_sharp = nn.functional.softmax(temp_dist_warper_sharper(input_ids, scores.clone()), dim=-1)
+        warped_prob_smooth = nn.functional.softmax(temp_dist_warper_smoother(input_ids, scores.clone()), dim=-1)
 
         # uniform distribution stays uniform
         self.assertTrue(torch.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3))
diff --git a/tests/test_modeling_clip.py b/tests/test_modeling_clip.py
index 2a8f05d7a600..afcc5903c63d 100644
--- a/tests/test_modeling_clip.py
+++ b/tests/test_modeling_clip.py
@@ -30,6 +30,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel
     from transformers.models.clip.modeling_clip import CLIP_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -140,9 +141,9 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module))
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
             x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
 
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 272f25a0ecf5..56e5cddbc96c 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -44,6 +44,7 @@
 if is_torch_available():
     import numpy as np
     import torch
+    from torch import nn
 
     from transformers import (
         BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -1150,10 +1151,10 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding))
-            model.set_input_embeddings(torch.nn.Embedding(10, 10))
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding))
+            model.set_input_embeddings(nn.Embedding(10, 10))
             x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
 
     def test_correct_missing_keys(self):
         if not self.test_missing_keys:
@@ -1337,7 +1338,7 @@ def test_multi_gpu_data_parallel_forward(self):
             model.eval()
 
             # Wrap model in nn.DataParallel
-            model = torch.nn.DataParallel(model)
+            model = nn.DataParallel(model)
             with torch.no_grad():
                 _ = model(**self._prepare_for_class(inputs_dict, model_class))
 
diff --git a/tests/test_modeling_deit.py b/tests/test_modeling_deit.py
index 5551da08903a..0eb24f84cf0e 100644
--- a/tests/test_modeling_deit.py
+++ b/tests/test_modeling_deit.py
@@ -27,6 +27,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import (
         MODEL_MAPPING,
@@ -176,9 +177,9 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module))
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
             x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
 
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/test_modeling_fsmt.py b/tests/test_modeling_fsmt.py
index 3c01360d0c08..7c3ba4a1e80e 100644
--- a/tests/test_modeling_fsmt.py
+++ b/tests/test_modeling_fsmt.py
@@ -30,6 +30,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer
     from transformers.models.fsmt.modeling_fsmt import (
@@ -160,10 +161,10 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding))
-            model.set_input_embeddings(torch.nn.Embedding(10, 10))
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding))
+            model.set_input_embeddings(nn.Embedding(10, 10))
             x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.modules.sparse.Embedding))
+            self.assertTrue(x is None or isinstance(x, nn.modules.sparse.Embedding))
 
     def test_initialization_more(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs()
diff --git a/tests/test_modeling_ibert.py b/tests/test_modeling_ibert.py
index 8ef878b902e8..d0b672193cc1 100755
--- a/tests/test_modeling_ibert.py
+++ b/tests/test_modeling_ibert.py
@@ -26,7 +26,7 @@
 
 if is_torch_available():
     import torch
-    import torch.nn as nn
+    from torch import nn
 
     from transformers import (
         IBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -304,9 +304,9 @@ def test_model_common_attributes(self):
         for model_class in self.all_model_classes:
             model = model_class(config)
             self.assertIsInstance(model.get_input_embeddings(), QuantEmbedding)
-            model.set_input_embeddings(torch.nn.Embedding(10, 10))
+            model.set_input_embeddings(nn.Embedding(10, 10))
             x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
 
     # Override
     def test_feed_forward_chunking(self):
@@ -350,7 +350,7 @@ def test_quant_embedding(self):
         weight_bit = 8
         embedding = QuantEmbedding(2, 4, quant_mode=True, weight_bit=weight_bit)
         embedding_weight = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]])
-        embedding.weight = torch.nn.Parameter(embedding_weight)
+        embedding.weight = nn.Parameter(embedding_weight)
 
         expected_scaling_factor = embedding_weight.abs().max() / (2 ** (weight_bit - 1) - 1)
         x, x_scaling_factor = embedding(torch.tensor(0))
@@ -447,8 +447,8 @@ def _test(per_channel):
             linear_q = QuantLinear(2, 4, quant_mode=True, per_channel=per_channel, weight_bit=weight_bit)
             linear_dq = QuantLinear(2, 4, quant_mode=False, per_channel=per_channel, weight_bit=weight_bit)
             linear_weight = torch.tensor([[-1.0, 2.0, 3.0, -4.0], [5.0, -6.0, -7.0, 8.0]]).T
-            linear_q.weight = torch.nn.Parameter(linear_weight)
-            linear_dq.weight = torch.nn.Parameter(linear_weight)
+            linear_q.weight = nn.Parameter(linear_weight)
+            linear_dq.weight = nn.Parameter(linear_weight)
 
             q, q_scaling_factor = linear_q(x, x_scaling_factor)
             q_int = q / q_scaling_factor
@@ -477,7 +477,7 @@ def _test(per_channel):
 
     def test_int_gelu(self):
         gelu_q = IntGELU(quant_mode=True)
-        gelu_dq = torch.nn.GELU()
+        gelu_dq = nn.GELU()
 
         x_int = torch.range(-10000, 10000, 1)
         x_scaling_factor = torch.tensor(0.001)
@@ -523,7 +523,7 @@ def test_force_dequant_gelu(self):
     def test_int_softmax(self):
         output_bit = 8
         softmax_q = IntSoftmax(output_bit, quant_mode=True)
-        softmax_dq = torch.nn.Softmax()
+        softmax_dq = nn.Softmax()
 
         # x_int = torch.range(-10000, 10000, 1)
         def _test(array):
@@ -590,12 +590,12 @@ def test_int_layernorm(self):
         x = x_int * x_scaling_factor
 
         ln_q = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit)
-        ln_dq = torch.nn.LayerNorm(x.shape[1:], 1e-5)
+        ln_dq = nn.LayerNorm(x.shape[1:], 1e-5)
 
-        ln_q.weight = torch.nn.Parameter(torch.ones(x.shape[1:]))
-        ln_q.bias = torch.nn.Parameter(torch.ones(x.shape[1:]))
-        ln_dq.weight = torch.nn.Parameter(torch.ones(x.shape[1:]))
-        ln_dq.bias = torch.nn.Parameter(torch.ones(x.shape[1:]))
+        ln_q.weight = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_q.bias = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:]))
 
         q, q_scaling_factor = ln_q(x, x_scaling_factor)
         q_int = q / q_scaling_factor
@@ -627,13 +627,13 @@ def test_force_dequant_layernorm(self):
             ],
         }
 
-        ln_dq.weight = torch.nn.Parameter(torch.ones(x.shape[1:]))
-        ln_dq.bias = torch.nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:]))
+        ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:]))
         dq, dq_scaling_factor = ln_dq(x, x_scaling_factor)
         for label, ln_fdqs in ln_fdqs_dict.items():
             for ln_fdq in ln_fdqs:
-                ln_fdq.weight = torch.nn.Parameter(torch.ones(x.shape[1:]))
-                ln_fdq.bias = torch.nn.Parameter(torch.ones(x.shape[1:]))
+                ln_fdq.weight = nn.Parameter(torch.ones(x.shape[1:]))
+                ln_fdq.bias = nn.Parameter(torch.ones(x.shape[1:]))
                 q, q_scaling_factor = ln_fdq(x, x_scaling_factor)
                 if label:
                     self.assertTrue(torch.allclose(q, dq, atol=1e-4))
diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py
index 05db9599c517..e8e5129a10d8 100644
--- a/tests/test_modeling_reformer.py
+++ b/tests/test_modeling_reformer.py
@@ -32,6 +32,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import (
         REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -241,7 +242,7 @@ def create_and_check_reformer_model_with_attn_mask(
         # set all position encodings to zero so that postions don't matter
         with torch.no_grad():
             embedding = model.embeddings.position_embeddings.embedding
-            embedding.weight = torch.nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device))
+            embedding.weight = nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device))
             embedding.weight.requires_grad = False
 
         half_seq_len = self.seq_length // 2
diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py
index adbaf3642e8b..f9b01e638d97 100644
--- a/tests/test_modeling_transfo_xl.py
+++ b/tests/test_modeling_transfo_xl.py
@@ -27,6 +27,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import TransfoXLConfig, TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
     from transformers.models.transfo_xl.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST
@@ -362,11 +363,11 @@ def _mock_init_weights(self, module):
         if hasattr(module, "emb_projs"):
             for i in range(len(module.emb_projs)):
                 if module.emb_projs[i] is not None:
-                    torch.nn.init.constant_(module.emb_projs[i], 0.0003)
+                    nn.init.constant_(module.emb_projs[i], 0.0003)
         if hasattr(module, "out_projs"):
             for i in range(len(module.out_projs)):
                 if module.out_projs[i] is not None:
-                    torch.nn.init.constant_(module.out_projs[i], 0.0003)
+                    nn.init.constant_(module.out_projs[i], 0.0003)
 
         for param in ["r_emb", "r_w_bias", "r_r_bias", "r_bias"]:
             if hasattr(module, param) and getattr(module, param) is not None:
diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py
index 09d4fa372a5d..b45c12c16d3d 100644
--- a/tests/test_modeling_vit.py
+++ b/tests/test_modeling_vit.py
@@ -27,6 +27,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import ViTConfig, ViTForImageClassification, ViTModel
     from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
@@ -169,9 +170,9 @@ def test_model_common_attributes(self):
 
         for model_class in self.all_model_classes:
             model = model_class(config)
-            self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module))
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
             x = model.get_output_embeddings()
-            self.assertTrue(x is None or isinstance(x, torch.nn.Linear))
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
 
     def test_forward_signature(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/test_optimization.py b/tests/test_optimization.py
index 4a1a0a785a58..c0c5a31a3a49 100644
--- a/tests/test_optimization.py
+++ b/tests/test_optimization.py
@@ -24,6 +24,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers import (
         Adafactor,
@@ -70,7 +71,7 @@ def assertListAlmostEqual(self, list1, list2, tol):
     def test_adam_w(self):
         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
         target = torch.tensor([0.4, 0.2, -0.5])
-        criterion = torch.nn.MSELoss()
+        criterion = nn.MSELoss()
         # No warmup, constant schedule, no gradient clipping
         optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
         for _ in range(100):
@@ -84,7 +85,7 @@ def test_adam_w(self):
     def test_adafactor(self):
         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
         target = torch.tensor([0.4, 0.2, -0.5])
-        criterion = torch.nn.MSELoss()
+        criterion = nn.MSELoss()
         # No warmup, constant schedule, no gradient clipping
         optimizer = Adafactor(
             params=[w],
@@ -109,7 +110,7 @@ def test_adafactor(self):
 
 @require_torch
 class ScheduleInitTest(unittest.TestCase):
-    m = torch.nn.Linear(50, 50) if is_torch_available() else None
+    m = nn.Linear(50, 50) if is_torch_available() else None
     optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None
     num_steps = 10
 
diff --git a/tests/test_pipelines_conversational.py b/tests/test_pipelines_conversational.py
index 89524dd3fb20..cfc9512e85bf 100644
--- a/tests/test_pipelines_conversational.py
+++ b/tests/test_pipelines_conversational.py
@@ -32,6 +32,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel
 
@@ -59,8 +60,8 @@ def get_pipeline(self):
         bias[76] = 1
         weight = torch.zeros((V, D), requires_grad=True)
 
-        model.lm_head.bias = torch.nn.Parameter(bias)
-        model.lm_head.weight = torch.nn.Parameter(weight)
+        model.lm_head.bias = nn.Parameter(bias)
+        model.lm_head.weight = nn.Parameter(weight)
 
         # # Created with:
         # import tempfile
diff --git a/tests/test_pipelines_summarization.py b/tests/test_pipelines_summarization.py
index f4ae9d13ca42..6bc55e9915ac 100644
--- a/tests/test_pipelines_summarization.py
+++ b/tests/test_pipelines_summarization.py
@@ -23,6 +23,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
 
     from transformers.models.bart import BartConfig, BartForConditionalGeneration
 
@@ -55,7 +56,7 @@ def test_input_too_long(self):
         bias = torch.zeros(V)
         bias[76] = 10
 
-        model.lm_head.bias = torch.nn.Parameter(bias)
+        model.lm_head.bias = nn.Parameter(bias)
 
         # # Generated with:
         # import tempfile
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index e5c2bf7b88bf..7bc507eb9338 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -53,6 +53,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
     from torch.utils.data import IterableDataset
 
     from transformers import (
@@ -154,11 +155,11 @@ def __iter__(self):
             for i in range(len(self.dataset)):
                 yield self.dataset[i]
 
-    class RegressionModel(torch.nn.Module):
+    class RegressionModel(nn.Module):
         def __init__(self, a=0, b=0, double_output=False):
             super().__init__()
-            self.a = torch.nn.Parameter(torch.tensor(a).float())
-            self.b = torch.nn.Parameter(torch.tensor(b).float())
+            self.a = nn.Parameter(torch.tensor(a).float())
+            self.b = nn.Parameter(torch.tensor(b).float())
             self.double_output = double_output
             self.config = None
 
@@ -166,21 +167,21 @@ def forward(self, input_x, labels=None, **kwargs):
             y = input_x * self.a + self.b
             if labels is None:
                 return (y, y) if self.double_output else (y,)
-            loss = torch.nn.functional.mse_loss(y, labels)
+            loss = nn.functional.mse_loss(y, labels)
             return (loss, y, y) if self.double_output else (loss, y)
 
-    class RegressionDictModel(torch.nn.Module):
+    class RegressionDictModel(nn.Module):
         def __init__(self, a=0, b=0):
             super().__init__()
-            self.a = torch.nn.Parameter(torch.tensor(a).float())
-            self.b = torch.nn.Parameter(torch.tensor(b).float())
+            self.a = nn.Parameter(torch.tensor(a).float())
+            self.b = nn.Parameter(torch.tensor(b).float())
             self.config = None
 
         def forward(self, input_x, labels=None, **kwargs):
             y = input_x * self.a + self.b
             result = {"output": y}
             if labels is not None:
-                result["loss"] = torch.nn.functional.mse_loss(y, labels)
+                result["loss"] = nn.functional.mse_loss(y, labels)
             return result
 
     class RegressionPreTrainedModel(PreTrainedModel):
@@ -189,15 +190,15 @@ class RegressionPreTrainedModel(PreTrainedModel):
 
         def __init__(self, config):
             super().__init__(config)
-            self.a = torch.nn.Parameter(torch.tensor(config.a).float())
-            self.b = torch.nn.Parameter(torch.tensor(config.b).float())
+            self.a = nn.Parameter(torch.tensor(config.a).float())
+            self.b = nn.Parameter(torch.tensor(config.b).float())
             self.double_output = config.double_output
 
         def forward(self, input_x, labels=None, **kwargs):
             y = input_x * self.a + self.b
             if labels is None:
                 return (y, y) if self.double_output else (y,)
-            loss = torch.nn.functional.mse_loss(y, labels)
+            loss = nn.functional.mse_loss(y, labels)
             return (loss, y, y) if self.double_output else (loss, y)
 
     class RegressionRandomPreTrainedModel(PreTrainedModel):
@@ -206,8 +207,8 @@ class RegressionRandomPreTrainedModel(PreTrainedModel):
 
         def __init__(self, config):
             super().__init__(config)
-            self.a = torch.nn.Parameter(torch.tensor(config.a).float())
-            self.b = torch.nn.Parameter(torch.tensor(config.b).float())
+            self.a = nn.Parameter(torch.tensor(config.a).float())
+            self.b = nn.Parameter(torch.tensor(config.b).float())
 
         def forward(self, input_x, labels=None, **kwargs):
             y = input_x * self.a + self.b
@@ -219,21 +220,21 @@ def forward(self, input_x, labels=None, **kwargs):
 
             if labels is None:
                 return (y,)
-            loss = torch.nn.functional.mse_loss(y, labels)
+            loss = nn.functional.mse_loss(y, labels)
             return (loss, y)
 
-    class TstLayer(torch.nn.Module):
+    class TstLayer(nn.Module):
         def __init__(self, hidden_size):
             super().__init__()
-            self.linear1 = torch.nn.Linear(hidden_size, hidden_size)
-            self.ln1 = torch.nn.LayerNorm(hidden_size)
-            self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
-            self.ln2 = torch.nn.LayerNorm(hidden_size)
-            self.bias = torch.nn.Parameter(torch.zeros(hidden_size))
+            self.linear1 = nn.Linear(hidden_size, hidden_size)
+            self.ln1 = nn.LayerNorm(hidden_size)
+            self.linear2 = nn.Linear(hidden_size, hidden_size)
+            self.ln2 = nn.LayerNorm(hidden_size)
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
 
         def forward(self, x):
-            h = self.ln1(torch.nn.functional.relu(self.linear1(x)))
-            h = torch.nn.functional.relu(self.linear2(x))
+            h = self.ln1(nn.functional.relu(self.linear1(x)))
+            h = nn.functional.relu(self.linear2(x))
             return self.ln2(x + h + self.bias)
 
     def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, **kwargs):
@@ -1065,7 +1066,7 @@ def assert_flos_extraction(trainer, wrapped_model_to_check):
         assert_flos_extraction(trainer, trainer.model)
 
         # with enforced DataParallel
-        assert_flos_extraction(trainer, torch.nn.DataParallel(trainer.model))
+        assert_flos_extraction(trainer, nn.DataParallel(trainer.model))
 
         trainer.train()
         self.assertTrue(isinstance(trainer.state.total_flos, float))
@@ -1186,7 +1187,7 @@ def test_fp16_full_eval(self):
         self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000)
 
     def test_no_wd_param_group(self):
-        model = torch.nn.Sequential(TstLayer(128), torch.nn.ModuleList([TstLayer(128), TstLayer(128)]))
+        model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
         trainer = Trainer(model=model)
         trainer.create_optimizer_and_scheduler(10)
         # fmt: off
diff --git a/tests/test_trainer_utils.py b/tests/test_trainer_utils.py
index b543a1ebcafa..80096742868a 100644
--- a/tests/test_trainer_utils.py
+++ b/tests/test_trainer_utils.py
@@ -24,6 +24,7 @@
 
 if is_torch_available():
     import torch
+    from torch import nn
     from torch.utils.data import IterableDataset
 
     from transformers.modeling_outputs import SequenceClassifierOutput
@@ -40,18 +41,18 @@
         get_parameter_names,
     )
 
-    class TstLayer(torch.nn.Module):
+    class TstLayer(nn.Module):
         def __init__(self, hidden_size):
             super().__init__()
-            self.linear1 = torch.nn.Linear(hidden_size, hidden_size)
-            self.ln1 = torch.nn.LayerNorm(hidden_size)
-            self.linear2 = torch.nn.Linear(hidden_size, hidden_size)
-            self.ln2 = torch.nn.LayerNorm(hidden_size)
-            self.bias = torch.nn.Parameter(torch.zeros(hidden_size))
+            self.linear1 = nn.Linear(hidden_size, hidden_size)
+            self.ln1 = nn.LayerNorm(hidden_size)
+            self.linear2 = nn.Linear(hidden_size, hidden_size)
+            self.ln2 = nn.LayerNorm(hidden_size)
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
 
         def forward(self, x):
-            h = self.ln1(torch.nn.functional.relu(self.linear1(x)))
-            h = torch.nn.functional.relu(self.linear2(x))
+            h = self.ln1(nn.functional.relu(self.linear1(x)))
+            h = nn.functional.relu(self.linear2(x))
             return self.ln2(x + h + self.bias)
 
     class RandomIterableDataset(IterableDataset):
@@ -151,10 +152,10 @@ def test_label_smoothing(self):
         num_labels = 12
         random_logits = torch.randn(4, 5, num_labels)
         random_labels = torch.randint(0, num_labels, (4, 5))
-        loss = torch.nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1))
+        loss = nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1))
         model_output = SequenceClassifierOutput(logits=random_logits)
         label_smoothed_loss = LabelSmoother(0.1)(model_output, random_labels)
-        log_probs = -torch.nn.functional.log_softmax(random_logits, dim=-1)
+        log_probs = -nn.functional.log_softmax(random_logits, dim=-1)
         expected_loss = (1 - epsilon) * loss + epsilon * log_probs.mean()
         self.assertTrue(torch.allclose(label_smoothed_loss, expected_loss))
 
@@ -163,10 +164,10 @@ def test_label_smoothing(self):
         random_labels[2, 1] = -100
         random_labels[2, 3] = -100
 
-        loss = torch.nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1))
+        loss = nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1))
         model_output = SequenceClassifierOutput(logits=random_logits)
         label_smoothed_loss = LabelSmoother(0.1)(model_output, random_labels)
-        log_probs = -torch.nn.functional.log_softmax(random_logits, dim=-1)
+        log_probs = -nn.functional.log_softmax(random_logits, dim=-1)
         # Mask the log probs with the -100 labels
         log_probs[0, 1] = 0.0
         log_probs[2, 1] = 0.0
@@ -230,10 +231,10 @@ def test_distributed_length_grouped(self):
         self.assertEqual(list(sorted(indices_process_0 + indices_process_1)), list(range(100)))
 
     def test_get_parameter_names(self):
-        model = torch.nn.Sequential(TstLayer(128), torch.nn.ModuleList([TstLayer(128), TstLayer(128)]))
+        model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)]))
         # fmt: off
         self.assertEqual(
-            get_parameter_names(model, [torch.nn.LayerNorm]),
+            get_parameter_names(model, [nn.LayerNorm]),
             ['0.linear1.weight', '0.linear1.bias', '0.linear2.weight', '0.linear2.bias', '0.bias', '1.0.linear1.weight', '1.0.linear1.bias', '1.0.linear2.weight', '1.0.linear2.bias', '1.0.bias', '1.1.linear1.weight', '1.1.linear1.bias', '1.1.linear2.weight', '1.1.linear2.bias', '1.1.bias']
         )
         # fmt: on