diff --git a/tests/test_generation_logits_process.py b/tests/test_generation_logits_process.py index 2e00be0fa4ae..e07fd3066e2e 100644 --- a/tests/test_generation_logits_process.py +++ b/tests/test_generation_logits_process.py @@ -24,7 +24,7 @@ if is_torch_available(): import torch - import torch.nn.functional as F + from torch import nn from transformers.generation_logits_process import ( EncoderNoRepeatNGramLogitsProcessor, @@ -80,13 +80,13 @@ def test_temperature_dist_warper(self): scores[1, 10] = (1 / length) - 0.4 # valley, 1st batch # compute softmax - probs = F.softmax(scores, dim=-1) + probs = nn.functional.softmax(scores, dim=-1) temp_dist_warper_sharper = TemperatureLogitsWarper(temperature=0.5) temp_dist_warper_smoother = TemperatureLogitsWarper(temperature=1.3) - warped_prob_sharp = F.softmax(temp_dist_warper_sharper(input_ids, scores.clone()), dim=-1) - warped_prob_smooth = F.softmax(temp_dist_warper_smoother(input_ids, scores.clone()), dim=-1) + warped_prob_sharp = nn.functional.softmax(temp_dist_warper_sharper(input_ids, scores.clone()), dim=-1) + warped_prob_smooth = nn.functional.softmax(temp_dist_warper_smoother(input_ids, scores.clone()), dim=-1) # uniform distribution stays uniform self.assertTrue(torch.allclose(probs[0, :], warped_prob_sharp[0, :], atol=1e-3)) diff --git a/tests/test_modeling_clip.py b/tests/test_modeling_clip.py index 2a8f05d7a600..afcc5903c63d 100644 --- a/tests/test_modeling_clip.py +++ b/tests/test_modeling_clip.py @@ -30,6 +30,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPVisionConfig, CLIPVisionModel from transformers.models.clip.modeling_clip import CLIP_PRETRAINED_MODEL_ARCHIVE_LIST @@ -140,9 +141,9 @@ def test_model_common_attributes(self): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module)) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) + self.assertTrue(x is None or isinstance(x, nn.Linear)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 272f25a0ecf5..56e5cddbc96c 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -44,6 +44,7 @@ if is_torch_available(): import numpy as np import torch + from torch import nn from transformers import ( BERT_PRETRAINED_MODEL_ARCHIVE_LIST, @@ -1150,10 +1151,10 @@ def test_model_common_attributes(self): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding, AdaptiveEmbedding)) - model.set_input_embeddings(torch.nn.Embedding(10, 10)) + self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding, AdaptiveEmbedding)) + model.set_input_embeddings(nn.Embedding(10, 10)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) + self.assertTrue(x is None or isinstance(x, nn.Linear)) def test_correct_missing_keys(self): if not self.test_missing_keys: @@ -1337,7 +1338,7 @@ def test_multi_gpu_data_parallel_forward(self): model.eval() # Wrap model in nn.DataParallel - model = torch.nn.DataParallel(model) + model = nn.DataParallel(model) with torch.no_grad(): _ = model(**self._prepare_for_class(inputs_dict, model_class)) diff --git a/tests/test_modeling_deit.py b/tests/test_modeling_deit.py index 5551da08903a..0eb24f84cf0e 100644 --- a/tests/test_modeling_deit.py +++ b/tests/test_modeling_deit.py @@ -27,6 +27,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import ( MODEL_MAPPING, @@ -176,9 +177,9 @@ def test_model_common_attributes(self): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module)) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) + self.assertTrue(x is None or isinstance(x, nn.Linear)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_modeling_fsmt.py b/tests/test_modeling_fsmt.py index 3c01360d0c08..7c3ba4a1e80e 100644 --- a/tests/test_modeling_fsmt.py +++ b/tests/test_modeling_fsmt.py @@ -30,6 +30,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTModel, FSMTTokenizer from transformers.models.fsmt.modeling_fsmt import ( @@ -160,10 +161,10 @@ def test_model_common_attributes(self): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Embedding)) - model.set_input_embeddings(torch.nn.Embedding(10, 10)) + self.assertIsInstance(model.get_input_embeddings(), (nn.Embedding)) + model.set_input_embeddings(nn.Embedding(10, 10)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.modules.sparse.Embedding)) + self.assertTrue(x is None or isinstance(x, nn.modules.sparse.Embedding)) def test_initialization_more(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs() diff --git a/tests/test_modeling_ibert.py b/tests/test_modeling_ibert.py index 8ef878b902e8..d0b672193cc1 100755 --- a/tests/test_modeling_ibert.py +++ b/tests/test_modeling_ibert.py @@ -26,7 +26,7 @@ if is_torch_available(): import torch - import torch.nn as nn + from torch import nn from transformers import ( IBERT_PRETRAINED_MODEL_ARCHIVE_LIST, @@ -304,9 +304,9 @@ def test_model_common_attributes(self): for model_class in self.all_model_classes: model = model_class(config) self.assertIsInstance(model.get_input_embeddings(), QuantEmbedding) - model.set_input_embeddings(torch.nn.Embedding(10, 10)) + model.set_input_embeddings(nn.Embedding(10, 10)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) + self.assertTrue(x is None or isinstance(x, nn.Linear)) # Override def test_feed_forward_chunking(self): @@ -350,7 +350,7 @@ def test_quant_embedding(self): weight_bit = 8 embedding = QuantEmbedding(2, 4, quant_mode=True, weight_bit=weight_bit) embedding_weight = torch.tensor([[-1.0, -2.0, -3.0, -4.0], [5.0, 6.0, 7.0, 8.0]]) - embedding.weight = torch.nn.Parameter(embedding_weight) + embedding.weight = nn.Parameter(embedding_weight) expected_scaling_factor = embedding_weight.abs().max() / (2 ** (weight_bit - 1) - 1) x, x_scaling_factor = embedding(torch.tensor(0)) @@ -447,8 +447,8 @@ def _test(per_channel): linear_q = QuantLinear(2, 4, quant_mode=True, per_channel=per_channel, weight_bit=weight_bit) linear_dq = QuantLinear(2, 4, quant_mode=False, per_channel=per_channel, weight_bit=weight_bit) linear_weight = torch.tensor([[-1.0, 2.0, 3.0, -4.0], [5.0, -6.0, -7.0, 8.0]]).T - linear_q.weight = torch.nn.Parameter(linear_weight) - linear_dq.weight = torch.nn.Parameter(linear_weight) + linear_q.weight = nn.Parameter(linear_weight) + linear_dq.weight = nn.Parameter(linear_weight) q, q_scaling_factor = linear_q(x, x_scaling_factor) q_int = q / q_scaling_factor @@ -477,7 +477,7 @@ def _test(per_channel): def test_int_gelu(self): gelu_q = IntGELU(quant_mode=True) - gelu_dq = torch.nn.GELU() + gelu_dq = nn.GELU() x_int = torch.range(-10000, 10000, 1) x_scaling_factor = torch.tensor(0.001) @@ -523,7 +523,7 @@ def test_force_dequant_gelu(self): def test_int_softmax(self): output_bit = 8 softmax_q = IntSoftmax(output_bit, quant_mode=True) - softmax_dq = torch.nn.Softmax() + softmax_dq = nn.Softmax() # x_int = torch.range(-10000, 10000, 1) def _test(array): @@ -590,12 +590,12 @@ def test_int_layernorm(self): x = x_int * x_scaling_factor ln_q = IntLayerNorm(x.shape[1:], 1e-5, quant_mode=True, output_bit=output_bit) - ln_dq = torch.nn.LayerNorm(x.shape[1:], 1e-5) + ln_dq = nn.LayerNorm(x.shape[1:], 1e-5) - ln_q.weight = torch.nn.Parameter(torch.ones(x.shape[1:])) - ln_q.bias = torch.nn.Parameter(torch.ones(x.shape[1:])) - ln_dq.weight = torch.nn.Parameter(torch.ones(x.shape[1:])) - ln_dq.bias = torch.nn.Parameter(torch.ones(x.shape[1:])) + ln_q.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_q.bias = nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:])) q, q_scaling_factor = ln_q(x, x_scaling_factor) q_int = q / q_scaling_factor @@ -627,13 +627,13 @@ def test_force_dequant_layernorm(self): ], } - ln_dq.weight = torch.nn.Parameter(torch.ones(x.shape[1:])) - ln_dq.bias = torch.nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_dq.bias = nn.Parameter(torch.ones(x.shape[1:])) dq, dq_scaling_factor = ln_dq(x, x_scaling_factor) for label, ln_fdqs in ln_fdqs_dict.items(): for ln_fdq in ln_fdqs: - ln_fdq.weight = torch.nn.Parameter(torch.ones(x.shape[1:])) - ln_fdq.bias = torch.nn.Parameter(torch.ones(x.shape[1:])) + ln_fdq.weight = nn.Parameter(torch.ones(x.shape[1:])) + ln_fdq.bias = nn.Parameter(torch.ones(x.shape[1:])) q, q_scaling_factor = ln_fdq(x, x_scaling_factor) if label: self.assertTrue(torch.allclose(q, dq, atol=1e-4)) diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index 05db9599c517..e8e5129a10d8 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -32,6 +32,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import ( REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, @@ -241,7 +242,7 @@ def create_and_check_reformer_model_with_attn_mask( # set all position encodings to zero so that postions don't matter with torch.no_grad(): embedding = model.embeddings.position_embeddings.embedding - embedding.weight = torch.nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device)) + embedding.weight = nn.Parameter(torch.zeros(embedding.weight.shape).to(torch_device)) embedding.weight.requires_grad = False half_seq_len = self.seq_length // 2 diff --git a/tests/test_modeling_transfo_xl.py b/tests/test_modeling_transfo_xl.py index adbaf3642e8b..f9b01e638d97 100644 --- a/tests/test_modeling_transfo_xl.py +++ b/tests/test_modeling_transfo_xl.py @@ -27,6 +27,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import TransfoXLConfig, TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel from transformers.models.transfo_xl.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST @@ -362,11 +363,11 @@ def _mock_init_weights(self, module): if hasattr(module, "emb_projs"): for i in range(len(module.emb_projs)): if module.emb_projs[i] is not None: - torch.nn.init.constant_(module.emb_projs[i], 0.0003) + nn.init.constant_(module.emb_projs[i], 0.0003) if hasattr(module, "out_projs"): for i in range(len(module.out_projs)): if module.out_projs[i] is not None: - torch.nn.init.constant_(module.out_projs[i], 0.0003) + nn.init.constant_(module.out_projs[i], 0.0003) for param in ["r_emb", "r_w_bias", "r_r_bias", "r_bias"]: if hasattr(module, param) and getattr(module, param) is not None: diff --git a/tests/test_modeling_vit.py b/tests/test_modeling_vit.py index 09d4fa372a5d..b45c12c16d3d 100644 --- a/tests/test_modeling_vit.py +++ b/tests/test_modeling_vit.py @@ -27,6 +27,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import ViTConfig, ViTForImageClassification, ViTModel from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple @@ -169,9 +170,9 @@ def test_model_common_attributes(self): for model_class in self.all_model_classes: model = model_class(config) - self.assertIsInstance(model.get_input_embeddings(), (torch.nn.Module)) + self.assertIsInstance(model.get_input_embeddings(), (nn.Module)) x = model.get_output_embeddings() - self.assertTrue(x is None or isinstance(x, torch.nn.Linear)) + self.assertTrue(x is None or isinstance(x, nn.Linear)) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_optimization.py b/tests/test_optimization.py index 4a1a0a785a58..c0c5a31a3a49 100644 --- a/tests/test_optimization.py +++ b/tests/test_optimization.py @@ -24,6 +24,7 @@ if is_torch_available(): import torch + from torch import nn from transformers import ( Adafactor, @@ -70,7 +71,7 @@ def assertListAlmostEqual(self, list1, list2, tol): def test_adam_w(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) - criterion = torch.nn.MSELoss() + criterion = nn.MSELoss() # No warmup, constant schedule, no gradient clipping optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0) for _ in range(100): @@ -84,7 +85,7 @@ def test_adam_w(self): def test_adafactor(self): w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) target = torch.tensor([0.4, 0.2, -0.5]) - criterion = torch.nn.MSELoss() + criterion = nn.MSELoss() # No warmup, constant schedule, no gradient clipping optimizer = Adafactor( params=[w], @@ -109,7 +110,7 @@ def test_adafactor(self): @require_torch class ScheduleInitTest(unittest.TestCase): - m = torch.nn.Linear(50, 50) if is_torch_available() else None + m = nn.Linear(50, 50) if is_torch_available() else None optimizer = AdamW(m.parameters(), lr=10.0) if is_torch_available() else None num_steps = 10 diff --git a/tests/test_pipelines_conversational.py b/tests/test_pipelines_conversational.py index 89524dd3fb20..cfc9512e85bf 100644 --- a/tests/test_pipelines_conversational.py +++ b/tests/test_pipelines_conversational.py @@ -32,6 +32,7 @@ if is_torch_available(): import torch + from torch import nn from transformers.models.gpt2 import GPT2Config, GPT2LMHeadModel @@ -59,8 +60,8 @@ def get_pipeline(self): bias[76] = 1 weight = torch.zeros((V, D), requires_grad=True) - model.lm_head.bias = torch.nn.Parameter(bias) - model.lm_head.weight = torch.nn.Parameter(weight) + model.lm_head.bias = nn.Parameter(bias) + model.lm_head.weight = nn.Parameter(weight) # # Created with: # import tempfile diff --git a/tests/test_pipelines_summarization.py b/tests/test_pipelines_summarization.py index f4ae9d13ca42..6bc55e9915ac 100644 --- a/tests/test_pipelines_summarization.py +++ b/tests/test_pipelines_summarization.py @@ -23,6 +23,7 @@ if is_torch_available(): import torch + from torch import nn from transformers.models.bart import BartConfig, BartForConditionalGeneration @@ -55,7 +56,7 @@ def test_input_too_long(self): bias = torch.zeros(V) bias[76] = 10 - model.lm_head.bias = torch.nn.Parameter(bias) + model.lm_head.bias = nn.Parameter(bias) # # Generated with: # import tempfile diff --git a/tests/test_trainer.py b/tests/test_trainer.py index e5c2bf7b88bf..7bc507eb9338 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -53,6 +53,7 @@ if is_torch_available(): import torch + from torch import nn from torch.utils.data import IterableDataset from transformers import ( @@ -154,11 +155,11 @@ def __iter__(self): for i in range(len(self.dataset)): yield self.dataset[i] - class RegressionModel(torch.nn.Module): + class RegressionModel(nn.Module): def __init__(self, a=0, b=0, double_output=False): super().__init__() - self.a = torch.nn.Parameter(torch.tensor(a).float()) - self.b = torch.nn.Parameter(torch.tensor(b).float()) + self.a = nn.Parameter(torch.tensor(a).float()) + self.b = nn.Parameter(torch.tensor(b).float()) self.double_output = double_output self.config = None @@ -166,21 +167,21 @@ def forward(self, input_x, labels=None, **kwargs): y = input_x * self.a + self.b if labels is None: return (y, y) if self.double_output else (y,) - loss = torch.nn.functional.mse_loss(y, labels) + loss = nn.functional.mse_loss(y, labels) return (loss, y, y) if self.double_output else (loss, y) - class RegressionDictModel(torch.nn.Module): + class RegressionDictModel(nn.Module): def __init__(self, a=0, b=0): super().__init__() - self.a = torch.nn.Parameter(torch.tensor(a).float()) - self.b = torch.nn.Parameter(torch.tensor(b).float()) + self.a = nn.Parameter(torch.tensor(a).float()) + self.b = nn.Parameter(torch.tensor(b).float()) self.config = None def forward(self, input_x, labels=None, **kwargs): y = input_x * self.a + self.b result = {"output": y} if labels is not None: - result["loss"] = torch.nn.functional.mse_loss(y, labels) + result["loss"] = nn.functional.mse_loss(y, labels) return result class RegressionPreTrainedModel(PreTrainedModel): @@ -189,15 +190,15 @@ class RegressionPreTrainedModel(PreTrainedModel): def __init__(self, config): super().__init__(config) - self.a = torch.nn.Parameter(torch.tensor(config.a).float()) - self.b = torch.nn.Parameter(torch.tensor(config.b).float()) + self.a = nn.Parameter(torch.tensor(config.a).float()) + self.b = nn.Parameter(torch.tensor(config.b).float()) self.double_output = config.double_output def forward(self, input_x, labels=None, **kwargs): y = input_x * self.a + self.b if labels is None: return (y, y) if self.double_output else (y,) - loss = torch.nn.functional.mse_loss(y, labels) + loss = nn.functional.mse_loss(y, labels) return (loss, y, y) if self.double_output else (loss, y) class RegressionRandomPreTrainedModel(PreTrainedModel): @@ -206,8 +207,8 @@ class RegressionRandomPreTrainedModel(PreTrainedModel): def __init__(self, config): super().__init__(config) - self.a = torch.nn.Parameter(torch.tensor(config.a).float()) - self.b = torch.nn.Parameter(torch.tensor(config.b).float()) + self.a = nn.Parameter(torch.tensor(config.a).float()) + self.b = nn.Parameter(torch.tensor(config.b).float()) def forward(self, input_x, labels=None, **kwargs): y = input_x * self.a + self.b @@ -219,21 +220,21 @@ def forward(self, input_x, labels=None, **kwargs): if labels is None: return (y,) - loss = torch.nn.functional.mse_loss(y, labels) + loss = nn.functional.mse_loss(y, labels) return (loss, y) - class TstLayer(torch.nn.Module): + class TstLayer(nn.Module): def __init__(self, hidden_size): super().__init__() - self.linear1 = torch.nn.Linear(hidden_size, hidden_size) - self.ln1 = torch.nn.LayerNorm(hidden_size) - self.linear2 = torch.nn.Linear(hidden_size, hidden_size) - self.ln2 = torch.nn.LayerNorm(hidden_size) - self.bias = torch.nn.Parameter(torch.zeros(hidden_size)) + self.linear1 = nn.Linear(hidden_size, hidden_size) + self.ln1 = nn.LayerNorm(hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.ln2 = nn.LayerNorm(hidden_size) + self.bias = nn.Parameter(torch.zeros(hidden_size)) def forward(self, x): - h = self.ln1(torch.nn.functional.relu(self.linear1(x))) - h = torch.nn.functional.relu(self.linear2(x)) + h = self.ln1(nn.functional.relu(self.linear1(x))) + h = nn.functional.relu(self.linear2(x)) return self.ln2(x + h + self.bias) def get_regression_trainer(a=0, b=0, double_output=False, train_len=64, eval_len=64, pretrained=True, **kwargs): @@ -1065,7 +1066,7 @@ def assert_flos_extraction(trainer, wrapped_model_to_check): assert_flos_extraction(trainer, trainer.model) # with enforced DataParallel - assert_flos_extraction(trainer, torch.nn.DataParallel(trainer.model)) + assert_flos_extraction(trainer, nn.DataParallel(trainer.model)) trainer.train() self.assertTrue(isinstance(trainer.state.total_flos, float)) @@ -1186,7 +1187,7 @@ def test_fp16_full_eval(self): self.assertAlmostEqual(fp16_eval, fp32_init / 2, delta=5_000) def test_no_wd_param_group(self): - model = torch.nn.Sequential(TstLayer(128), torch.nn.ModuleList([TstLayer(128), TstLayer(128)])) + model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)])) trainer = Trainer(model=model) trainer.create_optimizer_and_scheduler(10) # fmt: off diff --git a/tests/test_trainer_utils.py b/tests/test_trainer_utils.py index b543a1ebcafa..80096742868a 100644 --- a/tests/test_trainer_utils.py +++ b/tests/test_trainer_utils.py @@ -24,6 +24,7 @@ if is_torch_available(): import torch + from torch import nn from torch.utils.data import IterableDataset from transformers.modeling_outputs import SequenceClassifierOutput @@ -40,18 +41,18 @@ get_parameter_names, ) - class TstLayer(torch.nn.Module): + class TstLayer(nn.Module): def __init__(self, hidden_size): super().__init__() - self.linear1 = torch.nn.Linear(hidden_size, hidden_size) - self.ln1 = torch.nn.LayerNorm(hidden_size) - self.linear2 = torch.nn.Linear(hidden_size, hidden_size) - self.ln2 = torch.nn.LayerNorm(hidden_size) - self.bias = torch.nn.Parameter(torch.zeros(hidden_size)) + self.linear1 = nn.Linear(hidden_size, hidden_size) + self.ln1 = nn.LayerNorm(hidden_size) + self.linear2 = nn.Linear(hidden_size, hidden_size) + self.ln2 = nn.LayerNorm(hidden_size) + self.bias = nn.Parameter(torch.zeros(hidden_size)) def forward(self, x): - h = self.ln1(torch.nn.functional.relu(self.linear1(x))) - h = torch.nn.functional.relu(self.linear2(x)) + h = self.ln1(nn.functional.relu(self.linear1(x))) + h = nn.functional.relu(self.linear2(x)) return self.ln2(x + h + self.bias) class RandomIterableDataset(IterableDataset): @@ -151,10 +152,10 @@ def test_label_smoothing(self): num_labels = 12 random_logits = torch.randn(4, 5, num_labels) random_labels = torch.randint(0, num_labels, (4, 5)) - loss = torch.nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1)) + loss = nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1)) model_output = SequenceClassifierOutput(logits=random_logits) label_smoothed_loss = LabelSmoother(0.1)(model_output, random_labels) - log_probs = -torch.nn.functional.log_softmax(random_logits, dim=-1) + log_probs = -nn.functional.log_softmax(random_logits, dim=-1) expected_loss = (1 - epsilon) * loss + epsilon * log_probs.mean() self.assertTrue(torch.allclose(label_smoothed_loss, expected_loss)) @@ -163,10 +164,10 @@ def test_label_smoothing(self): random_labels[2, 1] = -100 random_labels[2, 3] = -100 - loss = torch.nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1)) + loss = nn.functional.cross_entropy(random_logits.view(-1, num_labels), random_labels.view(-1)) model_output = SequenceClassifierOutput(logits=random_logits) label_smoothed_loss = LabelSmoother(0.1)(model_output, random_labels) - log_probs = -torch.nn.functional.log_softmax(random_logits, dim=-1) + log_probs = -nn.functional.log_softmax(random_logits, dim=-1) # Mask the log probs with the -100 labels log_probs[0, 1] = 0.0 log_probs[2, 1] = 0.0 @@ -230,10 +231,10 @@ def test_distributed_length_grouped(self): self.assertEqual(list(sorted(indices_process_0 + indices_process_1)), list(range(100))) def test_get_parameter_names(self): - model = torch.nn.Sequential(TstLayer(128), torch.nn.ModuleList([TstLayer(128), TstLayer(128)])) + model = nn.Sequential(TstLayer(128), nn.ModuleList([TstLayer(128), TstLayer(128)])) # fmt: off self.assertEqual( - get_parameter_names(model, [torch.nn.LayerNorm]), + get_parameter_names(model, [nn.LayerNorm]), ['0.linear1.weight', '0.linear1.bias', '0.linear2.weight', '0.linear2.bias', '0.bias', '1.0.linear1.weight', '1.0.linear1.bias', '1.0.linear2.weight', '1.0.linear2.bias', '1.0.bias', '1.1.linear1.weight', '1.1.linear1.bias', '1.1.linear2.weight', '1.1.linear2.bias', '1.1.bias'] ) # fmt: on