diff --git a/src/transformers/activations.py b/src/transformers/activations.py index f60c64206266..6ef44348086a 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py @@ -15,8 +15,8 @@ import math import torch -import torch.nn.functional as F from packaging import version +from torch import nn from .utils import logging @@ -28,8 +28,8 @@ def _gelu_python(x): """ Original Implementation of the GELU activation function in Google BERT repo when initially created. For information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + - torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in - torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) @@ -45,7 +45,7 @@ def gelu_new(x): if version.parse(torch.__version__) < version.parse("1.4"): gelu = _gelu_python else: - gelu = F.gelu + gelu = nn.functional.gelu def gelu_fast(x): @@ -70,11 +70,11 @@ def _silu_python(x): if version.parse(torch.__version__) < version.parse("1.7"): silu = _silu_python else: - silu = F.silu + silu = nn.functional.silu def mish(x): - return x * torch.tanh(torch.nn.functional.softplus(x)) + return x * torch.tanh(nn.functional.softplus(x)) def linear_act(x): @@ -82,7 +82,7 @@ def linear_act(x): ACT2FN = { - "relu": F.relu, + "relu": nn.functional.relu, "silu": silu, "swish": silu, "gelu": gelu, diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py index 02fb3ebb7e1c..3a70090eff1d 100644 --- a/src/transformers/generation_utils.py +++ b/src/transformers/generation_utils.py @@ -20,7 +20,7 @@ import torch import torch.distributed as dist -from torch.nn import functional as F +from torch import nn from .file_utils import ModelOutput from .generation_beam_search import BeamScorer, BeamSearchScorer @@ -1564,7 +1564,7 @@ def sample( ) # sample - probs = F.softmax(next_token_scores, dim=-1) + probs = nn.functional.softmax(next_token_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) # finished sentences should have their next token be a padding token @@ -1801,9 +1801,11 @@ def beam_search( next_token_logits = outputs.logits[:, -1, :] # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `F.log_softmax` operation. + # cannot be generated both before and after the `nn.functional.log_softmax` operation. next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) - next_token_scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) next_token_scores = logits_processor(input_ids, next_token_scores) next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) @@ -2098,9 +2100,11 @@ def beam_sample( next_token_logits = outputs.logits[:, -1, :] # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `F.log_softmax` operation. + # cannot be generated both before and after the `nn.functional.log_softmax` operation. next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) - next_token_scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) next_token_scores = logits_processor(input_ids, next_token_scores) next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) @@ -2128,7 +2132,7 @@ def beam_sample( vocab_size = next_token_scores.shape[-1] next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) - probs = F.softmax(next_token_scores, dim=-1) + probs = nn.functional.softmax(next_token_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) next_token_scores = torch.gather(next_token_scores, -1, next_tokens) @@ -2426,9 +2430,11 @@ def group_beam_search( next_token_logits = outputs.logits[batch_group_indices, -1, :] # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `F.log_softmax` operation. + # cannot be generated both before and after the `nn.functional.log_softmax` operation. next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) - next_token_scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * group_size, vocab_size) + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * group_size, vocab_size) vocab_size = next_token_scores.shape[-1] next_token_scores = logits_processor( diff --git a/src/transformers/modeling_fx_utils.py b/src/transformers/modeling_fx_utils.py index 6c43a56bfb24..ff7763955ce7 100644 --- a/src/transformers/modeling_fx_utils.py +++ b/src/transformers/modeling_fx_utils.py @@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Union import torch +from torch import nn from torch.fx import Graph, GraphModule, Node, Proxy, Tracer from torch.fx.node import Argument @@ -277,7 +278,7 @@ def _insert_module_as_submodule(self, mod): return path - def path_of_module(self, mod: torch.nn.Module) -> str: + def path_of_module(self, mod: nn.Module) -> str: """ Helper method to find the qualified name of ``mod`` in the Module hierarchy of ``root``. For example, if ``root`` has a submodule named ``foo``, which has a submodule named ``bar``, passing ``bar`` into this function diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 109561e26de8..a95c729f896c 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -25,7 +25,6 @@ import torch from torch import Tensor, device, dtype, nn from torch.nn import CrossEntropyLoss -from torch.nn import functional as F from .activations import get_activation from .configuration_utils import PretrainedConfig @@ -355,9 +354,7 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool """ def parameter_filter(x): - return (x.requires_grad or not only_trainable) and not ( - isinstance(x, torch.nn.Embedding) and exclude_embeddings - ) + return (x.requires_grad or not only_trainable) and not (isinstance(x, nn.Embedding) and exclude_embeddings) params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters() return sum(p.numel() for p in params) @@ -549,7 +546,7 @@ def tie_encoder_to_decoder_recursively( ): assert isinstance(decoder_pointer, nn.Module) and isinstance( encoder_pointer, nn.Module - ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module" + ), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module" if hasattr(decoder_pointer, "weight"): assert hasattr(encoder_pointer, "weight") encoder_pointer.weight = decoder_pointer.weight @@ -613,7 +610,7 @@ def _tie_or_clone_weights(self, output_embeddings, input_embeddings): output_embeddings.weight = input_embeddings.weight if getattr(output_embeddings, "bias", None) is not None: - output_embeddings.bias.data = torch.nn.functional.pad( + output_embeddings.bias.data = nn.functional.pad( output_embeddings.bias.data, ( 0, @@ -625,7 +622,7 @@ def _tie_or_clone_weights(self, output_embeddings, input_embeddings): if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): output_embeddings.out_features = input_embeddings.num_embeddings - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding: + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: """ Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`. @@ -668,8 +665,8 @@ def _resize_token_embeddings(self, new_num_tokens): return self.get_input_embeddings() def _get_resized_embeddings( - self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None - ) -> torch.nn.Embedding: + self, old_embeddings: nn.Embedding, new_num_tokens: Optional[int] = None + ) -> nn.Embedding: """ Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end @@ -732,8 +729,8 @@ def _get_resized_embeddings( return new_embeddings def _get_resized_lm_head( - self, old_lm_head: torch.nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False - ) -> torch.nn.Linear: + self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False + ) -> nn.Linear: """ Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end @@ -1681,7 +1678,7 @@ def forward( else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() - start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) + start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, self.start_n_top, dim=-1 @@ -1695,7 +1692,7 @@ def forward( ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) - end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, self.end_n_top, dim=1 @@ -1820,7 +1817,7 @@ def forward( return output -def unwrap_model(model: torch.nn.Module) -> torch.nn.Module: +def unwrap_model(model: nn.Module) -> nn.Module: """ Recursively unwraps a model from potential containers (as used in distributed training). @@ -1834,7 +1831,7 @@ def unwrap_model(model: torch.nn.Module) -> torch.nn.Module: return model -def prune_linear_layer(layer: torch.nn.Linear, index: torch.LongTensor, dim: int = 0) -> torch.nn.Linear: +def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear: """ Prune a linear layer to keep only entries in index. @@ -1902,8 +1899,8 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> def prune_layer( - layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None -) -> Union[torch.nn.Linear, Conv1D]: + layer: Union[nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None +) -> Union[nn.Linear, Conv1D]: """ Prune a Conv1D or linear layer to keep only entries in index. diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index afd2d9d9b6fa..81ca97ab7bee 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -20,7 +20,7 @@ from typing import Optional, Tuple import torch -import torch.nn as nn +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index f0909decbdd5..a466be30a688 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -20,7 +20,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss, MSELoss @@ -223,7 +222,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -243,7 +242,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -303,15 +302,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -398,7 +397,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) @@ -418,7 +417,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -428,9 +427,9 @@ def forward( # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -661,7 +660,7 @@ class BartEncoder(BartPretrainedModel): Args: config: BartConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -760,7 +759,7 @@ def forward( hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -826,7 +825,7 @@ class BartDecoder(BartPretrainedModel): Args: config: BartConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -997,7 +996,7 @@ def forward( hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index dad2d1ceceb7..120a48c098cf 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -139,7 +139,7 @@ def __init__(self, config): self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 53b8f2e853b2..67b9bd182c5e 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -22,7 +22,6 @@ import numpy as np import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss @@ -379,7 +378,7 @@ def forward( attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -608,7 +607,9 @@ def bigbird_block_sparse_attention( first_product = first_product * rsqrt_d first_product += (1.0 - to_mask) * attn_mask_penalty - first_attn_weights = F.softmax(first_product, dim=-1) # [bsz, n_heads, from_block_size, to_seq_len] + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) @@ -660,7 +661,7 @@ def bigbird_block_sparse_attention( ) second_product = second_product * rsqrt_d second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty - second_attn_weights = F.softmax( + second_attn_weights = nn.functional.softmax( second_product, dim=-1 ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] @@ -721,7 +722,7 @@ def bigbird_block_sparse_attention( ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] # safely doing softmax since attention matrix is completed - attn_weights = F.softmax( + attn_weights = nn.functional.softmax( band_product, dim=-1 ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] @@ -794,7 +795,7 @@ def bigbird_block_sparse_attention( ) second_last_product = second_last_product * rsqrt_d second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty - second_last_attn_weights = F.softmax( + second_last_attn_weights = nn.functional.softmax( second_last_product, dim=-1 ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] @@ -810,7 +811,7 @@ def bigbird_block_sparse_attention( last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) last_product = last_product * rsqrt_d last_product += (1.0 - to_mask) * attn_mask_penalty - last_attn_weights = F.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) @@ -2210,10 +2211,10 @@ def _pad_to_block_size( f"`config.block_size`: {block_size}" ) if input_ids is not None: - input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) + input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) if position_ids is not None: # pad with position_id = pad_token_id as in modeling_bigbird.BigBirdEmbeddings - position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id) + position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id) if inputs_embeds is not None: input_ids_padding = inputs_embeds.new_full( (batch_size, padding_len), @@ -2223,8 +2224,10 @@ def _pad_to_block_size( inputs_embeds_padding = self.embeddings(input_ids_padding) inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) - attention_mask = F.pad(attention_mask, (0, padding_len), value=False) # no attention on the padding tokens - token_type_ids = F.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens + token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 3f548ecfc20e..4fc668348c27 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -22,7 +22,6 @@ import numpy as np import torch -import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss, MSELoss @@ -206,7 +205,7 @@ def forward( attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = F.softmax(attention_scores, dim=-1) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -436,7 +435,9 @@ def bigbird_block_sparse_attention( first_product = first_product * rsqrt_d first_product += (1.0 - to_mask) * attn_mask_penalty - first_attn_weights = F.softmax(first_product, dim=-1) # [bsz, n_heads, from_block_size, to_seq_len] + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) @@ -488,7 +489,7 @@ def bigbird_block_sparse_attention( ) second_product = second_product * rsqrt_d second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty - second_attn_weights = F.softmax( + second_attn_weights = nn.functional.softmax( second_product, dim=-1 ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] @@ -549,7 +550,7 @@ def bigbird_block_sparse_attention( ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] # safely doing softmax since attention matrix is completed - attn_weights = F.softmax( + attn_weights = nn.functional.softmax( band_product, dim=-1 ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] @@ -622,7 +623,7 @@ def bigbird_block_sparse_attention( ) second_last_product = second_last_product * rsqrt_d second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty - second_last_attn_weights = F.softmax( + second_last_attn_weights = nn.functional.softmax( second_last_product, dim=-1 ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] @@ -638,7 +639,7 @@ def bigbird_block_sparse_attention( last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) last_product = last_product * rsqrt_d last_product += (1.0 - to_mask) * attn_mask_penalty - last_attn_weights = F.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) @@ -1295,7 +1296,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -1315,7 +1316,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -1384,7 +1385,7 @@ def forward( ) hidden_states = self_attention_outputs[0] - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states residual = hidden_states @@ -1392,7 +1393,7 @@ def forward( hidden_states = self.activation_fn(self.fc1(hidden_states)) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states if hidden_states.dtype == torch.float16 and ( @@ -1492,7 +1493,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # Cross-Attention Block @@ -1512,7 +1513,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # add cross-attn to positions 3,4 of present_key_value tuple @@ -1522,9 +1523,9 @@ def forward( residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -1733,7 +1734,7 @@ class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): Args: config: BigBirdPegasusConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -1829,7 +1830,7 @@ def forward( embed_pos = self.embed_positions(input_shape) hidden_states = inputs_embeds + embed_pos - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) if attention_mask is None: attention_mask = torch.ones(input_shape, device=hidden_states.device) @@ -2015,7 +2016,9 @@ def _pad_to_block_size(self, hidden_states: torch.Tensor, attention_mask: torch. inputs_embeds_padding = self.embed_tokens(input_ids_padding) hidden_states = torch.cat([hidden_states, inputs_embeds_padding], dim=-2) - attention_mask = F.pad(attention_mask, (0, padding_len), value=0) # no attention on the padding tokens + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=0 + ) # no attention on the padding tokens return padding_len, hidden_states, attention_mask @@ -2027,7 +2030,7 @@ class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): Args: config: BigBirdPegasusConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -2198,7 +2201,7 @@ def forward( hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index a0d3a90c10eb..2857da5e359d 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -23,7 +23,6 @@ from typing import Optional, Tuple, Union import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss @@ -224,7 +223,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -244,7 +243,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -306,15 +305,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states if hidden_states.dtype == torch.float16 and ( @@ -402,7 +401,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # Cross-Attention Block @@ -422,7 +421,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # add cross-attn to positions 3,4 of present_key_value tuple @@ -432,9 +431,9 @@ def forward( residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -617,7 +616,7 @@ class BlenderbotEncoder(BlenderbotPreTrainedModel): Args: config: BlenderbotConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -715,7 +714,7 @@ def forward( embed_pos = self.embed_positions(input_shape) hidden_states = inputs_embeds + embed_pos - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -784,7 +783,7 @@ class BlenderbotDecoder(BlenderbotPreTrainedModel): Args: config: BlenderbotConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -956,7 +955,7 @@ def forward( hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 58f9ad9c101f..f2c5208cfff1 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -21,7 +21,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss @@ -222,7 +221,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -242,7 +241,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -303,15 +302,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -399,7 +398,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) @@ -419,7 +418,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -429,9 +428,9 @@ def forward( # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -618,7 +617,7 @@ class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): Args: config: BlenderbotSmallConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -717,7 +716,7 @@ def forward( hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -784,7 +783,7 @@ class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): Args: config: BlenderbotSmallConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -957,7 +956,7 @@ def forward( inputs_embeds = self.layernorm_embedding(inputs_embeds) hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py b/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py index acc6981d2bee..933c88795ab9 100644 --- a/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py +++ b/src/transformers/models/bort/convert_bort_original_gluonnlp_checkpoint_to_pytorch.py @@ -21,6 +21,7 @@ import numpy as np import torch from packaging import version +from torch import nn import gluonnlp as nlp import mxnet as mx @@ -170,8 +171,8 @@ def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_f # | `encoder.transformer_cells.*.proj.weight` | `bert.encoder.layer.*.output.dense.weight` # Helper function to convert MXNET Arrays to PyTorch - def to_torch(mx_array) -> torch.nn.Parameter: - return torch.nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy())) + def to_torch(mx_array) -> nn.Parameter: + return nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy())) # Check param shapes and map new HF param back def check_and_map_params(hf_param, gluon_param): diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index 6a2c0f426329..9fb65dbafab9 100755 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -18,7 +18,6 @@ from typing import Any, Optional, Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn @@ -62,7 +61,7 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] # contrastive loss function, adapted from # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html def contrastive_loss(logits: torch.Tensor, dim: int) -> torch.Tensor: - neg_ce = torch.diag(F.log_softmax(logits, dim=dim)) + neg_ce = torch.diag(nn.functional.log_softmax(logits, dim=dim)) return -neg_ce.mean() @@ -235,7 +234,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: # this operation is a bit akward, but it's required to @@ -247,7 +246,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -493,7 +492,7 @@ class CLIPEncoder(nn.Module): Args: config: CLIPConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: CLIPConfig): diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index d3d8085d3fb4..09d3d0db8faa 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -383,7 +383,7 @@ def forward( attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = torch.nn.functional.softmax(attention_scores, dim=-1) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index ce9bd80c5929..cdf32828b9f9 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -19,7 +19,7 @@ import numpy as np import torch -import torch.nn as nn +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -87,7 +87,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N return output, attention_weights -class MultiHeadAttention(torch.nn.Module): +class MultiHeadAttention(nn.Module): def __init__(self, d_model_size, num_heads): super().__init__() self.num_heads = num_heads @@ -95,11 +95,11 @@ def __init__(self, d_model_size, num_heads): self.depth = int(d_model_size / self.num_heads) - self.Wq = torch.nn.Linear(d_model_size, d_model_size) - self.Wk = torch.nn.Linear(d_model_size, d_model_size) - self.Wv = torch.nn.Linear(d_model_size, d_model_size) + self.Wq = nn.Linear(d_model_size, d_model_size) + self.Wk = nn.Linear(d_model_size, d_model_size) + self.Wv = nn.Linear(d_model_size, d_model_size) - self.dense = torch.nn.Linear(d_model_size, d_model_size) + self.dense = nn.Linear(d_model_size, d_model_size) self.pruned_heads = set() def prune_heads(self, heads): @@ -167,21 +167,21 @@ def forward( def point_wise_feed_forward_network(d_model_size, dff): - return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), torch.nn.ReLU(), torch.nn.Linear(dff, d_model_size)) + return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size)) -class EncoderLayer(torch.nn.Module): +class EncoderLayer(nn.Module): def __init__(self, d_model_size, num_heads, dff, rate=0.1): super().__init__() self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads) self.ffn = point_wise_feed_forward_network(d_model_size, dff) - self.layernorm1 = torch.nn.LayerNorm(d_model_size, eps=1e-6) - self.layernorm2 = torch.nn.LayerNorm(d_model_size, eps=1e-6) + self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6) + self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6) - self.dropout1 = torch.nn.Dropout(rate) - self.dropout2 = torch.nn.Dropout(rate) + self.dropout1 = nn.Dropout(rate) + self.dropout2 = nn.Dropout(rate) def forward( self, x, mask, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 24692cc6576d..7bdc00ebd7ce 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -163,7 +163,7 @@ def backward(ctx, grad_output): return grad_output, None -class StableDropout(torch.nn.Module): +class StableDropout(nn.Module): """ Optimized dropout module for stabilizing the training @@ -477,7 +477,7 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer): return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))) -class DisentangledSelfAttention(torch.nn.Module): +class DisentangledSelfAttention(nn.Module): """ Disentangled self-attention module @@ -498,19 +498,17 @@ def __init__(self, config): self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - self.in_proj = torch.nn.Linear(config.hidden_size, self.all_head_size * 3, bias=False) - self.q_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) - self.v_bias = torch.nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + self.in_proj = nn.Linear(config.hidden_size, self.all_head_size * 3, bias=False) + self.q_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) + self.v_bias = nn.Parameter(torch.zeros((self.all_head_size), dtype=torch.float)) self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] self.relative_attention = getattr(config, "relative_attention", False) self.talking_head = getattr(config, "talking_head", False) if self.talking_head: - self.head_logits_proj = torch.nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) - self.head_weights_proj = torch.nn.Linear( - config.num_attention_heads, config.num_attention_heads, bias=False - ) + self.head_logits_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) + self.head_weights_proj = nn.Linear(config.num_attention_heads, config.num_attention_heads, bias=False) if self.relative_attention: self.max_relative_positions = getattr(config, "max_relative_positions", -1) @@ -519,9 +517,9 @@ def __init__(self, config): self.pos_dropout = StableDropout(config.hidden_dropout_prob) if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type: - self.pos_proj = torch.nn.Linear(config.hidden_size, self.all_head_size, bias=False) + self.pos_proj = nn.Linear(config.hidden_size, self.all_head_size, bias=False) if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type: - self.pos_q_proj = torch.nn.Linear(config.hidden_size, self.all_head_size) + self.pos_q_proj = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = StableDropout(config.attention_probs_dropout_prob) @@ -1122,7 +1120,7 @@ def __init__(self, config): self.pooler = ContextPooler(config) output_dim = self.pooler.output_dim - self.classifier = torch.nn.Linear(output_dim, num_labels) + self.classifier = nn.Linear(output_dim, num_labels) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = StableDropout(drop_out) @@ -1182,7 +1180,7 @@ def forward( if labels is not None: if self.num_labels == 1: # regression task - loss_fn = torch.nn.MSELoss() + loss_fn = nn.MSELoss() logits = logits.view(-1).to(labels.dtype) loss = loss_fn(logits, labels.view(-1)) elif labels.dim() == 1 or labels.size(-1) == 1: @@ -1196,7 +1194,7 @@ def forward( else: loss = torch.tensor(0).to(logits) else: - log_softmax = torch.nn.LogSoftmax(-1) + log_softmax = nn.LogSoftmax(-1) loss = -((log_softmax(logits) * labels).sum(-1)).mean() if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 321922d877db..f186274380bf 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -168,7 +168,7 @@ def backward(ctx, grad_output): # Copied from transformers.models.deberta.modeling_deberta.StableDropout -class StableDropout(torch.nn.Module): +class StableDropout(nn.Module): """ Optimized dropout module for stabilizing the training @@ -342,7 +342,7 @@ def __init__(self, config): kernel_size = getattr(config, "conv_kernel_size", 3) groups = getattr(config, "conv_groups", 1) self.conv_act = getattr(config, "conv_act", "tanh") - self.conv = torch.nn.Conv1d( + self.conv = nn.Conv1d( config.hidden_size, config.hidden_size, kernel_size, padding=(kernel_size - 1) // 2, groups=groups ) self.LayerNorm = LayerNorm(config.hidden_size, config.layer_norm_eps) @@ -546,7 +546,7 @@ def pos_dynamic_expand(pos_index, p2c_att, key_layer): return pos_index.expand(p2c_att.size()[:2] + (pos_index.size(-2), key_layer.size(-2))) -class DisentangledSelfAttention(torch.nn.Module): +class DisentangledSelfAttention(nn.Module): """ Disentangled self-attention module @@ -1244,7 +1244,7 @@ def __init__(self, config): self.pooler = ContextPooler(config) output_dim = self.pooler.output_dim - self.classifier = torch.nn.Linear(output_dim, num_labels) + self.classifier = nn.Linear(output_dim, num_labels) drop_out = getattr(config, "cls_dropout", None) drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out self.dropout = StableDropout(drop_out) @@ -1304,7 +1304,7 @@ def forward( if labels is not None: if self.num_labels == 1: # regression task - loss_fn = torch.nn.MSELoss() + loss_fn = nn.MSELoss() logits = logits.view(-1).to(labels.dtype) loss = loss_fn(logits, labels.view(-1)) elif labels.dim() == 1 or labels.size(-1) == 1: @@ -1318,7 +1318,7 @@ def forward( else: loss = torch.tensor(0).to(logits) else: - log_softmax = torch.nn.LogSoftmax(-1) + log_softmax = nn.LogSoftmax(-1) loss = -((log_softmax(logits) * labels).sum(-1)).mean() if not return_dict: output = (logits,) + outputs[1:] diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py index 7c9b5526dc81..014ba278e92c 100644 --- a/src/transformers/models/detr/feature_extraction_detr.py +++ b/src/transformers/models/detr/feature_extraction_detr.py @@ -30,7 +30,7 @@ if is_torch_available(): import torch - import torch.nn.functional as F + from torch import nn logger = logging.get_logger(__name__) @@ -374,7 +374,7 @@ def get_size(image_size, size, max_size=None): # use PyTorch as current workaround # TODO replace by self.resize masks = torch.from_numpy(target["masks"][:, None]).float() - interpolated_masks = F.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5 + interpolated_masks = nn.functional.interpolate(masks, size=(h, w), mode="nearest")[:, 0] > 0.5 target["masks"] = interpolated_masks.numpy() return rescaled_image, target @@ -697,7 +697,7 @@ def post_process(self, outputs, target_sizes): target_sizes.shape[1] == 2 ), "Each element of target_sizes must contain the size (h, w) of each image of the batch" - prob = F.softmax(out_logits, -1) + prob = nn.functional.softmax(out_logits, -1) scores, labels = prob[..., :-1].max(-1) # convert to [x0, y0, x1, y1] format @@ -742,13 +742,15 @@ def post_process_segmentation(self, results, outputs, orig_target_sizes, max_tar ), "Make sure to pass in as many orig_target_sizes as max_target_sizes" max_h, max_w = max_target_sizes.max(0)[0].tolist() outputs_masks = outputs.pred_masks.squeeze(2) - outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False) + outputs_masks = nn.functional.interpolate( + outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False + ) outputs_masks = (outputs_masks.sigmoid() > threshold).cpu() for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): img_h, img_w = t[0], t[1] results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) - results[i]["masks"] = F.interpolate( + results[i]["masks"] = nn.functional.interpolate( results[i]["masks"].float(), size=tuple(tt.tolist()), mode="nearest" ).byte() @@ -810,7 +812,7 @@ def to_tuple(tup): cur_scores = cur_scores[keep] cur_classes = cur_classes[keep] cur_masks = cur_masks[keep] - cur_masks = F.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) + cur_masks = nn.functional.interpolate(cur_masks[:, None], to_tuple(size), mode="bilinear").squeeze(1) cur_boxes = center_to_corners_format(cur_boxes[keep]) h, w = cur_masks.shape[-2:] diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py index b876e1be8fe0..43d1edb94f3e 100644 --- a/src/transformers/models/detr/modeling_detr.py +++ b/src/transformers/models/detr/modeling_detr.py @@ -21,7 +21,6 @@ from typing import Dict, List, Optional, Tuple import torch -import torch.nn.functional as F from torch import Tensor, nn from ...activations import ACT2FN @@ -314,7 +313,7 @@ def forward(self, x): def replace_batch_norm(m, name=""): for attr_str in dir(m): target_attr = getattr(m, attr_str) - if isinstance(target_attr, torch.nn.BatchNorm2d): + if isinstance(target_attr, nn.BatchNorm2d): frozen = DetrFrozenBatchNorm2d(target_attr.num_features) bn = getattr(m, attr_str) frozen.weight.data.copy_(bn.weight) @@ -362,7 +361,7 @@ def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): out = [] for feature_map in features: # downsample pixel_mask to match shape of corresponding feature_map - mask = F.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] out.append((feature_map, mask)) return out @@ -570,7 +569,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: # this operation is a bit awkward, but it's required to @@ -582,7 +581,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -642,16 +641,16 @@ def forward( output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -731,7 +730,7 @@ def forward( output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) @@ -749,16 +748,16 @@ def forward( output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -885,7 +884,7 @@ class DetrEncoder(DetrPreTrainedModel): Args: config: DetrConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: DetrConfig): @@ -946,7 +945,7 @@ def forward( return_dict = return_dict if return_dict is not None else self.config.use_return_dict hidden_states = inputs_embeds - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -999,7 +998,7 @@ class DetrDecoder(DetrPreTrainedModel): Args: config: DetrConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: DetrConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -1717,23 +1716,23 @@ def __init__(self, dim, fpn_dims, context_dim): inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] - self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) - self.gn1 = torch.nn.GroupNorm(8, dim) - self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) - self.gn2 = torch.nn.GroupNorm(8, inter_dims[1]) - self.lay3 = torch.nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) - self.gn3 = torch.nn.GroupNorm(8, inter_dims[2]) - self.lay4 = torch.nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) - self.gn4 = torch.nn.GroupNorm(8, inter_dims[3]) - self.lay5 = torch.nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) - self.gn5 = torch.nn.GroupNorm(8, inter_dims[4]) - self.out_lay = torch.nn.Conv2d(inter_dims[4], 1, 3, padding=1) + self.lay1 = nn.Conv2d(dim, dim, 3, padding=1) + self.gn1 = nn.GroupNorm(8, dim) + self.lay2 = nn.Conv2d(dim, inter_dims[1], 3, padding=1) + self.gn2 = nn.GroupNorm(8, inter_dims[1]) + self.lay3 = nn.Conv2d(inter_dims[1], inter_dims[2], 3, padding=1) + self.gn3 = nn.GroupNorm(8, inter_dims[2]) + self.lay4 = nn.Conv2d(inter_dims[2], inter_dims[3], 3, padding=1) + self.gn4 = nn.GroupNorm(8, inter_dims[3]) + self.lay5 = nn.Conv2d(inter_dims[3], inter_dims[4], 3, padding=1) + self.gn5 = nn.GroupNorm(8, inter_dims[4]) + self.out_lay = nn.Conv2d(inter_dims[4], 1, 3, padding=1) self.dim = dim - self.adapter1 = torch.nn.Conv2d(fpn_dims[0], inter_dims[1], 1) - self.adapter2 = torch.nn.Conv2d(fpn_dims[1], inter_dims[2], 1) - self.adapter3 = torch.nn.Conv2d(fpn_dims[2], inter_dims[3], 1) + self.adapter1 = nn.Conv2d(fpn_dims[0], inter_dims[1], 1) + self.adapter2 = nn.Conv2d(fpn_dims[1], inter_dims[2], 1) + self.adapter3 = nn.Conv2d(fpn_dims[2], inter_dims[3], 1) for m in self.modules(): if isinstance(m, nn.Conv2d): @@ -1748,34 +1747,34 @@ def forward(self, x: Tensor, bbox_mask: Tensor, fpns: List[Tensor]): x = self.lay1(x) x = self.gn1(x) - x = F.relu(x) + x = nn.functional.relu(x) x = self.lay2(x) x = self.gn2(x) - x = F.relu(x) + x = nn.functional.relu(x) cur_fpn = self.adapter1(fpns[0]) if cur_fpn.size(0) != x.size(0): cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) - x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") x = self.lay3(x) x = self.gn3(x) - x = F.relu(x) + x = nn.functional.relu(x) cur_fpn = self.adapter2(fpns[1]) if cur_fpn.size(0) != x.size(0): cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) - x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") x = self.lay4(x) x = self.gn4(x) - x = F.relu(x) + x = nn.functional.relu(x) cur_fpn = self.adapter3(fpns[2]) if cur_fpn.size(0) != x.size(0): cur_fpn = _expand(cur_fpn, x.size(0) // cur_fpn.size(0)) - x = cur_fpn + F.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") + x = cur_fpn + nn.functional.interpolate(x, size=cur_fpn.shape[-2:], mode="nearest") x = self.lay5(x) x = self.gn5(x) - x = F.relu(x) + x = nn.functional.relu(x) x = self.out_lay(x) return x @@ -1797,14 +1796,14 @@ def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, bias=True, std def forward(self, q, k, mask: Optional[Tensor] = None): q = self.q_linear(q) - k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) + k = nn.functional.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) queries_per_head = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) keys_per_head = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) weights = torch.einsum("bqnc,bnchw->bqnhw", queries_per_head * self.normalize_fact, keys_per_head) if mask is not None: weights.masked_fill_(mask.unsqueeze(1).unsqueeze(1), float("-inf")) - weights = F.softmax(weights.flatten(2), dim=-1).view(weights.size()) + weights = nn.functional.softmax(weights.flatten(2), dim=-1).view(weights.size()) weights = self.dropout(weights) return weights @@ -1847,7 +1846,7 @@ def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: f Loss tensor """ prob = inputs.sigmoid() - ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") p_t = prob * targets + (1 - prob) * (1 - targets) loss = ce_loss * ((1 - p_t) ** gamma) @@ -1909,7 +1908,7 @@ def loss_labels(self, outputs, targets, indices, num_boxes): ) target_classes[idx] = target_classes_o - loss_ce = F.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) + loss_ce = nn.functional.cross_entropy(src_logits.transpose(1, 2), target_classes, self.empty_weight) losses = {"loss_ce": loss_ce} return losses @@ -1926,7 +1925,7 @@ def loss_cardinality(self, outputs, targets, indices, num_boxes): tgt_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) # Count the number of predictions that are NOT "no-object" (which is the last class) card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) - card_err = F.l1_loss(card_pred.float(), tgt_lengths.float()) + card_err = nn.functional.l1_loss(card_pred.float(), tgt_lengths.float()) losses = {"cardinality_error": card_err} return losses @@ -1942,7 +1941,7 @@ def loss_boxes(self, outputs, targets, indices, num_boxes): src_boxes = outputs["pred_boxes"][idx] target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) - loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction="none") + loss_bbox = nn.functional.l1_loss(src_boxes, target_boxes, reduction="none") losses = {} losses["loss_bbox"] = loss_bbox.sum() / num_boxes @@ -1972,7 +1971,7 @@ def loss_masks(self, outputs, targets, indices, num_boxes): target_masks = target_masks[tgt_idx] # upsample predictions to the target size - src_masks = F.interpolate( + src_masks = nn.functional.interpolate( src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False ) src_masks = src_masks[:, 0].flatten(1) @@ -2068,7 +2067,7 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers): def forward(self, x): for i, layer in enumerate(self.layers): - x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 911cd6cd55f4..1c232cd7e144 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -23,7 +23,7 @@ import numpy as np import torch -import torch.nn as nn +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import gelu diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 329faaff0cee..84084d26b751 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -20,8 +20,8 @@ from typing import Optional, Tuple import torch -import torch.nn as nn import torch.utils.checkpoint +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN, get_activation diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 1603ce1f4b5f..5c0826f01409 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -18,7 +18,7 @@ import random import torch -from torch.nn import functional as F +from torch import nn from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from ...modeling_outputs import BaseModelOutput @@ -234,7 +234,7 @@ def forward( if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) - tensor = F.dropout(tensor, p=self.dropout, training=self.training) + tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers @@ -261,7 +261,7 @@ def forward( attn = attn_outputs[0] if output_attentions: attentions = attentions + (attn_outputs[1],) - attn = F.dropout(attn, p=self.dropout, training=self.training) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) else: @@ -270,13 +270,13 @@ def forward( attn = attn_outputs[0] if output_attentions: attentions = attentions + (attn_outputs[1],) - attn = F.dropout(attn, p=self.dropout, training=self.training) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) - # attn = F.dropout(attn, p=self.dropout, training=self.training) + # attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) diff --git a/src/transformers/models/flaubert/modeling_tf_flaubert.py b/src/transformers/models/flaubert/modeling_tf_flaubert.py index c6f43a4ced08..4ba4c4d099f6 100644 --- a/src/transformers/models/flaubert/modeling_tf_flaubert.py +++ b/src/transformers/models/flaubert/modeling_tf_flaubert.py @@ -675,7 +675,7 @@ def call( # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) - # attn = F.dropout(attn, p=self.dropout, training=self.training) + # attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index ce0807b1a7d0..1f352a1cc6be 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -32,7 +32,6 @@ from typing import Any, Dict, List, Optional, Tuple import torch -import torch.nn.functional as F from torch import Tensor, nn from torch.nn import CrossEntropyLoss, LayerNorm @@ -430,15 +429,15 @@ def forward(self, x, encoder_padding_mask, layer_head_mask, output_attentions=Fa layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.self_attn_layer_norm(x) residual = x x = self.activation_fn(self.fc1(x)) - x = F.dropout(x, p=self.activation_dropout, training=self.training) + x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.final_layer_norm(x) return x, attn_weights @@ -504,7 +503,7 @@ def forward( inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale embed_pos = self.embed_positions(input_ids) x = inputs_embeds + embed_pos - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) @@ -600,7 +599,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.self_attn_layer_norm(x) @@ -615,16 +614,16 @@ def forward( layer_head_mask=cross_attn_layer_head_mask, output_attentions=output_attentions, ) - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.encoder_attn_layer_norm(x) # Fully Connected residual = x x = self.activation_fn(self.fc1(x)) - x = F.dropout(x, p=self.activation_dropout, training=self.training) + x = nn.functional.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.final_layer_norm(x) return ( @@ -641,7 +640,7 @@ class FSMTDecoder(nn.Module): Args: config: FSMTConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: FSMTConfig, embed_tokens: nn.Embedding): @@ -726,7 +725,7 @@ def forward( x = self.embed_tokens(input_ids) * self.embed_scale x += positions - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) # Convert to FSMT output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) x = x.transpose(0, 1) @@ -913,7 +912,7 @@ def forward( attn_weights = attn_weights.masked_fill(reshaped, float("-inf")) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: assert layer_head_mask.size() == ( @@ -929,7 +928,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout( + attn_probs = nn.functional.dropout( attn_weights, p=self.dropout, training=self.training, diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 8f75aa2f5742..09a90815f51d 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -22,7 +22,6 @@ import torch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from torch.nn import functional as F from ...activations import ACT2FN from ...file_utils import ( @@ -196,7 +195,7 @@ def init_attention_inputs(self, inputs_embeds, attention_mask=None, token_type_i position_embeds = self.get_position_embeds(seq_len, inputs_embeds.dtype, inputs_embeds.device) token_type_mat = self.token_type_ids_to_mat(token_type_ids) if token_type_ids is not None else None cls_mask = ( - F.pad(inputs_embeds.new_ones([seq_len - 1, seq_len - 1]), (1, 0, 1, 0)) + nn.functional.pad(inputs_embeds.new_ones([seq_len - 1, seq_len - 1]), (1, 0, 1, 0)) if self.config.separate_cls else None ) @@ -368,11 +367,11 @@ def pool_tensor(self, tensor, mode="mean", stride=2): stride = (stride, 1) if mode == "mean": - tensor = F.avg_pool2d(tensor, stride, stride=stride, ceil_mode=True) + tensor = nn.functional.avg_pool2d(tensor, stride, stride=stride, ceil_mode=True) elif mode == "max": - tensor = F.max_pool2d(tensor, stride, stride=stride, ceil_mode=True) + tensor = nn.functional.max_pool2d(tensor, stride, stride=stride, ceil_mode=True) elif mode == "min": - tensor = -F.max_pool2d(-tensor, stride, stride=stride, ceil_mode=True) + tensor = -nn.functional.max_pool2d(-tensor, stride, stride=stride, ceil_mode=True) else: raise NotImplementedError("The supported modes are 'mean', 'max' and 'min'.") diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index aff8d18e108e..ad12b91d1817 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -20,8 +20,8 @@ from typing import Optional, Tuple import torch -import torch.nn as nn import torch.utils.checkpoint +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...activations import ACT2FN diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 5dfd8151e62c..44c6dd6583a9 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -19,7 +19,6 @@ from typing import Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss, MSELoss @@ -186,7 +185,7 @@ def _look_back(tensor, block_length, window_size, pad_value=0, is_key_value=True else: raise ValueError(f"Input tensor rank should be one of [2, 3], but is: {len(tensor.shape)}") - padded_tensor = F.pad(tensor, padding_side, value=pad_value) + padded_tensor = nn.functional.pad(tensor, padding_side, value=pad_value) padded_tensor = padded_tensor.unfold(dimension=1, size=window_size + block_length, step=block_length) if is_key_value: diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index ec547aae7cc1..48344f0a254a 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -20,8 +20,8 @@ import math import torch -import torch.nn as nn import torch.utils.checkpoint +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...activations import gelu diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py index d1da18686abd..281bc96df888 100644 --- a/src/transformers/models/ibert/quant_modules.py +++ b/src/transformers/models/ibert/quant_modules.py @@ -19,8 +19,7 @@ import numpy as np import torch -import torch.nn as nn -import torch.nn.functional as F +from torch import nn from torch.autograd import Function from ...utils import logging @@ -79,7 +78,7 @@ def __init__( def forward(self, x, positions=None, incremental_state=None): if not self.quant_mode: return ( - F.embedding( + nn.functional.embedding( x, self.weight, self.padding_idx, @@ -101,7 +100,7 @@ def forward(self, x, positions=None, incremental_state=None): self.weight, self.weight_bit, self.percentile_mode, self.weight_scaling_factor ) - emb_int = F.embedding( + emb_int = nn.functional.embedding( x, self.weight_integer, self.padding_idx, @@ -264,7 +263,7 @@ def __repr__(self): def forward(self, x, prev_act_scaling_factor=None): if not self.quant_mode: - return F.linear(x, weight=self.weight, bias=self.bias), None + return nn.functional.linear(x, weight=self.weight, bias=self.bias), None # assert that prev_act_scaling_factor is a scalar tensor assert prev_act_scaling_factor is not None and prev_act_scaling_factor.shape == (1,), ( @@ -295,7 +294,7 @@ def forward(self, x, prev_act_scaling_factor=None): x_int = x / prev_act_scaling_factor return ( - F.linear(x_int, weight=self.weight_integer, bias=self.bias_integer) * bias_scaling_factor, + nn.functional.linear(x_int, weight=self.weight_integer, bias=self.bias_integer) * bias_scaling_factor, bias_scaling_factor, ) diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index c8c395557977..b8aa62662073 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -52,7 +52,7 @@ ] -LayoutLMLayerNorm = torch.nn.LayerNorm +LayoutLMLayerNorm = nn.LayerNorm class LayoutLMEmbeddings(nn.Module): diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 93eefc27f45f..9d3f80c02ad2 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -21,7 +21,6 @@ from typing import List, Optional, Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss @@ -250,7 +249,9 @@ def forward( # free memory del global_key_attn_scores - attn_probs = F.softmax(attn_scores, dim=-1, dtype=torch.float32) # use fp32 for numerical stability + attn_probs = nn.functional.softmax( + attn_scores, dim=-1, dtype=torch.float32 + ) # use fp32 for numerical stability if layer_head_mask is not None: assert layer_head_mask.size() == ( @@ -266,7 +267,7 @@ def forward( del attn_scores # apply dropout - attn_probs = F.dropout(attn_probs, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training) value_vectors = value_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1) @@ -326,7 +327,7 @@ def forward( @staticmethod def _pad_and_transpose_last_two_dims(hidden_states_padded, padding): """pads rows and then flips rows and columns""" - hidden_states_padded = F.pad( + hidden_states_padded = nn.functional.pad( hidden_states_padded, padding ) # padding value is not important because it will be overwritten hidden_states_padded = hidden_states_padded.view( @@ -353,7 +354,7 @@ def _pad_and_diagonalize(chunked_hidden_states): 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] """ total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size() - chunked_hidden_states = F.pad( + chunked_hidden_states = nn.functional.pad( chunked_hidden_states, (0, window_overlap + 1) ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten chunked_hidden_states = chunked_hidden_states.view( @@ -489,7 +490,7 @@ def _sliding_chunks_matmul_attn_probs_value( value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) # pad seq_len with w at the beginning of the sequence and another window overlap at the end - padded_value = F.pad(value, (0, 0, window_overlap, window_overlap), value=-1) + padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1) # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim) @@ -661,7 +662,7 @@ def _compute_global_attn_output_from_hidden( global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len) # compute global attn probs - global_attn_probs_float = F.softmax( + global_attn_probs_float = nn.functional.softmax( global_attn_scores, dim=-1, dtype=torch.float32 ) # use fp32 for numerical stability @@ -677,7 +678,7 @@ def _compute_global_attn_output_from_hidden( batch_size * self.num_heads, max_num_global_attn_indices, seq_len ) - global_attn_probs = F.dropout( + global_attn_probs = nn.functional.dropout( global_attn_probs_float.type_as(global_attn_scores), p=self.dropout, training=self.training ) @@ -833,7 +834,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: assert layer_head_mask.size() == ( self.num_heads, @@ -851,7 +852,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -914,15 +915,15 @@ def forward( output_attentions=output_attentions, ) hidden_states = attn_outputs[0] - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -1002,7 +1003,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) @@ -1022,7 +1023,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -1032,9 +1033,9 @@ def forward( # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -1562,7 +1563,7 @@ class LEDEncoder(LEDPreTrainedModel): Args: config: LEDConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -1637,7 +1638,7 @@ def _pad_to_window_size( f"`config.attention_window`: {attention_window}" ) if input_ids is not None: - input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) + input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) if inputs_embeds is not None: input_ids_padding = inputs_embeds.new_full( (batch_size, padding_len), @@ -1647,7 +1648,9 @@ def _pad_to_window_size( inputs_embeds_padding = self.embed_tokens(input_ids_padding) inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) - attention_mask = F.pad(attention_mask, (0, padding_len), value=False) # no attention on the padding tokens + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens return padding_len, input_ids, attention_mask, inputs_embeds @@ -1760,7 +1763,7 @@ def forward( hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1842,7 +1845,7 @@ class LEDDecoder(LEDPreTrainedModel): Args: config: LEDConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: LEDConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -2008,7 +2011,7 @@ def forward( hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py index 40b2f864c853..4d9ebe017a1d 100644 --- a/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py +++ b/src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py @@ -19,6 +19,7 @@ import pytorch_lightning as pl import torch +from torch import nn from transformers import LongformerForQuestionAnswering, LongformerModel @@ -28,7 +29,7 @@ def __init__(self, model): super().__init__() self.model = model self.num_labels = 2 - self.qa_outputs = torch.nn.Linear(self.model.config.hidden_size, self.num_labels) + self.qa_outputs = nn.Linear(self.model.config.hidden_size, self.num_labels) # implement only because lightning requires to do so def forward(self): diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 6128f4811494..9ef414fb13e8 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -19,10 +19,9 @@ from typing import Optional, Tuple import torch -import torch.nn as nn import torch.utils.checkpoint +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from torch.nn import functional as F from ...activations import ACT2FN, gelu from ...file_utils import ( @@ -640,7 +639,9 @@ def forward( # free memory del global_key_attn_scores - attn_probs = F.softmax(attn_scores, dim=-1, dtype=torch.float32) # use fp32 for numerical stability + attn_probs = nn.functional.softmax( + attn_scores, dim=-1, dtype=torch.float32 + ) # use fp32 for numerical stability if layer_head_mask is not None: assert layer_head_mask.size() == ( @@ -656,7 +657,7 @@ def forward( del attn_scores # apply dropout - attn_probs = F.dropout(attn_probs, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_probs, p=self.dropout, training=self.training) value_vectors = value_vectors.view(seq_len, batch_size, self.num_heads, self.head_dim).transpose(0, 1) @@ -716,7 +717,7 @@ def forward( @staticmethod def _pad_and_transpose_last_two_dims(hidden_states_padded, padding): """pads rows and then flips rows and columns""" - hidden_states_padded = F.pad( + hidden_states_padded = nn.functional.pad( hidden_states_padded, padding ) # padding value is not important because it will be overwritten hidden_states_padded = hidden_states_padded.view( @@ -743,7 +744,7 @@ def _pad_and_diagonalize(chunked_hidden_states): 0.0000, 0.0000, 0.0000, 2.0514, -1.1600, 0.5372, 0.2629 ] """ total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size() - chunked_hidden_states = F.pad( + chunked_hidden_states = nn.functional.pad( chunked_hidden_states, (0, window_overlap + 1) ) # total_num_heads x num_chunks x window_overlap x (hidden_dim+window_overlap+1). Padding value is not important because it'll be overwritten chunked_hidden_states = chunked_hidden_states.view( @@ -879,7 +880,7 @@ def _sliding_chunks_matmul_attn_probs_value( value = value.transpose(1, 2).reshape(batch_size * num_heads, seq_len, head_dim) # pad seq_len with w at the beginning of the sequence and another window overlap at the end - padded_value = F.pad(value, (0, 0, window_overlap, window_overlap), value=-1) + padded_value = nn.functional.pad(value, (0, 0, window_overlap, window_overlap), value=-1) # chunk padded_value into chunks of size 3 window overlap and an overlap of size window overlap chunked_value_size = (batch_size * num_heads, chunks_count + 1, 3 * window_overlap, head_dim) @@ -1051,7 +1052,7 @@ def _compute_global_attn_output_from_hidden( global_attn_scores = global_attn_scores.view(batch_size * self.num_heads, max_num_global_attn_indices, seq_len) # compute global attn probs - global_attn_probs_float = F.softmax( + global_attn_probs_float = nn.functional.softmax( global_attn_scores, dim=-1, dtype=torch.float32 ) # use fp32 for numerical stability @@ -1067,7 +1068,7 @@ def _compute_global_attn_output_from_hidden( batch_size * self.num_heads, max_num_global_attn_indices, seq_len ) - global_attn_probs = F.dropout( + global_attn_probs = nn.functional.dropout( global_attn_probs_float.type_as(global_attn_scores), p=self.dropout, training=self.training ) @@ -1546,10 +1547,10 @@ def _pad_to_window_size( f"`config.attention_window`: {attention_window}" ) if input_ids is not None: - input_ids = F.pad(input_ids, (0, padding_len), value=pad_token_id) + input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) if position_ids is not None: # pad with position_id = pad_token_id as in modeling_roberta.RobertaEmbeddings - position_ids = F.pad(position_ids, (0, padding_len), value=pad_token_id) + position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id) if inputs_embeds is not None: input_ids_padding = inputs_embeds.new_full( (batch_size, padding_len), @@ -1559,8 +1560,10 @@ def _pad_to_window_size( inputs_embeds_padding = self.embeddings(input_ids_padding) inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) - attention_mask = F.pad(attention_mask, (0, padding_len), value=False) # no attention on the padding tokens - token_type_ids = F.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens + token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index dc69198344cc..9e1bf746ecce 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -19,9 +19,8 @@ from typing import Optional, Tuple import torch -import torch.nn as nn -import torch.nn.functional as F import torch.utils.checkpoint +from torch import nn from ...activations import ACT2FN from ...file_utils import ( @@ -1098,9 +1097,9 @@ def forward( # When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary # cross entropy is used otherwise. if labels.ndim == 1: - loss = F.cross_entropy(logits, labels) + loss = nn.functional.cross_entropy(logits, labels) else: - loss = F.binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits)) + loss = nn.functional.binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits)) if not return_dict: output = ( @@ -1213,9 +1212,9 @@ def forward( # When the number of dimension of `labels` is 1, cross entropy is used as the loss function. The binary # cross entropy is used otherwise. if labels.ndim == 1: - loss = F.cross_entropy(logits, labels) + loss = nn.functional.cross_entropy(logits, labels) else: - loss = F.binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits)) + loss = nn.functional.binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits)) if not return_dict: output = ( @@ -1351,9 +1350,9 @@ def forward( # When the number of dimension of `labels` is 2, cross entropy is used as the loss function. The binary # cross entropy is used otherwise. if labels.ndim == 2: - loss = F.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) + loss = nn.functional.cross_entropy(logits.view(-1, self.num_labels), labels.view(-1)) else: - loss = F.binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits)) + loss = nn.functional.binary_cross_entropy_with_logits(logits.view(-1), labels.view(-1).type_as(logits)) if not return_dict: output = ( diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index 47d614acaa60..25b86d4a04c9 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -20,7 +20,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss @@ -293,7 +292,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -313,7 +312,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -375,15 +374,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states if hidden_states.dtype == torch.float16 and ( @@ -471,7 +470,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # Cross-Attention Block @@ -491,7 +490,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # add cross-attn to positions 3,4 of present_key_value tuple @@ -501,9 +500,9 @@ def forward( residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -665,7 +664,7 @@ class M2M100Encoder(M2M100PreTrainedModel): Args: config: M2M100Config - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None): @@ -764,7 +763,7 @@ def forward( embed_pos = self.embed_positions(input_ids, inputs_embeds) hidden_states = inputs_embeds + embed_pos - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -832,7 +831,7 @@ class M2M100Decoder(M2M100PreTrainedModel): Args: config: M2M100Config - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: M2M100Config, embed_tokens: Optional[nn.Embedding] = None): @@ -989,7 +988,7 @@ def forward( hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/marian/convert_marian_to_pytorch.py b/src/transformers/models/marian/convert_marian_to_pytorch.py index a7faef942e97..86a8d0158de7 100644 --- a/src/transformers/models/marian/convert_marian_to_pytorch.py +++ b/src/transformers/models/marian/convert_marian_to_pytorch.py @@ -24,6 +24,7 @@ import numpy as np import torch +from torch import nn from tqdm import tqdm from transformers import MarianConfig, MarianMTModel, MarianTokenizer @@ -53,7 +54,7 @@ def convert_encoder_layer(opus_dict, layer_prefix: str, converter: dict): return sd -def load_layers_(layer_lst: torch.nn.ModuleList, opus_state: dict, converter, is_decoder=False): +def load_layers_(layer_lst: nn.ModuleList, opus_state: dict, converter, is_decoder=False): for i, layer in enumerate(layer_lst): layer_tag = f"decoder_l{i + 1}_" if is_decoder else f"encoder_l{i + 1}_" sd = convert_encoder_layer(opus_state, layer_tag, converter) @@ -543,8 +544,8 @@ def load_marian_model(self) -> MarianMTModel: load_layers_(model.model.decoder.layers, state_dict, BART_CONVERTER, is_decoder=True) # handle tensors not associated with layers - wemb_tensor = torch.nn.Parameter(torch.FloatTensor(self.wemb)) - bias_tensor = torch.nn.Parameter(torch.FloatTensor(self.final_bias)) + wemb_tensor = nn.Parameter(torch.FloatTensor(self.wemb)) + bias_tensor = nn.Parameter(torch.FloatTensor(self.final_bias)) model.model.shared.weight = wemb_tensor model.model.encoder.embed_tokens = model.model.decoder.embed_tokens = model.model.shared diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 6408562b5bf8..6fb635737a8f 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -22,7 +22,6 @@ import numpy as np import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss @@ -239,7 +238,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -259,7 +258,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -320,15 +319,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -416,7 +415,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) @@ -436,7 +435,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -446,9 +445,9 @@ def forward( # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -630,7 +629,7 @@ class MarianEncoder(MarianPreTrainedModel): Args: config: MarianConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -727,7 +726,7 @@ def forward( embed_pos = self.embed_positions(input_shape) hidden_states = inputs_embeds + embed_pos - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -793,7 +792,7 @@ class MarianDecoder(MarianPreTrainedModel): Args: config: MarianConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: MarianConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -963,7 +962,7 @@ def forward( hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 7252d646eb49..0412eccaaab7 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -19,7 +19,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss, MSELoss @@ -230,7 +229,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -250,7 +249,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -311,15 +310,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states if hidden_states.dtype == torch.float16 and ( @@ -406,7 +405,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # Cross-Attention Block @@ -426,7 +425,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # add cross-attn to positions 3,4 of present_key_value tuple @@ -436,9 +435,9 @@ def forward( residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -658,7 +657,7 @@ class MBartEncoder(MBartPreTrainedModel): Args: config: MBartConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -758,7 +757,7 @@ def forward( hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -826,7 +825,7 @@ class MBartDecoder(MBartPreTrainedModel): Args: config: MBartConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -999,7 +998,7 @@ def forward( hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py index 8588cb815f51..4abce490741f 100644 --- a/src/transformers/models/mmbt/modeling_mmbt.py +++ b/src/transformers/models/mmbt/modeling_mmbt.py @@ -17,7 +17,7 @@ import torch -import torch.nn as nn +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 7f604f981417..3a855ba4fb75 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -27,7 +27,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss @@ -155,7 +154,7 @@ def forward(self, input_tensor): return input_tensor * self.weight + self.bias -NORM2FN = {"layer_norm": torch.nn.LayerNorm, "no_norm": NoNorm} +NORM2FN = {"layer_norm": nn.LayerNorm, "no_norm": NoNorm} class MobileBertEmbeddings(nn.Module): @@ -207,9 +206,9 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs # dimensional output. inputs_embeds = torch.cat( [ - F.pad(inputs_embeds[:, 1:], [0, 0, 0, 1, 0, 0], value=0), + nn.functional.pad(inputs_embeds[:, 1:], [0, 0, 0, 1, 0, 0], value=0), inputs_embeds, - F.pad(inputs_embeds[:, :-1], [0, 0, 1, 0, 0, 0], value=0), + nn.functional.pad(inputs_embeds[:, :-1], [0, 0, 1, 0, 0, 0], value=0), ], dim=2, ) @@ -920,7 +919,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddigs): self.cls.predictions.decoder = new_embeddigs - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding: + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: # resize dense output embedings at first self.cls.predictions.dense = self._get_resized_lm_head( self.cls.predictions.dense, new_num_tokens=new_num_tokens, transposed=True @@ -1028,7 +1027,7 @@ def get_output_embeddings(self): def set_output_embeddings(self, new_embeddigs): self.cls.predictions.decoder = new_embeddigs - def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding: + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: # resize dense output embedings at first self.cls.predictions.dense = self._get_resized_lm_head( self.cls.predictions.dense, new_num_tokens=new_num_tokens, transposed=True diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 27d5ef697d97..6bf03a2f9260 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -23,7 +23,7 @@ from typing import Optional, Tuple import torch -import torch.nn as nn +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...activations import gelu_new, silu diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 36ae820e3b1d..6f10133f0e07 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -21,7 +21,6 @@ import numpy as np import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn from torch.nn import CrossEntropyLoss @@ -239,7 +238,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -259,7 +258,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -321,15 +320,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states if hidden_states.dtype == torch.float16 and ( @@ -417,7 +416,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # Cross-Attention Block @@ -437,7 +436,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # add cross-attn to positions 3,4 of present_key_value tuple @@ -447,9 +446,9 @@ def forward( residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -629,7 +628,7 @@ class PegasusEncoder(PegasusPreTrainedModel): Args: config: PegasusConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -729,7 +728,7 @@ def forward( hidden_states = inputs_embeds + embed_pos - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -797,7 +796,7 @@ class PegasusDecoder(PegasusPreTrainedModel): Args: config: PegasusConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None): @@ -969,7 +968,7 @@ def forward( hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py index cbd8c49956e8..638a71ef2fa4 100644 --- a/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/prophetnet/convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py @@ -17,7 +17,7 @@ import argparse -import torch +from torch import nn from transformers import ProphetNetForConditionalGeneration, XLMProphetNetForConditionalGeneration, logging @@ -107,15 +107,15 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py param.weight.shape == old_model.in_proj_weight[:embed_dim, :].shape, "Shapes have to match" param.bias.shape == old_model.in_proj_bias[:embed_dim].shape, "Shapes have to match" if attribute == "query_proj": - model.query_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[:embed_dim, :]) - model.query_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[:embed_dim]) + model.query_proj.weight = nn.Parameter(old_model.in_proj_weight[:embed_dim, :]) + model.query_proj.bias = nn.Parameter(old_model.in_proj_bias[:embed_dim]) elif attribute == "key_proj": - model.key_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :]) - model.key_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim]) + model.key_proj.weight = nn.Parameter(old_model.in_proj_weight[embed_dim : 2 * embed_dim, :]) + model.key_proj.bias = nn.Parameter(old_model.in_proj_bias[embed_dim : 2 * embed_dim]) elif attribute == "value_proj": - model.value_proj.weight = torch.nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :]) - model.value_proj.bias = torch.nn.Parameter(old_model.in_proj_bias[2 * embed_dim :]) + model.value_proj.weight = nn.Parameter(old_model.in_proj_weight[2 * embed_dim :, :]) + model.value_proj.bias = nn.Parameter(old_model.in_proj_bias[2 * embed_dim :]) is_key_init = True break elif attribute == "position_embeddings": @@ -123,7 +123,7 @@ def convert_prophetnet_checkpoint_to_pytorch(prophetnet_checkpoint_path: str, py model.position_embeddings.weight.shape[-1] == old_model.embed_positions.weight.shape[-1] ), "Hidden size has to match" assert model.position_embeddings.weight.shape[0] == 512, "We want 512 position_embeddings." - model.position_embeddings.weight = torch.nn.Parameter(old_model.embed_positions.weight[:512, :]) + model.position_embeddings.weight = nn.Parameter(old_model.embed_positions.weight[:512, :]) is_key_init = True break diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index c2f642b99531..d707705ea19b 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -21,7 +21,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import Tensor, nn from torch.nn import LayerNorm @@ -183,9 +182,9 @@ def softmax(hidden_state, dim, onnx_trace=False): if onnx_trace: - return F.softmax(hidden_state.float(), dim=dim) + return nn.functional.softmax(hidden_state.float(), dim=dim) else: - return F.softmax(hidden_state, dim=dim, dtype=torch.float32) + return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32) def ngram_attention_bias(sequence_length, ngram, device, dtype): @@ -732,7 +731,7 @@ def forward( else: attn_weights_reshaped = None - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: assert layer_head_mask.size() == ( @@ -746,7 +745,7 @@ def forward( # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped - attn_probs = F.dropout( + attn_probs = nn.functional.dropout( attn_weights, p=self.attention_dropout, training=self.training, @@ -767,7 +766,7 @@ def forward( attn_output = self.out_proj(attn_output) - attn_output = F.dropout(attn_output, p=self.dropout, training=self.training) + attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training) return attn_output, attn_weights_reshaped, past_key_value @@ -788,9 +787,9 @@ def forward(self, hidden_states): hidden_states = self.intermediate(hidden_states) hidden_states = self.activation_fn(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.output(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) return hidden_states @@ -924,7 +923,7 @@ def forward( ) main_attn_probs = main_attn_probs.view(batch_size * self.num_attn_heads, -1, sequence_length) - main_attn_probs = F.dropout(main_attn_probs, p=self.attention_dropout, training=self.training) + main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training) # project to attn_output main_attn_output = torch.bmm(main_attn_probs, main_value_states) @@ -989,7 +988,9 @@ def forward( self.ngram, batch_size * self.num_attn_heads, sequence_length, 2 * sequence_length ) - predict_attn_probs = F.dropout(predict_attn_probs, p=self.attention_dropout, training=self.training) + predict_attn_probs = nn.functional.dropout( + predict_attn_probs, p=self.attention_dropout, training=self.training + ) # project to attention output # [ngram, B*head, T, c] predict_attn_output = torch.einsum("nbts,nbsc->nbtc", (predict_attn_probs, predict_value_states)) @@ -1012,7 +1013,7 @@ def forward( self.ngram, batch_size, self.num_attn_heads, sequence_length, -1 ).transpose(0, 1) - attn_output = F.dropout(attn_output, p=self.dropout, training=self.training) + attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training) return attn_output, main_attn_probs, predict_attn_probs, past_key_value @@ -1321,7 +1322,7 @@ def forward( hidden_states = inputs_embeds + position_embeddings hidden_states = self.embeddings_layer_norm(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.config.dropout, training=self.training) encoder_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -1538,7 +1539,7 @@ def forward( if self.embeddings_layer_norm: hidden_states = self.embeddings_layer_norm(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # init attentions, hidden_states and cache with empty tuples all_main_stream_hidden_states = () if output_hidden_states else None @@ -1995,13 +1996,13 @@ def _compute_loss(self, logits, labels, ignore_index=-100): break expend_targets[i, :, :] = labels - lprobs = F.log_softmax( + lprobs = nn.functional.log_softmax( logits.view(-1, logits.size(-1)), dim=-1, dtype=torch.float32, ) - loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") + loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") if self.config.eps > 0.0: smooth_loss = -lprobs.sum(dim=-1, keepdim=True) @@ -2239,13 +2240,13 @@ def _compute_loss(self, logits, labels, ignore_index=-100): break expend_targets[i, :, :] = labels - lprobs = F.log_softmax( + lprobs = nn.functional.log_softmax( logits.view(-1, logits.size(-1)), dim=-1, dtype=torch.float32, ) - loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") + loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") if self.config.eps > 0.0: smooth_loss = -lprobs.sum(dim=-1, keepdim=True) diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 02c4a2a28f61..183a45437309 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -18,6 +18,7 @@ from typing import Callable, List, Optional, Tuple import torch +from torch import nn from ...configuration_utils import PretrainedConfig from ...file_utils import add_start_docstrings_to_model_forward, replace_return_docstrings @@ -1065,10 +1066,10 @@ def _mask_pads(ll, smooth_obj): return ll.squeeze(-1), smooth_obj.squeeze(-1) # seq_logits dim = (batch*n_docs, tgt_len , #vocabs) - seq_logprobs = torch.nn.functional.log_softmax(seq_logits, dim=-1).view( + seq_logprobs = nn.functional.log_softmax(seq_logits, dim=-1).view( seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.size(-1) ) # batch_size x n_docs x tgt_len x #vocab_size - doc_logprobs = torch.nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1) + doc_logprobs = nn.functional.log_softmax(doc_scores, dim=1).unsqueeze(-1).unsqueeze(-1) # RAG-sequence marginalization first_token_scores = seq_logprobs[:, :, :1, :] @@ -1212,7 +1213,7 @@ def marginalize(self, seq_logits, doc_scores, n_docs=None): n_docs = n_docs if n_docs is not None else self.config.n_docs # RAG-token marginalization - seq_logprobs = torch.nn.functional.log_softmax(seq_logits, dim=-1).view( + seq_logprobs = nn.functional.log_softmax(seq_logits, dim=-1).view( seq_logits.shape[0] // n_docs, n_docs, -1, seq_logits.size(-1) ) doc_logprobs = torch.log_softmax(doc_scores, dim=1) diff --git a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py index 32902fa8e7b7..2e2e3f3a60dd 100755 --- a/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py +++ b/src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py @@ -20,6 +20,7 @@ import numpy as np import torch +from torch import nn from transformers import ReformerConfig, ReformerModelWithLMHead from transformers.utils import logging @@ -31,10 +32,10 @@ def set_param(torch_layer, weight, bias=None): # set parameter of one layer assert torch_layer.weight.shape == weight.shape, f"{torch_layer} layer.weight does not match" - torch_layer.weight = torch.nn.Parameter(weight) + torch_layer.weight = nn.Parameter(weight) if bias is not None: assert torch_layer.bias.shape == bias.shape, f"{torch_layer} layer.bias does not match" - torch_layer.bias = torch.nn.Parameter(bias) + torch_layer.bias = nn.Parameter(bias) def set_layer_weights_in_torch_lsh(weights, torch_layer, hidden_size): @@ -153,7 +154,7 @@ def set_model_weights_in_torch(weights, torch_model, hidden_size): assert ( position_embeddings.weights[emb_idx].shape == emb_weights.shape ), f"{position_embeddings[emb_idx]} emb does not match" - position_embeddings.weights[emb_idx] = torch.nn.Parameter(torch.tensor(emb_weights)) + position_embeddings.weights[emb_idx] = nn.Parameter(torch.tensor(emb_weights)) trax_layer_weights = weights[5] assert len(torch_model_reformer.encoder.layers) * 4 == len( diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index 634c005c4065..8521a9542b6d 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -1782,7 +1782,7 @@ def _init_weights(self, module): """Initialize the weights""" if isinstance(module, AxialPositionEmbeddings): for weight in module.weights: - torch.nn.init.normal_(weight, std=self.config.axial_norm_std) + nn.init.normal_(weight, std=self.config.axial_norm_std) elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) if module.padding_idx is not None: diff --git a/src/transformers/models/retribert/modeling_retribert.py b/src/transformers/models/retribert/modeling_retribert.py index 2507688209e7..08f56e13ee0f 100644 --- a/src/transformers/models/retribert/modeling_retribert.py +++ b/src/transformers/models/retribert/modeling_retribert.py @@ -20,8 +20,8 @@ import math import torch -import torch.nn as nn import torch.utils.checkpoint as checkpoint +from torch import nn from ...file_utils import add_start_docstrings from ...modeling_utils import PreTrainedModel diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 4939ba7e2927..c1a22259ad4c 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -18,8 +18,8 @@ import math import torch -import torch.nn as nn import torch.utils.checkpoint +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN, gelu diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index dde154ab46d4..c3332512ac40 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -20,7 +20,6 @@ from typing import Optional, Tuple import torch -import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss @@ -306,7 +305,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -326,7 +325,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -387,15 +386,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states if hidden_states.dtype == torch.float16 and ( @@ -482,7 +481,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # Cross-Attention Block @@ -502,7 +501,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states # add cross-attn to positions 3,4 of present_key_value tuple @@ -512,9 +511,9 @@ def forward( residual = hidden_states hidden_states = self.final_layer_norm(hidden_states) hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states outputs = (hidden_states,) @@ -686,7 +685,7 @@ class Speech2TextEncoder(Speech2TextPreTrainedModel): Args: config: Speech2TextConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: Speech2TextConfig): @@ -772,7 +771,7 @@ def forward( embed_pos = self.embed_positions(padding_mask) hidden_states = inputs_embeds + embed_pos - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -840,7 +839,7 @@ class Speech2TextDecoder(Speech2TextPreTrainedModel): Args: config: Speech2TextConfig - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: Speech2TextConfig): @@ -1008,7 +1007,7 @@ def forward( positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length) hidden_states = inputs_embeds + positions - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 4aa4b547b37e..38c54bf9d955 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -92,7 +92,7 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs return embeddings -class MatMulWrapper(torch.nn.Module): +class MatMulWrapper(nn.Module): """ Wrapper for torch.matmul(). This makes flop-counting easier to implement. Note that if you directly call torch.matmul() in your code, the flop counter will typically ignore the flops of the matmul. diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index 1460cfcc706b..4822a2ed214f 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -21,7 +21,6 @@ import warnings import torch -import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss from torch.utils.checkpoint import checkpoint @@ -179,7 +178,7 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): #################################################### # PyTorch Models are constructed by sub-classing # - torch.nn.Module for the layers and -# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) +# - PreTrainedModel for the models (it-self a sub-class of nn.Module) #################################################### PARALLELIZE_DOCSTRING = r""" This is an experimental feature and is a subject to change at a moment's notice. @@ -257,7 +256,7 @@ def __init__(self, config): def forward(self, hidden_states): hidden_states = self.wi(hidden_states) - hidden_states = F.relu(hidden_states) + hidden_states = nn.functional.relu(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.wo(hidden_states) return hidden_states @@ -502,10 +501,10 @@ def project(hidden_states, proj_layer, key_value_states, past_key_value): position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) scores += position_bias - attn_weights = F.softmax(scores.float(), dim=-1).type_as( + attn_weights = nn.functional.softmax(scores.float(), dim=-1).type_as( scores ) # (batch_size, n_heads, seq_length, key_length) - attn_weights = F.dropout( + attn_weights = nn.functional.dropout( attn_weights, p=self.dropout, training=self.training ) # (batch_size, n_heads, seq_length, key_length) diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 11d9c07d9f9e..aaf96fcb1e7c 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -22,8 +22,8 @@ from typing import Optional, Tuple import torch -import torch.nn as nn import torch.utils.checkpoint +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...activations import ACT2FN @@ -2096,10 +2096,8 @@ def _calculate_aggregation_loss_known( # Use aggregation supervision as the target. target_aggregation = aggregation_labels - one_hot_labels = torch.nn.functional.one_hot(target_aggregation, num_classes=num_aggregation_labels).type( - torch.float32 - ) - log_probs = torch.nn.functional.log_softmax(logits_aggregation, dim=-1) + one_hot_labels = nn.functional.one_hot(target_aggregation, num_classes=num_aggregation_labels).type(torch.float32) + log_probs = nn.functional.log_softmax(logits_aggregation, dim=-1) # torch.FloatTensor[batch_size] per_example_aggregation_intermediate = -torch.sum(one_hot_labels * log_probs, dim=-1) @@ -2243,7 +2241,7 @@ def _calculate_expected_result( aggregation_op_only_probs = gumbel_dist.sample() else: # [batch_size, num_aggregation_labels - 1] - aggregation_op_only_probs = torch.nn.functional.softmax( + aggregation_op_only_probs = nn.functional.softmax( logits_aggregation[:, 1:] / config.aggregation_temperature, dim=-1 ) diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index 8d0fa11e59eb..aa445726a873 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -21,8 +21,7 @@ from typing import List, Optional, Tuple import torch -import torch.nn as nn -import torch.nn.functional as F +from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from ...file_utils import ( @@ -344,7 +343,7 @@ def forward(self, w, r, attn_mask=None, mems=None, head_mask=None, output_attent attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], -1e30).type_as(attn_score) # [qlen x klen x bsz x n_head] - attn_prob = F.softmax(attn_score, dim=1) + attn_prob = nn.functional.softmax(attn_score, dim=1) attn_prob = self.dropatt(attn_prob) # Mask heads if we want to @@ -434,7 +433,7 @@ def forward(self, inp): if self.div_val == 1: embed = self.emb_layers[0](inp) if self.d_proj != self.d_embed: - embed = F.linear(embed, self.emb_projs[0]) + embed = nn.functional.linear(embed, self.emb_projs[0]) else: param = next(self.parameters()) inp_flat = inp.view(-1) @@ -450,7 +449,7 @@ def forward(self, inp): inp_i = inp_flat.index_select(0, indices_i) - l_idx emb_i = self.emb_layers[i](inp_i) - emb_i = F.linear(emb_i, self.emb_projs[i]) + emb_i = nn.functional.linear(emb_i, self.emb_projs[i]) emb_flat.index_copy_(0, indices_i, emb_i) diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py index 98692746e76a..1f804a278fd9 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py @@ -19,8 +19,7 @@ import torch -import torch.nn as nn -import torch.nn.functional as F +from torch import nn # CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) @@ -71,11 +70,11 @@ def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=Fals def _compute_logit(self, hidden, weight, bias, proj): if proj is None: - logit = F.linear(hidden, weight, bias=bias) + logit = nn.functional.linear(hidden, weight, bias=bias) else: # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: - proj_hid = F.linear(hidden, proj.t().contiguous()) - logit = F.linear(proj_hid, weight, bias=bias) + proj_hid = nn.functional.linear(hidden, proj.t().contiguous()) + logit = nn.functional.linear(proj_hid, weight, bias=bias) # else: # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) # if bias is not None: @@ -110,9 +109,9 @@ def forward(self, hidden, labels=None, keep_order=False): if self.n_clusters == 0: logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) if labels is not None: - out = -F.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1) + out = -nn.functional.log_softmax(logit, dim=-1).gather(1, labels.unsqueeze(1)).squeeze(1) else: - out = F.log_softmax(logit, dim=-1) + out = nn.functional.log_softmax(logit, dim=-1) else: # construct weights and biases weights, biases = [], [] @@ -135,7 +134,7 @@ def forward(self, hidden, labels=None, keep_order=False): head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) - head_logprob = F.log_softmax(head_logit, dim=1) + head_logprob = nn.functional.log_softmax(head_logit, dim=1) if labels is None: out = hidden.new_empty((head_logit.size(0), self.n_token)) @@ -169,7 +168,7 @@ def forward(self, hidden, labels=None, keep_order=False): weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) - tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1) cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster if labels is not None: logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather( @@ -205,7 +204,7 @@ def log_prob(self, hidden): """ if self.n_clusters == 0: logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0]) - return F.log_softmax(logit, dim=-1) + return nn.functional.log_softmax(logit, dim=-1) else: # construct weights and biases weights, biases = [], [] @@ -229,7 +228,7 @@ def log_prob(self, hidden): head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) out = hidden.new_empty((head_logit.size(0), self.n_token)) - head_logprob = F.log_softmax(head_logit, dim=1) + head_logprob = nn.functional.log_softmax(head_logit, dim=1) cutoff_values = [0] + self.cutoffs for i in range(len(cutoff_values) - 1): @@ -241,7 +240,7 @@ def log_prob(self, hidden): weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) - tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1) logprob_i = head_logprob[:, -i] + tail_logprob_i out[:, start_idx, stop_idx] = logprob_i diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 994e9ff9c6f6..35c3600eabe5 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -89,10 +89,10 @@ def __init__(self, config): self.visual_position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) if config.special_visual_initialize: - self.visual_token_type_embeddings.weight.data = torch.nn.Parameter( + self.visual_token_type_embeddings.weight.data = nn.Parameter( self.token_type_embeddings.weight.data.clone(), requires_grad=True ) - self.visual_position_embeddings.weight.data = torch.nn.Parameter( + self.visual_position_embeddings.weight.data = nn.Parameter( self.position_embeddings.weight.data.clone(), requires_grad=True ) @@ -1253,8 +1253,8 @@ def forward( loss = None if labels is not None: - loss_fct = torch.nn.KLDivLoss(reduction="batchmean") - log_softmax = torch.nn.LogSoftmax(dim=-1) + loss_fct = nn.KLDivLoss(reduction="batchmean") + log_softmax = nn.LogSoftmax(dim=-1) reshaped_logits = log_softmax(reshaped_logits) loss = loss_fct(reshaped_logits, labels.contiguous()) if not return_dict: diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 5039595a29ca..4f8d65fee69f 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -20,7 +20,6 @@ import numpy as np import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn @@ -449,7 +448,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -469,7 +468,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -805,9 +804,9 @@ def forward(self, hidden_states, mask_time_indices=None): if self.training: # sample code vector probs via gumbel in differentiateable way - codevector_probs = F.gumbel_softmax(hidden_states.float(), tau=self.temperature, hard=True).type_as( - hidden_states - ) + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) # compute perplexity codevector_soft_dist = torch.softmax( @@ -867,12 +866,12 @@ def _init_weights(self, module): if hasattr(module, "weight_v") and hasattr(module, "weight_g"): with deepspeed.zero.GatheredParameters([module.weight_v, module.weight_g], modifier_rank=0): - torch.nn.init.kaiming_normal_(module.weight.data) + nn.init.kaiming_normal_(module.weight.data) else: with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0): - torch.nn.init.kaiming_normal_(module.weight.data) + nn.init.kaiming_normal_(module.weight.data) else: - torch.nn.init.kaiming_normal_(module.weight.data) + nn.init.kaiming_normal_(module.weight.data) if isinstance(module, (nn.Linear, nn.Conv1d)) and module.bias is not None: module.bias.data.zero_() @@ -1296,7 +1295,7 @@ def forward( # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa)) preds = logits.transpose(0, 2).reshape(-1, logits.size(0)) target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten() - contrastive_loss = F.cross_entropy(preds.float(), target, reduction="sum") + contrastive_loss = nn.functional.cross_entropy(preds.float(), target, reduction="sum") # 7. compute diversity loss: \mathbf{L}_d num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups @@ -1502,10 +1501,10 @@ def forward( flattened_targets = labels.masked_select(labels_mask) # ctc_loss doesn't support fp16 - log_probs = F.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) + log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1) with torch.backends.cudnn.flags(enabled=False): - loss = F.ctc_loss( + loss = nn.functional.ctc_loss( log_probs, flattened_targets, input_lengths, diff --git a/src/transformers/models/xlm/modeling_tf_xlm.py b/src/transformers/models/xlm/modeling_tf_xlm.py index 0ae3ac2a2472..b7cdc1ad74fa 100644 --- a/src/transformers/models/xlm/modeling_tf_xlm.py +++ b/src/transformers/models/xlm/modeling_tf_xlm.py @@ -503,7 +503,7 @@ def call( # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) - # attn = F.dropout(attn, p=self.dropout, training=self.training) + # attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index 38a99d233454..ebf88d2b12ef 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -25,7 +25,6 @@ import torch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from torch.nn import functional as F from ...activations import gelu from ...file_utils import ( @@ -190,8 +189,8 @@ def unshape(x): mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) scores.masked_fill_(mask, -float("inf")) # (bs, n_heads, qlen, klen) - weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) - weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) + weights = nn.functional.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = nn.functional.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) # Mask heads if we want to if head_mask is not None: @@ -212,7 +211,7 @@ def __init__(self, in_dim, dim_hidden, out_dim, config): self.dropout = config.dropout self.lin1 = nn.Linear(in_dim, dim_hidden) self.lin2 = nn.Linear(dim_hidden, out_dim) - self.act = gelu if config.gelu_activation else F.relu + self.act = gelu if config.gelu_activation else nn.functional.relu self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 @@ -223,7 +222,7 @@ def ff_chunk(self, input): x = self.lin1(input) x = self.act(x) x = self.lin2(x) - x = F.dropout(x, p=self.dropout, training=self.training) + x = nn.functional.dropout(x, p=self.dropout, training=self.training) return x @@ -578,7 +577,7 @@ def forward( if token_type_ids is not None: tensor = tensor + self.embeddings(token_type_ids) tensor = self.layer_norm_emb(tensor) - tensor = F.dropout(tensor, p=self.dropout, training=self.training) + tensor = nn.functional.dropout(tensor, p=self.dropout, training=self.training) tensor *= mask.unsqueeze(-1).to(tensor.dtype) # transformer layers @@ -599,14 +598,14 @@ def forward( attn = attn_outputs[0] if output_attentions: attentions = attentions + (attn_outputs[1],) - attn = F.dropout(attn, p=self.dropout, training=self.training) + attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) # encoder attention (for decoder only) # if self.is_decoder and src_enc is not None: # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) - # attn = F.dropout(attn, p=self.dropout, training=self.training) + # attn = nn.functional.dropout(attn, p=self.dropout, training=self.training) # tensor = tensor + attn # tensor = self.layer_norm15[i](tensor) @@ -661,7 +660,9 @@ def forward(self, x, y=None): scores = self.proj(x) outputs = (scores,) + outputs if y is not None: - loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean") + loss = nn.functional.cross_entropy( + scores.view(-1, self.n_words), y.view(-1), reduction="elementwise_mean" + ) outputs = (loss,) + outputs else: scores = self.proj.log_prob(x) diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index 97264da73793..cd0cb73ac183 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -23,7 +23,6 @@ import torch from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from torch.nn import functional as F from ...activations import ACT2FN from ...file_utils import ( @@ -305,7 +304,7 @@ def rel_attn_core( attn_score = attn_score - 1e30 * torch.einsum("ijbn->bnij", attn_mask) # attention probability - attn_prob = F.softmax(attn_score, dim=3) + attn_prob = nn.functional.softmax(attn_score, dim=3) attn_prob = self.dropout(attn_prob) # Mask heads if we want to @@ -1208,7 +1207,7 @@ def forward( # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long() - seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float) + seg_mat = nn.functional.one_hot(seg_mat, num_classes=2).to(dtype_float) else: seg_mat = None @@ -2034,7 +2033,7 @@ def forward( else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() - start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) + start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, self.start_n_top, dim=-1 @@ -2048,7 +2047,7 @@ def forward( ) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) - end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, self.end_n_top, dim=1 diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 4a92b18a3031..07098326838e 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -18,6 +18,7 @@ from typing import Callable, Iterable, Optional, Tuple, Union import torch +from torch import nn from torch.optim import Optimizer from torch.optim.lr_scheduler import LambdaLR @@ -272,7 +273,7 @@ class AdamW(Optimizer): `__. Parameters: - params (:obj:`Iterable[torch.nn.parameter.Parameter]`): + params (:obj:`Iterable[nn.parameter.Parameter]`): Iterable of parameters to optimize or dictionaries defining parameter groups. lr (:obj:`float`, `optional`, defaults to 1e-3): The learning rate to use. @@ -288,7 +289,7 @@ class AdamW(Optimizer): def __init__( self, - params: Iterable[torch.nn.parameter.Parameter], + params: Iterable[nn.parameter.Parameter], lr: float = 1e-3, betas: Tuple[float, float] = (0.9, 0.999), eps: float = 1e-6, @@ -379,7 +380,7 @@ class Adafactor(Optimizer): `relative_step=False`. Arguments: - params (:obj:`Iterable[torch.nn.parameter.Parameter]`): + params (:obj:`Iterable[nn.parameter.Parameter]`): Iterable of parameters to optimize or dictionaries defining parameter groups. lr (:obj:`float`, `optional`): The external learning rate. diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 8303fef2d2ae..9f882e56abf4 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -264,7 +264,7 @@ class Trainer: def __init__( self, - model: Union[PreTrainedModel, torch.nn.Module] = None, + model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, @@ -772,7 +772,7 @@ def create_optimizer(self): Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass. """ if self.optimizer is None: - decay_parameters = get_parameter_names(self.model, [torch.nn.LayerNorm]) + decay_parameters = get_parameter_names(self.model, [nn.LayerNorm]) decay_parameters = [name for name in decay_parameters if "bias" not in name] optimizer_grouped_parameters = [ { @@ -933,7 +933,7 @@ def _wrap_model(self, model, training=True): # Multi-gpu training (should be after apex fp16 initialization) if self.args.n_gpu > 1: - model = torch.nn.DataParallel(model) + model = nn.DataParallel(model) # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. @@ -970,7 +970,7 @@ def _wrap_model(self, model, training=True): find_unused_parameters = not getattr(model.config, "gradient_checkpointing", False) else: find_unused_parameters = True - model = torch.nn.parallel.DistributedDataParallel( + model = nn.parallel.DistributedDataParallel( model, device_ids=[self.args.local_rank], output_device=self.args.local_rank, @@ -1288,7 +1288,7 @@ def train( model.clip_grad_norm_(args.max_grad_norm) else: # Revert to normal clipping otherwise, handling Apex or full precision - torch.nn.utils.clip_grad_norm_( + nn.utils.clip_grad_norm_( amp.master_params(self.optimizer) if self.use_apex else model.parameters(), args.max_grad_norm, ) diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 70da2a48c9be..ec055e647e37 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -28,6 +28,7 @@ import numpy as np import torch from packaging import version +from torch import nn from torch.utils.data.dataset import Dataset, IterableDataset from torch.utils.data.distributed import DistributedSampler from torch.utils.data.sampler import RandomSampler, Sampler @@ -441,7 +442,7 @@ class LabelSmoother: def __call__(self, model_output, labels): logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0] - log_probs = -torch.nn.functional.log_softmax(logits, dim=-1) + log_probs = -nn.functional.log_softmax(logits, dim=-1) if labels.dim() == log_probs.dim() - 1: labels = labels.unsqueeze(-1)