Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions src/transformers/activations.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
import math

import torch
import torch.nn.functional as F
from packaging import version
from torch import nn

from .utils import logging

Expand All @@ -28,8 +28,8 @@ def _gelu_python(x):
"""
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

Expand All @@ -45,7 +45,7 @@ def gelu_new(x):
if version.parse(torch.__version__) < version.parse("1.4"):
gelu = _gelu_python
else:
gelu = F.gelu
gelu = nn.functional.gelu


def gelu_fast(x):
Expand All @@ -70,19 +70,19 @@ def _silu_python(x):
if version.parse(torch.__version__) < version.parse("1.7"):
silu = _silu_python
else:
silu = F.silu
silu = nn.functional.silu


def mish(x):
return x * torch.tanh(torch.nn.functional.softplus(x))
return x * torch.tanh(nn.functional.softplus(x))


def linear_act(x):
return x


ACT2FN = {
"relu": F.relu,
"relu": nn.functional.relu,
"silu": silu,
"swish": silu,
"gelu": gelu,
Expand Down
24 changes: 15 additions & 9 deletions src/transformers/generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import torch
import torch.distributed as dist
from torch.nn import functional as F
from torch import nn

from .file_utils import ModelOutput
from .generation_beam_search import BeamScorer, BeamSearchScorer
Expand Down Expand Up @@ -1564,7 +1564,7 @@ def sample(
)

# sample
probs = F.softmax(next_token_scores, dim=-1)
probs = nn.functional.softmax(next_token_scores, dim=-1)
next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)

# finished sentences should have their next token be a padding token
Expand Down Expand Up @@ -1801,9 +1801,11 @@ def beam_search(
next_token_logits = outputs.logits[:, -1, :]

# hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
# cannot be generated both before and after the `F.log_softmax` operation.
# cannot be generated both before and after the `nn.functional.log_softmax` operation.
next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
next_token_scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size)
next_token_scores = nn.functional.log_softmax(
next_token_logits, dim=-1
) # (batch_size * num_beams, vocab_size)

next_token_scores = logits_processor(input_ids, next_token_scores)
next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
Expand Down Expand Up @@ -2098,9 +2100,11 @@ def beam_sample(
next_token_logits = outputs.logits[:, -1, :]

# hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
# cannot be generated both before and after the `F.log_softmax` operation.
# cannot be generated both before and after the `nn.functional.log_softmax` operation.
next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
next_token_scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size)
next_token_scores = nn.functional.log_softmax(
next_token_logits, dim=-1
) # (batch_size * num_beams, vocab_size)

next_token_scores = logits_processor(input_ids, next_token_scores)
next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
Expand Down Expand Up @@ -2128,7 +2132,7 @@ def beam_sample(
vocab_size = next_token_scores.shape[-1]
next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)

probs = F.softmax(next_token_scores, dim=-1)
probs = nn.functional.softmax(next_token_scores, dim=-1)

next_tokens = torch.multinomial(probs, num_samples=2 * num_beams)
next_token_scores = torch.gather(next_token_scores, -1, next_tokens)
Expand Down Expand Up @@ -2426,9 +2430,11 @@ def group_beam_search(
next_token_logits = outputs.logits[batch_group_indices, -1, :]

# hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
# cannot be generated both before and after the `F.log_softmax` operation.
# cannot be generated both before and after the `nn.functional.log_softmax` operation.
next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
next_token_scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * group_size, vocab_size)
next_token_scores = nn.functional.log_softmax(
next_token_logits, dim=-1
) # (batch_size * group_size, vocab_size)
vocab_size = next_token_scores.shape[-1]

next_token_scores = logits_processor(
Expand Down
3 changes: 2 additions & 1 deletion src/transformers/modeling_fx_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any, Dict, List, Optional, Union

import torch
from torch import nn
from torch.fx import Graph, GraphModule, Node, Proxy, Tracer
from torch.fx.node import Argument

Expand Down Expand Up @@ -277,7 +278,7 @@ def _insert_module_as_submodule(self, mod):

return path

def path_of_module(self, mod: torch.nn.Module) -> str:
def path_of_module(self, mod: nn.Module) -> str:
"""
Helper method to find the qualified name of ``mod`` in the Module hierarchy of ``root``. For example, if
``root`` has a submodule named ``foo``, which has a submodule named ``bar``, passing ``bar`` into this function
Expand Down
31 changes: 14 additions & 17 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import torch
from torch import Tensor, device, dtype, nn
from torch.nn import CrossEntropyLoss
from torch.nn import functional as F

from .activations import get_activation
from .configuration_utils import PretrainedConfig
Expand Down Expand Up @@ -355,9 +354,7 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool
"""

def parameter_filter(x):
return (x.requires_grad or not only_trainable) and not (
isinstance(x, torch.nn.Embedding) and exclude_embeddings
)
return (x.requires_grad or not only_trainable) and not (isinstance(x, nn.Embedding) and exclude_embeddings)

params = filter(parameter_filter, self.parameters()) if only_trainable else self.parameters()
return sum(p.numel() for p in params)
Expand Down Expand Up @@ -549,7 +546,7 @@ def tie_encoder_to_decoder_recursively(
):
assert isinstance(decoder_pointer, nn.Module) and isinstance(
encoder_pointer, nn.Module
), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
if hasattr(decoder_pointer, "weight"):
assert hasattr(encoder_pointer, "weight")
encoder_pointer.weight = decoder_pointer.weight
Expand Down Expand Up @@ -613,7 +610,7 @@ def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
output_embeddings.weight = input_embeddings.weight

if getattr(output_embeddings, "bias", None) is not None:
output_embeddings.bias.data = torch.nn.functional.pad(
output_embeddings.bias.data = nn.functional.pad(
output_embeddings.bias.data,
(
0,
Expand All @@ -625,7 +622,7 @@ def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
output_embeddings.out_features = input_embeddings.num_embeddings

def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> torch.nn.Embedding:
def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
"""
Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.

Expand Down Expand Up @@ -668,8 +665,8 @@ def _resize_token_embeddings(self, new_num_tokens):
return self.get_input_embeddings()

def _get_resized_embeddings(
self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None
) -> torch.nn.Embedding:
self, old_embeddings: nn.Embedding, new_num_tokens: Optional[int] = None
) -> nn.Embedding:
"""
Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
initialized vectors at the end. Reducing the size will remove vectors from the end
Expand Down Expand Up @@ -732,8 +729,8 @@ def _get_resized_embeddings(
return new_embeddings

def _get_resized_lm_head(
self, old_lm_head: torch.nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
) -> torch.nn.Linear:
self, old_lm_head: nn.Linear, new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
) -> nn.Linear:
"""
Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
vectors at the end. Reducing the size will remove vectors from the end
Expand Down Expand Up @@ -1681,7 +1678,7 @@ def forward(
else:
# during inference, compute the end logits based on beam search
bsz, slen, hsz = hidden_states.size()
start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen)

start_top_log_probs, start_top_index = torch.topk(
start_log_probs, self.start_n_top, dim=-1
Expand All @@ -1695,7 +1692,7 @@ def forward(
) # shape (bsz, slen, start_n_top, hsz)
p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)

end_top_log_probs, end_top_index = torch.topk(
end_log_probs, self.end_n_top, dim=1
Expand Down Expand Up @@ -1820,7 +1817,7 @@ def forward(
return output


def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
def unwrap_model(model: nn.Module) -> nn.Module:
"""
Recursively unwraps a model from potential containers (as used in distributed training).

Expand All @@ -1834,7 +1831,7 @@ def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
return model


def prune_linear_layer(layer: torch.nn.Linear, index: torch.LongTensor, dim: int = 0) -> torch.nn.Linear:
def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear:
"""
Prune a linear layer to keep only entries in index.

Expand Down Expand Up @@ -1902,8 +1899,8 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) ->


def prune_layer(
layer: Union[torch.nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None
) -> Union[torch.nn.Linear, Conv1D]:
layer: Union[nn.Linear, Conv1D], index: torch.LongTensor, dim: Optional[int] = None
) -> Union[nn.Linear, Conv1D]:
"""
Prune a Conv1D or linear layer to keep only entries in index.

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/albert/modeling_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from typing import Optional, Tuple

import torch
import torch.nn as nn
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from ...activations import ACT2FN
Expand Down
27 changes: 13 additions & 14 deletions src/transformers/models/bart/modeling_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from typing import Optional, Tuple

import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
Expand Down Expand Up @@ -223,7 +222,7 @@ def forward(
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

attn_weights = F.softmax(attn_weights, dim=-1)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)

if layer_head_mask is not None:
if layer_head_mask.size() != (self.num_heads,):
Expand All @@ -243,7 +242,7 @@ def forward(
else:
attn_weights_reshaped = None

attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)

attn_output = torch.bmm(attn_probs, value_states)

Expand Down Expand Up @@ -303,15 +302,15 @@ def forward(
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)

residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)

Expand Down Expand Up @@ -398,7 +397,7 @@ def forward(
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)

Expand All @@ -418,7 +417,7 @@ def forward(
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)

Expand All @@ -428,9 +427,9 @@ def forward(
# Fully Connected
residual = hidden_states
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
hidden_states = self.final_layer_norm(hidden_states)

Expand Down Expand Up @@ -661,7 +660,7 @@ class BartEncoder(BartPretrainedModel):

Args:
config: BartConfig
embed_tokens (torch.nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""

def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
Expand Down Expand Up @@ -760,7 +759,7 @@ def forward(

hidden_states = inputs_embeds + embed_pos
hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

# expand attention_mask
if attention_mask is not None:
Expand Down Expand Up @@ -826,7 +825,7 @@ class BartDecoder(BartPretrainedModel):

Args:
config: BartConfig
embed_tokens (torch.nn.Embedding): output embedding
embed_tokens (nn.Embedding): output embedding
"""

def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
Expand Down Expand Up @@ -997,7 +996,7 @@ def forward(
hidden_states = inputs_embeds + positions
hidden_states = self.layernorm_embedding(hidden_states)

hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

# decoder layers
all_hidden_states = () if output_hidden_states else None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def __init__(self, config):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

# position_ids (1, len position emb) is contiguous in memory and exported when serialized
Expand Down
Loading