diff --git a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md index bdbedf8630ac..784314b56da3 100644 --- a/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md +++ b/templates/adding_a_new_model/ADD_NEW_MODEL_PROPOSAL_TEMPLATE.md @@ -711,7 +711,7 @@ defined by the name of the class attribute you give the layer. Let's define a dummy model in PyTorch, called `SimpleModel` as follows: ```python -import torch.nn as nn +from torch import nn class SimpleModel(nn.Module): def __init__(self): diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index e69340c17961..c4e6278459f0 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -1542,7 +1542,6 @@ def forward( from typing import Optional, Tuple import torch -import torch.nn.functional as F from torch import nn from torch.nn import CrossEntropyLoss @@ -1743,7 +1742,7 @@ def forward( attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) - attn_weights = F.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) if layer_head_mask is not None: if layer_head_mask.size() != (self.num_heads,): @@ -1763,7 +1762,7 @@ def forward( else: attn_weights_reshaped = None - attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) attn_output = torch.bmm(attn_probs, value_states) @@ -1823,15 +1822,15 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -1916,7 +1915,7 @@ def forward( layer_head_mask=layer_head_mask, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.self_attn_layer_norm(hidden_states) @@ -1936,7 +1935,7 @@ def forward( past_key_value=cross_attn_past_key_value, output_attentions=output_attentions, ) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.encoder_attn_layer_norm(hidden_states) @@ -1946,9 +1945,9 @@ def forward( # Fully Connected residual = hidden_states hidden_states = self.activation_fn(self.fc1(hidden_states)) - hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.fc2(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) hidden_states = residual + hidden_states hidden_states = self.final_layer_norm(hidden_states) @@ -2171,7 +2170,7 @@ class {{cookiecutter.camelcase_modelname}}Encoder({{cookiecutter.camelcase_model Args: config: {{cookiecutter.camelcase_modelname}}Config - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None): @@ -2270,7 +2269,7 @@ def forward( hidden_states = inputs_embeds + embed_pos hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # expand attention_mask if attention_mask is not None: @@ -2337,7 +2336,7 @@ class {{cookiecutter.camelcase_modelname}}Decoder({{cookiecutter.camelcase_model Args: config: {{cookiecutter.camelcase_modelname}}Config - embed_tokens (torch.nn.Embedding): output embedding + embed_tokens (nn.Embedding): output embedding """ def __init__(self, config: {{cookiecutter.camelcase_modelname}}Config, embed_tokens: Optional[nn.Embedding] = None): @@ -2506,7 +2505,7 @@ def forward( hidden_states = inputs_embeds + positions hidden_states = self.layernorm_embedding(hidden_states) - hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) # decoder layers all_hidden_states = () if output_hidden_states else None diff --git a/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md b/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md index 22450344743e..fea8376a80f9 100644 --- a/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md +++ b/templates/adding_a_new_model/open_model_proposals/ADD_BIG_BIRD.md @@ -725,7 +725,7 @@ defined by the name of the class attribute you give the layer. Let's define a dummy model in PyTorch, called `SimpleModel` as follows: ```python -import torch.nn as nn +from torch import nn class SimpleModel(nn.Module): def __init__(self):