huggingface · sgugger · Jul 6, 2022 · Jul 6, 2022 · Jul 6, 2022 · sgugger
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -526,7 +526,7 @@ def postprocess_text(preds, labels):
 
     # Optimizer
     # Split weights in two groups, one with weight decay and the other not.
-    no_decay = ["bias", "LayerNorm.weight"]
+    no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
     optimizer_grouped_parameters = [
         {
             "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],

diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py
@@ -32,7 +32,8 @@
     Seq2SeqLMOutput,
     Seq2SeqModelOutput,
 )
-from ...modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -247,6 +248,8 @@ def forward(self, hidden_states):
         return self.weight * hidden_states
 
 
+ALL_LAYERNORM_LAYERS.append(LongT5LayerNorm)
+
 try:
     from apex.normalization import FusedRMSNorm
 

diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
@@ -34,7 +34,7 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, find_pruneable_heads_and_indices, prune_linear_layer
 from ...utils import (
     DUMMY_INPUTS,
     DUMMY_MASK,
@@ -262,6 +262,8 @@ def forward(self, hidden_states):
         return self.weight * hidden_states
 
 
+ALL_LAYERNORM_LAYERS.append(T5LayerNorm)
+
 try:
     from apex.normalization import FusedRMSNorm
 

diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
@@ -21,6 +21,8 @@
 from .utils import logging
 
 
+ALL_LAYERNORM_LAYERS = [nn.LayerNorm]
+
 logger = logging.get_logger(__name__)
 
 is_torch_less_than_1_8 = version.parse(version.parse(torch.__version__).base_version) < version.parse("1.8.0")

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -71,6 +71,7 @@
 from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
 from .models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 from .optimization import Adafactor, get_scheduler
+from .pytorch_utils import ALL_LAYERNORM_LAYERS
 from .tokenization_utils_base import PreTrainedTokenizerBase
 from .trainer_callback import (
     CallbackHandler,
@@ -967,7 +968,7 @@ def create_optimizer(self):
         opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
 
         if self.optimizer is None:
-            decay_parameters = get_parameter_names(opt_model, [nn.LayerNorm])
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
             decay_parameters = [name for name in decay_parameters if "bias" not in name]
             optimizer_grouped_parameters = [
                 {