Refactor bias act fusion (#4376)

* Refactor bias act fusion Signed-off-by: MaximumEntropy <[email protected]> * Update NMT config Signed-off-by: MaximumEntropy <[email protected]> * Update ci tests Signed-off-by: MaximumEntropy <[email protected]> * Empty Signed-off-by: MaximumEntropy <[email protected]>
NVIDIA · Jun 30, 2022 · 7e53b32 · 7e53b32
1 parent bb5c59f
commit 7e53b32
Show file tree

Hide file tree

Showing 9 changed files with 29 additions and 30 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -2115,7 +2115,7 @@ pipeline {
         model.num_attention_heads=8 \
         model.activation='swiglu' \
         model.masked_softmax_fusion=False \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.micro_batch_size=2 \
@@ -2147,7 +2147,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='swiglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.masked_softmax_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
@@ -2879,7 +2879,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='swiglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.transformer_block_type='pre_ln' \
@@ -2904,7 +2904,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='swiglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.transformer_block_type='pre_ln' \
@@ -3001,7 +3001,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='swiglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.transformer_block_type='normformer' \
@@ -3026,7 +3026,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='swiglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.transformer_block_type='normformer' \
@@ -3080,7 +3080,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='reglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
@@ -3102,7 +3102,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='reglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
@@ -3136,7 +3136,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='geglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
@@ -3159,7 +3159,7 @@ pipeline {
         model.hidden_size=64 \
         model.num_attention_heads=8 \
         model.activation='geglu' \
-        model.bias_gelu_fusion=False \
+        model.bias_activation_fusion=False \
         model.activations_checkpoint_method='block' \
         model.activations_checkpoint_num_layers=1 \
         model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"

diff --git a/examples/nlp/language_modeling/conf/megatron_bart_config.yaml b/examples/nlp/language_modeling/conf/megatron_bart_config.yaml
@@ -71,7 +71,7 @@ model:
   layernorm_epsilon: 1e-5
   persist_layer_norm: True # Use of persistent fused layer norm kernel.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.

diff --git a/examples/nlp/language_modeling/conf/megatron_retro_config.yaml b/examples/nlp/language_modeling/conf/megatron_retro_config.yaml
@@ -59,7 +59,7 @@ model:
   layernorm_epsilon: 1e-5
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
   persist_layer_norm: False
-  bias_gelu_fusion: True
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   bias_dropout_add_fusion: True
   masked_softmax_fusion: True
   activation: 'gelu'

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config.yaml
@@ -72,7 +72,6 @@ model:
   layernorm_epsilon: 1e-5
   persist_layer_norm: True # Use of persistent fused layer norm kernel.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
   bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml b/examples/nlp/language_modeling/conf/megatron_t5_lm_adaptation_finetune.yaml
@@ -53,7 +53,7 @@ model:
   megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.
 
   # JIT fusion params.
-  bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
 

diff --git a/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml b/examples/nlp/language_modeling/conf/megatron_ul2_config.yaml
@@ -70,7 +70,7 @@ model:
   layernorm_epsilon: 1e-5
   persist_layer_norm: True # Use of persistent fused layer norm kernel.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.

diff --git a/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml b/examples/nlp/machine_translation/conf/aayn_base_megatron.yaml
@@ -81,7 +81,7 @@ model:
   layernorm_epsilon: 1e-5
   persist_layer_norm: True # Use of persistent fused layer norm kernel.
   gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
-  bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
+  bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
   masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
   bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
   bias: True # Whether to use bias terms in all weight matrices.

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py b/nemo/collections/nlp/models/language_modeling/megatron_lm_encoder_decoder_model.py
@@ -121,7 +121,11 @@ def setup_optimizer_param_groups(self):
 
     def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder):
         # TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim)
-
+        if hasattr(self.cfg, 'bias_gelu_fusion'):
+            logging.warning('bias_gelu_fusion is deprecated. Please use bias_activation_fusion instead.')
+            activation_fusion = self.cfg.bias_gelu_fusion
+        else:
+            activation_fusion = self.cfg.get('bias_activation_fusion', True)
         model = MegatronTokenLevelEncoderDecoderModule(
             encoder_arch=self.cfg.encoder_arch,
             decoder_arch=self.cfg.decoder_arch,
@@ -151,10 +155,7 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
             activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1),
             layernorm_epsilon=self.cfg.get('layernorm_epsilon', 1e-5),
             persist_layer_norm=self.cfg.get('persist_layer_norm', False),
-            bias_activation_fusion=(
-                (self.cfg.get('bias_gelu_fusion', True) and self.cfg.get('activation', 'gelu') == 'gelu')
-                or (self.cfg.get('bias_activation_fusion', True) and self.cfg.get('activation', 'gelu') == 'geglu')
-            ),
+            bias_activation_fusion=activation_fusion,
             bias_dropout_add_fusion=self.cfg.get('bias_dropout_add_fusion', True),
             masked_softmax_fusion=self.cfg.get('masked_softmax_fusion', True),
             onnx_safe=self.cfg.get('onnx_safe', False),

diff --git a/nemo/collections/nlp/modules/common/megatron/transformer.py b/nemo/collections/nlp/modules/common/megatron/transformer.py
@@ -159,19 +159,20 @@ def __init__(
                 bias=bias,
             )
 
-        glu_activation_family = activation in ['reglu', 'swiglu']
+        self.glu_activation_family = activation in ['geglu', 'reglu', 'swiglu']
+        bias_activation_fusion_unavailable = activation in ['reglu', 'swiglu']
 
-        if glu_activation_family and bias_activation_fusion:
+        if bias_activation_fusion_unavailable and bias_activation_fusion:
             raise ValueError(
                 f"Cannot use bias_activation_fusion with {activation} activation. Please turn bias gelu fusion off."
             )
 
-        if glu_activation_family and openai_gelu:
+        if self.glu_activation_family and openai_gelu:
             raise ValueError(
                 f"Cannot use openai_gelu with specificed activation function : {activation} Please turn openai gelu off."
             )
 
-        if glu_activation_family and onnx_safe:
+        if self.glu_activation_family and onnx_safe:
             raise ValueError(
                 f"Cannot use onnx_safe with specificed activation function : {activation} Please turn onnx safe off."
             )
@@ -180,8 +181,6 @@ def __init__(
             raise ValueError(
                 f"Cannot use bias_activation_fusion without bias terms. Please set bias=True or bias_activation_fusion=False."
             )
-        else:
-            glu_activation_family = False
 
         self.bias_activation_fusion = bias_activation_fusion
 
@@ -224,18 +223,18 @@ def forward(self, hidden_states):
         # [s, b, 4hp]
         intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)
 
-        if self.activation in ['geglu', 'reglu', 'swiglu']:
+        if self.glu_activation_family:
             intermediate_parallel_2, bias_parallel_2 = self.dense_h_to_4h_2(hidden_states)
 
         if self.bias_activation_fusion:
             if self.activation == 'gelu':
                 intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
-            else:
+            elif self.activation == 'geglu':
                 intermediate_parallel = fused_bias_geglu(
                     intermediate_parallel, bias_parallel, intermediate_parallel_2, bias_parallel_2
                 )
 
-        elif self.activation in ['geglu', 'reglu', 'swiglu']:
+        elif self.activation in ['reglu', 'swiglu']:
             if bias_parallel is not None:
                 intermediate_parallel = self.activation_func(intermediate_parallel + bias_parallel) * (
                     intermediate_parallel_2 + bias_parallel_2