Skip to content

Commit

Permalink
Refactor bias act fusion (#4376)
Browse files Browse the repository at this point in the history
* Refactor bias act fusion

Signed-off-by: MaximumEntropy <[email protected]>

* Update NMT config

Signed-off-by: MaximumEntropy <[email protected]>

* Update ci tests

Signed-off-by: MaximumEntropy <[email protected]>

* Empty

Signed-off-by: MaximumEntropy <[email protected]>
  • Loading branch information
MaximumEntropy authored Jun 30, 2022
1 parent bb5c59f commit 7e53b32
Show file tree
Hide file tree
Showing 9 changed files with 29 additions and 30 deletions.
20 changes: 10 additions & 10 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -2115,7 +2115,7 @@ pipeline {
model.num_attention_heads=8 \
model.activation='swiglu' \
model.masked_softmax_fusion=False \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.micro_batch_size=2 \
Expand Down Expand Up @@ -2147,7 +2147,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='swiglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.masked_softmax_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
Expand Down Expand Up @@ -2879,7 +2879,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='swiglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.transformer_block_type='pre_ln' \
Expand All @@ -2904,7 +2904,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='swiglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.transformer_block_type='pre_ln' \
Expand Down Expand Up @@ -3001,7 +3001,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='swiglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.transformer_block_type='normformer' \
Expand All @@ -3026,7 +3026,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='swiglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.transformer_block_type='normformer' \
Expand Down Expand Up @@ -3080,7 +3080,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='reglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
Expand All @@ -3102,7 +3102,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='reglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
Expand Down Expand Up @@ -3136,7 +3136,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='geglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
Expand All @@ -3159,7 +3159,7 @@ pipeline {
model.hidden_size=64 \
model.num_attention_heads=8 \
model.activation='geglu' \
model.bias_gelu_fusion=False \
model.bias_activation_fusion=False \
model.activations_checkpoint_method='block' \
model.activations_checkpoint_num_layers=1 \
model.data.data_prefix=[.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document,.5,/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document]"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ model:
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ model:
layernorm_epsilon: 1e-5
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
persist_layer_norm: False
bias_gelu_fusion: True
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
bias_dropout_add_fusion: True
masked_softmax_fusion: True
activation: 'gelu'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ model:
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ model:
megatron_amp_O2: False # use AMP with O2 style mixed precision instead of native amp on-the-fly weight autocasting.

# JIT fusion params.
bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ model:
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ model:
layernorm_epsilon: 1e-5
persist_layer_norm: True # Use of persistent fused layer norm kernel.
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
bias_gelu_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent gelu activation.
bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function.
masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask.
bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition.
bias: True # Whether to use bias terms in all weight matrices.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,11 @@ def setup_optimizer_param_groups(self):

def model_provider_func(self, pre_process, post_process, add_encoder, add_decoder):
# TODO: create get_encoder_decoder_model()here for different losses (e..g, nll, vae, mim)

if hasattr(self.cfg, 'bias_gelu_fusion'):
logging.warning('bias_gelu_fusion is deprecated. Please use bias_activation_fusion instead.')
activation_fusion = self.cfg.bias_gelu_fusion
else:
activation_fusion = self.cfg.get('bias_activation_fusion', True)
model = MegatronTokenLevelEncoderDecoderModule(
encoder_arch=self.cfg.encoder_arch,
decoder_arch=self.cfg.decoder_arch,
Expand Down Expand Up @@ -151,10 +155,7 @@ def model_provider_func(self, pre_process, post_process, add_encoder, add_decode
activations_checkpoint_num_layers=self.cfg.get('activations_checkpoint_num_layers', 1),
layernorm_epsilon=self.cfg.get('layernorm_epsilon', 1e-5),
persist_layer_norm=self.cfg.get('persist_layer_norm', False),
bias_activation_fusion=(
(self.cfg.get('bias_gelu_fusion', True) and self.cfg.get('activation', 'gelu') == 'gelu')
or (self.cfg.get('bias_activation_fusion', True) and self.cfg.get('activation', 'gelu') == 'geglu')
),
bias_activation_fusion=activation_fusion,
bias_dropout_add_fusion=self.cfg.get('bias_dropout_add_fusion', True),
masked_softmax_fusion=self.cfg.get('masked_softmax_fusion', True),
onnx_safe=self.cfg.get('onnx_safe', False),
Expand Down
17 changes: 8 additions & 9 deletions nemo/collections/nlp/modules/common/megatron/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,19 +159,20 @@ def __init__(
bias=bias,
)

glu_activation_family = activation in ['reglu', 'swiglu']
self.glu_activation_family = activation in ['geglu', 'reglu', 'swiglu']
bias_activation_fusion_unavailable = activation in ['reglu', 'swiglu']

if glu_activation_family and bias_activation_fusion:
if bias_activation_fusion_unavailable and bias_activation_fusion:
raise ValueError(
f"Cannot use bias_activation_fusion with {activation} activation. Please turn bias gelu fusion off."
)

if glu_activation_family and openai_gelu:
if self.glu_activation_family and openai_gelu:
raise ValueError(
f"Cannot use openai_gelu with specificed activation function : {activation} Please turn openai gelu off."
)

if glu_activation_family and onnx_safe:
if self.glu_activation_family and onnx_safe:
raise ValueError(
f"Cannot use onnx_safe with specificed activation function : {activation} Please turn onnx safe off."
)
Expand All @@ -180,8 +181,6 @@ def __init__(
raise ValueError(
f"Cannot use bias_activation_fusion without bias terms. Please set bias=True or bias_activation_fusion=False."
)
else:
glu_activation_family = False

self.bias_activation_fusion = bias_activation_fusion

Expand Down Expand Up @@ -224,18 +223,18 @@ def forward(self, hidden_states):
# [s, b, 4hp]
intermediate_parallel, bias_parallel = self.dense_h_to_4h(hidden_states)

if self.activation in ['geglu', 'reglu', 'swiglu']:
if self.glu_activation_family:
intermediate_parallel_2, bias_parallel_2 = self.dense_h_to_4h_2(hidden_states)

if self.bias_activation_fusion:
if self.activation == 'gelu':
intermediate_parallel = fused_bias_gelu(intermediate_parallel, bias_parallel)
else:
elif self.activation == 'geglu':
intermediate_parallel = fused_bias_geglu(
intermediate_parallel, bias_parallel, intermediate_parallel_2, bias_parallel_2
)

elif self.activation in ['geglu', 'reglu', 'swiglu']:
elif self.activation in ['reglu', 'swiglu']:
if bias_parallel is not None:
intermediate_parallel = self.activation_func(intermediate_parallel + bias_parallel) * (
intermediate_parallel_2 + bias_parallel_2
Expand Down

0 comments on commit 7e53b32

Please sign in to comment.