bigscience-workshop · thomasw21 · Sep 16, 2021 · Mar 10, 2021 · Mar 11, 2021 · Mar 19, 2021
diff --git a/megatron/arguments.py b/megatron/arguments.py
@@ -689,9 +689,11 @@ def _add_data_args(parser):
                        help='Reset posistion ids after end-of-document token.')
     group.add_argument('--reset-attention-mask', action='store_true',
                        help='Reset self attention maske after '
-                       'end-of-document token.')
+                       'end-of-document token. Attention between tokens from different documents is null.')
     group.add_argument('--eod-mask-loss', action='store_true',
                        help='Mask loss for the end of document tokens.')
+    group.add_argument('--loss-on-targets-only', action='store_true',
+                       help='Mask loss on input sequence.')
 
     return parser
 

diff --git a/megatron/enums.py b/megatron/enums.py
@@ -26,6 +26,7 @@ class AttnType(enum.Enum):
 class AttnMaskType(enum.Enum):
     padding = 1
     causal = 2
+    prefix = 3
 
 class PositionEmbeddingType(enum.Enum):
     rotary = 1

diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
@@ -92,6 +92,7 @@ class FusedScaleMaskSoftmax(torch.nn.Module):
         scale: scaling factor used in input tensor scaling.
 
     """
+    custom_kernel_friendly_attn_mask_type = [AttnMaskType.causal, AttnMaskType.padding]
 
     def __init__(
         self,
@@ -134,7 +135,8 @@ def forward(self, input, mask):
 
         # invoke custom kernel
         if self.input_in_float16 and mask is not None and \
-            custom_kernel_constraint and self.scaled_masked_softmax_fusion:
+            custom_kernel_constraint and self.scaled_masked_softmax_fusion and \
+            self.attn_mask_type in self.custom_kernel_friendly_attn_mask_type:
             scale = self.scale if self.scale is not None else 1.0
 
             if self.attn_mask_type == AttnMaskType.causal:

diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py
@@ -67,11 +67,14 @@ def post_language_model_processing(lm_output, labels, logit_weights,
 class GPTModel(MegatronModule):
     """GPT-2 Language model."""
 
-    def __init__(self,
-                 num_tokentypes=0,
-                 parallel_output=True,
-                 pre_process=True,
-                 post_process=True):
+    def __init__(
+        self,
+        num_tokentypes=0,
+        parallel_output=True,
+        pre_process=True,
+        post_process=True,
+        prefix_lm=False,
+    ):
         super(GPTModel, self).__init__()
         args = get_args()
 
@@ -83,7 +86,8 @@ def __init__(self,
         self.language_model, self._language_model_key = get_language_model(
             num_tokentypes=num_tokentypes,
             add_pooler=False,
-            encoder_attn_mask_type=AttnMaskType.causal,
+            # TODO: Change naming of class from GPT to something that encapsulate prefix lm.
+            encoder_attn_mask_type=AttnMaskType.prefix if prefix_lm else AttnMaskType.causal,
             init_method=init_method_normal(args.init_method_std),
             scaled_init_method=scaled_init_method_normal(args.init_method_std,
                                                          args.num_layers),
@@ -157,9 +161,12 @@ def CrossEntropy(output, labels):
 class GPTModelPipe(PipelineModule,MegatronModule):
     """GPT-2 Language model."""
 
-    def __init__(self,
-                 num_tokentypes=0,
-                 parallel_output=True):
+    def __init__(
+        self,
+        num_tokentypes=0,
+        parallel_output=True,
+        prefix_lm=False
+    ):
         args = get_args()
         self.parallel_output = parallel_output
 
@@ -199,7 +206,8 @@ def _to_float16(inputs):
                     output_layer_init_method=scaled_init_method_normal(args.init_method_std,
                                                                        args.num_layers),
                     layer_number=layer_idx,
-                    self_attn_mask_type=AttnMaskType.causal))
+                    # TODO: Change naming of class from GPT to something that encapsulate prefix lm.
+                    self_attn_mask_type=AttnMaskType.prefix if prefix_lm else AttnMaskType.causal))
 
 
         # Undo data format change

diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
@@ -41,13 +41,16 @@ def get_batch(context_tokens):
 
     # Move to GPU.
     tokens = context_tokens.view(args.micro_batch_size, -1).contiguous().cuda()
-    # Get the attention mask and postition ids.
+    # Get the attention mask and position ids.
     attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
         args.reset_attention_mask,
-        args.eod_mask_loss)
+        args.eod_mask_loss,
+        prefix_indices=None,
+        loss_on_targets_only=args.loss_on_targets_only
+    )
 
     return tokens, attention_mask, position_ids
 

diff --git a/megatron/utils.py b/megatron/utils.py
@@ -16,6 +16,7 @@
 """General utilities."""
 
 import sys
+from random import randint
 
 import torch
 from torch.nn.parallel import DistributedDataParallel as torchDDP
@@ -144,12 +145,23 @@ def check_adlr_autoresume_termination(iteration, model,
         sys.exit(0)
 
 
-def get_ltor_masks_and_position_ids(data,
-                                    eod_token,
-                                    reset_position_ids,
-                                    reset_attention_mask,
-                                    eod_mask_loss):
-    """Build masks and position id for left to right model."""
+def get_ltor_masks_and_position_ids(
+        data,
+        eod_token,
+        reset_position_ids,
+        reset_attention_mask,
+        eod_mask_loss,
+        prefix_indices,
+        loss_on_targets_only,
+    ):
+    """
+    Build masks and position id for left to right model.
+    :param prefix_indices: argument can have multiple types:
+        - None signifies that the model is fully autoregressive.
+        - List[int] the argument holds all prefix indices that split a row into an input and a target
+        - List[List[int]] the argument holds all prefix indices that split documents between input and target.
+    :param loss_on_targets_only: bool to determine if we should mask loss on prefix.
+    """
 
     # Extract batch size and sequence length.
     micro_batch_size, seq_length = data.size()
@@ -182,6 +194,14 @@ def get_ltor_masks_and_position_ids(data,
 
             # Find indecies where EOD token is.
             eod_index = position_ids[b, data[b] == eod_token]
+
+            # If the last eod token is not the last token of the sequence, we suppose that there is a partial document
+            # We treat this case as if we add an eod token at the end of the sequence.
+            if data[b][-1] != eod_token:
+                eod_index = torch.cat(
+                    (eod_index, torch.tensor([len(data[b])], dtype=eod_index.dtype, device=eod_index.device))
+                )
+
             # Detach indecies from positions if going to modify positions.
             if reset_position_ids:
                 eod_index = eod_index.clone()
@@ -190,13 +210,31 @@ def get_ltor_masks_and_position_ids(data,
             prev_index = 0
             for j in range(eod_index.size()[0]):
                 i = eod_index[j]
-                # Mask attention loss.
+
                 if reset_attention_mask:
+                    # Prevent cross document interactions.
                     attention_mask[b, 0, (i + 1):, :(i + 1)] = 0
+
+                    # Prefix lm per document.
+                    if prefix_indices:
+                        assert isinstance(prefix_indices[b], list), f"prefix for a row has to be document specific, and consequently return a list, got {prefix_indices[b]}"
+                        attention_mask[b, 0, prev_index: prefix_indices[b][j], prev_index: prefix_indices[b][j]] = 1
+                        if loss_on_targets_only:
+                            loss_mask[b, prev_index: prefix_indices[b][j]] = 0.0
+
                 # Reset positions.
                 if reset_position_ids:
                     position_ids[b, (i + 1):] -= (i + 1 - prev_index)
-                    prev_index = i + 1
+
+                prev_index = i + 1
+
+            # Prefix lm per row.
+            if prefix_indices is not None and (reset_attention_mask is False):
+                assert isinstance(prefix_indices[b], int), \
+                    f"prefix for a row has to be row specific, and consequently return an int, got {prefix_indices[b]}"
+                attention_mask[b, 0, :prefix_indices[b], :prefix_indices[b]] = 1
 att_mask_batch = 1 
 attention_mask[b, 0, :prefix_indices[b], :prefix_indices[b]] = 1 
 if reset_attention_mask: 
     att_mask_batch = micro_batch_size 
 else: 
     att_mask_batch = 1 
 att_mask_batch = 1 
 attention_mask[b, 0, :prefix_indices[b], :prefix_indices[b]] = 1 
 if reset_attention_mask: 
     att_mask_batch = micro_batch_size 
 else: 
     att_mask_batch = 1 
+                if loss_on_targets_only:
+                    loss_mask[b, :prefix_indices[b]] = 0.0
 
     # Convert attention mask to binary:
     attention_mask = (attention_mask < 0.5)
@@ -226,3 +264,76 @@ def flops_calculator(model, args, iteration_time):
     effective_tera_flops_per_gpu = giga_flops_per_model_per_train_step / (iteration_time * 1000.0 * gpus_per_model)
 
     print_rank_0(f"Effective Tera Flops per GPU: {round(effective_tera_flops_per_gpu, 2)} and total parameters {round(approx_parameters_in_billions, 3)} B")
+
+def get_prefix_indices(data, eod_token, partial_prefix_indices, reset_attention_mask):
+    """
+    Helper function in order to:
+     - randomly choose prefix index when there's no constraint
+     - check that prefix are compatible with convention.
+
+    :param data: torch.Tensor
+    :param eod_token: int, token_id used to signal end of document
+    :param partial_prefix_indices: this agument can have multiple types:
+        - None, it signals that all prefix indices are randomly sampled.
+        - List[Optional[int]], its length has to be equal to mini batch size. It stores all the indices for per row prefix.
+            Optional means that if set to None, we allows ourselves to sample one randomly.
+        - List[List[Optional[int]]], it follows the following rules:
+            - The first dimension refers to that sample, ie len(partial_prefix_indices) == len(data)
+            - The second dimension refers to the number of document of that sample, ie
+                len(partial_prefix_indices[b]) == (data[b] == eod_token).sum() (+1 for the last partial document).
+            - partial_prefix_indices have to be interleaved with eod_indices, ie
+                eod_indices[b][d-1] < partial_prefix_indices[b][d] < eod_indices[b][d] + 1 or is None.
+            - Optional means that if set to None, we allows ourselves to sample one randomly.
+    :param reset_attention_mask: bool, determines if prefixes are to be per document or per row.
+    :return Depending if prefix is per document or per row, the method returns:
+        - List[List[int]]: prefix indices for each document in case of per document prefix
+        - List[int]: prefix indices for rows else.
+    """
+    micro_batch_size, seq_length = data.size()
+    prefix_indices = []
+
+    assert partial_prefix_indices is None or len(partial_prefix_indices) == micro_batch_size, f"partial_prefix_indices has to be None or its length equal to {micro_batch_size}, got {len(partial_prefix_indices)}"
+    for batch_id in range(micro_batch_size):
+        prefix_indices.append([])
+        # Compute the index of all eod tokens in data.
+        eod_indices = (data[batch_id] == eod_token).nonzero().squeeze(-1)
+
+        # If the last eod token is not the last token of the sequence, we suppose that there is a partial document
+        # We treat this case as if we add an eod token at the end of the sequence.
+        if data[batch_id][-1] != eod_token:
+            eod_indices = torch.cat(
+                (eod_indices, torch.tensor([len(data[batch_id])], dtype = eod_indices.dtype, device = eod_indices.device))
+            )
+
+        # Prefix lm per document.
+        if reset_attention_mask:
+            prev_index = 0
+            assert partial_prefix_indices is None or len(partial_prefix_indices[batch_id]) == len(eod_indices), f"The number of prefixes has to match the number of documents, complete or partial. Got {len(partial_prefix_indices[batch_id])} prefixes and {len(eod_indices)} documents"
+
+            for doc_id, eod_index in enumerate(eod_indices):
+                assert partial_prefix_indices is None or isinstance(partial_prefix_indices[batch_id], list), f"Per document prefix has to store a list on indices for each row, got {partial_prefix_indices[batch_id]}"
+                if partial_prefix_indices is None or partial_prefix_indices[batch_id][doc_id] is None:
+                    # We need to randomly generate a prefix index that satisfies the interleave condition in the docstring
+                    prefix_index = randint(prev_index, eod_index)
+                else:
+                    # We get value from partial_prefix_indices, and run validation on that value
+                    prefix_index = partial_prefix_indices[batch_id][doc_id]
+                    assert prev_index <= prefix_index < eod_index, f"Prefix index needs to be between documents indices, {prev_index} <= {prefix_index} < {eod_index} should be True."
+
+                prefix_indices[batch_id].append(prefix_index)
+                prev_index = eod_index + 1
+
+        # Prefix lm per row.
+        else:
+            assert partial_prefix_indices is None or isinstance(partial_prefix_indices[batch_id], int), \
+                f"Per document prefix has to store an int for each row, got {partial_prefix_indices[batch_id]}"
+
+            if partial_prefix_indices is None or partial_prefix_indices[batch_id] is None:
+                # We need to randomly generate a prefix index
+                prefix_index = randint(0, seq_length - 1)
+            else:
+                # We get value from partial_prefix_indices, and run validation on that value
+                prefix_index = partial_prefix_indices[batch_id]
+                assert 0 <= prefix_index < seq_length - 1, f"Prefix index needs to be between documents indices, 0 <= {prefix_index} < {seq_length - 1} should be True."
+            prefix_indices[batch_id].append(prefix_index)
+    return prefix_indices
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
@@ -25,7 +25,7 @@
 from megatron.data.gpt_dataset import build_train_valid_test_datasets
 from megatron.model import GPTModel, GPTModelPipe
 from megatron.training import pretrain
-from megatron.utils import get_ltor_masks_and_position_ids
+from megatron.utils import get_ltor_masks_and_position_ids, get_prefix_indices
 from megatron.utils import average_losses_across_data_parallel_group
 
 import deepspeed
@@ -40,6 +40,7 @@ def model_provider(pre_process=True, post_process=True):
     see_memory_usage(f"Before Building Model", force=True)
 
     args = get_args()
+
     with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
                              remote_device=None if args.remote_device=='none' else args.remote_device,
                              config=args.deepspeed_config,
@@ -53,7 +54,7 @@ def model_provider(pre_process=True, post_process=True):
             # We need to call model.set_batch_fn after deepspeed.initialize
             model._megatron_batch_fn = get_batch_pipe
 
-            # Predompute the attention mask and store it in args. This avoids having to
+            # Precompute the attention mask and store it in args. This avoids having to
             # pipeline it as an activation during training. The mask is constant, and thus
             # we can reuse it.
             attention_mask = torch.tril(torch.ones(
@@ -108,7 +109,10 @@ def get_batch(data_iterator):
         tokenizer.eod,
         args.reset_position_ids,
         args.reset_attention_mask,
-        args.eod_mask_loss)
+        args.eod_mask_loss,
+        prefix_indices=None,
+        loss_on_targets_only=args.loss_on_targets_only
+    )
 
     return tokens, labels, loss_mask, attention_mask, position_ids
 
@@ -129,13 +133,16 @@ def get_batch_pipe(data):
     labels = tokens_[:, 1:].contiguous()
     tokens = tokens_[:, :-1].contiguous()
 
-    # Get the masks and postition ids.
+    # Get the masks and position ids.
     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
         tokens,
         tokenizer.eod,
         args.reset_position_ids,
         args.reset_attention_mask,
-        args.eod_mask_loss)
+        args.eod_mask_loss,
+        prefix_indices=None,
+        loss_on_targets_only=args.loss_on_targets_only
+    )
 
     return (tokens, position_ids, attention_mask), (labels, loss_mask)