T5 prompt learning fixes (#4771)

* RPE, hidden size and config fixes Signed-off-by: MaximumEntropy <[email protected]> * Update to reflect new config names Signed-off-by: MaximumEntropy <[email protected]> * Sentencepiece fixes Signed-off-by: MaximumEntropy <[email protected]> * Style Signed-off-by: MaximumEntropy <[email protected]> * Fix finetuning Signed-off-by: MaximumEntropy <[email protected]> * Add encoder seq len to gpt Signed-off-by: MaximumEntropy <[email protected]> * Style Signed-off-by: MaximumEntropy <[email protected]> * Add finetune eval script Signed-off-by: MaximumEntropy <[email protected]> * Fix name Signed-off-by: MaximumEntropy <[email protected]> * Update Jenkinsfile Signed-off-by: MaximumEntropy <[email protected]> * Update config Signed-off-by: MaximumEntropy <[email protected]> * Fix CI test Signed-off-by: MaximumEntropy <[email protected]> * Update check Signed-off-by: MaximumEntropy <[email protected]> * Style Signed-off-by: MaximumEntropy <[email protected]> * Backward compat Signed-off-by: MaximumEntropy <[email protected]> * Update CI test Signed-off-by: MaximumEntropy <[email protected]> * Split rank for Enc-Dec models Signed-off-by: MaximumEntropy <[email protected]> * Address comments Signed-off-by: MaximumEntropy <[email protected]> * Style Signed-off-by: MaximumEntropy <[email protected]> Signed-off-by: MaximumEntropy <[email protected]> Co-authored-by: Virginia Adams <[email protected]>
NVIDIA · Sep 12, 2022 · be60005 · be60005
1 parent d44bdd7
commit be60005
Show file tree

Hide file tree

Showing 13 changed files with 169 additions and 62 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -3433,7 +3433,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 trainer.max_steps=6 \
                 trainer.max_epochs=null \
                 model.tensor_model_parallel_size=1 \
-                model.pretrained_language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
+                model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
                 model.existing_tasks=[] \
                 model.new_tasks=['squad'] \
                 model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
@@ -3443,7 +3443,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test"
             sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \
                 virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo' \
-                pretrained_language_model_file='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
+                language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
                 data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
                 data.global_batch_size=4 \
                 data.micro_batch_size=4"
@@ -3459,7 +3459,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
                 trainer.max_steps=6 \
                 trainer.max_epochs=null \
                 model.tensor_model_parallel_size=2 \
-                model.pretrained_language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
+                model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
                 model.existing_tasks=[] \
                 model.new_tasks=['squad'] \
                 model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
@@ -3469,7 +3469,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
             sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2"
             sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \
                 virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2.nemo' \
-                pretrained_language_model_file='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
+                language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
                 data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
                 tensor_model_parallel_size=2 \
                 trainer.devices=2 \

diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_prompt_learning_config.yaml
@@ -41,7 +41,7 @@ model:
   seed: 1234
   nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
   virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
-  encoder_seq_length: 2048 
+  encoder_seq_length: 2048
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
   global_batch_size: 8

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_eval.yaml
@@ -0,0 +1,44 @@
+name: megatron_t5_finetune_eval
+
+trainer:
+  devices: 1
+  num_nodes: 1
+  accelerator: gpu
+  precision: 16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  replace_sampler_ddp: False
+  benchmark: False
+
+exp_manager:
+  explicit_log_dir: null
+  exp_dir: null
+  name: megatron_t5_finetune_eval
+  create_checkpoint_callback: False
+
+model:
+  restore_from_path: ??? # Path to a finetuned T5 .nemo file
+  gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
+  megatron_amp_O2: False # Enable O2 optimization for megatron amp
+
+  data:
+    validation_ds:
+      src_file_name: null # Path to the txt file corresponding to the source data.
+      tgt_file_name: null # Path to the txt file corresponding to the target data.
+      names: null # If src/tgt file names are ListConfigs, the corresponding label is used to log metrics.
+      global_batch_size: 64
+      micro_batch_size: 64
+      shuffle: False
+      num_workers: 0
+      pin_memory: True
+      max_src_seq_length: 512
+      max_tgt_seq_length: 128
+      drop_last: False # TODO: Figure out if there is a way to avoid dropping last.
+      write_predictions_to_file: False
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      metric:
+        name: "exact_string_match" # Name of the evaluation metric to use.
+        average: micro # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null # Number of classes for the metric. Works only for 'F1', 'accuracy' and 'average_precision' etc. Refer to torchmetrics for metrics where this is supported.
+        class_labels: null # If the targets in your dataset are strings and not integers/float, you need to provide a list of class labels (size = num_classes) so we can convert from strings to integer categories to compute the metric.
+        labels_are_strings: True # NOTE: This is only required to properly handle metrics like f1, accuracy, average_precision etc. This does not affect extract_string_match.
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml b/examples/nlp/language_modeling/conf/megatron_t5_config_finetune_glue_eval.yaml
@@ -33,4 +33,8 @@ model:
       max_seq_length: 512
       drop_last: False
       write_predictions_to_file: False
-      prediction_file_path_prefix: null # Prefix of the file to write predictions to.
+      output_file_path_prefix: null # Prefix of the file to write predictions to.
+      metric:
+        name: "exact_string_match" # Name of the evaluation metric to use.
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
diff --git a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning.yaml
@@ -30,23 +30,22 @@ exp_manager:
     monitor: val_loss
     save_top_k: 2
     mode: min
-    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below
+    save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
     filename: "megatron_t5_prompt_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
     model_parallel_size: ${model.tensor_model_parallel_size}
     save_best_model: True
 
 model:
   seed: 1234
-  virtual_prompt_save_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
+  nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
   virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1 
-  encoder_seq_length: 2048
   global_batch_size: 8
   micro_batch_size: 8
 
   restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
-  pretrained_language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
+  language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
   existing_tasks: []
   new_tasks: ["squad"] 
 

diff --git a/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml b/examples/nlp/language_modeling/conf/megatron_t5_prompt_learning_inference.yaml
@@ -14,7 +14,7 @@ data:
 tensor_model_parallel_size: 1
 pipeline_model_parallel_size: 1
 pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
-pretrained_language_model_file: ???  # path to a pretrained T5 nemo file 
+language_model_path: ???  # path to a pretrained T5 nemo file 
 virtual_prompt_model_file: ??? # path to a MegatronT5PromptLearningModel nemo file
 
 
diff --git a/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py b/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
@@ -72,6 +72,13 @@ def get_args():
     parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
     parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
     parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None)
+    parser.add_argument(
+        "--pipeline_model_parallel_split_rank",
+        type=int,
+        required=False,
+        default=None,
+        help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
+    )
     parser.add_argument(
         "--model_type", type=str, required=True, default="gpt", choices=["gpt", "t5", "bert", "nmt", "bart", "retro"]
     )
@@ -96,11 +103,28 @@ def convert(local_rank, rank, world_size, args):
 
     app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
     app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
+    # Auto set split rank for T5, BART, NMT if split rank is None.
+    if (
+        args.pipeline_model_parallel_size > 1
+        and args.pipeline_model_parallel_split_rank is None
+        and args.model_type in ['t5', 'bart', 'nmt']
+    ):
+        if args.pipeline_model_parallel_split_rank is not None:
+            app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank
+        else:
+            if args.pipeline_model_parallel_size % 2 != 0:
+                raise ValueError(
+                    f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified."
+                )
+            else:
+                # If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers.
+                app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2
     app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size
 
     parallel_state.initialize_model_parallel(
         tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
         pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
+        pipeline_model_parallel_split_rank_=app_state.pipeline_model_parallel_split_rank,
     )
 
     app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()

diff --git a/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py b/examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py
@@ -64,18 +64,19 @@ def main(cfg) -> None:
             pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
         )
 
-    # Load prompt tuned model, virtual_prompt_model_file and pretrained_language_model_file must be provided in config
-    if (
-        cfg.get('virtual_prompt_model_file', None) is not None
-        and cfg.get('pretrained_language_model_file', None) is not None
-    ):
+    # Load prompt tuned model, virtual_prompt_model_file and language_model_path must be provided in config
+    if cfg.get('virtual_prompt_model_file', None) is not None and cfg.get('language_model_path', None) is not None:
 
         # Update frozen T5 model path in case it has changed
         prompt_learning_cfg = MegatronT5PromptLearningModel.restore_from(
             cfg.virtual_prompt_model_file, trainer=trainer, return_config=True
         )
         with open_dict(prompt_learning_cfg):
-            prompt_learning_cfg.pretrained_language_model_path = cfg.pretrained_language_model_file
+            # This is for backward compatibility with old checkpoints that used `pretrained_language_model_path` instead of `language_model_path`.
+            if hasattr(prompt_learning_cfg, 'pretrained_language_model_path'):
+                prompt_learning_cfg.pretrained_language_model_path = cfg.language_model_path
+            else:
+                prompt_learning_cfg.language_model_path = cfg.language_model_path
             prompt_learning_cfg.micro_batch_size = cfg.data.get('micro_batch_size', 4)
             prompt_learning_cfg.global_batch_size = cfg.data.get('global_batch_size', 4)
 
@@ -85,7 +86,7 @@ def main(cfg) -> None:
         )
 
     else:
-        raise ValueError("virtual_prompt_model_file and pretrained_language_model_file must be provided in config")
+        raise ValueError("virtual_prompt_model_file and language_model_path must be provided in config")
 
     # check whether the DDP is initialized
     if parallel_state.is_unitialized():

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
@@ -63,15 +63,9 @@ def main(cfg) -> None:
         if isinstance(callback, Timer):
             trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)
 
-    # Get the T5 Base configuration.
-    if hasattr(t5_cfg.data.validation_ds, 'task_name'):
-        t5_cfg = MegatronT5GLUEModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
-        )
-    else:
-        t5_cfg = MegatronT5FinetuneModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
-        )
+    t5_cfg = MegatronT5GLUEModel.restore_from(
+        restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
+    )
 
     # Override the T5 configuration with the one from the config file.
     # NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
@@ -80,16 +74,20 @@ def main(cfg) -> None:
         t5_cfg.masked_softmax_fusion = False
         t5_cfg.precision = cfg.trainer.precision
         # Overwrite data configs
-        t5_cfg.data = cfg.model.data
-        # XNLI has eval languages in the yaml config.
-        if hasattr(cfg.model, 'eval_languages'):
-            t5_cfg.eval_languages = cfg.model.eval_languages
+        if cfg.model.data.validation_ds.src_file_name is not None:
+            logging.info(
+                'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
+            )
+            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
+        if cfg.model.data.validation_ds.tgt_file_name is not None:
+            logging.info(
+                'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
+            )
+            t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
+
+        t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
+        t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size
 
-    if hasattr(t5_cfg.data.validation_ds, 'task_name'):
-        model = MegatronT5GLUEModel.restore_from(
-            restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg
-        )
-    else:
         model = MegatronT5FinetuneModel.restore_from(
             restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg
         )

diff --git a/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py b/nemo/collections/nlp/data/common/sequence_to_sequence_dataset.py
@@ -86,8 +86,12 @@ def _get_examples(self):
                     + self.tgt_tokenizer.text_to_ids(tgt.strip())
                     + [self.tgt_tokenizer.eos_id]
                 )
-                if len(src) <= self.max_src_seq_length and len(tgt) < self.max_tgt_seq_length:
-                    self.examples.append({'src': src, 'tgt': tgt})
+                # Truncate to max sequence length.
+                if len(src) > self.max_src_seq_length:
+                    src = src[-self.max_src_seq_length + 1 :]
+                if len(tgt) > self.max_tgt_seq_length:
+                    tgt = tgt[-self.max_tgt_seq_length + 1 :]
+                self.examples.append({'src': src, 'tgt': tgt})
 
         logging.info(f'Dataset Length : {len(self.examples)}')
 

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.trainer.trainer import Trainer
 from torch import Tensor
 
+from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
 from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
 from nemo.collections.nlp.modules.common import (
     PromptEncoder,
@@ -65,7 +66,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         self.tokenizer = self.frozen_model.tokenizer
 
-        self.hidden_size = self.frozen_model.cfg.hidden_size
+        if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
+            self.hidden_size = (
+                self.frozen_model.cfg.encoder.hidden_size
+            )  # Encoder and decoder need to have the same hidden size and we check for this in the frozen enc-dec model.
+        else:
+            self.hidden_size = self.frozen_model.cfg.hidden_size
+
+        # TODO: Handle this when moving GPT prompt learning to the base class.
         self.word_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.word_embeddings
         self.existing_tasks = list(self.cfg.get('existing_tasks', []))
         self.new_tasks = list(self.cfg.get('new_tasks', []))
@@ -101,7 +109,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):
 
         # Prepare pseudo token ids for virtual/virtual prompt tokens
         self.pseudo_tokens = get_pseudo_tokens(self.max_virtual_tokens)
-        self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
+        if isinstance(self.tokenizer, SentencePieceTokenizer):
+            self.tokenizer.add_special_tokens(self.pseudo_tokens)
+        else:
+            self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
         self.pseudo_token_ids = self.tokenizer.tokens_to_ids(self.pseudo_tokens)
         self.pseudo_token_ids_start = self.pseudo_token_ids[0]
         self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
@@ -396,8 +407,8 @@ def on_train_end(self):
             self.cfg.virtual_prompt_style = VirtualPromptStyle.INFERENCE.value
 
         # Save the best nemo model
-        self.save_to(save_path=self.cfg.virtual_prompt_save_path)
-        logging.info(f"The final model was saved to {self.cfg.virtual_prompt_save_path}")
+        self.save_to(save_path=self.cfg.nemo_path)
+        logging.info(f"The final model was saved to {self.cfg.nemo_path}")
 
     def setup(self, stage=None):
         if stage == 'predict' or self.virtual_prompt_style == VirtualPromptStyle.INFERENCE:

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py b/nemo/collections/nlp/models/language_modeling/megatron_finetune_model.py
@@ -439,12 +439,11 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
                             for pred, label, input, category in zip(
                                 batch['preds'], batch['labels'], batch['inputs'], batch['categories']
                             ):
-                                if input + label not in gt_inp_set:
-                                    gt_inp_set.add(input + label)
-                                    deduplicated_outputs['preds'].append(pred)
-                                    deduplicated_outputs['labels'].append(label)
-                                    deduplicated_outputs['categories'].append(category)
-                                    deduplicated_outputs['inputs'].append(input)
+                                gt_inp_set.add(input + label)
+                                deduplicated_outputs['preds'].append(pred)
+                                deduplicated_outputs['labels'].append(label)
+                                deduplicated_outputs['categories'].append(category)
+                                deduplicated_outputs['inputs'].append(input)
                     self.write_predictions_to_file(
                         deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}"
                     )