Skip to content

Commit

Permalink
T5 prompt learning fixes (#4771)
Browse files Browse the repository at this point in the history
* RPE, hidden size and config fixes

Signed-off-by: MaximumEntropy <[email protected]>

* Update to reflect new config names

Signed-off-by: MaximumEntropy <[email protected]>

* Sentencepiece fixes

Signed-off-by: MaximumEntropy <[email protected]>

* Style

Signed-off-by: MaximumEntropy <[email protected]>

* Fix finetuning

Signed-off-by: MaximumEntropy <[email protected]>

* Add encoder seq len to gpt

Signed-off-by: MaximumEntropy <[email protected]>

* Style

Signed-off-by: MaximumEntropy <[email protected]>

* Add finetune eval script

Signed-off-by: MaximumEntropy <[email protected]>

* Fix name

Signed-off-by: MaximumEntropy <[email protected]>

* Update Jenkinsfile

Signed-off-by: MaximumEntropy <[email protected]>

* Update config

Signed-off-by: MaximumEntropy <[email protected]>

* Fix CI test

Signed-off-by: MaximumEntropy <[email protected]>

* Update check

Signed-off-by: MaximumEntropy <[email protected]>

* Style

Signed-off-by: MaximumEntropy <[email protected]>

* Backward compat

Signed-off-by: MaximumEntropy <[email protected]>

* Update CI test

Signed-off-by: MaximumEntropy <[email protected]>

* Split rank for Enc-Dec models

Signed-off-by: MaximumEntropy <[email protected]>

* Address comments

Signed-off-by: MaximumEntropy <[email protected]>

* Style

Signed-off-by: MaximumEntropy <[email protected]>

Signed-off-by: MaximumEntropy <[email protected]>
Co-authored-by: Virginia Adams <[email protected]>
  • Loading branch information
2 people authored and XuesongYang committed Sep 12, 2022
1 parent d44bdd7 commit be60005
Show file tree
Hide file tree
Showing 13 changed files with 169 additions and 62 deletions.
8 changes: 4 additions & 4 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -3433,7 +3433,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.max_steps=6 \
trainer.max_epochs=null \
model.tensor_model_parallel_size=1 \
model.pretrained_language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
model.existing_tasks=[] \
model.new_tasks=['squad'] \
model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
Expand All @@ -3443,7 +3443,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test"
sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \
virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test.nemo' \
pretrained_language_model_file='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo' \
data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
data.global_batch_size=4 \
data.micro_batch_size=4"
Expand All @@ -3459,7 +3459,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
trainer.max_steps=6 \
trainer.max_epochs=null \
model.tensor_model_parallel_size=2 \
model.pretrained_language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
model.language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
model.existing_tasks=[] \
model.new_tasks=['squad'] \
model.data.train_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
Expand All @@ -3469,7 +3469,7 @@ assert_frame_equal(training_curve, gt_curve, rtol=1e-3, atol=1e-3)"'''
sh "rm -rf /home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2"
sh "python examples/nlp/language_modeling/megatron_t5_prompt_learning_eval.py \
virtual_prompt_model_file='/home/TestData/nlp/prompt_learning/t5_p_tuning_test_tp2.nemo' \
pretrained_language_model_file='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
language_model_path='/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m_tp2.nemo' \
data.test_ds=['/home/TestData/nlp/prompt_learning/squad_CI_test.jsonl'] \
tensor_model_parallel_size=2 \
trainer.devices=2 \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ model:
seed: 1234
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: 'p-tuning' # one of 'prompt-tuning', 'p-tuning', or 'inference'
encoder_seq_length: 2048
encoder_seq_length: 2048
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism
global_batch_size: 8
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
name: megatron_t5_finetune_eval

trainer:
devices: 1
num_nodes: 1
accelerator: gpu
precision: 16
logger: False # logger provided by exp_manager
enable_checkpointing: False
replace_sampler_ddp: False
benchmark: False

exp_manager:
explicit_log_dir: null
exp_dir: null
name: megatron_t5_finetune_eval
create_checkpoint_callback: False

model:
restore_from_path: ??? # Path to a finetuned T5 .nemo file
gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory)
megatron_amp_O2: False # Enable O2 optimization for megatron amp

data:
validation_ds:
src_file_name: null # Path to the txt file corresponding to the source data.
tgt_file_name: null # Path to the txt file corresponding to the target data.
names: null # If src/tgt file names are ListConfigs, the corresponding label is used to log metrics.
global_batch_size: 64
micro_batch_size: 64
shuffle: False
num_workers: 0
pin_memory: True
max_src_seq_length: 512
max_tgt_seq_length: 128
drop_last: False # TODO: Figure out if there is a way to avoid dropping last.
write_predictions_to_file: False
output_file_path_prefix: null # Prefix of the file to write predictions to.
metric:
name: "exact_string_match" # Name of the evaluation metric to use.
average: micro # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
num_classes: null # Number of classes for the metric. Works only for 'F1', 'accuracy' and 'average_precision' etc. Refer to torchmetrics for metrics where this is supported.
class_labels: null # If the targets in your dataset are strings and not integers/float, you need to provide a list of class labels (size = num_classes) so we can convert from strings to integer categories to compute the metric.
labels_are_strings: True # NOTE: This is only required to properly handle metrics like f1, accuracy, average_precision etc. This does not affect extract_string_match.
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,8 @@ model:
max_seq_length: 512
drop_last: False
write_predictions_to_file: False
prediction_file_path_prefix: null # Prefix of the file to write predictions to.
output_file_path_prefix: null # Prefix of the file to write predictions to.
metric:
name: "exact_string_match" # Name of the evaluation metric to use.
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
num_classes: null
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,22 @@ exp_manager:
monitor: val_loss
save_top_k: 2
mode: min
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.virtual_prompt_save_path set below
save_nemo_on_train_end: False # Should be false, correct prompt learning model file is saved at model.nemo_path set below
filename: "megatron_t5_prompt_tune--{${exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}"
model_parallel_size: ${model.tensor_model_parallel_size}
save_best_model: True

model:
seed: 1234
virtual_prompt_save_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
nemo_path: ${name}.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved
virtual_prompt_style: "p-tuning" # one of 'prompt-tuning', 'p-tuning', or 'inference'
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
encoder_seq_length: 2048
global_batch_size: 8
micro_batch_size: 8

restore_path: null # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
pretrained_language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
language_model_path: ??? # Path to the pretrained T5 language model .nemo file, always required
existing_tasks: []
new_tasks: ["squad"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ data:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model
pretrained_language_model_file: ??? # path to a pretrained T5 nemo file
language_model_path: ??? # path to a pretrained T5 nemo file
virtual_prompt_model_file: ??? # path to a MegatronT5PromptLearningModel nemo file


24 changes: 24 additions & 0 deletions examples/nlp/language_modeling/megatron_ckpt_to_nemo.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def get_args():
parser.add_argument("--gpus_per_node", type=int, required=True, default=None)
parser.add_argument("--tensor_model_parallel_size", type=int, required=True, default=None)
parser.add_argument("--pipeline_model_parallel_size", type=int, required=True, default=None)
parser.add_argument(
"--pipeline_model_parallel_split_rank",
type=int,
required=False,
default=None,
help="If pipeline parallel size > 1, this is the rank at which the encoder ends and the decoder begins.",
)
parser.add_argument(
"--model_type", type=str, required=True, default="gpt", choices=["gpt", "t5", "bert", "nmt", "bart", "retro"]
)
Expand All @@ -96,11 +103,28 @@ def convert(local_rank, rank, world_size, args):

app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
# Auto set split rank for T5, BART, NMT if split rank is None.
if (
args.pipeline_model_parallel_size > 1
and args.pipeline_model_parallel_split_rank is None
and args.model_type in ['t5', 'bart', 'nmt']
):
if args.pipeline_model_parallel_split_rank is not None:
app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_split_rank
else:
if args.pipeline_model_parallel_size % 2 != 0:
raise ValueError(
f"Pipeline model parallel size {args.pipeline_model_parallel_size} must be even if split rank is not specified."
)
else:
# If split rank is not set, then we set it to be pipeline_model_parallel_size // 2 - this is because in most cases we have the same number of enc/dec layers.
app_state.pipeline_model_parallel_split_rank = args.pipeline_model_parallel_size // 2
app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

parallel_state.initialize_model_parallel(
tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
pipeline_model_parallel_split_rank_=app_state.pipeline_model_parallel_split_rank,
)

app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,19 @@ def main(cfg) -> None:
pipeline_model_parallel_split_rank_=cfg.pipeline_model_parallel_split_rank,
)

# Load prompt tuned model, virtual_prompt_model_file and pretrained_language_model_file must be provided in config
if (
cfg.get('virtual_prompt_model_file', None) is not None
and cfg.get('pretrained_language_model_file', None) is not None
):
# Load prompt tuned model, virtual_prompt_model_file and language_model_path must be provided in config
if cfg.get('virtual_prompt_model_file', None) is not None and cfg.get('language_model_path', None) is not None:

# Update frozen T5 model path in case it has changed
prompt_learning_cfg = MegatronT5PromptLearningModel.restore_from(
cfg.virtual_prompt_model_file, trainer=trainer, return_config=True
)
with open_dict(prompt_learning_cfg):
prompt_learning_cfg.pretrained_language_model_path = cfg.pretrained_language_model_file
# This is for backward compatibility with old checkpoints that used `pretrained_language_model_path` instead of `language_model_path`.
if hasattr(prompt_learning_cfg, 'pretrained_language_model_path'):
prompt_learning_cfg.pretrained_language_model_path = cfg.language_model_path
else:
prompt_learning_cfg.language_model_path = cfg.language_model_path
prompt_learning_cfg.micro_batch_size = cfg.data.get('micro_batch_size', 4)
prompt_learning_cfg.global_batch_size = cfg.data.get('global_batch_size', 4)

Expand All @@ -85,7 +86,7 @@ def main(cfg) -> None:
)

else:
raise ValueError("virtual_prompt_model_file and pretrained_language_model_file must be provided in config")
raise ValueError("virtual_prompt_model_file and language_model_path must be provided in config")

# check whether the DDP is initialized
if parallel_state.is_unitialized():
Expand Down
34 changes: 16 additions & 18 deletions examples/nlp/language_modeling/megatron_t5_seq2seq_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,9 @@ def main(cfg) -> None:
if isinstance(callback, Timer):
trainer.callbacks[idx] = StatelessTimer(cfg.trainer.max_time,)

# Get the T5 Base configuration.
if hasattr(t5_cfg.data.validation_ds, 'task_name'):
t5_cfg = MegatronT5GLUEModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)
else:
t5_cfg = MegatronT5FinetuneModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)
t5_cfg = MegatronT5GLUEModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, return_config=True
)

# Override the T5 configuration with the one from the config file.
# NOTE: Only data can be overriden here since this the file being restored here should already correspond to a GLUE/XNLI finetuned model.
Expand All @@ -80,16 +74,20 @@ def main(cfg) -> None:
t5_cfg.masked_softmax_fusion = False
t5_cfg.precision = cfg.trainer.precision
# Overwrite data configs
t5_cfg.data = cfg.model.data
# XNLI has eval languages in the yaml config.
if hasattr(cfg.model, 'eval_languages'):
t5_cfg.eval_languages = cfg.model.eval_languages
if cfg.model.data.validation_ds.src_file_name is not None:
logging.info(
'Found validation_ds.src_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
)
t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name
if cfg.model.data.validation_ds.tgt_file_name is not None:
logging.info(
'Found validation_ds.tgt_file_name in the config file. Overriding the finetuned model config file with the values from the new config file.'
)
t5_cfg.data.validation_ds.src_file_name = cfg.model.data.validation_ds.src_file_name

t5_cfg.data.validation_ds.micro_batch_size = cfg.model.data.validation_ds.micro_batch_size
t5_cfg.data.validation_ds.global_batch_size = cfg.model.data.validation_ds.global_batch_size

if hasattr(t5_cfg.data.validation_ds, 'task_name'):
model = MegatronT5GLUEModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg
)
else:
model = MegatronT5FinetuneModel.restore_from(
restore_path=cfg.model.restore_from_path, trainer=trainer, override_config_path=t5_cfg
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,12 @@ def _get_examples(self):
+ self.tgt_tokenizer.text_to_ids(tgt.strip())
+ [self.tgt_tokenizer.eos_id]
)
if len(src) <= self.max_src_seq_length and len(tgt) < self.max_tgt_seq_length:
self.examples.append({'src': src, 'tgt': tgt})
# Truncate to max sequence length.
if len(src) > self.max_src_seq_length:
src = src[-self.max_src_seq_length + 1 :]
if len(tgt) > self.max_tgt_seq_length:
tgt = tgt[-self.max_tgt_seq_length + 1 :]
self.examples.append({'src': src, 'tgt': tgt})

logging.info(f'Dataset Length : {len(self.examples)}')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from pytorch_lightning.trainer.trainer import Trainer
from torch import Tensor

from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer
from nemo.collections.nlp.models.language_modeling.megatron_base_model import MegatronBaseModel
from nemo.collections.nlp.modules.common import (
PromptEncoder,
Expand Down Expand Up @@ -65,7 +66,14 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):

self.tokenizer = self.frozen_model.tokenizer

self.hidden_size = self.frozen_model.cfg.hidden_size
if hasattr(self.frozen_model.cfg, "encoder") and hasattr(self.frozen_model.cfg, "decoder"):
self.hidden_size = (
self.frozen_model.cfg.encoder.hidden_size
) # Encoder and decoder need to have the same hidden size and we check for this in the frozen enc-dec model.
else:
self.hidden_size = self.frozen_model.cfg.hidden_size

# TODO: Handle this when moving GPT prompt learning to the base class.
self.word_embeddings = self.frozen_model.enc_dec_model.encoder_embedding.word_embeddings
self.existing_tasks = list(self.cfg.get('existing_tasks', []))
self.new_tasks = list(self.cfg.get('new_tasks', []))
Expand Down Expand Up @@ -101,7 +109,10 @@ def __init__(self, cfg: DictConfig, trainer: Trainer):

# Prepare pseudo token ids for virtual/virtual prompt tokens
self.pseudo_tokens = get_pseudo_tokens(self.max_virtual_tokens)
self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
if isinstance(self.tokenizer, SentencePieceTokenizer):
self.tokenizer.add_special_tokens(self.pseudo_tokens)
else:
self.tokenizer.add_special_tokens({'additional_special_tokens': self.pseudo_tokens})
self.pseudo_token_ids = self.tokenizer.tokens_to_ids(self.pseudo_tokens)
self.pseudo_token_ids_start = self.pseudo_token_ids[0]
self.pad_token_id = self.tokenizer.pad_id if self.tokenizer.pad_id is not None else self.tokenizer.unk_id
Expand Down Expand Up @@ -396,8 +407,8 @@ def on_train_end(self):
self.cfg.virtual_prompt_style = VirtualPromptStyle.INFERENCE.value

# Save the best nemo model
self.save_to(save_path=self.cfg.virtual_prompt_save_path)
logging.info(f"The final model was saved to {self.cfg.virtual_prompt_save_path}")
self.save_to(save_path=self.cfg.nemo_path)
logging.info(f"The final model was saved to {self.cfg.nemo_path}")

def setup(self, stage=None):
if stage == 'predict' or self.virtual_prompt_style == VirtualPromptStyle.INFERENCE:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,12 +439,11 @@ def inference_epoch_end(self, outputs, mode, data_cfg):
for pred, label, input, category in zip(
batch['preds'], batch['labels'], batch['inputs'], batch['categories']
):
if input + label not in gt_inp_set:
gt_inp_set.add(input + label)
deduplicated_outputs['preds'].append(pred)
deduplicated_outputs['labels'].append(label)
deduplicated_outputs['categories'].append(category)
deduplicated_outputs['inputs'].append(input)
gt_inp_set.add(input + label)
deduplicated_outputs['preds'].append(pred)
deduplicated_outputs['labels'].append(label)
deduplicated_outputs['categories'].append(category)
deduplicated_outputs['inputs'].append(input)
self.write_predictions_to_file(
deduplicated_outputs, f"{data_cfg.output_file_path_prefix}_{filename_log_key}"
)
Expand Down
Loading

0 comments on commit be60005

Please sign in to comment.