diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 75188e38f68c..83ef8a8b4339 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2798,7 +2798,7 @@ jobs: model.megatron_amp_O2=True \ trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=2 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=2 \ @@ -2829,7 +2829,7 @@ jobs: trainer.gradient_clip_val=1.0 \ exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ + model.pipeline_model_parallel_size=2 \ model.optim.name=fused_adam \ model.optim.lr=2e-4 \ model.optim.sched.warmup_steps=2 \ @@ -5170,8 +5170,10 @@ jobs: trainer.max_steps=10 \ trainer.limit_val_batches=7 \ trainer.val_check_interval=10 \ + trainer.precision=16 \ exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ model.mcore_gpt=True \ + model.megatron_amp_O2=False \ model.data.data_impl=mock \ model.data.data_prefix=[] - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index 67a4802d83f6..2ff6a2ae0a85 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -345,11 +345,6 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw transformer_block_type=self.transformer_block_type, ) - if self.add_pooler: - self.pooler = Pooler( - self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel - ) - # Output if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly @@ -370,6 +365,11 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw skip_weight_param_allocation=self.pre_process and self.share_embeddings_and_output_weights, ) + if self.add_pooler: + self.pooler = Pooler( + self.config.hidden_size, self.config.init_method, self.config, self.config.sequence_parallel + ) + self.binary_head = None if self.add_binary_head: # TODO: Shoudl switch this to TE ?