From f4721cab783719c037788464ad27c302a9f042d3 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Fri, 8 Dec 2023 21:47:10 -0800 Subject: [PATCH 1/2] Make pipelined TP comm overlap available with mcore Signed-off-by: Sangkug Lym --- .../nlp/models/language_modeling/megatron_base_model.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 99b0c81ac790..6eb2e4474c56 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -792,12 +792,14 @@ def _validate_and_override_config(self): ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.' if self.cfg.get('ub_tp_comm_overlap', False): - if not self.cfg.get('transformer_engine', False) or not self.cfg.get('sequence_parallel', False): + if self.cfg.get('ub_tp_comm_overlap', False) and not self.cfg.get('sequence_parallel', False): logging.info( - "Userbuffer tensor-parallel communication overlap is available with both Transformer Engine and sequence-parallelism." + "Pipelined tensor-parallel communication overlap is available with sequence-parallelism." + "Setting `ub_tp_comm_overlap` to False." ) with open_dict(self.cfg): self.cfg.ub_tp_comm_overlap = False + if self.cfg.get('fsdp', False): logging.info( "Userbuffer tensor-parallel communication overlap is not available with FSDP." From ad9b504213d9a0a6ae7483c4311f1679e6fefd47 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Wed, 3 Jan 2024 09:52:07 +0900 Subject: [PATCH 2/2] remove unnecessary condition Signed-off-by: Sangkug Lym --- .../nlp/models/language_modeling/megatron_base_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 6eb2e4474c56..009ac851b066 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -792,7 +792,7 @@ def _validate_and_override_config(self): ) % vp_size == 0, 'Make sure the number of model chunks is the same across all pipeline stages.' if self.cfg.get('ub_tp_comm_overlap', False): - if self.cfg.get('ub_tp_comm_overlap', False) and not self.cfg.get('sequence_parallel', False): + if not self.cfg.get('sequence_parallel', False): logging.info( "Pipelined tensor-parallel communication overlap is available with sequence-parallelism." "Setting `ub_tp_comm_overlap` to False."