NVIDIA · ericharper · Aug 12, 2023 · Jul 7, 2023 · Jul 10, 2023 · Jul 13, 2023
diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml
@@ -1,3 +1,6 @@
+defaults:
+  - optional [email protected]_tp_comm_overlap_cfg:
+
 name: megatron_gpt
 restore_from_path: null # used when starting from a .nemo file
 

diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml
@@ -0,0 +1,53 @@
+# UB communicator configurations
+# Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16
+
+# Bulk overlap with AllGather
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+## Ring-exchange overlap with AllGather
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+# Chunked-collective overlap with ReduceScatter
+proj_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml
@@ -0,0 +1,53 @@
+# UB communicator configurations
+# Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16
+
+# Bulk overlap with AllGather
+qkv_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 2
+  set_sm_margin: 0
+
+## Ring-exchange overlap with AllGather
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+# Chunked-collective overlap with ReduceScatter
+proj_fprop:
+  method: pipeline
+  num_sm: 8
+  num_splits: 4
+  set_sm_margin: 0
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 4
+  num_splits: 4
+  set_sm_margin: 0
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml
@@ -0,0 +1,59 @@
+# UB communicator configurations
+# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8
+
+# Bulk overlap with AllGather / ReduceScatter
+qkv_dgrad:
+  method: bulk
+  num_sm: 4
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 8
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 2
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 4
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring-exchange overlap with AllGather
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+# Chunked-collective overlap with ReduceScatter
+proj_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 20
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
diff --git a/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml b/examples/nlp/language_modeling/conf/tp_overlap/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml
@@ -0,0 +1,59 @@
+# UB communicator configurations
+# Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8
+
+# Bulk overlap with AllGather
+qkv_dgrad:
+  method: bulk
+  num_sm: 8
+  cga_size: 2
+  set_sm_margin: 0
+
+qkv_wgrad:
+  method: bulk
+  num_sm: 16
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_dgrad:
+  method: bulk
+  num_sm: 4
+  cga_size: 2
+  set_sm_margin: 0
+
+fc1_wgrad:
+  method: bulk
+  num_sm: 16
+  cga_size: 2
+  set_sm_margin: 0
+
+## Ring-exchange overlap with AllGather
+qkv_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+proj_dgrad:
+  method: ring_exchange
+  aggregate: 1
+
+fc1_fprop:
+  method: ring_exchange
+  aggregate: 0
+
+fc2_dgrad:
+  method: ring_exchange
+  aggregate: 0
+
+# Chunked-collective overlap with ReduceScatter
+proj_fprop:
+  method: pipeline
+  num_sm: 16
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
+
+fc2_fprop:
+  method: pipeline
+  num_sm: 24
+  cga_size: 2
+  num_splits: 4
+  set_sm_margin: 1
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -515,21 +515,11 @@ def initialize_ub_func(self):
             self.cfg.get('encoder_seq_length') * self.cfg.get('micro_batch_size'),
             self.cfg.get('hidden_size'),
         ]
-        ub_cfg_file_name = self.cfg.get('ub_tp_comm_overlap_cfg', None)
-        ub_cfgs = None
-        if ub_cfg_file_name is not None:
-            try:
-                import yaml
-
-                with open(ub_cfg_file_name, 'r') as ub_cfg_file:
-                    ub_cfgs = yaml.safe_load(ub_cfg_file)
-            except (ImportError, TypeError):
-                logging.error(f"Fail to read ub_tp_comm_overlap config file: {ub_cfg_file_name}.")
         te_module.initialize_ub(
             shape=input_shape,
             tp_size=self.cfg.get('tensor_model_parallel_size'),
             use_fp8=self.cfg.get('fp8'),
-            ub_cfgs=ub_cfgs,
+            ub_cfgs=self.cfg.get('ub_tp_comm_overlap_cfg', None),
         )
         self.initialize_ub = False