From c80953433749905a95a0067b502f0bda5df298ab Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 21 May 2024 15:36:12 -0700
Subject: [PATCH 1/2] remove tp_comm_atomic knobs

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 launcher_scripts/conf/peft/llama/sft.yaml                 | 2 --
 launcher_scripts/conf/peft/nemotron/sft.yaml              | 2 --
 launcher_scripts/conf/peft/qwen2/sft.yaml                 | 2 --
 launcher_scripts/conf/peft/starcoder2/sft.yaml            | 2 --
 launcher_scripts/conf/training/gpt3/126m.yaml             | 2 --
 launcher_scripts/conf/training/gpt3/175b.yaml             | 2 --
 launcher_scripts/conf/training/gpt3/175b_16k.yaml         | 2 --
 launcher_scripts/conf/training/gpt3/175b_32k.yaml         | 2 --
 launcher_scripts/conf/training/gpt3/175b_fp8.yaml         | 2 --
 launcher_scripts/conf/training/gpt3/1b_improved.yaml      | 2 --
 launcher_scripts/conf/training/gpt3/20b.yaml              | 2 --
 launcher_scripts/conf/training/gpt3/400m_improved.yaml    | 2 --
 launcher_scripts/conf/training/gpt3/40b.yaml              | 2 --
 launcher_scripts/conf/training/gpt3/40b_16k.yaml          | 2 --
 launcher_scripts/conf/training/gpt3/40b_32k.yaml          | 2 --
 launcher_scripts/conf/training/gpt3/40b_64k.yaml          | 2 --
 launcher_scripts/conf/training/gpt3/40b_improved.yaml     | 2 --
 launcher_scripts/conf/training/gpt3/5b.yaml               | 2 --
 launcher_scripts/conf/training/gpt3/5b_16k.yaml           | 2 --
 launcher_scripts/conf/training/gpt3/5b_32k.yaml           | 2 --
 launcher_scripts/conf/training/gpt3/5b_64k.yaml           | 2 --
 launcher_scripts/conf/training/gpt3/7b_improved.yaml      | 2 --
 launcher_scripts/conf/training/gpt3/mlperf.yaml           | 4 ----
 launcher_scripts/conf/training/llama/llama2_13b.yaml      | 2 --
 launcher_scripts/conf/training/llama/llama2_70b.yaml      | 2 --
 launcher_scripts/conf/training/llama/llama2_7b.yaml       | 2 --
 launcher_scripts/conf/training/nemotron/nemotron_15b.yaml | 2 --
 launcher_scripts/conf/training/nemotron/nemotron_22b.yaml | 2 --
 launcher_scripts/conf/training/nemotron/nemotron_8b.yaml  | 2 --
 launcher_scripts/conf/training/qwen2/qwen2_14b.yaml       | 2 --
 launcher_scripts/conf/training/qwen2/qwen2_4b.yaml        | 2 --
 launcher_scripts/conf/training/qwen2/qwen2_72b.yaml       | 2 --
 launcher_scripts/conf/training/qwen2/qwen2_7b.yaml        | 2 --
 33 files changed, 68 deletions(-)

diff --git a/launcher_scripts/conf/peft/llama/sft.yaml b/launcher_scripts/conf/peft/llama/sft.yaml
index e3a4d5a6d2..4278ec3d3e 100644
--- a/launcher_scripts/conf/peft/llama/sft.yaml
+++ b/launcher_scripts/conf/peft/llama/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True
diff --git a/launcher_scripts/conf/peft/nemotron/sft.yaml b/launcher_scripts/conf/peft/nemotron/sft.yaml
index 57337925cf..ef472b4a9a 100644
--- a/launcher_scripts/conf/peft/nemotron/sft.yaml
+++ b/launcher_scripts/conf/peft/nemotron/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True
diff --git a/launcher_scripts/conf/peft/qwen2/sft.yaml b/launcher_scripts/conf/peft/qwen2/sft.yaml
index 09cb2b3430..74450918ed 100644
--- a/launcher_scripts/conf/peft/qwen2/sft.yaml
+++ b/launcher_scripts/conf/peft/qwen2/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True
diff --git a/launcher_scripts/conf/peft/starcoder2/sft.yaml b/launcher_scripts/conf/peft/starcoder2/sft.yaml
index d0cd414680..09ba86cad4 100644
--- a/launcher_scripts/conf/peft/starcoder2/sft.yaml
+++ b/launcher_scripts/conf/peft/starcoder2/sft.yaml
@@ -76,8 +76,6 @@ model:
   sync_batch_comm: False
   overlap_p2p_comm: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   megatron_amp_O2: True
   mcore_gpt: True
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
index 8cf1c4766d..918f5dd93e 100755
--- a/launcher_scripts/conf/training/gpt3/126m.yaml
+++ b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -144,8 +144,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml
index 4c2b372a52..b31a30c280 100755
--- a/launcher_scripts/conf/training/gpt3/175b.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/175b_16k.yaml b/launcher_scripts/conf/training/gpt3/175b_16k.yaml
index c2760d151b..8b4d97712a 100755
--- a/launcher_scripts/conf/training/gpt3/175b_16k.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_16k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/175b_32k.yaml b/launcher_scripts/conf/training/gpt3/175b_32k.yaml
index 661fef9914..dc1274a535 100755
--- a/launcher_scripts/conf/training/gpt3/175b_32k.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_32k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
index a44f26c30e..dab17ea003 100755
--- a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
+++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
index 15c8db11d0..55af39da41 100644
--- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
index 96c970667f..3d92bce02f 100755
--- a/launcher_scripts/conf/training/gpt3/20b.yaml
+++ b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
index 34bdbfa189..dd7ce23236 100644
--- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
index f71f8629e0..eb14489e6a 100755
--- a/launcher_scripts/conf/training/gpt3/40b.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/40b_16k.yaml b/launcher_scripts/conf/training/gpt3/40b_16k.yaml
index 5fb24b39fc..926e5fae12 100755
--- a/launcher_scripts/conf/training/gpt3/40b_16k.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_16k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/40b_32k.yaml b/launcher_scripts/conf/training/gpt3/40b_32k.yaml
index 23f50fa1fb..2c58f7c296 100755
--- a/launcher_scripts/conf/training/gpt3/40b_32k.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_32k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/40b_64k.yaml b/launcher_scripts/conf/training/gpt3/40b_64k.yaml
index a0e3e59e92..3a57f69c24 100755
--- a/launcher_scripts/conf/training/gpt3/40b_64k.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_64k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
index 40452896f5..cf1a8fb153 100644
--- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
index 659fc0ceee..d8d1109929 100755
--- a/launcher_scripts/conf/training/gpt3/5b.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -147,8 +147,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/5b_16k.yaml b/launcher_scripts/conf/training/gpt3/5b_16k.yaml
index 93455b4b3b..1d93ef1204 100755
--- a/launcher_scripts/conf/training/gpt3/5b_16k.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b_16k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/5b_32k.yaml b/launcher_scripts/conf/training/gpt3/5b_32k.yaml
index 4aaecc5ea1..854a16ecd5 100755
--- a/launcher_scripts/conf/training/gpt3/5b_32k.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b_32k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/5b_64k.yaml b/launcher_scripts/conf/training/gpt3/5b_64k.yaml
index c7446d4d8f..4b059900f5 100644
--- a/launcher_scripts/conf/training/gpt3/5b_64k.yaml
+++ b/launcher_scripts/conf/training/gpt3/5b_64k.yaml
@@ -149,8 +149,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   # miscellaneous
   seed: 1234
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
index cff7b4a6a6..d7080d82e2 100644
--- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml
+++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -150,8 +150,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/gpt3/mlperf.yaml b/launcher_scripts/conf/training/gpt3/mlperf.yaml
index 55ddb03c9f..cce28bba30 100644
--- a/launcher_scripts/conf/training/gpt3/mlperf.yaml
+++ b/launcher_scripts/conf/training/gpt3/mlperf.yaml
@@ -268,7 +268,3 @@ model:
   gc_interval: 100
   name: megatron_gpt_full_te_layer_autocast
   fp8_params: true
-  tp_comm_split_ag: true
-  tp_comm_split_rs: false
-  tp_comm_atomic_ag: false
-  tp_comm_atomic_rs: true
diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml
index 406309a534..2512468de5 100644
--- a/launcher_scripts/conf/training/llama/llama2_13b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml
@@ -136,8 +136,6 @@ model:
   ub_tp_comm_overlap: false
   overlap_p2p_comm: true
   batch_p2p_comm: false
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml
index 7fd3863d3a..c91dce0a48 100644
--- a/launcher_scripts/conf/training/llama/llama2_70b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml
@@ -134,8 +134,6 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   overlap_p2p_comm: true
   batch_p2p_comm: false
diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml
index a36466cbb9..3bd2e307af 100644
--- a/launcher_scripts/conf/training/llama/llama2_7b.yaml
+++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml
@@ -136,8 +136,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml
index 1f171f5e02..3403cc1ff9 100644
--- a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml
+++ b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml
@@ -155,8 +155,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   nsys_profile:
     enabled: False
diff --git a/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml
index bf74eb514a..c73b74a6ca 100644
--- a/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml
+++ b/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml
@@ -155,8 +155,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: True
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   gc_interval: 100
 
diff --git a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml
index b8a21bb5e6..200ef44bb5 100644
--- a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml
+++ b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml
@@ -155,8 +155,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   fp8_wgrad: True
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
 
   nsys_profile:
     enabled: False
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
index 70d02245b0..a0af6a0ca8 100644
--- a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
+++ b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml
@@ -138,8 +138,6 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
index 6217eb0145..fdcf0c4def 100644
--- a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
+++ b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml
@@ -140,8 +140,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
index 0ba8267c42..00e56ec063 100644
--- a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
+++ b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml
@@ -137,8 +137,6 @@ model:
   fp8_amax_compute_algo: most_recent
   use_emha: false
   ub_tp_comm_overlap: true
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   overlap_p2p_comm: true
   batch_p2p_comm: false
diff --git a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
index 98bd353954..89d14083fd 100644
--- a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
+++ b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml
@@ -140,8 +140,6 @@ model:
   fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history
   use_emha: False
   ub_tp_comm_overlap: False
-  tp_comm_atomic_ag: False
-  tp_comm_atomic_rs: False
   use_flash_attention: true
   optim:
     name: distributed_fused_adam

From f34fc914ba22ffd9cfe13561842266adb8877f04 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 21 May 2024 15:43:41 -0700
Subject: [PATCH 2/2] update mlperf config

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 launcher_scripts/conf/training/gpt3/mlperf.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/launcher_scripts/conf/training/gpt3/mlperf.yaml b/launcher_scripts/conf/training/gpt3/mlperf.yaml
index cce28bba30..1befc3f24b 100644
--- a/launcher_scripts/conf/training/gpt3/mlperf.yaml
+++ b/launcher_scripts/conf/training/gpt3/mlperf.yaml
@@ -189,18 +189,18 @@ model:
       aggregate: 0
     fc2_dgrad:
       method: ring_exchange
-      aggregate: 1
+      aggregate: 0
     proj_fprop:
       method: pipeline
       num_sm: 24
       cga_size: 2
       num_splits: 4
       set_sm_margin: 1
+      atomic_gemm: 1
+      fp8_buf: 1
     fc2_fprop:
-      method: pipeline
-      num_sm: 4
-      cga_size: 2
-      num_splits: 4
+      method: ring_exchange
+      num_sm: 1
       set_sm_margin: 1
   use_flash_attention: false
   cpu_offloading: false