From c80953433749905a95a0067b502f0bda5df298ab Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 21 May 2024 15:36:12 -0700 Subject: [PATCH 1/2] remove tp_comm_atomic knobs Signed-off-by: Sangkug Lym --- launcher_scripts/conf/peft/llama/sft.yaml | 2 -- launcher_scripts/conf/peft/nemotron/sft.yaml | 2 -- launcher_scripts/conf/peft/qwen2/sft.yaml | 2 -- launcher_scripts/conf/peft/starcoder2/sft.yaml | 2 -- launcher_scripts/conf/training/gpt3/126m.yaml | 2 -- launcher_scripts/conf/training/gpt3/175b.yaml | 2 -- launcher_scripts/conf/training/gpt3/175b_16k.yaml | 2 -- launcher_scripts/conf/training/gpt3/175b_32k.yaml | 2 -- launcher_scripts/conf/training/gpt3/175b_fp8.yaml | 2 -- launcher_scripts/conf/training/gpt3/1b_improved.yaml | 2 -- launcher_scripts/conf/training/gpt3/20b.yaml | 2 -- launcher_scripts/conf/training/gpt3/400m_improved.yaml | 2 -- launcher_scripts/conf/training/gpt3/40b.yaml | 2 -- launcher_scripts/conf/training/gpt3/40b_16k.yaml | 2 -- launcher_scripts/conf/training/gpt3/40b_32k.yaml | 2 -- launcher_scripts/conf/training/gpt3/40b_64k.yaml | 2 -- launcher_scripts/conf/training/gpt3/40b_improved.yaml | 2 -- launcher_scripts/conf/training/gpt3/5b.yaml | 2 -- launcher_scripts/conf/training/gpt3/5b_16k.yaml | 2 -- launcher_scripts/conf/training/gpt3/5b_32k.yaml | 2 -- launcher_scripts/conf/training/gpt3/5b_64k.yaml | 2 -- launcher_scripts/conf/training/gpt3/7b_improved.yaml | 2 -- launcher_scripts/conf/training/gpt3/mlperf.yaml | 4 ---- launcher_scripts/conf/training/llama/llama2_13b.yaml | 2 -- launcher_scripts/conf/training/llama/llama2_70b.yaml | 2 -- launcher_scripts/conf/training/llama/llama2_7b.yaml | 2 -- launcher_scripts/conf/training/nemotron/nemotron_15b.yaml | 2 -- launcher_scripts/conf/training/nemotron/nemotron_22b.yaml | 2 -- launcher_scripts/conf/training/nemotron/nemotron_8b.yaml | 2 -- launcher_scripts/conf/training/qwen2/qwen2_14b.yaml | 2 -- launcher_scripts/conf/training/qwen2/qwen2_4b.yaml | 2 -- launcher_scripts/conf/training/qwen2/qwen2_72b.yaml | 2 -- launcher_scripts/conf/training/qwen2/qwen2_7b.yaml | 2 -- 33 files changed, 68 deletions(-) diff --git a/launcher_scripts/conf/peft/llama/sft.yaml b/launcher_scripts/conf/peft/llama/sft.yaml index e3a4d5a6d2..4278ec3d3e 100644 --- a/launcher_scripts/conf/peft/llama/sft.yaml +++ b/launcher_scripts/conf/peft/llama/sft.yaml @@ -76,8 +76,6 @@ model: sync_batch_comm: False overlap_p2p_comm: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False megatron_amp_O2: True mcore_gpt: True diff --git a/launcher_scripts/conf/peft/nemotron/sft.yaml b/launcher_scripts/conf/peft/nemotron/sft.yaml index 57337925cf..ef472b4a9a 100644 --- a/launcher_scripts/conf/peft/nemotron/sft.yaml +++ b/launcher_scripts/conf/peft/nemotron/sft.yaml @@ -76,8 +76,6 @@ model: sync_batch_comm: False overlap_p2p_comm: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False megatron_amp_O2: True mcore_gpt: True diff --git a/launcher_scripts/conf/peft/qwen2/sft.yaml b/launcher_scripts/conf/peft/qwen2/sft.yaml index 09cb2b3430..74450918ed 100644 --- a/launcher_scripts/conf/peft/qwen2/sft.yaml +++ b/launcher_scripts/conf/peft/qwen2/sft.yaml @@ -76,8 +76,6 @@ model: sync_batch_comm: False overlap_p2p_comm: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False megatron_amp_O2: True mcore_gpt: True diff --git a/launcher_scripts/conf/peft/starcoder2/sft.yaml b/launcher_scripts/conf/peft/starcoder2/sft.yaml index d0cd414680..09ba86cad4 100644 --- a/launcher_scripts/conf/peft/starcoder2/sft.yaml +++ b/launcher_scripts/conf/peft/starcoder2/sft.yaml @@ -76,8 +76,6 @@ model: sync_batch_comm: False overlap_p2p_comm: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False megatron_amp_O2: True mcore_gpt: True diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 8cf1c4766d..918f5dd93e 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -144,8 +144,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index 4c2b372a52..b31a30c280 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -147,8 +147,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/175b_16k.yaml b/launcher_scripts/conf/training/gpt3/175b_16k.yaml index c2760d151b..8b4d97712a 100755 --- a/launcher_scripts/conf/training/gpt3/175b_16k.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_16k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/175b_32k.yaml b/launcher_scripts/conf/training/gpt3/175b_32k.yaml index 661fef9914..dc1274a535 100755 --- a/launcher_scripts/conf/training/gpt3/175b_32k.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_32k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml index a44f26c30e..dab17ea003 100755 --- a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml @@ -147,8 +147,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml index 15c8db11d0..55af39da41 100644 --- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml @@ -150,8 +150,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index 96c970667f..3d92bce02f 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -147,8 +147,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml index 34bdbfa189..dd7ce23236 100644 --- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml @@ -150,8 +150,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index f71f8629e0..eb14489e6a 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -147,8 +147,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/40b_16k.yaml b/launcher_scripts/conf/training/gpt3/40b_16k.yaml index 5fb24b39fc..926e5fae12 100755 --- a/launcher_scripts/conf/training/gpt3/40b_16k.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_16k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/40b_32k.yaml b/launcher_scripts/conf/training/gpt3/40b_32k.yaml index 23f50fa1fb..2c58f7c296 100755 --- a/launcher_scripts/conf/training/gpt3/40b_32k.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_32k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/40b_64k.yaml b/launcher_scripts/conf/training/gpt3/40b_64k.yaml index a0e3e59e92..3a57f69c24 100755 --- a/launcher_scripts/conf/training/gpt3/40b_64k.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_64k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml index 40452896f5..cf1a8fb153 100644 --- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml @@ -150,8 +150,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml index 659fc0ceee..d8d1109929 100755 --- a/launcher_scripts/conf/training/gpt3/5b.yaml +++ b/launcher_scripts/conf/training/gpt3/5b.yaml @@ -147,8 +147,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/5b_16k.yaml b/launcher_scripts/conf/training/gpt3/5b_16k.yaml index 93455b4b3b..1d93ef1204 100755 --- a/launcher_scripts/conf/training/gpt3/5b_16k.yaml +++ b/launcher_scripts/conf/training/gpt3/5b_16k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/5b_32k.yaml b/launcher_scripts/conf/training/gpt3/5b_32k.yaml index 4aaecc5ea1..854a16ecd5 100755 --- a/launcher_scripts/conf/training/gpt3/5b_32k.yaml +++ b/launcher_scripts/conf/training/gpt3/5b_32k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/5b_64k.yaml b/launcher_scripts/conf/training/gpt3/5b_64k.yaml index c7446d4d8f..4b059900f5 100644 --- a/launcher_scripts/conf/training/gpt3/5b_64k.yaml +++ b/launcher_scripts/conf/training/gpt3/5b_64k.yaml @@ -149,8 +149,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml index cff7b4a6a6..d7080d82e2 100644 --- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml @@ -150,8 +150,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/gpt3/mlperf.yaml b/launcher_scripts/conf/training/gpt3/mlperf.yaml index 55ddb03c9f..cce28bba30 100644 --- a/launcher_scripts/conf/training/gpt3/mlperf.yaml +++ b/launcher_scripts/conf/training/gpt3/mlperf.yaml @@ -268,7 +268,3 @@ model: gc_interval: 100 name: megatron_gpt_full_te_layer_autocast fp8_params: true - tp_comm_split_ag: true - tp_comm_split_rs: false - tp_comm_atomic_ag: false - tp_comm_atomic_rs: true diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml index 406309a534..2512468de5 100644 --- a/launcher_scripts/conf/training/llama/llama2_13b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml @@ -136,8 +136,6 @@ model: ub_tp_comm_overlap: false overlap_p2p_comm: true batch_p2p_comm: false - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml index 7fd3863d3a..c91dce0a48 100644 --- a/launcher_scripts/conf/training/llama/llama2_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -134,8 +134,6 @@ model: fp8_amax_compute_algo: most_recent use_emha: false ub_tp_comm_overlap: true - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true overlap_p2p_comm: true batch_p2p_comm: false diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml index a36466cbb9..3bd2e307af 100644 --- a/launcher_scripts/conf/training/llama/llama2_7b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml @@ -136,8 +136,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml index 1f171f5e02..3403cc1ff9 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_15b.yaml @@ -155,8 +155,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False nsys_profile: enabled: False diff --git a/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml index bf74eb514a..c73b74a6ca 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_22b.yaml @@ -155,8 +155,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: True - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False gc_interval: 100 diff --git a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml index b8a21bb5e6..200ef44bb5 100644 --- a/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml +++ b/launcher_scripts/conf/training/nemotron/nemotron_8b.yaml @@ -155,8 +155,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True ub_tp_comm_overlap: true - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False nsys_profile: enabled: False diff --git a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml index 70d02245b0..a0af6a0ca8 100644 --- a/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml +++ b/launcher_scripts/conf/training/qwen2/qwen2_14b.yaml @@ -138,8 +138,6 @@ model: fp8_amax_compute_algo: most_recent use_emha: false ub_tp_comm_overlap: true - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml index 6217eb0145..fdcf0c4def 100644 --- a/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml +++ b/launcher_scripts/conf/training/qwen2/qwen2_4b.yaml @@ -140,8 +140,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml index 0ba8267c42..00e56ec063 100644 --- a/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml +++ b/launcher_scripts/conf/training/qwen2/qwen2_72b.yaml @@ -137,8 +137,6 @@ model: fp8_amax_compute_algo: most_recent use_emha: false ub_tp_comm_overlap: true - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true overlap_p2p_comm: true batch_p2p_comm: false diff --git a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml index 98bd353954..89d14083fd 100644 --- a/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml +++ b/launcher_scripts/conf/training/qwen2/qwen2_7b.yaml @@ -140,8 +140,6 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: False ub_tp_comm_overlap: False - tp_comm_atomic_ag: False - tp_comm_atomic_rs: False use_flash_attention: true optim: name: distributed_fused_adam From f34fc914ba22ffd9cfe13561842266adb8877f04 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 21 May 2024 15:43:41 -0700 Subject: [PATCH 2/2] update mlperf config Signed-off-by: Sangkug Lym --- launcher_scripts/conf/training/gpt3/mlperf.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/launcher_scripts/conf/training/gpt3/mlperf.yaml b/launcher_scripts/conf/training/gpt3/mlperf.yaml index cce28bba30..1befc3f24b 100644 --- a/launcher_scripts/conf/training/gpt3/mlperf.yaml +++ b/launcher_scripts/conf/training/gpt3/mlperf.yaml @@ -189,18 +189,18 @@ model: aggregate: 0 fc2_dgrad: method: ring_exchange - aggregate: 1 + aggregate: 0 proj_fprop: method: pipeline num_sm: 24 cga_size: 2 num_splits: 4 set_sm_margin: 1 + atomic_gemm: 1 + fp8_buf: 1 fc2_fprop: - method: pipeline - num_sm: 4 - cga_size: 2 - num_splits: 4 + method: ring_exchange + num_sm: 1 set_sm_margin: 1 use_flash_attention: false cpu_offloading: false