NVIDIA · akoumpa · Oct 7, 2024 · Oct 7, 2024
diff --git a/examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh b/examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=grok/grok1_proxy \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="grok1_proxy_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
+training.model.moe_grouped_gemm=False \
+training.model.gradient_accumulation_fusion=True \
++training.model.optim.grad_sync_dtype=bf16 \
+training.trainer.num_nodes=64 \
++training.model.context_parallel_size=2 \
+training.model.sequence_parallel=True \
+training.model.tensor_model_parallel_size=4 \
+training.model.pipeline_model_parallel_size=8 \
+training.model.virtual_pipeline_model_parallel_size=8 \
+training.model.gc_interval=40
diff --git a/examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh b/examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=grok/grok1_proxy \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="grok1_proxy_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
+training.model.moe_grouped_gemm=False \
+training.model.gradient_accumulation_fusion=True \
++training.model.optim.grad_sync_dtype=bf16 \
+training.trainer.num_nodes=64 \
++training.model.context_parallel_size=2 \
+training.model.sequence_parallel=True \
+training.model.tensor_model_parallel_size=4 \
+training.model.pipeline_model_parallel_size=8 \
+training.model.virtual_pipeline_model_parallel_size=8 \
+training.model.gc_interval=40
+training.model.fp8=True \
++training.model.fp8_params=True \
++training.model.optim.overlap_param_gather_with_optimizer_step=True \
++training.model.optim.average_in_collective=True \
+
diff --git a/examples/training/mixtral/h100/mixtral_8x3b_fp8.sh b/examples/training/mixtral/h100/mixtral_8x3b_fp8.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=mixtral/mixtral_8x3b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="mixtral_8x3b_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
+training.model.fp8=True \
++training.model.fp8_params=True \
++training.model.optim.overlap_param_gather_with_optimizer_step=False \
++training.model.optim.average_in_collective=True \
diff --git a/examples/training/mixtral/h100/mixtral_8x7b_fp8.sh b/examples/training/mixtral/h100/mixtral_8x7b_fp8.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training=mixtral/mixtral_8x7b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="mixtral_8x7b_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
+training.model.fp8=True \
++training.model.fp8_params=True \
++training.model.optim.overlap_param_gather_with_optimizer_step=True \
++training.model.optim.average_in_collective=True \
+training.model.sequence_parallel=False \
diff --git a/launcher_scripts/conf/training/grok/grok1_proxy.yaml b/launcher_scripts/conf/training/grok/grok1_proxy.yaml
@@ -91,8 +91,8 @@ model:
   num_moe_experts: 8
   attention_type: multihead
   share_embeddings_and_output_weights: false
-  overlap_p2p_comm: false
-  batch_p2p_comm: true
+  overlap_p2p_comm: true
+  batch_p2p_comm: false
   seq_len_interpolation_factor: null
   num_query_groups: 8
   tokenizer:
@@ -149,7 +149,7 @@ model:
   optim:
     name: mcore_distributed_optim
     overlap_grad_sync: true 
-    overlap_param_sync: false
+    overlap_param_sync: true
     lr: 0.00012
     weight_decay: 0.1
     betas:

diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
@@ -160,7 +160,7 @@ model:
       warmup_steps: 107
       constant_steps: 11873
       min_lr: 1.0e-05
-  gc_interval: 0
+  gc_interval: 40
   precision: bf16
   mcore_customization_config:
     new_decoder_architecture: false

diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
@@ -161,7 +161,7 @@ model:
       warmup_steps: 636
       constant_steps: 11873
       min_lr: 1.0e-05
-  gc_interval: 0
+  gc_interval: 60
   precision: bf16
   mcore_customization_config:
     new_decoder_architecture: false