Merge branch 'main' into alit/mamba

NVIDIA · Jul 12, 2024 · e046ac7 · e046ac7
2 parents 8c7e32b + 8250482
commit e046ac7
Show file tree

Hide file tree

Showing 16 changed files with 93 additions and 2 deletions.
diff --git a/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x3b_bf16.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training==mixtral/mixtral_8x3b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="mixtral_8x3b_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
+training.model.pipeline_model_parallel_size=1 \
+training.model.virtual_pipeline_model_parallel_size=null \
+training.model.expert_model_parallel_size=8 \
+training.model.moe_grouped_gemm=False \
+training.model.gradient_accumulation_fusion=True \
+training.model.optim.name=mcore_distributed_optim \
++training.model.optim.overlap_grad_sync=True \
++training.model.optim.overlap_param_sync=True \
++training.model.optim.grad_sync_dtype=bf16 \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
diff --git a/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh b/examples/training/mixtral/h100/mixtral_8x7b_bf16.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
+NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
+DATA_DIR=${DATA_DIR}
+TOK_PATH=${TOK_PATH}
+
+HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
+training==mixtral/mixtral_8x7b \
+stages=[training] \
+data_dir=${DATA_DIR} \
+launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
+base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
+training.run.name="mixtral_8x7b_bf16" \
+training.run.time_limit=0:30:00 \
+training.model.tokenizer.model=${TOK_PATH} \
+training.model.tensor_model_parallel_size=1 \
+training.model.pipeline_model_parallel_size=4 \
+training.model.virtual_pipeline_model_parallel_size=8 \
+training.model.expert_model_parallel_size=8 \
+training.model.sequence_parallel=False \
+training.model.moe_grouped_gemm=False \
+training.model.gradient_accumulation_fusion=True \
+training.model.overlap_p2p_comm=True \
+training.model.batch_p2p_comm=False \
+training.model.optim.name=mcore_distributed_optim \
++training.model.optim.overlap_grad_sync=True \
++training.model.optim.overlap_param_sync=True \
++training.model.optim.grad_sync_dtype=bf16 \
++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
diff --git a/launcher_scripts/conf/training/gpt3/mlperf-24n.yaml b/launcher_scripts/conf/training/gpt3/mlperf-24n.yaml
@@ -302,7 +302,7 @@ model:
   name: megatron_gpt_full_te_layer_autocast
   use_tp_pp_dp_mapping: false
   fp8_params: true
-  enable_cuda_graph: 1
+  enable_cuda_graph: False # TODO: set to true once cuda graph functionality is fully supported
   defer_embedding_wgrad_compute: true
   use_te_rng_tracker: true
   tp_comm_overlap_ag: true

diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
@@ -47,6 +47,9 @@ exp_manager:
 model:
   mcore_gpt: true
   moe_grouped_gemm: true
+  moe_token_dispatcher_type: alltoall
+  moe_pad_expert_input_to_capacity: True
+  moe_expert_capacity_factor: 1.0
   micro_batch_size: 1
   global_batch_size: 128
   rampup_batch_size: null

diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
@@ -48,6 +48,8 @@ model:
   mcore_gpt: true
   moe_grouped_gemm: true
   moe_token_dispatcher_type: alltoall
+  moe_pad_expert_input_to_capacity: True
+  moe_expert_capacity_factor: 1.0
   moe_aux_loss_coeff: 0.01
   micro_batch_size: 1
   global_batch_size: 256

diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml
@@ -53,6 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml
@@ -53,6 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 64
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml
@@ -53,6 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 8
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml
@@ -53,6 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 32
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml
@@ -53,6 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml
@@ -51,6 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml
@@ -51,6 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 64
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml
@@ -51,6 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 8
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml
@@ -51,6 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml
@@ -51,6 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
+  mcore_t5: False
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 6
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/peft/peft.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/peft/peft.yaml
@@ -106,7 +106,7 @@ spec:
               - 'cd /opt/NeMo;
               git rev-parse HEAD;
               nvidia-smi;
-              export PYTHONPATH=/opt/NeMo:\${PYTHONPATH};
+              export PYTHONPATH=/opt/NeMo:${PYTHONPATH};
               {{ if ne $config.wandbKey "nil" }}
               wandb login {{ $config.wandbKey }} &&
               {{ end }}