Skip to content

Commit

Permalink
Merge branch 'main' into alit/mamba
Browse files Browse the repository at this point in the history
  • Loading branch information
JRD971000 authored Jul 12, 2024
2 parents 8c7e32b + 8250482 commit e046ac7
Show file tree
Hide file tree
Showing 16 changed files with 93 additions and 2 deletions.
26 changes: 26 additions & 0 deletions examples/training/mixtral/h100/mixtral_8x3b_bf16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training==mixtral/mixtral_8x3b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="mixtral_8x3b_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
training.model.pipeline_model_parallel_size=1 \
training.model.virtual_pipeline_model_parallel_size=null \
training.model.expert_model_parallel_size=8 \
training.model.moe_grouped_gemm=False \
training.model.gradient_accumulation_fusion=True \
training.model.optim.name=mcore_distributed_optim \
+training.model.optim.overlap_grad_sync=True \
+training.model.optim.overlap_param_sync=True \
+training.model.optim.grad_sync_dtype=bf16 \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
30 changes: 30 additions & 0 deletions examples/training/mixtral/h100/mixtral_8x7b_bf16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training==mixtral/mixtral_8x7b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="mixtral_8x7b_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
training.model.tensor_model_parallel_size=1 \
training.model.pipeline_model_parallel_size=4 \
training.model.virtual_pipeline_model_parallel_size=8 \
training.model.expert_model_parallel_size=8 \
training.model.sequence_parallel=False \
training.model.moe_grouped_gemm=False \
training.model.gradient_accumulation_fusion=True \
training.model.overlap_p2p_comm=True \
training.model.batch_p2p_comm=False \
training.model.optim.name=mcore_distributed_optim \
+training.model.optim.overlap_grad_sync=True \
+training.model.optim.overlap_param_sync=True \
+training.model.optim.grad_sync_dtype=bf16 \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
2 changes: 1 addition & 1 deletion launcher_scripts/conf/training/gpt3/mlperf-24n.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ model:
name: megatron_gpt_full_te_layer_autocast
use_tp_pp_dp_mapping: false
fp8_params: true
enable_cuda_graph: 1
enable_cuda_graph: False # TODO: set to true once cuda graph functionality is fully supported
defer_embedding_wgrad_compute: true
use_te_rng_tracker: true
tp_comm_overlap_ag: true
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ exp_manager:
model:
mcore_gpt: true
moe_grouped_gemm: true
moe_token_dispatcher_type: alltoall
moe_pad_expert_input_to_capacity: True
moe_expert_capacity_factor: 1.0
micro_batch_size: 1
global_batch_size: 128
rampup_batch_size: null
Expand Down
2 changes: 2 additions & 0 deletions launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ model:
mcore_gpt: true
moe_grouped_gemm: true
moe_token_dispatcher_type: alltoall
moe_pad_expert_input_to_capacity: True
moe_expert_capacity_factor: 1.0
moe_aux_loss_coeff: 0.01
micro_batch_size: 1
global_batch_size: 256
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/mt5/11b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/mt5/170m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 64
global_batch_size: 2048 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/mt5/23b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 8
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/mt5/390m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 32
global_batch_size: 2048 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/mt5/3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/t5/11b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/t5/220m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 64
global_batch_size: 2048 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/t5/23b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 8
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/t5/3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/t5/41b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False

# model parallelism
micro_batch_size: 6
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ spec:
- 'cd /opt/NeMo;
git rev-parse HEAD;
nvidia-smi;
export PYTHONPATH=/opt/NeMo:\${PYTHONPATH};
export PYTHONPATH=/opt/NeMo:${PYTHONPATH};
{{ if ne $config.wandbKey "nil" }}
wandb login {{ $config.wandbKey }} &&
{{ end }}
Expand Down

0 comments on commit e046ac7

Please sign in to comment.