Skip to content

Commit

Permalink
add fp8 configs and grok
Browse files Browse the repository at this point in the history
Signed-off-by: gaod <[email protected]>
  • Loading branch information
gdengk committed Oct 7, 2024
1 parent d0d3902 commit 5df9c65
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 5 deletions.
27 changes: 27 additions & 0 deletions examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training=grok/grok1_proxy \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="grok1_proxy_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
training.model.moe_grouped_gemm=False \
training.model.gradient_accumulation_fusion=True \
+training.model.optim.grad_sync_dtype=bf16 \
training.trainer.num_nodes=64 \
+training.model.context_parallel_size=2 \
training.model.sequence_parallel=True \
training.model.tensor_model_parallel_size=4 \
training.model.pipeline_model_parallel_size=8 \
training.model.virtual_pipeline_model_parallel_size=8 \
training.model.gc_interval=40
32 changes: 32 additions & 0 deletions examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training=grok/grok1_proxy \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="grok1_proxy_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
training.model.moe_grouped_gemm=False \
training.model.gradient_accumulation_fusion=True \
+training.model.optim.grad_sync_dtype=bf16 \
training.trainer.num_nodes=64 \
+training.model.context_parallel_size=2 \
training.model.sequence_parallel=True \
training.model.tensor_model_parallel_size=4 \
training.model.pipeline_model_parallel_size=8 \
training.model.virtual_pipeline_model_parallel_size=8 \
training.model.gc_interval=40
training.model.fp8=True \
+training.model.fp8_params=True \
+training.model.optim.overlap_param_gather_with_optimizer_step=True \
+training.model.optim.average_in_collective=True \

20 changes: 20 additions & 0 deletions examples/training/mixtral/h100/mixtral_8x3b_fp8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training=mixtral/mixtral_8x3b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="mixtral_8x3b_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
training.model.fp8=True \
+training.model.fp8_params=True \
+training.model.optim.overlap_param_gather_with_optimizer_step=False \
+training.model.optim.average_in_collective=True \
22 changes: 22 additions & 0 deletions examples/training/mixtral/h100/mixtral_8x7b_fp8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training=mixtral/mixtral_8x7b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="mixtral_8x7b_bf16" \
training.run.time_limit=0:30:00 \
training.model.tokenizer.model=${TOK_PATH} \
+env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \
training.model.fp8=True \
+training.model.fp8_params=True \
+training.model.optim.overlap_param_gather_with_optimizer_step=True \
+training.model.optim.average_in_collective=True \
training.model.sequence_parallel=False \
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/grok/grok1_proxy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ model:
num_moe_experts: 8
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: false
batch_p2p_comm: true
overlap_p2p_comm: true
batch_p2p_comm: false
seq_len_interpolation_factor: null
num_query_groups: 8
tokenizer:
Expand Down Expand Up @@ -149,7 +149,7 @@ model:
optim:
name: mcore_distributed_optim
overlap_grad_sync: true
overlap_param_sync: false
overlap_param_sync: true
lr: 0.00012
weight_decay: 0.1
betas:
Expand Down
2 changes: 1 addition & 1 deletion launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ model:
warmup_steps: 107
constant_steps: 11873
min_lr: 1.0e-05
gc_interval: 0
gc_interval: 40
precision: bf16
mcore_customization_config:
new_decoder_architecture: false
Expand Down
2 changes: 1 addition & 1 deletion launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ model:
warmup_steps: 636
constant_steps: 11873
min_lr: 1.0e-05
gc_interval: 0
gc_interval: 60
precision: bf16
mcore_customization_config:
new_decoder_architecture: false
Expand Down

0 comments on commit 5df9c65

Please sign in to comment.