From 5df9c6587a9dca4c9c793c4974fe1bf74da5484d Mon Sep 17 00:00:00 2001 From: gaod Date: Mon, 7 Oct 2024 09:49:36 -0700 Subject: [PATCH] add fp8 configs and grok Signed-off-by: gaod --- .../grok1-proxy/h100/grok1_proxy_bf16.sh | 27 ++++++++++++++++ .../grok1-proxy/h100/grok1_proxy_fp8.sh | 32 +++++++++++++++++++ .../training/mixtral/h100/mixtral_8x3b_fp8.sh | 20 ++++++++++++ .../training/mixtral/h100/mixtral_8x7b_fp8.sh | 22 +++++++++++++ .../conf/training/grok/grok1_proxy.yaml | 6 ++-- .../conf/training/mixtral/mixtral_8x3b.yaml | 2 +- .../conf/training/mixtral/mixtral_8x7b.yaml | 2 +- 7 files changed, 106 insertions(+), 5 deletions(-) create mode 100644 examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh create mode 100644 examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh create mode 100644 examples/training/mixtral/h100/mixtral_8x3b_fp8.sh create mode 100644 examples/training/mixtral/h100/mixtral_8x7b_fp8.sh diff --git a/examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh b/examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh new file mode 100644 index 0000000000..4524f9c4c0 --- /dev/null +++ b/examples/training/grok1-proxy/h100/grok1_proxy_bf16.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training=grok/grok1_proxy \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="grok1_proxy_bf16" \ +training.run.time_limit=0:30:00 \ +training.model.tokenizer.model=${TOK_PATH} \ ++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ +training.model.moe_grouped_gemm=False \ +training.model.gradient_accumulation_fusion=True \ ++training.model.optim.grad_sync_dtype=bf16 \ +training.trainer.num_nodes=64 \ ++training.model.context_parallel_size=2 \ +training.model.sequence_parallel=True \ +training.model.tensor_model_parallel_size=4 \ +training.model.pipeline_model_parallel_size=8 \ +training.model.virtual_pipeline_model_parallel_size=8 \ +training.model.gc_interval=40 diff --git a/examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh b/examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh new file mode 100644 index 0000000000..fe1d8770da --- /dev/null +++ b/examples/training/grok1-proxy/h100/grok1_proxy_fp8.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training=grok/grok1_proxy \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="grok1_proxy_bf16" \ +training.run.time_limit=0:30:00 \ +training.model.tokenizer.model=${TOK_PATH} \ ++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ +training.model.moe_grouped_gemm=False \ +training.model.gradient_accumulation_fusion=True \ ++training.model.optim.grad_sync_dtype=bf16 \ +training.trainer.num_nodes=64 \ ++training.model.context_parallel_size=2 \ +training.model.sequence_parallel=True \ +training.model.tensor_model_parallel_size=4 \ +training.model.pipeline_model_parallel_size=8 \ +training.model.virtual_pipeline_model_parallel_size=8 \ +training.model.gc_interval=40 +training.model.fp8=True \ ++training.model.fp8_params=True \ ++training.model.optim.overlap_param_gather_with_optimizer_step=True \ ++training.model.optim.average_in_collective=True \ + diff --git a/examples/training/mixtral/h100/mixtral_8x3b_fp8.sh b/examples/training/mixtral/h100/mixtral_8x3b_fp8.sh new file mode 100644 index 0000000000..44ee4ac387 --- /dev/null +++ b/examples/training/mixtral/h100/mixtral_8x3b_fp8.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training=mixtral/mixtral_8x3b \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="mixtral_8x3b_bf16" \ +training.run.time_limit=0:30:00 \ +training.model.tokenizer.model=${TOK_PATH} \ +training.model.fp8=True \ ++training.model.fp8_params=True \ ++training.model.optim.overlap_param_gather_with_optimizer_step=False \ ++training.model.optim.average_in_collective=True \ diff --git a/examples/training/mixtral/h100/mixtral_8x7b_fp8.sh b/examples/training/mixtral/h100/mixtral_8x7b_fp8.sh new file mode 100644 index 0000000000..012a1374cf --- /dev/null +++ b/examples/training/mixtral/h100/mixtral_8x7b_fp8.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +#Users should setup their cluster type in /launcher_scripts/conf/config.yaml +NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"} +DATA_DIR=${DATA_DIR} +TOK_PATH=${TOK_PATH} + +HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \ +training=mixtral/mixtral_8x7b \ +stages=[training] \ +data_dir=${DATA_DIR} \ +launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \ +base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \ +training.run.name="mixtral_8x7b_bf16" \ +training.run.time_limit=0:30:00 \ +training.model.tokenizer.model=${TOK_PATH} \ ++env_vars.NCCL_P2P_NET_CHUNKSIZE=2097152 \ +training.model.fp8=True \ ++training.model.fp8_params=True \ ++training.model.optim.overlap_param_gather_with_optimizer_step=True \ ++training.model.optim.average_in_collective=True \ +training.model.sequence_parallel=False \ diff --git a/launcher_scripts/conf/training/grok/grok1_proxy.yaml b/launcher_scripts/conf/training/grok/grok1_proxy.yaml index 7719fa72c1..6da0d0ee77 100644 --- a/launcher_scripts/conf/training/grok/grok1_proxy.yaml +++ b/launcher_scripts/conf/training/grok/grok1_proxy.yaml @@ -91,8 +91,8 @@ model: num_moe_experts: 8 attention_type: multihead share_embeddings_and_output_weights: false - overlap_p2p_comm: false - batch_p2p_comm: true + overlap_p2p_comm: true + batch_p2p_comm: false seq_len_interpolation_factor: null num_query_groups: 8 tokenizer: @@ -149,7 +149,7 @@ model: optim: name: mcore_distributed_optim overlap_grad_sync: true - overlap_param_sync: false + overlap_param_sync: true lr: 0.00012 weight_decay: 0.1 betas: diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml index 0e12a99ae3..9a1fa1ea47 100644 --- a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml +++ b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml @@ -160,7 +160,7 @@ model: warmup_steps: 107 constant_steps: 11873 min_lr: 1.0e-05 - gc_interval: 0 + gc_interval: 40 precision: bf16 mcore_customization_config: new_decoder_architecture: false diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml index 02766d4b80..9184b3abbd 100644 --- a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml +++ b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml @@ -161,7 +161,7 @@ model: warmup_steps: 636 constant_steps: 11873 min_lr: 1.0e-05 - gc_interval: 0 + gc_interval: 60 precision: bf16 mcore_customization_config: new_decoder_architecture: false