diff --git a/scripts/performance/configs/llama/llama3_workload_base_configs.py b/scripts/performance/configs/llama/llama3_workload_base_configs.py index affd813dba..bea561672b 100644 --- a/scripts/performance/configs/llama/llama3_workload_base_configs.py +++ b/scripts/performance/configs/llama/llama3_workload_base_configs.py @@ -500,8 +500,8 @@ num_gpus=32, peft="none", tensor_model_parallel_size=1, - pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=5, + pipeline_model_parallel_size=2, + virtual_pipeline_model_parallel_size=20, micro_batch_size=1, global_batch_size=32, cuda_graph_impl="transformer_engine", @@ -517,9 +517,9 @@ BASE_LLAMA3_70B_CONFIG, num_gpus=32, peft="none", - tensor_model_parallel_size=2, - pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=5, + tensor_model_parallel_size=1, + pipeline_model_parallel_size=8, + virtual_pipeline_model_parallel_size=10, micro_batch_size=1, global_batch_size=32, cuda_graph_impl="transformer_engine", @@ -527,7 +527,10 @@ ) LLAMA3_70B_SFT_CONFIG_GB200_BF16_V1 = _LLAMA3_70B_SFT_CONFIG_GB200 -LLAMA3_70B_SFT_CONFIG_GB200_FP8_CS_V1 = _LLAMA3_70B_SFT_CONFIG_GB200 +LLAMA3_70B_SFT_CONFIG_GB200_FP8_CS_V1 = replace( + _LLAMA3_70B_SFT_CONFIG_GB200, + pipeline_model_parallel_size=4, +) LLAMA3_70B_SFT_CONFIG_GB200_FP8_MX_V1 = _LLAMA3_70B_SFT_CONFIG_GB200 @@ -543,8 +546,11 @@ ) LLAMA3_70B_SFT_CONFIG_H100_BF16_V1 = _LLAMA3_70B_SFT_CONFIG_H100 -LLAMA3_70B_SFT_CONFIG_H100_FP8_CS_V1 = _LLAMA3_70B_SFT_CONFIG_H100 - +LLAMA3_70B_SFT_CONFIG_H100_FP8_CS_V1 = replace( + _LLAMA3_70B_SFT_CONFIG_H100, + cuda_graph_impl="transformer_engine", + cuda_graph_scope="mlp", +) # ============================================================================= # Llama3 70B finetune (LoRA) presets - V1 (only version) diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 84ffe3a970..ff4eb52239 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -292,6 +292,12 @@ def _set_model_specific_environment_variables( if model_family_name == "deepseek": if compute_dtype == "fp8_mx": del_cudnn_ln = False + if model_family_name in ["llama"] and train_task in ["sft"]: + # TODO: Verify for H100 and 8b + del_cudnn_ln = False + if gpu in ["h100"] and model_recipe_name in ["llama3_70b"] and compute_dtype == "fp8_cs": + executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + executor.env_vars["NCCL_GRAPH_REGISTER"] = "0" if del_cudnn_ln: if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars: executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN") @@ -448,6 +454,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo if self.model_recipe_name in ["llama3_70b", "llama31_405b"] and self.train_task == "pretrain" else None ) + nccl_pp_comm_chunksize = ( + 2097152 if self.model_family_name in ["llama"] and self.train_task in ["sft"] else None + ) self._set_nccl_pp_comm_chunksize(task, executor, nccl_pp_comm_chunksize, pp_size) # Configure manual garbage collection