diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py index d30c9b2f14..4c4235c7d2 100644 --- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py +++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py @@ -210,7 +210,11 @@ def deepseek_v3_pretrain_config_h100( cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend - cfg.model.pipeline_model_parallel_layout = "Et|(tt|)*30mL" + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py index 6de05c2eba..ca06801718 100644 --- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py +++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py @@ -120,7 +120,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V1 = replace( BASE_DEEPSEEK_V3_CONFIG, num_gpus=1024, - tensor_model_parallel_size=4, # TODO: TP=2 is OOM. Resolve it and revert it to recover perf + tensor_model_parallel_size=2, pipeline_model_parallel_size=8, virtual_pipeline_model_parallel_size=4, expert_model_parallel_size=64, @@ -128,6 +128,7 @@ recompute_modules=["mla_up_proj", "mlp"], moe_flex_dispatcher_backend="hybridep", moe_a2a_overlap=False, + pp_layout="Et|(tt|)*30mL", ) DEEPSEEK_V3_PRETRAIN_CONFIG_H100_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V1 @@ -189,7 +190,11 @@ ) DEEPSEEK_V3_PRETRAIN_CONFIG_H100_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V2 -DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V2 +DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_V2 = replace( + DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V2, + virtual_pipeline_model_parallel_size=2, + pp_layout=None, +) # ============================================================================= diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 40b3701419..84ffe3a970 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -267,6 +267,14 @@ def _set_model_specific_environment_variables( ): if compute_dtype in ["fp8_cs", "fp8_mx"]: executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + elif ( + model_family_name in ["deepseek"] + and model_recipe_name in ["deepseek_v3"] + and train_task == "pretrain" + and gpu in ["h100"] + ): + executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + del_cudnn_ln = True if gpu in ["h100"]: if model_family_name == "llama" and model_recipe_name == "llama3_8b" and train_task == "pretrain":