diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py index 5dce8e242a..67db288f25 100644 --- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py +++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py @@ -95,7 +95,7 @@ def deepseek_v3_pretrain_config_gb200( pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=None, + layout=base_cfg.pp_layout, ) set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py index ec38533a6d..62baca5d45 100644 --- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py +++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py @@ -42,28 +42,20 @@ BASE_DEEPSEEK_V3_CONFIG, num_gpus=256, global_batch_size=2048, - pipeline_model_parallel_size=4, - virtual_pipeline_model_parallel_size=4, - expert_model_parallel_size=64, - moe_flex_dispatcher_backend="hybridep", - moe_a2a_overlap=False, - cuda_graph_impl="transformer_engine", - cuda_graph_scope=["attn", "moe_router", "moe_preprocess"], - recompute_modules=["moe_act"], -) -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace( - BASE_DEEPSEEK_V3_CONFIG, micro_batch_size=2, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=8, pp_layout="Et*4|(t*4|)*14tmL", expert_model_parallel_size=32, + moe_flex_dispatcher_backend="hybridep", + moe_a2a_overlap=False, cuda_graph_scope=[], recompute_modules=["mla_up_proj"], ) +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace( @@ -142,10 +134,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace( - DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1, - global_batch_size=4096, -) +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace( diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 03db10a459..40b3701419 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -281,6 +281,9 @@ def _set_model_specific_environment_variables( if model_family_name == "llama" and model_recipe_name == "llama31_405b" and train_task == "pretrain": if compute_dtype == "fp8_cs": del_cudnn_ln = False + if model_family_name == "deepseek": + if compute_dtype == "fp8_mx": + del_cudnn_ln = False if del_cudnn_ln: if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars: executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN")