diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py index 680c83466f..5dce8e242a 100644 --- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py +++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py @@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300( pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=None, + layout=base_cfg.pp_layout, ) set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py index c4f61555a0..ec38533a6d 100644 --- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py +++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py @@ -54,7 +54,16 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace( + BASE_DEEPSEEK_V3_CONFIG, + micro_batch_size=2, + pipeline_model_parallel_size=2, + virtual_pipeline_model_parallel_size=8, + pp_layout="Et*4|(t*4|)*14tmL", + expert_model_parallel_size=32, + cuda_graph_scope=[], + recompute_modules=["mla_up_proj"], +) DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace( @@ -133,7 +142,10 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace( + DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1, + global_batch_size=4096, +) DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace( diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 5fb0595da6..4a81071943 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo self.train_task, ) + # Set NVFP4-specific environment variables + if self.compute_dtype == "nvfp4": + executor.env_vars["NVTE_USE_FAST_MATH"] = "1" + @dataclass class PyTorchProfilerPluginScriptArgs: diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 5f623480ec..ca103b243f 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -359,7 +359,7 @@ def set_post_overrides( dp = int(num_gpus / (tp * pp * cp)) logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}") ## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved. - if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx": + if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"): recipe.optimizer.overlap_param_gather_with_optimizer_step = True if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig): recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True diff --git a/scripts/performance/utils/utils.py b/scripts/performance/utils/utils.py index d29b87011c..3de04c7166 100644 --- a/scripts/performance/utils/utils.py +++ b/scripts/performance/utils/utils.py @@ -62,6 +62,9 @@ class WorkloadBaseConfig: moe_a2a_overlap: Optional[bool] = False peft: Optional[str] = None + # Pipeline parallelism layout + pp_layout: Optional[str] = None + @property def sequence_parallel(self) -> bool: """Get the sequence parallel flag."""