From de3d019144a64bc8e6ef11391cc17c18cc8f5e09 Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Mon, 26 Jan 2026 16:39:05 -0800 Subject: [PATCH 1/3] dsv3 nvfp4 gb300 Signed-off-by: Dingqing Yang --- .../configs/deepseek/deepseek_llm_pretrain.py | 2 +- .../deepseek_workload_base_configs.py | 20 +++++++++++++++++-- scripts/performance/utils/overrides.py | 2 +- scripts/performance/utils/utils.py | 3 +++ 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py index 680c83466f..5dce8e242a 100644 --- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py +++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py @@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300( pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size, virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size, moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend, - layout=None, + layout=base_cfg.pp_layout, ) set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py index c4f61555a0..9d9c979b59 100644 --- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py +++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py @@ -54,7 +54,20 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace( + BASE_DEEPSEEK_V3_CONFIG, + num_gpus=256, + global_batch_size=2048, + micro_batch_size=2, + pipeline_model_parallel_size=2, + virtual_pipeline_model_parallel_size=8, + pp_layout="Et*4|(t*4|)*14tmL", + expert_model_parallel_size=32, + moe_flex_dispatcher_backend="hybridep", + moe_a2a_overlap=False, + cuda_graph_scope=[], + recompute_modules=["mla_up_proj"], +) DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace( @@ -133,7 +146,10 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 -DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace( + DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1, + global_batch_size=4096, +) DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace( diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 291e303461..b8fb0b7a39 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -357,7 +357,7 @@ def set_post_overrides( dp = int(num_gpus / (tp * pp * cp)) logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}") ## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved. - if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx": + if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"): recipe.optimizer.overlap_param_gather_with_optimizer_step = True if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig): recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True diff --git a/scripts/performance/utils/utils.py b/scripts/performance/utils/utils.py index d29b87011c..3de04c7166 100644 --- a/scripts/performance/utils/utils.py +++ b/scripts/performance/utils/utils.py @@ -62,6 +62,9 @@ class WorkloadBaseConfig: moe_a2a_overlap: Optional[bool] = False peft: Optional[str] = None + # Pipeline parallelism layout + pp_layout: Optional[str] = None + @property def sequence_parallel(self) -> bool: """Get the sequence parallel flag.""" From daf08abe8a8691a57edf3761f4605e686559731e Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Tue, 27 Jan 2026 00:21:12 -0800 Subject: [PATCH 2/3] enable fast math Signed-off-by: Dingqing Yang --- scripts/performance/perf_plugins.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 5fb0595da6..4a81071943 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo self.train_task, ) + # Set NVFP4-specific environment variables + if self.compute_dtype == "nvfp4": + executor.env_vars["NVTE_USE_FAST_MATH"] = "1" + @dataclass class PyTorchProfilerPluginScriptArgs: From 0aaad4d9a021fc9651f755ba1f3d5265d407aebc Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Tue, 27 Jan 2026 11:14:03 -0800 Subject: [PATCH 3/3] remove redundent override Signed-off-by: Dingqing Yang --- .../configs/deepseek/deepseek_workload_base_configs.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py index 9d9c979b59..ec38533a6d 100644 --- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py +++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py @@ -56,15 +56,11 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace( BASE_DEEPSEEK_V3_CONFIG, - num_gpus=256, - global_batch_size=2048, micro_batch_size=2, pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=8, pp_layout="Et*4|(t*4|)*14tmL", expert_model_parallel_size=32, - moe_flex_dispatcher_backend="hybridep", - moe_a2a_overlap=False, cuda_graph_scope=[], recompute_modules=["mla_up_proj"], )