NVIDIA-NeMo · erhoo82 · Jan 28, 2026 · Jan 27, 2026 · Jan 27, 2026 · Jan 27, 2026
diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
@@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(
         pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
         virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
         moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
-        layout=None,
+        layout=base_cfg.pp_layout,
     )
     set_deepseek_v3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)

diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
@@ -54,7 +54,16 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
+    BASE_DEEPSEEK_V3_CONFIG,
+    micro_batch_size=2,
+    pipeline_model_parallel_size=2,
+    virtual_pipeline_model_parallel_size=8,
+    pp_layout="Et*4|(t*4|)*14tmL",
+    expert_model_parallel_size=32,
+    cuda_graph_scope=[],
+    recompute_modules=["mla_up_proj"],
+)
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
@@ -133,7 +142,10 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
+    DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
+    global_batch_size=4096,
+)
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
@@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
             self.train_task,
         )
 
+        # Set NVFP4-specific environment variables
+        if self.compute_dtype == "nvfp4":
+            executor.env_vars["NVTE_USE_FAST_MATH"] = "1"
+
 
 @dataclass
 class PyTorchProfilerPluginScriptArgs:

diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py
@@ -359,7 +359,7 @@ def set_post_overrides(
     dp = int(num_gpus / (tp * pp * cp))
     logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}")
     ## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved.
-    if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx":
+    if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"):
         recipe.optimizer.overlap_param_gather_with_optimizer_step = True
         if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig):
             recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True

diff --git a/scripts/performance/utils/utils.py b/scripts/performance/utils/utils.py
@@ -62,6 +62,9 @@ class WorkloadBaseConfig:
     moe_a2a_overlap: Optional[bool] = False
     peft: Optional[str] = None
 
+    # Pipeline parallelism layout
+    pp_layout: Optional[str] = None
+
     @property
     def sequence_parallel(self) -> bool:
         """Get the sequence parallel flag."""