From de3d019144a64bc8e6ef11391cc17c18cc8f5e09 Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Mon, 26 Jan 2026 16:39:05 -0800
Subject: [PATCH 1/3] dsv3 nvfp4 gb300

Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 .../configs/deepseek/deepseek_llm_pretrain.py |  2 +-
 .../deepseek_workload_base_configs.py         | 20 +++++++++++++++++--
 scripts/performance/utils/overrides.py        |  2 +-
 scripts/performance/utils/utils.py            |  3 +++
 4 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
index 680c83466f..5dce8e242a 100644
--- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
+++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
@@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(
         pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
         virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
         moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
-        layout=None,
+        layout=base_cfg.pp_layout,
     )
     set_deepseek_v3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
index c4f61555a0..9d9c979b59 100644
--- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
+++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
@@ -54,7 +54,20 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
+    BASE_DEEPSEEK_V3_CONFIG,
+    num_gpus=256,
+    global_batch_size=2048,
+    micro_batch_size=2,
+    pipeline_model_parallel_size=2,
+    virtual_pipeline_model_parallel_size=8,
+    pp_layout="Et*4|(t*4|)*14tmL",
+    expert_model_parallel_size=32,
+    moe_flex_dispatcher_backend="hybridep",
+    moe_a2a_overlap=False,
+    cuda_graph_scope=[],
+    recompute_modules=["mla_up_proj"],
+)
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
@@ -133,7 +146,10 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
+    DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
+    global_batch_size=4096,
+)
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(
diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py
index 291e303461..b8fb0b7a39 100644
--- a/scripts/performance/utils/overrides.py
+++ b/scripts/performance/utils/overrides.py
@@ -357,7 +357,7 @@ def set_post_overrides(
     dp = int(num_gpus / (tp * pp * cp))
     logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}")
     ## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved.
-    if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx":
+    if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"):
         recipe.optimizer.overlap_param_gather_with_optimizer_step = True
         if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig):
             recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True
diff --git a/scripts/performance/utils/utils.py b/scripts/performance/utils/utils.py
index d29b87011c..3de04c7166 100644
--- a/scripts/performance/utils/utils.py
+++ b/scripts/performance/utils/utils.py
@@ -62,6 +62,9 @@ class WorkloadBaseConfig:
     moe_a2a_overlap: Optional[bool] = False
     peft: Optional[str] = None
 
+    # Pipeline parallelism layout
+    pp_layout: Optional[str] = None
+
     @property
     def sequence_parallel(self) -> bool:
         """Get the sequence parallel flag."""

From daf08abe8a8691a57edf3761f4605e686559731e Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Tue, 27 Jan 2026 00:21:12 -0800
Subject: [PATCH 2/3] enable fast math

Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 scripts/performance/perf_plugins.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index 5fb0595da6..4a81071943 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
             self.train_task,
         )
 
+        # Set NVFP4-specific environment variables
+        if self.compute_dtype == "nvfp4":
+            executor.env_vars["NVTE_USE_FAST_MATH"] = "1"
+
 
 @dataclass
 class PyTorchProfilerPluginScriptArgs:

From 0aaad4d9a021fc9651f755ba1f3d5265d407aebc Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Tue, 27 Jan 2026 11:14:03 -0800
Subject: [PATCH 3/3] remove redundent override

Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 .../configs/deepseek/deepseek_workload_base_configs.py        | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
index 9d9c979b59..ec38533a6d 100644
--- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
+++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
@@ -56,15 +56,11 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
     BASE_DEEPSEEK_V3_CONFIG,
-    num_gpus=256,
-    global_batch_size=2048,
     micro_batch_size=2,
     pipeline_model_parallel_size=2,
     virtual_pipeline_model_parallel_size=8,
     pp_layout="Et*4|(t*4|)*14tmL",
     expert_model_parallel_size=32,
-    moe_flex_dispatcher_backend="hybridep",
-    moe_a2a_overlap=False,
     cuda_graph_scope=[],
     recompute_modules=["mla_up_proj"],
 )