NVIDIA-NeMo · ko3n1g · Feb 8, 2026 · Feb 8, 2026 · Feb 8, 2026
diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
@@ -95,7 +95,7 @@ def deepseek_v3_pretrain_config_gb200(
         pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
         virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
         moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
-        layout=None,
+        layout=base_cfg.pp_layout,
     )
     set_deepseek_v3_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)

diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py
@@ -42,28 +42,20 @@
     BASE_DEEPSEEK_V3_CONFIG,
     num_gpus=256,
     global_batch_size=2048,
-    pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=4,
-    expert_model_parallel_size=64,
-    moe_flex_dispatcher_backend="hybridep",
-    moe_a2a_overlap=False,
-    cuda_graph_impl="transformer_engine",
-    cuda_graph_scope=["attn", "moe_router", "moe_preprocess"],
-    recompute_modules=["moe_act"],
-)
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
-    BASE_DEEPSEEK_V3_CONFIG,
     micro_batch_size=2,
     pipeline_model_parallel_size=2,
     virtual_pipeline_model_parallel_size=8,
     pp_layout="Et*4|(t*4|)*14tmL",
     expert_model_parallel_size=32,
+    moe_flex_dispatcher_backend="hybridep",
+    moe_a2a_overlap=False,
     cuda_graph_scope=[],
     recompute_modules=["mla_up_proj"],
 )
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
@@ -142,10 +134,7 @@
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
-DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
-    DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
-    global_batch_size=4096,
-)
+DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
 
 
 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(

diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py
@@ -198,8 +198,8 @@
     QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1,
     num_gpus=256,
     pipeline_model_parallel_size=4,
-    virtual_pipeline_model_parallel_size=12,
-    expert_model_parallel_size=16,
+    expert_model_parallel_size=32,
+    cuda_graph_scope=["attn", "moe_router", "moe_preprocess"],
     global_batch_size=8192,
 )
 
@@ -217,6 +217,7 @@
 QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = replace(
     QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1,
     num_gpus=256,
+    expert_model_parallel_size=32,
     global_batch_size=8192,
 )
 

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
@@ -281,6 +281,9 @@ def _set_model_specific_environment_variables(
             if model_family_name == "llama" and model_recipe_name == "llama31_405b" and train_task == "pretrain":
                 if compute_dtype == "fp8_cs":
                     del_cudnn_ln = False
+            if model_family_name == "deepseek":
+                if compute_dtype == "fp8_mx":
+                    del_cudnn_ln = False
         if del_cudnn_ln:
             if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars:
                 executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN")