NVIDIA-NeMo · malay-nagda · Mar 6, 2026 · Mar 6, 2026
diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
@@ -30,6 +30,8 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None:
     cfg.mixed_precision.grad_reduce_in_fp32 = False
     cfg.ddp.grad_reduce_in_fp32 = False
 
+    cfg.model.moe_router_force_load_balancing = True
+
 
 def nemotron_3_nano_pretrain_config_gb300(
     precision: str = "bf16", mock: bool = True, config_variant: str = "v1"

diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -31,36 +31,60 @@
 BASE_NEMOTRON_3_NANO_CONFIG = WorkloadBaseConfig(
     num_gpus=8,
     global_batch_size=512,
-    micro_batch_size=2,
     tensor_model_parallel_size=1,
     expert_tensor_parallel_size=1,
     expert_model_parallel_size=8,
     moe_flex_dispatcher_backend="hybridep",
+    cuda_graph_impl="transformer_engine",
+    cuda_graph_scope=["attn", "mamba", "moe_router", "moe_preprocess"],
 )
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
-
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace(
+    BASE_NEMOTRON_3_NANO_CONFIG,
+    micro_batch_size=4,
+)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace(
+    BASE_NEMOTRON_3_NANO_CONFIG,
+    micro_batch_size=2,
+)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
+    BASE_NEMOTRON_3_NANO_CONFIG,
+    micro_batch_size=4,
+)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
 
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace(
+    BASE_NEMOTRON_3_NANO_CONFIG,
+    micro_batch_size=2,
+)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_NVFP4_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
 
 _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
     num_gpus=16,
     global_batch_size=1024,
     micro_batch_size=1,
-    recompute_modules=["moe", "layernorm"],
+    cuda_graph_impl="transformer_engine",
 )
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = replace(
+    _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
+    recompute_modules=["moe", "layernorm"],
+    cuda_graph_scope=["attn", "mamba"],
+)
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = replace(
+    _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100,
+    cuda_graph_scope=["mamba"],
+    recompute_modules=["moe", "layernorm", "core_attn", "moe_act"],
+)
 
 __all__ = [
     "NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1",

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
@@ -301,6 +301,9 @@ def _set_model_specific_environment_variables(
             if gpu in ["h100"] and model_recipe_name in ["llama3_70b"] and compute_dtype == "fp8_cs":
                 executor.env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
                 executor.env_vars["NCCL_GRAPH_REGISTER"] = "0"
+        if model_recipe_name in ["nemotron_3_nano"]:
+            del_cudnn_ln = False
+
         if del_cudnn_ln:
             if "NVTE_NORM_FWD_USE_CUDNN" in executor.env_vars:
                 executor.env_vars.pop("NVTE_NORM_FWD_USE_CUDNN")