NVIDIA-NeMo · ko3n1g · Feb 26, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py
@@ -31,7 +31,9 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None:
     cfg.ddp.grad_reduce_in_fp32 = False
 
 
-def nemotron_3_nano_pretrain_config_gb300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+def nemotron_3_nano_pretrain_config_gb300(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
     """GB300, baseline config."""
     base_cfg = get_workload_base_config(
         model_family_name="nemotronh",
@@ -47,11 +49,15 @@ def nemotron_3_nano_pretrain_config_gb300(precision: str = "bf16", config_varian
     cfg.mixed_precision = precision_config
     set_nemotron_3_nano_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if base_cfg.moe_flex_dispatcher_backend is not None:
+        cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
     return cfg
 
 
-def nemotron_3_nano_pretrain_config_gb200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+def nemotron_3_nano_pretrain_config_gb200(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
     """GB200, baseline config."""
     base_cfg = get_workload_base_config(
         model_family_name="nemotronh",
@@ -67,11 +73,15 @@ def nemotron_3_nano_pretrain_config_gb200(precision: str = "bf16", config_varian
     cfg.mixed_precision = precision_config
     set_nemotron_3_nano_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if base_cfg.moe_flex_dispatcher_backend is not None:
+        cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
     return cfg
 
 
-def nemotron_3_nano_pretrain_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+def nemotron_3_nano_pretrain_config_b300(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
     """B300, baseline config."""
     base_cfg = get_workload_base_config(
         model_family_name="nemotronh",
@@ -87,11 +97,15 @@ def nemotron_3_nano_pretrain_config_b300(precision: str = "bf16", config_variant
     cfg.mixed_precision = precision_config
     set_nemotron_3_nano_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if base_cfg.moe_flex_dispatcher_backend is not None:
+        cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
     return cfg
 
 
-def nemotron_3_nano_pretrain_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+def nemotron_3_nano_pretrain_config_b200(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
     """B200, baseline config."""
     base_cfg = get_workload_base_config(
         model_family_name="nemotronh",
@@ -107,11 +121,15 @@ def nemotron_3_nano_pretrain_config_b200(precision: str = "bf16", config_variant
     cfg.mixed_precision = precision_config
     set_nemotron_3_nano_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if base_cfg.moe_flex_dispatcher_backend is not None:
+        cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
     return cfg
 
 
-def nemotron_3_nano_pretrain_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer:
+def nemotron_3_nano_pretrain_config_h100(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
     """H100, baseline config."""
     base_cfg = get_workload_base_config(
         model_family_name="nemotronh",
@@ -127,5 +145,7 @@ def nemotron_3_nano_pretrain_config_h100(precision: str = "bf16", config_variant
     cfg.mixed_precision = precision_config
     set_nemotron_3_nano_common_configs(cfg)
     set_workload_base_configs(cfg, base_cfg)
+    if base_cfg.moe_flex_dispatcher_backend is not None:
+        cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
 
     return cfg
diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py
@@ -30,39 +30,36 @@
 
 BASE_NEMOTRON_3_NANO_CONFIG = WorkloadBaseConfig(
     num_gpus=8,
-    global_batch_size=3072,
+    global_batch_size=512,
     micro_batch_size=2,
-    tensor_model_parallel_size=4,
+    tensor_model_parallel_size=1,
     expert_tensor_parallel_size=1,
     expert_model_parallel_size=8,
+    moe_flex_dispatcher_backend="hybridep",
 )
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace(
-    BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
-)
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace(
-    BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
-)
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace(
-    BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
-)
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG
 
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace(
+
+_NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace(
     BASE_NEMOTRON_3_NANO_CONFIG,
-    tensor_model_parallel_size=1,
+    num_gpus=16,
+    global_batch_size=1024,
+    micro_batch_size=1,
+    recompute_modules=["moe", "layernorm"],
 )
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1
-
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG
-NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = BASE_NEMOTRON_3_NANO_CONFIG
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
+NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100
 
 __all__ = [
     "NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1",