diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py index f71d8e34b9..4f23420451 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_llm_pretrain.py @@ -31,7 +31,9 @@ def set_nemotron_3_nano_common_configs(cfg: ConfigContainer) -> None: cfg.ddp.grad_reduce_in_fp32 = False -def nemotron_3_nano_pretrain_config_gb300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: +def nemotron_3_nano_pretrain_config_gb300( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: """GB300, baseline config.""" base_cfg = get_workload_base_config( model_family_name="nemotronh", @@ -47,11 +49,15 @@ def nemotron_3_nano_pretrain_config_gb300(precision: str = "bf16", config_varian cfg.mixed_precision = precision_config set_nemotron_3_nano_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend return cfg -def nemotron_3_nano_pretrain_config_gb200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: +def nemotron_3_nano_pretrain_config_gb200( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: """GB200, baseline config.""" base_cfg = get_workload_base_config( model_family_name="nemotronh", @@ -67,11 +73,15 @@ def nemotron_3_nano_pretrain_config_gb200(precision: str = "bf16", config_varian cfg.mixed_precision = precision_config set_nemotron_3_nano_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend return cfg -def nemotron_3_nano_pretrain_config_b300(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: +def nemotron_3_nano_pretrain_config_b300( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: """B300, baseline config.""" base_cfg = get_workload_base_config( model_family_name="nemotronh", @@ -87,11 +97,15 @@ def nemotron_3_nano_pretrain_config_b300(precision: str = "bf16", config_variant cfg.mixed_precision = precision_config set_nemotron_3_nano_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend return cfg -def nemotron_3_nano_pretrain_config_b200(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: +def nemotron_3_nano_pretrain_config_b200( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: """B200, baseline config.""" base_cfg = get_workload_base_config( model_family_name="nemotronh", @@ -107,11 +121,15 @@ def nemotron_3_nano_pretrain_config_b200(precision: str = "bf16", config_variant cfg.mixed_precision = precision_config set_nemotron_3_nano_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend return cfg -def nemotron_3_nano_pretrain_config_h100(precision: str = "bf16", config_variant: str = "v1") -> ConfigContainer: +def nemotron_3_nano_pretrain_config_h100( + precision: str = "bf16", mock: bool = True, config_variant: str = "v1" +) -> ConfigContainer: """H100, baseline config.""" base_cfg = get_workload_base_config( model_family_name="nemotronh", @@ -127,5 +145,7 @@ def nemotron_3_nano_pretrain_config_h100(precision: str = "bf16", config_variant cfg.mixed_precision = precision_config set_nemotron_3_nano_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) + if base_cfg.moe_flex_dispatcher_backend is not None: + cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend return cfg diff --git a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py index a042d3bb57..8bc3db54d2 100644 --- a/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py +++ b/scripts/performance/configs/nemotronh/nemotron_3_nano_workload_base_configs.py @@ -30,39 +30,36 @@ BASE_NEMOTRON_3_NANO_CONFIG = WorkloadBaseConfig( num_gpus=8, - global_batch_size=3072, + global_batch_size=512, micro_batch_size=2, - tensor_model_parallel_size=4, + tensor_model_parallel_size=1, expert_tensor_parallel_size=1, expert_model_parallel_size=8, + moe_flex_dispatcher_backend="hybridep", ) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = replace( - BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, -) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = replace( - BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, -) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG +NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = replace( - BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, -) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B300_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG + +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG +NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = BASE_NEMOTRON_3_NANO_CONFIG -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 = replace( + +_NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 = replace( BASE_NEMOTRON_3_NANO_CONFIG, - tensor_model_parallel_size=1, + num_gpus=16, + global_batch_size=1024, + micro_batch_size=1, + recompute_modules=["moe", "layernorm"], ) -NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_FP8_MX_V1 = NEMOTRON_3_NANO_PRETRAIN_CONFIG_B200_BF16_V1 - -NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = BASE_NEMOTRON_3_NANO_CONFIG -NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = BASE_NEMOTRON_3_NANO_CONFIG +NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_BF16_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 +NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100_FP8_CS_V1 = _NEMOTRON_3_NANO_PRETRAIN_CONFIG_H100 __all__ = [ "NEMOTRON_3_NANO_PRETRAIN_CONFIG_GB300_BF16_V1",