diff --git a/scripts/performance/configs/deepseek/__init__.py b/scripts/performance/configs/deepseek/__init__.py index cb7e039f9a..e81c35eba7 100644 --- a/scripts/performance/configs/deepseek/__init__.py +++ b/scripts/performance/configs/deepseek/__init__.py @@ -22,6 +22,8 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_LARGE_SCALE, DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1, DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2, + DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1, + DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2, DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1, DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V2, DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1, @@ -29,6 +31,8 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_LARGE_SCALE, DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1, DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2, + DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1, + DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2, DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1, DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2, DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1, @@ -36,6 +40,8 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_LARGE_SCALE, DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1, DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2, + DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1, + DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2, DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1, DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2, DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1, @@ -60,12 +66,15 @@ "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1", + "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1", @@ -77,12 +86,15 @@ "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2", + "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2", diff --git a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py index ce185719a6..5ca6f9e22d 100644 --- a/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py +++ b/scripts/performance/configs/deepseek/deepseek_workload_base_configs.py @@ -85,6 +85,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1 = replace( @@ -100,6 +101,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1 = replace( @@ -115,6 +117,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1 +DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V1 DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V1 = replace( @@ -159,6 +162,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 +DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V2 = replace( @@ -173,6 +177,7 @@ ) DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2 +DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B300_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2 = replace( @@ -182,6 +187,7 @@ DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2 +DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_B200_V2 DEEPSEEK_V3_PRETRAIN_CONFIG_H100_V2 = replace( @@ -242,12 +248,15 @@ "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V1", + "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V1", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V1", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_H100_BF16_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V1", "DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_V1", @@ -259,12 +268,15 @@ "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_FP8_MX_V2", + "DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_NVFP4_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B300_NVFP4_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "DEEPSEEK_V3_PRETRAIN_CONFIG_B200_NVFP4_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_H100_BF16_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_CS_V2", "DEEPSEEK_V3_PRETRAIN_CONFIG_H100_FP8_SC_V2", diff --git a/scripts/performance/configs/gpt_oss/__init__.py b/scripts/performance/configs/gpt_oss/__init__.py index 04b8c3d532..b12ed0efb3 100644 --- a/scripts/performance/configs/gpt_oss/__init__.py +++ b/scripts/performance/configs/gpt_oss/__init__.py @@ -17,30 +17,50 @@ from .gpt_oss_workload_base_configs import ( GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1, GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2, + GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1, + GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2, GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1, GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2, + GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1, + GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2, GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1, GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2, + GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1, + GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2, GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1, GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2, + GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1, + GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2, GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1, GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2, + GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1, + GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2, ) __all__ = [ # V1 (GBS=512) "GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1", # V2 (GBS=1280) "GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2", ] if HAVE_MEGATRON_BRIDGE: diff --git a/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py b/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py index ce2aaeaea0..0a5b2fd9c7 100644 --- a/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py +++ b/scripts/performance/configs/gpt_oss/gpt_oss_llm_pretrain.py @@ -20,6 +20,7 @@ from megatron.bridge.recipes.gpt_oss import gpt_oss_120b_pretrain_config from megatron.bridge.training.config import ConfigContainer +from megatron.bridge.training.flex_dispatcher_backend import apply_flex_dispatcher_backend logger = logging.getLogger(__name__) @@ -38,7 +39,6 @@ def gpt_oss_120b_pretrain_config_gb300( precision: str = "bf16", mock: bool = True, config_variant: str = "v1" ) -> ConfigContainer: """GB300, baseline config.""" - # GPT-OSS currently only has BF16 base configs enabled base_cfg = get_workload_base_config( model_family_name="gpt_oss", model_recipe_name="gpt_oss_120b", @@ -51,6 +51,8 @@ def gpt_oss_120b_pretrain_config_gb300( cfg = gpt_oss_120b_pretrain_config() cfg.mixed_precision = precision_config + if base_cfg.moe_flex_dispatcher_backend is not None: + apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend) set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -73,6 +75,8 @@ def gpt_oss_120b_pretrain_config_gb200( cfg = gpt_oss_120b_pretrain_config() cfg.mixed_precision = precision_config + if base_cfg.moe_flex_dispatcher_backend is not None: + apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend) set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -95,6 +99,8 @@ def gpt_oss_120b_pretrain_config_b300( cfg = gpt_oss_120b_pretrain_config() cfg.mixed_precision = precision_config + if base_cfg.moe_flex_dispatcher_backend is not None: + apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend) set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -117,6 +123,8 @@ def gpt_oss_120b_pretrain_config_b200( cfg = gpt_oss_120b_pretrain_config() cfg.mixed_precision = precision_config + if base_cfg.moe_flex_dispatcher_backend is not None: + apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend) set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -139,6 +147,8 @@ def gpt_oss_120b_pretrain_config_h100( cfg = gpt_oss_120b_pretrain_config() cfg.mixed_precision = precision_config + if base_cfg.moe_flex_dispatcher_backend is not None: + apply_flex_dispatcher_backend(cfg.model, base_cfg.moe_flex_dispatcher_backend) set_gpt_oss_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py b/scripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py index daf0f2cd93..baf9cb6e5d 100644 --- a/scripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py +++ b/scripts/performance/configs/gpt_oss/gpt_oss_workload_base_configs.py @@ -82,6 +82,12 @@ recompute_modules=["layernorm", "moe_act"], ) +GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1 +GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1 +GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1 +GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1 +GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1 = GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1 + # ============================================================================= # GPT-OSS 120B Pretrain - V2 (GBS=1280) @@ -116,18 +122,34 @@ global_batch_size=1280, ) +GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2 +GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2 +GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2 +GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2 +GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2 = GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2 + __all__ = [ # V1 (GBS=512) "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V1", "GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V1", + "GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V1", # V2 (GBS=1280) "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB300_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_GB200_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_B300_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_B300_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_B200_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_B200_FP8_MX_V2", "GPT_OSS_120B_PRETRAIN_CONFIG_H100_BF16_V2", + "GPT_OSS_120B_PRETRAIN_CONFIG_H100_FP8_MX_V2", ] diff --git a/scripts/performance/configs/qwen/__init__.py b/scripts/performance/configs/qwen/__init__.py index 5da300a021..12dd0f1425 100644 --- a/scripts/performance/configs/qwen/__init__.py +++ b/scripts/performance/configs/qwen/__init__.py @@ -46,6 +46,8 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_LARGE_SCALE, QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2, + QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V1, + QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1, @@ -53,6 +55,8 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_LARGE_SCALE, QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2, + QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V1, + QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1, @@ -60,6 +64,8 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_LARGE_SCALE, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V2, + QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V1, + QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1, @@ -67,6 +73,8 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_LARGE_SCALE, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V2, + QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V1, + QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V1, QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V2, QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_LARGE_SCALE, @@ -105,30 +113,38 @@ "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_V1", # Qwen3 235B A22B V2 (num_gpus=256 for Blackwell, GBS=8192 for all) "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_V2", # Qwen3 Next 80B A3B V1 diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py index 85671f4904..3388c2bb23 100644 --- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py +++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py @@ -80,6 +80,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1 +QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1 QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V1 = replace( @@ -107,6 +108,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1 +QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1 QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V1 = replace( @@ -132,6 +134,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1 +QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1 QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V1 = replace( @@ -156,6 +159,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V1 +QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V1 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V1 QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V1 = replace( @@ -207,6 +211,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V2 +QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V2 QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V2 = replace( @@ -225,6 +230,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2 +QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2 QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V2 = replace( @@ -242,6 +248,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2 +QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2 QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V2 = replace( @@ -259,6 +266,7 @@ QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2 +QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2 = QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2 QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V2 = replace( @@ -498,30 +506,38 @@ "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V1", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V1", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_V1", # Qwen3 235B A22B V2 (num_gpus=256 for Blackwell, GBS=8192 for all) "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB300_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_GB200_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B300_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_CS_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_FP8_MX_V2", + "QWEN3_235B_A22B_PRETRAIN_CONFIG_B200_NVFP4_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_BF16_V2", "QWEN3_235B_A22B_PRETRAIN_CONFIG_H100_FP8_CS_V2", # Qwen3 30B A3B V1 (only version)