diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py index 522ecdbb10..c113920e56 100644 --- a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py +++ b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py @@ -286,7 +286,7 @@ def qwen3_30b_a3b_pretrain_config_b200( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend - cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_token_dispatcher_type = "flex" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py index 71910ef12a..85671f4904 100644 --- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py +++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py @@ -391,20 +391,17 @@ QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1 = replace( BASE_QWEN3_30B_A3B_CONFIG, num_gpus=8, + micro_batch_size=4, + moe_flex_dispatcher_backend="hybridep", cuda_graph_impl="transformer_engine", - cuda_graph_scope=["moe_router", "moe_preprocess"], + cuda_graph_scope=["attn", "moe_router", "moe_preprocess"], ) -QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 = replace( - BASE_QWEN3_30B_A3B_CONFIG, - num_gpus=8, - cuda_graph_impl="transformer_engine", - cuda_graph_scope=["moe_router", "moe_preprocess"], -) +QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1 -QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_CS_V1 +QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_FP8_MX_V1 = QWEN3_30B_A3B_PRETRAIN_CONFIG_B200_BF16_V1 QWEN3_30B_A3B_PRETRAIN_CONFIG_H100_BF16_V1 = replace(