diff --git a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py index 293a2f3c06..4c4235c7d2 100644 --- a/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py +++ b/scripts/performance/configs/deepseek/deepseek_llm_pretrain.py @@ -203,10 +203,6 @@ def deepseek_v3_pretrain_config_h100( ) precision_config = get_precision_config(precision) - pp_layout = None - if base_cfg.pp_layout: - pp_layout = base_cfg.pp_layout - cfg = pretrain_config() cfg.mixed_precision = precision_config @@ -214,7 +210,11 @@ def deepseek_v3_pretrain_config_h100( cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend - cfg.model.pipeline_model_parallel_layout = pp_layout or "Et|(tt|)*30mL" + if base_cfg.pp_layout: + cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout + else: + # Recompute layout based on updated PP/VP sizes + set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) set_deepseek_v3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py index 86446da203..c113920e56 100644 --- a/scripts/performance/configs/qwen/qwen3_llm_pretrain.py +++ b/scripts/performance/configs/qwen/qwen3_llm_pretrain.py @@ -78,6 +78,7 @@ def qwen3_235b_a22b_pretrain_config_gb300( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "flex" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -103,6 +104,7 @@ def qwen3_235b_a22b_pretrain_config_gb200( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "flex" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -128,6 +130,7 @@ def qwen3_235b_a22b_pretrain_config_b300( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "alltoall" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -153,6 +156,7 @@ def qwen3_235b_a22b_pretrain_config_b200( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "alltoall" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -178,6 +182,7 @@ def qwen3_235b_a22b_pretrain_config_h100( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "alltoall" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -203,6 +208,7 @@ def qwen3_30b_a3b_pretrain_config_gb300( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "flex" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -228,6 +234,7 @@ def qwen3_30b_a3b_pretrain_config_gb200( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "flex" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) @@ -253,6 +260,7 @@ def qwen3_30b_a3b_pretrain_config_b300( cfg.mixed_precision = precision_config cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True) cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend + cfg.model.moe_token_dispatcher_type = "flex" set_qwen3_common_configs(cfg) set_workload_base_configs(cfg, base_cfg) diff --git a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py index ec853e3ff8..85671f4904 100644 --- a/scripts/performance/configs/qwen/qwen3_workload_base_configs.py +++ b/scripts/performance/configs/qwen/qwen3_workload_base_configs.py @@ -31,6 +31,7 @@ BASE_QWEN3_235B_A22B_CONFIG = WorkloadBaseConfig( expert_tensor_parallel_size=1, + moe_flex_dispatcher_backend="deepep", ) @@ -38,6 +39,7 @@ expert_model_parallel_size=8, expert_tensor_parallel_size=1, global_batch_size=512, + moe_flex_dispatcher_backend="deepep", ) BASE_QWEN3_NEXT_80B_A3B_CONFIG = WorkloadBaseConfig(