diff --git a/src/megatron/bridge/recipes/deepseek/deepseek_v3.py b/src/megatron/bridge/recipes/deepseek/deepseek_v3.py index c31ecbc0bc..10f804432d 100644 --- a/src/megatron/bridge/recipes/deepseek/deepseek_v3.py +++ b/src/megatron/bridge/recipes/deepseek/deepseek_v3.py @@ -101,9 +101,9 @@ def deepseek_v3_pretrain_config() -> ConfigContainer: set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) # MoE Token Dispatcher settings - cfg.model.moe_token_dispatcher_type = "alltoall" # Default from DeepSeekModelProvider - apply_flex_dispatcher_backend(cfg.model, None) - cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + # Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end + cfg.model.moe_token_dispatcher_type = "alltoall" + cfg.model.moe_flex_dispatcher_backend = "hybridep" # Options: None, deepep, hybridep cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend # Training config (DIFFERENT from _pretrain_common) @@ -168,6 +168,7 @@ def deepseek_v3_pretrain_config() -> ConfigContainer: cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) cfg.comm_overlap.delay_wgrad_compute = False cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + # Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_shared_expert_overlap = True # Default from DeepSeekModelProvider # Checkpoint config (DIFFERENT from _pretrain_common: save_interval=2000 vs 500) @@ -192,6 +193,8 @@ def deepseek_v3_pretrain_config() -> ConfigContainer: if cfg.model.apply_rope_fusion: cfg.dist.enable_megatron_core_experimental = True # mla rope fusion is experimental + apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend) + return cfg @@ -248,9 +251,9 @@ def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer: set_deepseek_v3_pipeline_model_parallel_layout(cfg.model) # MoE Token Dispatcher settings + # Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_token_dispatcher_type = "alltoall" - apply_flex_dispatcher_backend(cfg.model, None) - cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_flex_dispatcher_backend = "hybridep" # Options: None, deepep, hybridep cfg.model.moe_hybridep_num_sms = 16 # Training config @@ -310,6 +313,7 @@ def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer: cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) cfg.comm_overlap.delay_wgrad_compute = False cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + # Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_shared_expert_overlap = True # Checkpoint config @@ -331,4 +335,6 @@ def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer: if cfg.model.apply_rope_fusion: cfg.dist.enable_megatron_core_experimental = True # mla rope fusion is experimental + apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend) + return cfg diff --git a/src/megatron/bridge/recipes/qwen/qwen3_moe.py b/src/megatron/bridge/recipes/qwen/qwen3_moe.py index 3c8af8802d..10d6069b33 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3_moe.py +++ b/src/megatron/bridge/recipes/qwen/qwen3_moe.py @@ -153,8 +153,9 @@ def qwen3_30b_a3b_pretrain_config() -> ConfigContainer: cfg.model.init_method_std = 0.02 # MoE Token Dispatcher settings + # Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_token_dispatcher_type = "alltoall" - cfg.model.moe_flex_dispatcher_backend = "deepep" + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep cfg.model.moe_hybridep_num_sms = 16 # Training config @@ -205,6 +206,7 @@ def qwen3_30b_a3b_pretrain_config() -> ConfigContainer: # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable # cfg.comm_overlap.delay_wgrad_compute = False # Delay wgrad compute for overlap # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False # MoE-specific: Overlap EP communication + # Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation # Checkpoint config (paths set in _pretrain_common) @@ -222,6 +224,8 @@ def qwen3_30b_a3b_pretrain_config() -> ConfigContainer: # MoE Force Load Balancing cfg.model.moe_router_force_load_balancing = False + apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend) + return cfg @@ -262,6 +266,7 @@ def qwen3_235b_a22b_pretrain_config() -> ConfigContainer: cfg.model.account_for_loss_in_pipeline_split = True # MoE Token Dispatcher settings + # Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_token_dispatcher_type = "alltoall" cfg.model.moe_flex_dispatcher_backend = "deepep" cfg.model.moe_hybridep_num_sms = 16 @@ -314,6 +319,7 @@ def qwen3_235b_a22b_pretrain_config() -> ConfigContainer: # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable # cfg.comm_overlap.delay_wgrad_compute = False # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False + # Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation # Checkpoint config @@ -331,6 +337,8 @@ def qwen3_235b_a22b_pretrain_config() -> ConfigContainer: # MoE Force Load Balancing cfg.model.moe_router_force_load_balancing = False + apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend) + return cfg diff --git a/src/megatron/bridge/recipes/qwen/qwen3_next.py b/src/megatron/bridge/recipes/qwen/qwen3_next.py index f21f2f5bcc..ef2f527694 100644 --- a/src/megatron/bridge/recipes/qwen/qwen3_next.py +++ b/src/megatron/bridge/recipes/qwen/qwen3_next.py @@ -148,11 +148,10 @@ def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer: cfg.model.mtp_loss_scaling_factor = 0.1 # Loss scaling factor for MTP # MoE Token Dispatcher settings + # Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_token_dispatcher_type = "alltoall" # Options: alltoall, allgather, flex - cfg.model.moe_flex_dispatcher_backend = ( - "deepep" # Options: None, deepep, hybridep (default from TransformerConfig) - ) - cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend (default from TransformerConfig) + cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep + cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend # Training config cfg.train.manual_gc = True @@ -205,6 +204,7 @@ def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer: # cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable # cfg.comm_overlap.delay_wgrad_compute = False # Delay wgrad compute for overlap # cfg.comm_overlap.overlap_moe_expert_parallel_comm = False # MoE-specific: Overlap EP communication + # Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation # Checkpoint config @@ -222,6 +222,8 @@ def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer: # MoE Force Load Balancing cfg.model.moe_router_force_load_balancing = False + apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend) + return cfg