Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions src/megatron/bridge/recipes/deepseek/deepseek_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,9 @@ def deepseek_v3_pretrain_config() -> ConfigContainer:
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

# MoE Token Dispatcher settings
cfg.model.moe_token_dispatcher_type = "alltoall" # Default from DeepSeekModelProvider
apply_flex_dispatcher_backend(cfg.model, None)
cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep
# Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_token_dispatcher_type = "alltoall"
cfg.model.moe_flex_dispatcher_backend = "hybridep" # Options: None, deepep, hybridep
cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend

# Training config (DIFFERENT from _pretrain_common)
Expand Down Expand Up @@ -168,6 +168,7 @@ def deepseek_v3_pretrain_config() -> ConfigContainer:
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False)
cfg.comm_overlap.delay_wgrad_compute = False
cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
# Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_shared_expert_overlap = True # Default from DeepSeekModelProvider

# Checkpoint config (DIFFERENT from _pretrain_common: save_interval=2000 vs 500)
Expand All @@ -192,6 +193,8 @@ def deepseek_v3_pretrain_config() -> ConfigContainer:
if cfg.model.apply_rope_fusion:
cfg.dist.enable_megatron_core_experimental = True # mla rope fusion is experimental

apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend)

return cfg


Expand Down Expand Up @@ -248,9 +251,9 @@ def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer:
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

# MoE Token Dispatcher settings
# Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_token_dispatcher_type = "alltoall"
apply_flex_dispatcher_backend(cfg.model, None)
cfg.model.moe_flex_dispatcher_backend = "deepep"
cfg.model.moe_flex_dispatcher_backend = "hybridep" # Options: None, deepep, hybridep
cfg.model.moe_hybridep_num_sms = 16

# Training config
Expand Down Expand Up @@ -310,6 +313,7 @@ def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer:
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False)
cfg.comm_overlap.delay_wgrad_compute = False
cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
# Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_shared_expert_overlap = True

# Checkpoint config
Expand All @@ -331,4 +335,6 @@ def deepseek_v3_pretrain_config_32nodes() -> ConfigContainer:
if cfg.model.apply_rope_fusion:
cfg.dist.enable_megatron_core_experimental = True # mla rope fusion is experimental

apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend)

return cfg
10 changes: 9 additions & 1 deletion src/megatron/bridge/recipes/qwen/qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ def qwen3_30b_a3b_pretrain_config() -> ConfigContainer:
cfg.model.init_method_std = 0.02

# MoE Token Dispatcher settings
# Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_token_dispatcher_type = "alltoall"
cfg.model.moe_flex_dispatcher_backend = "deepep"
cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep
cfg.model.moe_hybridep_num_sms = 16

# Training config
Expand Down Expand Up @@ -205,6 +206,7 @@ def qwen3_30b_a3b_pretrain_config() -> ConfigContainer:
# cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable
# cfg.comm_overlap.delay_wgrad_compute = False # Delay wgrad compute for overlap
# cfg.comm_overlap.overlap_moe_expert_parallel_comm = False # MoE-specific: Overlap EP communication
# Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation

# Checkpoint config (paths set in _pretrain_common)
Expand All @@ -222,6 +224,8 @@ def qwen3_30b_a3b_pretrain_config() -> ConfigContainer:
# MoE Force Load Balancing
cfg.model.moe_router_force_load_balancing = False

apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend)

return cfg


Expand Down Expand Up @@ -262,6 +266,7 @@ def qwen3_235b_a22b_pretrain_config() -> ConfigContainer:
cfg.model.account_for_loss_in_pipeline_split = True

# MoE Token Dispatcher settings
# Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_token_dispatcher_type = "alltoall"
cfg.model.moe_flex_dispatcher_backend = "deepep"
cfg.model.moe_hybridep_num_sms = 16
Expand Down Expand Up @@ -314,6 +319,7 @@ def qwen3_235b_a22b_pretrain_config() -> ConfigContainer:
# cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable
# cfg.comm_overlap.delay_wgrad_compute = False
# cfg.comm_overlap.overlap_moe_expert_parallel_comm = False
# Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation

# Checkpoint config
Expand All @@ -331,6 +337,8 @@ def qwen3_235b_a22b_pretrain_config() -> ConfigContainer:
# MoE Force Load Balancing
cfg.model.moe_router_force_load_balancing = False

apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend)

return cfg


Expand Down
10 changes: 6 additions & 4 deletions src/megatron/bridge/recipes/qwen/qwen3_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,10 @@ def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer:
cfg.model.mtp_loss_scaling_factor = 0.1 # Loss scaling factor for MTP

# MoE Token Dispatcher settings
# Note: moe_token_dispatcher_type may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_token_dispatcher_type = "alltoall" # Options: alltoall, allgather, flex
cfg.model.moe_flex_dispatcher_backend = (
"deepep" # Options: None, deepep, hybridep (default from TransformerConfig)
)
cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend (default from TransformerConfig)
cfg.model.moe_flex_dispatcher_backend = "deepep" # Options: None, deepep, hybridep
cfg.model.moe_hybridep_num_sms = 16 # Number of SMs for hybridep backend

# Training config
cfg.train.manual_gc = True
Expand Down Expand Up @@ -205,6 +204,7 @@ def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer:
# cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False) # Uncomment to enable
# cfg.comm_overlap.delay_wgrad_compute = False # Delay wgrad compute for overlap
# cfg.comm_overlap.overlap_moe_expert_parallel_comm = False # MoE-specific: Overlap EP communication
# Note: moe_shared_expert_overlap may be overridden by apply_flex_dispatcher_backend at the end
cfg.model.moe_shared_expert_overlap = False # Overlap shared expert computation

# Checkpoint config
Expand All @@ -222,6 +222,8 @@ def qwen3_next_80b_a3b_pretrain_config() -> ConfigContainer:
# MoE Force Load Balancing
cfg.model.moe_router_force_load_balancing = False

apply_flex_dispatcher_backend(cfg.model, cfg.model.moe_flex_dispatcher_backend)

return cfg


Expand Down