Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions scripts/performance/configs/deepseek/deepseek_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,18 +203,18 @@ def deepseek_v3_pretrain_config_h100(
)
precision_config = get_precision_config(precision)

pp_layout = None
if base_cfg.pp_layout:
pp_layout = base_cfg.pp_layout

cfg = pretrain_config()
cfg.mixed_precision = precision_config

# Apply model-specific settings that were previously passed as constructor args
cfg.model.pipeline_model_parallel_size = base_cfg.pipeline_model_parallel_size
cfg.model.virtual_pipeline_model_parallel_size = base_cfg.virtual_pipeline_model_parallel_size
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.pipeline_model_parallel_layout = pp_layout or "Et|(tt|)*30mL"
if base_cfg.pp_layout:
cfg.model.pipeline_model_parallel_layout = base_cfg.pp_layout
else:
# Recompute layout based on updated PP/VP sizes
set_deepseek_v3_pipeline_model_parallel_layout(cfg.model)

set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand Down
8 changes: 8 additions & 0 deletions scripts/performance/configs/qwen/qwen3_llm_pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ def qwen3_235b_a22b_pretrain_config_gb300(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "flex"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -103,6 +104,7 @@ def qwen3_235b_a22b_pretrain_config_gb200(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "flex"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -128,6 +130,7 @@ def qwen3_235b_a22b_pretrain_config_b300(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "alltoall"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -153,6 +156,7 @@ def qwen3_235b_a22b_pretrain_config_b200(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "alltoall"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -178,6 +182,7 @@ def qwen3_235b_a22b_pretrain_config_h100(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=False)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "alltoall"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -203,6 +208,7 @@ def qwen3_30b_a3b_pretrain_config_gb300(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "flex"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -228,6 +234,7 @@ def qwen3_30b_a3b_pretrain_config_gb200(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "flex"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand All @@ -253,6 +260,7 @@ def qwen3_30b_a3b_pretrain_config_b300(
cfg.mixed_precision = precision_config
cfg.comm_overlap = CommOverlapConfig(tp_comm_overlap=True)
cfg.model.moe_flex_dispatcher_backend = base_cfg.moe_flex_dispatcher_backend
cfg.model.moe_token_dispatcher_type = "flex"

set_qwen3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@

BASE_QWEN3_235B_A22B_CONFIG = WorkloadBaseConfig(
expert_tensor_parallel_size=1,
moe_flex_dispatcher_backend="deepep",
)


BASE_QWEN3_30B_A3B_CONFIG = WorkloadBaseConfig(
expert_model_parallel_size=8,
expert_tensor_parallel_size=1,
global_batch_size=512,
moe_flex_dispatcher_backend="deepep",
)

BASE_QWEN3_NEXT_80B_A3B_CONFIG = WorkloadBaseConfig(
Expand Down
Loading