Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout=None,
layout=base_cfg.pp_layout,
)
set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,20 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
BASE_DEEPSEEK_V3_CONFIG,
num_gpus=256,
global_batch_size=2048,
micro_batch_size=2,
pipeline_model_parallel_size=2,
virtual_pipeline_model_parallel_size=8,
pp_layout="Et*4|(t*4|)*14tmL",
expert_model_parallel_size=32,
moe_flex_dispatcher_backend="hybridep",
moe_a2a_overlap=False,
cuda_graph_scope=[],
recompute_modules=["mla_up_proj"],

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to override only the changes?
I see that most of the lines remain the same as the baseline.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, updated

)


DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
Expand Down Expand Up @@ -133,7 +146,10 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
global_batch_size=4096,
)


DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(
Expand Down
2 changes: 1 addition & 1 deletion scripts/performance/utils/overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def set_post_overrides(
dp = int(num_gpus / (tp * pp * cp))
logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}")
## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved.
if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx":
if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"):
recipe.optimizer.overlap_param_gather_with_optimizer_step = True
if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig):
recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ class WorkloadBaseConfig:
moe_a2a_overlap: Optional[bool] = False
peft: Optional[str] = None

# Pipeline parallelism layout
pp_layout: Optional[str] = None

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice change to make the PP layout configurable.


@property
def sequence_parallel(self) -> bool:
"""Get the sequence parallel flag."""
Expand Down
Loading