Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def deepseek_v3_pretrain_config_gb300(
pipeline_model_parallel_size=base_cfg.pipeline_model_parallel_size,
virtual_pipeline_model_parallel_size=base_cfg.virtual_pipeline_model_parallel_size,
moe_flex_dispatcher_backend=base_cfg.moe_flex_dispatcher_backend,
layout=None,
layout=base_cfg.pp_layout,
)
set_deepseek_v3_common_configs(cfg)
set_workload_base_configs(cfg, base_cfg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,16 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V1
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1 = replace(
BASE_DEEPSEEK_V3_CONFIG,
micro_batch_size=2,
pipeline_model_parallel_size=2,
virtual_pipeline_model_parallel_size=8,
pp_layout="Et*4|(t*4|)*14tmL",
expert_model_parallel_size=32,
cuda_graph_scope=[],
recompute_modules=["mla_up_proj"],
)


DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V1 = replace(
Expand Down Expand Up @@ -133,7 +142,10 @@
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_BF16_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_CS_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_FP8_MX_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_V2
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V2 = replace(
DEEPSEEK_V3_PRETRAIN_CONFIG_GB300_NVFP4_V1,
global_batch_size=4096,
)


DEEPSEEK_V3_PRETRAIN_CONFIG_GB200_V2 = replace(
Expand Down
4 changes: 4 additions & 0 deletions scripts/performance/perf_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,10 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
self.train_task,
)

# Set NVFP4-specific environment variables
if self.compute_dtype == "nvfp4":
executor.env_vars["NVTE_USE_FAST_MATH"] = "1"


@dataclass
class PyTorchProfilerPluginScriptArgs:
Expand Down
2 changes: 1 addition & 1 deletion scripts/performance/utils/overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ def set_post_overrides(
dp = int(num_gpus / (tp * pp * cp))
logger.info(f"DP: {dp}; TP: {tp}; PP: {pp}; CP: {cp}; VP: {vp}")
## NOTE: overlap_param_gather_with_optimizer_step causes NaN grad norm for fp8_mx. Disabling it until the issue is resolved.
if dp > 1 and pp > 1 and vp > 1 and compute_dtype != "fp8_mx":
if dp > 1 and pp > 1 and vp > 1 and compute_dtype not in ("fp8_mx", "nvfp4"):
recipe.optimizer.overlap_param_gather_with_optimizer_step = True
if hasattr(recipe, "comm_overlap") and isinstance(recipe.comm_overlap, CommOverlapConfig):
recipe.comm_overlap.overlap_param_gather_with_optimizer_step = True
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ class WorkloadBaseConfig:
moe_a2a_overlap: Optional[bool] = False
peft: Optional[str] = None

# Pipeline parallelism layout
pp_layout: Optional[str] = None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice change to make the PP layout configurable.


@property
def sequence_parallel(self) -> bool:
"""Get the sequence parallel flag."""
Expand Down
Loading