Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def main(
)

if nccl_ub:
custom_env_vars.update({"NCCL_NVLS_ENABLE": "1"})
custom_env_vars.update({"NCCL_NVLS_ENABLE": "1", "NCCL_CTA_POLICY": "1"})

if not dgxc_cluster:
executor = slurm_executor(
Expand Down
28 changes: 19 additions & 9 deletions scripts/performance/utils/overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,7 @@ def _set_common_perf_overrides(recipe: ConfigContainer) -> ConfigContainer:
return recipe


def _set_megatron_fsdp_overrides(
recipe: ConfigContainer, use_megatron_fsdp: bool = False, nccl_ub: bool = False
) -> ConfigContainer:
def _set_megatron_fsdp_overrides(recipe: ConfigContainer, use_megatron_fsdp: bool = False) -> ConfigContainer:
"""Set the Megatron FSDP overrides."""
if not use_megatron_fsdp:
return
Expand All @@ -84,10 +82,6 @@ def _set_megatron_fsdp_overrides(
# average_in_collective is not supported with Megatron FSDP
recipe.ddp.average_in_collective = False

if nccl_ub:
recipe.ddp.nccl_ub = True
recipe.ddp.fsdp_manual_registration = True

recipe.model.init_model_with_meta_device = True
recipe.model.gradient_accumulation_fusion = True

Expand Down Expand Up @@ -210,7 +204,8 @@ def set_workload_base_configs(cfg: ConfigContainer, settings: WorkloadBaseConfig
cfg.train.global_batch_size = settings.global_batch_size
cfg.train.micro_batch_size = settings.micro_batch_size

_set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp, nccl_ub=settings.nccl_ub)
_set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp)
_set_nccl_ub_overrides(cfg, nccl_ub=settings.nccl_ub)
_set_cuda_graph_overrides(
cfg,
cuda_graph_impl=settings.cuda_graph_impl,
Expand Down Expand Up @@ -248,9 +243,24 @@ def set_cli_overrides(recipe: ConfigContainer, cli_overrides: List[str]) -> Conf
return recipe


def _set_nccl_ub_overrides(recipe: ConfigContainer, nccl_ub: bool = False) -> ConfigContainer:
"""Set the NCCL UB overrides."""
if nccl_ub:
recipe.ddp.nccl_ub = True
# The current version of NCCL does not support the AVG operation for reductions with symmetric kernels.
# To enable symmetric kernels, average_in_collective must be disabled.
recipe.ddp.average_in_collective = False

if recipe.ddp.use_megatron_fsdp and recipe.ddp.nccl_ub:
recipe.ddp.fsdp_manual_registration = True

return recipe


def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> ConfigContainer:
"""Set the user overrides."""
_set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp, nccl_ub=args.nccl_ub)
_set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp)
_set_nccl_ub_overrides(recipe, nccl_ub=args.nccl_ub)
_set_cuda_graph_overrides(
recipe,
cuda_graph_impl=args.cuda_graph_impl,
Expand Down