diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 9408e1277c..867cfd5e6c 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -278,7 +278,7 @@ def main( ) if nccl_ub: - custom_env_vars.update({"NCCL_NVLS_ENABLE": "1"}) + custom_env_vars.update({"NCCL_NVLS_ENABLE": "1", "NCCL_CTA_POLICY": "1"}) if not dgxc_cluster: executor = slurm_executor( diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 17846b2137..7b4c08b5b1 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -71,9 +71,7 @@ def _set_common_perf_overrides(recipe: ConfigContainer) -> ConfigContainer: return recipe -def _set_megatron_fsdp_overrides( - recipe: ConfigContainer, use_megatron_fsdp: bool = False, nccl_ub: bool = False -) -> ConfigContainer: +def _set_megatron_fsdp_overrides(recipe: ConfigContainer, use_megatron_fsdp: bool = False) -> ConfigContainer: """Set the Megatron FSDP overrides.""" if not use_megatron_fsdp: return @@ -84,10 +82,6 @@ def _set_megatron_fsdp_overrides( # average_in_collective is not supported with Megatron FSDP recipe.ddp.average_in_collective = False - if nccl_ub: - recipe.ddp.nccl_ub = True - recipe.ddp.fsdp_manual_registration = True - recipe.model.init_model_with_meta_device = True recipe.model.gradient_accumulation_fusion = True @@ -210,7 +204,8 @@ def set_workload_base_configs(cfg: ConfigContainer, settings: WorkloadBaseConfig cfg.train.global_batch_size = settings.global_batch_size cfg.train.micro_batch_size = settings.micro_batch_size - _set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp, nccl_ub=settings.nccl_ub) + _set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp) + _set_nccl_ub_overrides(cfg, nccl_ub=settings.nccl_ub) _set_cuda_graph_overrides( cfg, cuda_graph_impl=settings.cuda_graph_impl, @@ -248,9 +243,24 @@ def set_cli_overrides(recipe: ConfigContainer, cli_overrides: List[str]) -> Conf return recipe +def _set_nccl_ub_overrides(recipe: ConfigContainer, nccl_ub: bool = False) -> ConfigContainer: + """Set the NCCL UB overrides.""" + if nccl_ub: + recipe.ddp.nccl_ub = True + # The current version of NCCL does not support the AVG operation for reductions with symmetric kernels. + # To enable symmetric kernels, average_in_collective must be disabled. + recipe.ddp.average_in_collective = False + + if recipe.ddp.use_megatron_fsdp and recipe.ddp.nccl_ub: + recipe.ddp.fsdp_manual_registration = True + + return recipe + + def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> ConfigContainer: """Set the user overrides.""" - _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp, nccl_ub=args.nccl_ub) + _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp) + _set_nccl_ub_overrides(recipe, nccl_ub=args.nccl_ub) _set_cuda_graph_overrides( recipe, cuda_graph_impl=args.cuda_graph_impl,