From 8415d6d361777272d71d4857399d0810d9d81ca9 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Fri, 30 Jan 2026 20:58:27 -0800 Subject: [PATCH 1/4] fix Signed-off-by: Youngeun Kwon --- scripts/performance/utils/overrides.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 17846b2137..45d982d0e1 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -71,9 +71,7 @@ def _set_common_perf_overrides(recipe: ConfigContainer) -> ConfigContainer: return recipe -def _set_megatron_fsdp_overrides( - recipe: ConfigContainer, use_megatron_fsdp: bool = False, nccl_ub: bool = False -) -> ConfigContainer: +def _set_megatron_fsdp_overrides(recipe: ConfigContainer, use_megatron_fsdp: bool = False) -> ConfigContainer: """Set the Megatron FSDP overrides.""" if not use_megatron_fsdp: return @@ -84,10 +82,6 @@ def _set_megatron_fsdp_overrides( # average_in_collective is not supported with Megatron FSDP recipe.ddp.average_in_collective = False - if nccl_ub: - recipe.ddp.nccl_ub = True - recipe.ddp.fsdp_manual_registration = True - recipe.model.init_model_with_meta_device = True recipe.model.gradient_accumulation_fusion = True @@ -210,7 +204,8 @@ def set_workload_base_configs(cfg: ConfigContainer, settings: WorkloadBaseConfig cfg.train.global_batch_size = settings.global_batch_size cfg.train.micro_batch_size = settings.micro_batch_size - _set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp, nccl_ub=settings.nccl_ub) + _set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp) + _set_nccl_ub_overrides(cfg, nccl_ub=settings.nccl_ub) _set_cuda_graph_overrides( cfg, cuda_graph_impl=settings.cuda_graph_impl, @@ -248,9 +243,20 @@ def set_cli_overrides(recipe: ConfigContainer, cli_overrides: List[str]) -> Conf return recipe +def _set_nccl_ub_overrides(recipe: ConfigContainer, nccl_ub: bool = False) -> ConfigContainer: + """Set the NCCL UB overrides.""" + if nccl_ub: + recipe.ddp.nccl_ub = True + + if recipe.ddp.use_megatron_fsdp: + recipe.ddp.fsdp_manual_registration = True + + return recipe + def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> ConfigContainer: """Set the user overrides.""" - _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp, nccl_ub=args.nccl_ub) + _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp) + _set_nccl_ub_overrides(recipe, nccl_ub=args.nccl_ub) _set_cuda_graph_overrides( recipe, cuda_graph_impl=args.cuda_graph_impl, From 3b98db1793b3575bfbd405a70958d2c3b446a909 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Fri, 30 Jan 2026 21:45:02 -0800 Subject: [PATCH 2/4] add nccl_cta_policy setting Signed-off-by: Youngeun Kwon --- scripts/performance/setup_experiment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 9408e1277c..867cfd5e6c 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -278,7 +278,7 @@ def main( ) if nccl_ub: - custom_env_vars.update({"NCCL_NVLS_ENABLE": "1"}) + custom_env_vars.update({"NCCL_NVLS_ENABLE": "1", "NCCL_CTA_POLICY": "1"}) if not dgxc_cluster: executor = slurm_executor( From acc60cfd785d4d94e61326f9f40a5804bf3a3929 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 2 Feb 2026 09:46:51 -0800 Subject: [PATCH 3/4] fix lint error Signed-off-by: Youngeun Kwon --- scripts/performance/utils/overrides.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 45d982d0e1..559e1ae65d 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -253,6 +253,7 @@ def _set_nccl_ub_overrides(recipe: ConfigContainer, nccl_ub: bool = False) -> Co return recipe + def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> ConfigContainer: """Set the user overrides.""" _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp) From 78313c81d32a75e9ce279814aca2eb9a3972d538 Mon Sep 17 00:00:00 2001 From: Youngeun Kwon Date: Mon, 2 Feb 2026 15:52:59 -0800 Subject: [PATCH 4/4] disable avg in collective Signed-off-by: Youngeun Kwon --- scripts/performance/utils/overrides.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py index 559e1ae65d..36df218b06 100644 --- a/scripts/performance/utils/overrides.py +++ b/scripts/performance/utils/overrides.py @@ -247,6 +247,9 @@ def _set_nccl_ub_overrides(recipe: ConfigContainer, nccl_ub: bool = False) -> Co """Set the NCCL UB overrides.""" if nccl_ub: recipe.ddp.nccl_ub = True + # The current version of NCCL does not support the AVG operation for reductions with symmetric kernels. + # To enable symmetric kernels, average_in_collective must be disabled. + recipe.ddp.average_in_collective = False if recipe.ddp.use_megatron_fsdp: recipe.ddp.fsdp_manual_registration = True