diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py
index 9408e1277c..867cfd5e6c 100755
--- a/scripts/performance/setup_experiment.py
+++ b/scripts/performance/setup_experiment.py
@@ -278,7 +278,7 @@ def main(
     )
 
     if nccl_ub:
-        custom_env_vars.update({"NCCL_NVLS_ENABLE": "1"})
+        custom_env_vars.update({"NCCL_NVLS_ENABLE": "1", "NCCL_CTA_POLICY": "1"})
 
     if not dgxc_cluster:
         executor = slurm_executor(
diff --git a/scripts/performance/utils/overrides.py b/scripts/performance/utils/overrides.py
index 17846b2137..7b4c08b5b1 100644
--- a/scripts/performance/utils/overrides.py
+++ b/scripts/performance/utils/overrides.py
@@ -71,9 +71,7 @@ def _set_common_perf_overrides(recipe: ConfigContainer) -> ConfigContainer:
     return recipe
 
 
-def _set_megatron_fsdp_overrides(
-    recipe: ConfigContainer, use_megatron_fsdp: bool = False, nccl_ub: bool = False
-) -> ConfigContainer:
+def _set_megatron_fsdp_overrides(recipe: ConfigContainer, use_megatron_fsdp: bool = False) -> ConfigContainer:
     """Set the Megatron FSDP overrides."""
     if not use_megatron_fsdp:
         return
@@ -84,10 +82,6 @@ def _set_megatron_fsdp_overrides(
     # average_in_collective is not supported with Megatron FSDP
     recipe.ddp.average_in_collective = False
 
-    if nccl_ub:
-        recipe.ddp.nccl_ub = True
-        recipe.ddp.fsdp_manual_registration = True
-
     recipe.model.init_model_with_meta_device = True
     recipe.model.gradient_accumulation_fusion = True
 
@@ -210,7 +204,8 @@ def set_workload_base_configs(cfg: ConfigContainer, settings: WorkloadBaseConfig
     cfg.train.global_batch_size = settings.global_batch_size
     cfg.train.micro_batch_size = settings.micro_batch_size
 
-    _set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp, nccl_ub=settings.nccl_ub)
+    _set_megatron_fsdp_overrides(cfg, use_megatron_fsdp=settings.use_megatron_fsdp)
+    _set_nccl_ub_overrides(cfg, nccl_ub=settings.nccl_ub)
     _set_cuda_graph_overrides(
         cfg,
         cuda_graph_impl=settings.cuda_graph_impl,
@@ -248,9 +243,24 @@ def set_cli_overrides(recipe: ConfigContainer, cli_overrides: List[str]) -> Conf
     return recipe
 
 
+def _set_nccl_ub_overrides(recipe: ConfigContainer, nccl_ub: bool = False) -> ConfigContainer:
+    """Set the NCCL UB overrides."""
+    if nccl_ub:
+        recipe.ddp.nccl_ub = True
+        # The current version of NCCL does not support the AVG operation for reductions with symmetric kernels.
+        # To enable symmetric kernels, average_in_collective must be disabled.
+        recipe.ddp.average_in_collective = False
+
+    if recipe.ddp.use_megatron_fsdp and recipe.ddp.nccl_ub:
+        recipe.ddp.fsdp_manual_registration = True
+
+    return recipe
+
+
 def set_user_overrides(recipe: ConfigContainer, args: argparse.Namespace) -> ConfigContainer:
     """Set the user overrides."""
-    _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp, nccl_ub=args.nccl_ub)
+    _set_megatron_fsdp_overrides(recipe, use_megatron_fsdp=args.use_megatron_fsdp)
+    _set_nccl_ub_overrides(recipe, nccl_ub=args.nccl_ub)
     _set_cuda_graph_overrides(
         recipe,
         cuda_graph_impl=args.cuda_graph_impl,