diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 361c38ec8a..be7fb30c0d 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -99,11 +99,9 @@ def kuberay_executor( "TRANSFORMERS_OFFLINE": "1", "HF_HOME": "/nemo-workspace/pagaray/hf_cache", "RAY_enable_infeasible_task_early_exit": "true", - "NCCL_IB_DISABLE": "1", - "NCCL_IB_HCA": "^openib", # Ignore OpenIB devices - "NCCL_NET": "Socket", - "NCCL_NET_GDR_LEVEL": "0", - "FI_PROVIDER": "tcp", + "NCCL_NET": "tcpxo", + "NCCL_SOCKET_IFNAME": "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8", + "NCCL_FASTRAK_CTRL_DEV": "eth0", } if custom_env_vars: env_vars.update(custom_env_vars) @@ -132,8 +130,11 @@ def kuberay_executor( spec_kwargs={ "schedulerName": "runai-scheduler", "image_pull_secrets": ["dockerregistry-dockerregistry-pagaray-ngc"], + "dnsConfig": {"options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen"}]}, }, # e.g. Run:ai - volume_mounts=[{"name": "workspace", "mountPath": dgxc_pvc_mount_path}], + volume_mounts=[ + {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, + ], volumes=[ { "name": "workspace", @@ -145,7 +146,7 @@ def kuberay_executor( "securityContext": { "allowPrivilegeEscalation": False, "runAsUser": 0, - }, + } }, ) diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 6afd3bcd50..c7951278b9 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -386,17 +386,17 @@ def main( ) ) - if use_recipes and dgxc_cluster is not None: - plugins.append( - FaultTolerancePlugin( - enable_ft_package=True, - calc_ft_timeouts=True, - num_in_job_restarts=10, - num_job_retries_on_failure=10, - initial_rank_heartbeat_timeout=1800, - rank_heartbeat_timeout=300, - ) - ) + # if use_recipes and dgxc_cluster is not None: + # plugins.append( + # FaultTolerancePlugin( + # enable_ft_package=True, + # calc_ft_timeouts=True, + # num_in_job_restarts=10, + # num_job_retries_on_failure=10, + # initial_rank_heartbeat_timeout=1800, + # rank_heartbeat_timeout=300, + # ) + # ) nemorun_script = run.Script( path=str(run_script_path),