From f08500aa006ea93d4cbc1a73f1e80463503fd06a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 10:33:38 +0000 Subject: [PATCH 01/18] chore: Disable nvrx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- scripts/performance/setup_experiment.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 6afd3bcd50..c7951278b9 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -386,17 +386,17 @@ def main( ) ) - if use_recipes and dgxc_cluster is not None: - plugins.append( - FaultTolerancePlugin( - enable_ft_package=True, - calc_ft_timeouts=True, - num_in_job_restarts=10, - num_job_retries_on_failure=10, - initial_rank_heartbeat_timeout=1800, - rank_heartbeat_timeout=300, - ) - ) + # if use_recipes and dgxc_cluster is not None: + # plugins.append( + # FaultTolerancePlugin( + # enable_ft_package=True, + # calc_ft_timeouts=True, + # num_in_job_restarts=10, + # num_job_retries_on_failure=10, + # initial_rank_heartbeat_timeout=1800, + # rank_heartbeat_timeout=300, + # ) + # ) nemorun_script = run.Script( path=str(run_script_path), From de117e9c7c79317fe9ba28fad5c9eb79d0124ab7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 12:13:18 +0000 Subject: [PATCH 02/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 361c38ec8a..64a28c7ca0 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -132,6 +132,9 @@ def kuberay_executor( spec_kwargs={ "schedulerName": "runai-scheduler", "image_pull_secrets": ["dockerregistry-dockerregistry-pagaray-ngc"], + "dnsConfig": { + "options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen", "value": "1"}] + }, }, # e.g. Run:ai volume_mounts=[{"name": "workspace", "mountPath": dgxc_pvc_mount_path}], volumes=[ From 0c019dedbf3b5e30046c40ae6d49e30909946cec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 17:30:16 +0000 Subject: [PATCH 03/18] add gcp vars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 64a28c7ca0..4aff867ebb 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -99,11 +99,9 @@ def kuberay_executor( "TRANSFORMERS_OFFLINE": "1", "HF_HOME": "/nemo-workspace/pagaray/hf_cache", "RAY_enable_infeasible_task_early_exit": "true", - "NCCL_IB_DISABLE": "1", - "NCCL_IB_HCA": "^openib", # Ignore OpenIB devices - "NCCL_NET": "Socket", - "NCCL_NET_GDR_LEVEL": "0", - "FI_PROVIDER": "tcp", + "NCCL_NET": "FasTrak", + "NCCL_SOCKET_IFNAME": "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8", + "NCCL_FASTRAK_CTRL_DEV": "eth0", } if custom_env_vars: env_vars.update(custom_env_vars) From 170d540d7d537fdab5578eee5bb81903728acc3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 18:50:02 +0000 Subject: [PATCH 04/18] add tcpxo daemon MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 4aff867ebb..be3fc293f4 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -133,6 +133,13 @@ def kuberay_executor( "dnsConfig": { "options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen", "value": "1"}] }, + "initContainers": [ + { + "name": "tcpxo-daemon", + "image": "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.8", + "volumeMounts": [{"name": "tcpxo-plugin", "mountPath": "/usr/local/nvidia/lib64"}], + } + ], }, # e.g. Run:ai volume_mounts=[{"name": "workspace", "mountPath": dgxc_pvc_mount_path}], volumes=[ @@ -140,6 +147,10 @@ def kuberay_executor( "name": "workspace", "persistentVolumeClaim": {"claimName": dgxc_pvc_claim_name}, }, + { + "name": "tcpxo-plugin", + "emptyDir": {}, + }, ], env_vars=env_vars, container_kwargs={ From e2dde535837c13696d7798d5dea94b67a9f4729c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 19:34:00 +0000 Subject: [PATCH 05/18] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index be3fc293f4..4aff867ebb 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -133,13 +133,6 @@ def kuberay_executor( "dnsConfig": { "options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen", "value": "1"}] }, - "initContainers": [ - { - "name": "tcpxo-daemon", - "image": "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.8", - "volumeMounts": [{"name": "tcpxo-plugin", "mountPath": "/usr/local/nvidia/lib64"}], - } - ], }, # e.g. Run:ai volume_mounts=[{"name": "workspace", "mountPath": dgxc_pvc_mount_path}], volumes=[ @@ -147,10 +140,6 @@ def kuberay_executor( "name": "workspace", "persistentVolumeClaim": {"claimName": dgxc_pvc_claim_name}, }, - { - "name": "tcpxo-plugin", - "emptyDir": {}, - }, ], env_vars=env_vars, container_kwargs={ From c31f0bbb911ff6963444e75baed82cf4f360c52c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 19:45:01 +0000 Subject: [PATCH 06/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 4aff867ebb..a443712d87 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -102,6 +102,7 @@ def kuberay_executor( "NCCL_NET": "FasTrak", "NCCL_SOCKET_IFNAME": "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8", "NCCL_FASTRAK_CTRL_DEV": "eth0", + "LD_LIBRARY_PATH": "/usr/local/nvidia/lib64:/usr/local/tensorrt/lib:/usr/lib/x86_64-linux-gnu", } if custom_env_vars: env_vars.update(custom_env_vars) @@ -134,7 +135,10 @@ def kuberay_executor( "options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen", "value": "1"}] }, }, # e.g. Run:ai - volume_mounts=[{"name": "workspace", "mountPath": dgxc_pvc_mount_path}], + volume_mounts=[ + {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, + {"mountPath": "/usr/local/nvidia", "name": "nvtcpxo-libraries", "readOnly": "true"}, + ], volumes=[ { "name": "workspace", From e920a8b11f531f4de67066a5364027310882abd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 20:05:52 +0000 Subject: [PATCH 07/18] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index a443712d87..c405dfabda 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -15,6 +15,8 @@ import os from typing import Dict, List +from numpy import true_divide + import nemo_run as run from nemo_run.config import get_nemorun_home from nemo_run.core.execution.kuberay import KubeRayExecutor, KubeRayWorkerGroup @@ -137,7 +139,7 @@ def kuberay_executor( }, # e.g. Run:ai volume_mounts=[ {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, - {"mountPath": "/usr/local/nvidia", "name": "nvtcpxo-libraries", "readOnly": "true"}, + {"mountPath": "/usr/local/nvidia", "name": "nvtcpxo-libraries", "readOnly": true_divide}, ], volumes=[ { From bf9c69c1e8f896ebdf97dc68433e1a20c5435553 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 20:22:21 +0000 Subject: [PATCH 08/18] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index c405dfabda..b8317ea1c4 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -15,8 +15,6 @@ import os from typing import Dict, List -from numpy import true_divide - import nemo_run as run from nemo_run.config import get_nemorun_home from nemo_run.core.execution.kuberay import KubeRayExecutor, KubeRayWorkerGroup @@ -139,7 +137,7 @@ def kuberay_executor( }, # e.g. Run:ai volume_mounts=[ {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, - {"mountPath": "/usr/local/nvidia", "name": "nvtcpxo-libraries", "readOnly": true_divide}, + {"mountPath": "/usr/local/nvidia", "name": "nvtcpxo-libraries", "readOnly": True}, ], volumes=[ { From bc28da85191ffb6d95418ab3be569ed6fe50e6de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 20:32:11 +0000 Subject: [PATCH 09/18] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index b8317ea1c4..9b764e86a9 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -137,7 +137,6 @@ def kuberay_executor( }, # e.g. Run:ai volume_mounts=[ {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, - {"mountPath": "/usr/local/nvidia", "name": "nvtcpxo-libraries", "readOnly": True}, ], volumes=[ { From b474568d1de167626f3456f6d941233130ce1af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 20:57:12 +0000 Subject: [PATCH 10/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 9b764e86a9..f3bdf46f2a 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -99,7 +99,7 @@ def kuberay_executor( "TRANSFORMERS_OFFLINE": "1", "HF_HOME": "/nemo-workspace/pagaray/hf_cache", "RAY_enable_infeasible_task_early_exit": "true", - "NCCL_NET": "FasTrak", + "NCCL_NET": "tcpxo", "NCCL_SOCKET_IFNAME": "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8", "NCCL_FASTRAK_CTRL_DEV": "eth0", "LD_LIBRARY_PATH": "/usr/local/nvidia/lib64:/usr/local/tensorrt/lib:/usr/lib/x86_64-linux-gnu", From b8f88114c45740edebec5ddf073bc8c93a71b0cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 21:33:14 +0000 Subject: [PATCH 11/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index f3bdf46f2a..4a1389ece1 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -150,6 +150,15 @@ def kuberay_executor( "allowPrivilegeEscalation": False, "runAsUser": 0, }, + "command": ["/bin/bash", "-lc", "--"], + "args": [ + "export NCCL_SOCKET_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 && " + "export NCCL_NET=tcpxo && " + "export NCCL_FASTRAK_CTRL_DEV=eth0 && " + "export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu && " + "ulimit -n 65536; " + "ray start $(printenv KUBERAY_GEN_RAY_START_CMD)" # reuse KubeRay's generated cmd + ], }, ) From 0856ca59f87cc6d53c496bc637b8b3a2c9fdffb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 21:57:44 +0000 Subject: [PATCH 12/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 4a1389ece1..fa09880d80 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -150,14 +150,16 @@ def kuberay_executor( "allowPrivilegeEscalation": False, "runAsUser": 0, }, - "command": ["/bin/bash", "-lc", "--"], + "command": ["/bin/bash", "-lc"], "args": [ - "export NCCL_SOCKET_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 && " - "export NCCL_NET=tcpxo && " - "export NCCL_FASTRAK_CTRL_DEV=eth0 && " - "export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu && " - "ulimit -n 65536; " - "ray start $(printenv KUBERAY_GEN_RAY_START_CMD)" # reuse KubeRay's generated cmd + """ + export NCCL_SOCKET_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 + export NCCL_NET=tcpxo + export NCCL_FASTRAK_CTRL_DEV=eth0 + export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu + ulimit -n 65536 + eval "$KUBERAY_GEN_RAY_START_CMD" + """ ], }, ) From 9a60c19bfda7234c7ac40ea42adfa1efc05cc008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 22:25:01 +0000 Subject: [PATCH 13/18] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index fa09880d80..1c2f4ed734 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -156,7 +156,7 @@ def kuberay_executor( export NCCL_SOCKET_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 export NCCL_NET=tcpxo export NCCL_FASTRAK_CTRL_DEV=eth0 - export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/lib/x86_64-linux-gnu + export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/lib:/usr/local/nvidia/lib64 ulimit -n 65536 eval "$KUBERAY_GEN_RAY_START_CMD" """ From 04697d01e292f217d8c6dedb930d371439e69a18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 22:38:55 +0000 Subject: [PATCH 14/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 1c2f4ed734..c6e74c0a3d 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -156,7 +156,8 @@ def kuberay_executor( export NCCL_SOCKET_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 export NCCL_NET=tcpxo export NCCL_FASTRAK_CTRL_DEV=eth0 - export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:/usr/local/tensorrt/lib:/usr/local/nvidia/lib64 + export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tensorrt/lib:/usr/lib/x86_64-linux-gnu + export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.29.7 ulimit -n 65536 eval "$KUBERAY_GEN_RAY_START_CMD" """ From bf8b9db850099e66b9a61c53c83a71b42618a688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 23:00:20 +0000 Subject: [PATCH 15/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index c6e74c0a3d..ad1d087c7e 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -157,7 +157,7 @@ def kuberay_executor( export NCCL_NET=tcpxo export NCCL_FASTRAK_CTRL_DEV=eth0 export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tensorrt/lib:/usr/lib/x86_64-linux-gnu - export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.29.7 + export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.29.7 ulimit -n 65536 eval "$KUBERAY_GEN_RAY_START_CMD" """ From 6539c01e6194f25922f7d68ad789dfb3e79e3b70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 23:15:49 +0000 Subject: [PATCH 16/18] test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index ad1d087c7e..8dc47cd9dc 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -137,31 +137,21 @@ def kuberay_executor( }, # e.g. Run:ai volume_mounts=[ {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, + {"name": "nccl-env", "mountPath": "/etc/profile.d/nccl.sh", "subPath": "nccl.sh"}, ], volumes=[ { "name": "workspace", "persistentVolumeClaim": {"claimName": dgxc_pvc_claim_name}, }, + {"name": "nccl-env", "configMap": {"name": "nccl-env-override"}}, ], env_vars=env_vars, container_kwargs={ "securityContext": { "allowPrivilegeEscalation": False, "runAsUser": 0, - }, - "command": ["/bin/bash", "-lc"], - "args": [ - """ - export NCCL_SOCKET_IFNAME=eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8 - export NCCL_NET=tcpxo - export NCCL_FASTRAK_CTRL_DEV=eth0 - export LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tensorrt/lib:/usr/lib/x86_64-linux-gnu - export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.29.7 - ulimit -n 65536 - eval "$KUBERAY_GEN_RAY_START_CMD" - """ - ], + } }, ) From 856e3418499aecd9e06890cd84c7ba4f9c02458f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 23:42:55 +0000 Subject: [PATCH 17/18] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index 8dc47cd9dc..b31ee971af 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -131,20 +131,16 @@ def kuberay_executor( spec_kwargs={ "schedulerName": "runai-scheduler", "image_pull_secrets": ["dockerregistry-dockerregistry-pagaray-ngc"], - "dnsConfig": { - "options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen", "value": "1"}] - }, + "dnsConfig": {"options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen"}]}, }, # e.g. Run:ai volume_mounts=[ {"name": "workspace", "mountPath": dgxc_pvc_mount_path}, - {"name": "nccl-env", "mountPath": "/etc/profile.d/nccl.sh", "subPath": "nccl.sh"}, ], volumes=[ { "name": "workspace", "persistentVolumeClaim": {"claimName": dgxc_pvc_claim_name}, }, - {"name": "nccl-env", "configMap": {"name": "nccl-env-override"}}, ], env_vars=env_vars, container_kwargs={ From b9567897b4e27d4d81b30a9e747c113db69546a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 7 Mar 2026 23:43:21 +0000 Subject: [PATCH 18/18] revert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- examples/evaluation/utils/executors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/evaluation/utils/executors.py b/examples/evaluation/utils/executors.py index b31ee971af..be7fb30c0d 100644 --- a/examples/evaluation/utils/executors.py +++ b/examples/evaluation/utils/executors.py @@ -102,7 +102,6 @@ def kuberay_executor( "NCCL_NET": "tcpxo", "NCCL_SOCKET_IFNAME": "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8", "NCCL_FASTRAK_CTRL_DEV": "eth0", - "LD_LIBRARY_PATH": "/usr/local/nvidia/lib64:/usr/local/tensorrt/lib:/usr/lib/x86_64-linux-gnu", } if custom_env_vars: env_vars.update(custom_env_vars)