From 8c2ecf388952b1a9a587132dcea4bad61878a9f0 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 27 Aug 2025 14:48:26 -0700 Subject: [PATCH 1/4] update ray.sub to sync ray files from all nodes Signed-off-by: Guyue Huang --- docs/nsys-profiling.md | 2 +- ray.sub | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/docs/nsys-profiling.md b/docs/nsys-profiling.md index 951251420c..8cf6943d7c 100644 --- a/docs/nsys-profiling.md +++ b/docs/nsys-profiling.md @@ -91,7 +91,7 @@ If you are not using model parallelism in Vllm, you should directly refer to `vl 3. **File Location**: Profile files are saved in `/tmp/ray/session*/logs/nsight/` directory on each worker node. Ensure you check both `ls /tmp/ray/session_[0-9]*/logs/nsight` and `ls /tmp/ray/session_latest/logs/nsight` for the profiles, since the "latest" pointer may be stale. -**Note for SLURM users with `ray.sub`**: When using `ray.sub` on SLURM, set `RAY_LOG_SYNC_FREQUENCY=$NUM_SEC` (e.g., `RAY_LOG_SYNC_FREQUENCY=30`) to ensure that the nsight profile files get copied from the container's ephemeral filesystem (`/tmp/ray`) to the persistent `$SLURM_JOB_ID-logs/ray` directory. +**Note for SLURM users with `ray.sub`**: When using `ray.sub` on SLURM, set `RAY_LOG_SYNC_FREQUENCY=$NUM_SEC` (e.g., `RAY_LOG_SYNC_FREQUENCY=30`) to ensure that the nsight profile files get copied from the container's ephemeral filesystem (`/tmp/ray`) to the persistent directory. The header node's files will be synced to ``$SLURM_JOB_ID-logs/ray`, and other nodes' files will be synced to `$SLURM_JOB_ID-logs/ray/$node_ip/` where `$node_ip` is the IP address of the node. ## Analyze Profile Files diff --git a/ray.sub b/ray.sub index 1d37a2296c..65c243b372 100644 --- a/ray.sub +++ b/ray.sub @@ -266,6 +266,38 @@ monitor-sidecar() { } monitor-sidecar & +# Background process to sync ray logs every $RAY_LOG_SYNC_FREQUENCY seconds +log-sync-sidecar() { + set +x + if [[ -z "$RAY_LOG_SYNC_FREQUENCY" ]]; then + echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar" + return + fi + mkdir -p $LOG_DIR/ray + mkdir -p $LOG_DIR/ray/$node_i + while true; do + sleep $RAY_LOG_SYNC_FREQUENCY + if ls /tmp/ray/session_[0-9]* > /dev/null 2>&1; then + for session_dir in /tmp/ray/session_[0-9]*/; do + if [[ -d "\$session_dir/logs" ]]; then + session_name=\$(basename "\$session_dir") + mkdir -p "$LOG_DIR/ray/$node_i/\$session_name" + if command -v rsync > /dev/null 2>&1; then + rsync -ahP "\$session_dir/logs/" $head_node_ip:$LOG_DIR/ray/$node_i/\$session_name/logs/ 2>/dev/null || true + else + cp -r "\$session_dir/logs" $head_node:$LOG_DIR/ray/$node_i/\$session_name/ + fi + fi + done + fi + if [[ -f "$LOG_DIR/ENDED" ]]; then + echo "Log sync sidecar terminating..." + break + fi + done +} +log-sync-sidecar & + # Patch nsight.py before starting Ray worker sed -i 's/context\.py_executable = " "\.join(self\.nsight_cmd) + " python"/context.py_executable = " ".join(self.nsight_cmd) + f" {context.py_executable}"/g' /opt/nemo_rl_venv/lib64/python*/site-packages/ray/_private/runtime_env/nsight.py From e730c9d89934acd433ae1968aadad395f04253c2 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 27 Aug 2025 15:51:20 -0700 Subject: [PATCH 2/4] Bugfix Signed-off-by: Guyue Huang --- ray.sub | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray.sub b/ray.sub index 65c243b372..70c3679253 100644 --- a/ray.sub +++ b/ray.sub @@ -285,7 +285,7 @@ log-sync-sidecar() { if command -v rsync > /dev/null 2>&1; then rsync -ahP "\$session_dir/logs/" $head_node_ip:$LOG_DIR/ray/$node_i/\$session_name/logs/ 2>/dev/null || true else - cp -r "\$session_dir/logs" $head_node:$LOG_DIR/ray/$node_i/\$session_name/ + cp -r "\$session_dir/logs" $LOG_DIR/ray/$node_i/\$session_name/ fi fi done From 275b212f02d7cb7b34e204b8fdfc5b33f6663012 Mon Sep 17 00:00:00 2001 From: Guyue Huang Date: Wed, 27 Aug 2025 16:01:26 -0700 Subject: [PATCH 3/4] bugfix Signed-off-by: Guyue Huang --- ray.sub | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray.sub b/ray.sub index 70c3679253..ad2d3c5f0d 100644 --- a/ray.sub +++ b/ray.sub @@ -283,7 +283,7 @@ log-sync-sidecar() { session_name=\$(basename "\$session_dir") mkdir -p "$LOG_DIR/ray/$node_i/\$session_name" if command -v rsync > /dev/null 2>&1; then - rsync -ahP "\$session_dir/logs/" $head_node_ip:$LOG_DIR/ray/$node_i/\$session_name/logs/ 2>/dev/null || true + rsync -ahP "\$session_dir/logs/" $LOG_DIR/ray/$node_i/\$session_name/logs/ 2>/dev/null || true else cp -r "\$session_dir/logs" $LOG_DIR/ray/$node_i/\$session_name/ fi From 148c7ca21f67010aae2a16dd60deb0eb3c6e8d6c Mon Sep 17 00:00:00 2001 From: Guyue Huang <140554423+guyueh1@users.noreply.github.com> Date: Fri, 29 Aug 2025 09:01:05 -0700 Subject: [PATCH 4/4] Update ray.sub Co-authored-by: Terry Kong Signed-off-by: Guyue Huang <140554423+guyueh1@users.noreply.github.com> --- ray.sub | 1 - 1 file changed, 1 deletion(-) diff --git a/ray.sub b/ray.sub index ad2d3c5f0d..ac1eece445 100644 --- a/ray.sub +++ b/ray.sub @@ -273,7 +273,6 @@ log-sync-sidecar() { echo "RAY_LOG_SYNC_FREQUENCY is not set, skipping log sync sidecar" return fi - mkdir -p $LOG_DIR/ray mkdir -p $LOG_DIR/ray/$node_i while true; do sleep $RAY_LOG_SYNC_FREQUENCY