Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 40 additions & 4 deletions ray.sub
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ maybe_gres_arg() {
# Check if any nodes in the partition have GRES configured
# Assumes a homogeneous allocation (not a heterogeneous job)
if sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep -q "gpu:"; then
# Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8.
if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | cut -d: -f2) ]]; then
# Do a quick assert here that gpus:8 == gpus:$GPUS_PER_NODE. It is probably a user error if someone isn't using GPUS_PER_NODE=8 on our clusters if it supports --gres=gpu:8 or gpu:a100:8
if [[ $GPUS_PER_NODE -ne $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:" | awk -F: '{print $NF}') ]]; then
echo "Error: GPUS_PER_NODE=$GPUS_PER_NODE but GRES detected is $(sinfo -p $SLURM_JOB_PARTITION -h -o "%G" | grep "gpu:") meaning GPUS_PER_NODE is not set to fully claim the GPUs on the nodes." >&2
exit 1
fi
Expand Down Expand Up @@ -124,7 +124,43 @@ nodes_array=($nodes)
ip_addresses_array=()

for node in $nodes; do
ip_address=$(host $node | awk '/has address/ { print $4 }')
# Try multiple methods to get IP address - ENHANCED VERSION v2.0
echo "[DEBUG] Resolving hostname: $node using enhanced resolution methods"
ip_address=""

# Method 1: Try host command
echo "[DEBUG] Method 1: host command"
ip_address=$(host $node 2>/dev/null | awk '/has address/ { print $4 }' | head -1 || true)
echo "[DEBUG] host result: '$ip_address'"

# Method 2: If host fails, try getent
if [[ -z "$ip_address" ]]; then
echo "[DEBUG] Method 2: getent hosts"
ip_address=$(getent hosts $node 2>/dev/null | awk '{ print $1 }' | head -1 || true)
echo "[DEBUG] getent result: '$ip_address'"
fi

# Method 3: If getent fails, try nslookup
if [[ -z "$ip_address" ]]; then
echo "[DEBUG] Method 3: nslookup"
ip_address=$(nslookup $node 2>/dev/null | awk '/^Address: / { print $2 }' | head -1 || true)
echo "[DEBUG] nslookup result: '$ip_address'"
fi

# Method 4: If all DNS methods fail, try ping to extract IP
if [[ -z "$ip_address" ]]; then
echo "[DEBUG] Method 4: ping"
ip_address=$(ping -c 1 $node 2>/dev/null | grep "PING" | sed 's/.*(\([^)]*\)).*/\1/' || true)
echo "[DEBUG] ping result: '$ip_address'"
fi

# If still no IP, use the hostname itself (might work if it's already an IP or resolvable)
if [[ -z "$ip_address" ]]; then
echo "[WARNING] Could not resolve IP for $node, using hostname as fallback"
ip_address=$node
fi

echo "[INFO] Node: $node -> IP: $ip_address"
# Add the IP address to the array
ip_addresses_array+=("$ip_address")
done
Expand Down Expand Up @@ -338,7 +374,7 @@ echo "All workers connected!"
# We can now launch a job on this cluster
# We do so by launching a driver process on the physical node that the head node is on
# This driver process is responsible for launching a job on the Ray cluster
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID --json | jq -r '.jobs[].current_working_directory')
Comment thread
terrykong marked this conversation as resolved.
CONTAINER_CWD=$(scontrol show job $SLURM_JOB_ID | grep -oP 'WorkDir=\K[^ ]+' | head -1)
if [[ -n "$COMMAND" ]]; then
srun --no-container-mount-home --overlap --container-name=ray-head --container-workdir=$CONTAINER_CWD --nodes=1 --ntasks=1 -w "$head_node" -o $LOG_DIR/ray-driver.log bash -c "$COMMAND"
else
Expand Down