From fba70680900a14ed86715b15550337a929d6b104 Mon Sep 17 00:00:00 2001 From: chenzp15 Date: Sun, 27 Apr 2025 00:22:04 +0800 Subject: [PATCH 1/2] =?UTF-8?q?fix(train=5Fppo):=20=E4=BD=BF=E7=94=A8=20se?= =?UTF-8?q?tsid=20=E5=92=8C=20kill=20PGID=20=E6=94=B9=E8=BF=9B=E5=90=8E?= =?UTF-8?q?=E5=8F=B0=E6=9C=8D=E5=8A=A1=E5=99=A8=E6=B8=85=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 使用 `setsid` 在独立的进程组中启动 AgentGym 服务器。这确保了由 `conda run` 启动的所有子进程都属于同一个组。 - 修改 `trap EXIT` 清理逻辑,使用 `kill -- -PGID` 杀死整个进程组,而不是仅杀死初始的 `conda run` PID。这样可以更可靠地终止服务器及其所有潜在的子进程。 - 将存储的 ID 从 PID 改为 PGID。 - 移除了之前检查服务器进程是否存在的逻辑,因为 `setsid` 很快退出,该检查不再可靠。 此更改解决了主脚本退出时 AgentGym 服务器可能由于信号未通过 `conda run` 正确传播而无法正确终止的问题。 TODO: 检查 train_grpo.sh 并应用类似的服务端清理逻辑。 --- train_ppo.sh | 54 +++++++++++++++++++--------------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/train_ppo.sh b/train_ppo.sh index c574e580..c34779e8 100644 --- a/train_ppo.sh +++ b/train_ppo.sh @@ -103,7 +103,7 @@ esac # --- Start AgentGym Servers in Dedicated Environment --- TARGET_ENV_NAME="agentenv-${AGENTGYM_ENV_NAME}" -AGENTGYM_PIDS=() # Array to store PIDs +AGENTGYM_PGIDS=() # Array to store PGIDs (changed from PIDS) AGENTGYM_PORTS=() # Array to store ports # Check if target env exists @@ -132,52 +132,38 @@ for (( i=0; i<$NUM_SERVERS; i++ )); do echo "[Server $(($i+1))/$NUM_SERVERS] Launching on ${AGENTGYM_HOST}:${AGENTGYM_PORT}..." echo "[Server $(($i+1))/$NUM_SERVERS] Command: $CURRENT_LAUNCH_CMD" - # Run server in background using conda run + # Run server in background using conda run within a new process group (setsid) LOG_FILE="logs/${TARGET_ENV_NAME}_server_${AGENTGYM_PORT}.log" echo "[Server $(($i+1))/$NUM_SERVERS] Logging to $LOG_FILE" - # Use bash -c to handle potential env vars in launch cmd (like for sqlgym) - conda run --no-capture-output -n "$TARGET_ENV_NAME" bash -c "$CURRENT_LAUNCH_CMD" > "$LOG_FILE" 2>&1 & - PID=$! + # Use setsid to ensure the server runs in its own process group + setsid conda run --no-capture-output -n "$TARGET_ENV_NAME" bash -c "$CURRENT_LAUNCH_CMD" > "$LOG_FILE" 2>&1 & + PGID=$! # PID of setsid becomes the Process Group ID - # Check if PID was obtained - if [ -z "$PID" ]; then - echo "[Error] Failed to get PID for AgentGym server instance $i on port $AGENTGYM_PORT." + # Check if PGID was obtained + if [ -z "$PGID" ]; then + echo "[Error] Failed to get PGID for AgentGym server instance $i on port $AGENTGYM_PORT." # Attempt to kill already launched servers before exiting - for p in "${AGENTGYM_PIDS[@]}"; do kill $p 2>/dev/null; done + for pgid in "${AGENTGYM_PGIDS[@]}"; do kill -- -$pgid 2>/dev/null; done # Kill process group exit 1 fi - AGENTGYM_PIDS+=($PID) # Store PID - echo "[Server $(($i+1))/$NUM_SERVERS] Launched (PID: $PID)." + AGENTGYM_PGIDS+=($PGID) # Store PGID + echo "[Server $(($i+1))/$NUM_SERVERS] Launched (PGID: $PGID)." sleep 2 # Small delay between starting servers done # --- Wait and Check Servers --- -echo "[Server] Waiting for AgentGym servers (${AGENTGYM_PIDS[*]}) to initialize..." -sleep 15 # Adjust sleep time if needed - -# Check if all server processes are still running -ALL_SERVERS_RUNNING=true -for PID in "${AGENTGYM_PIDS[@]}"; do - if ! kill -0 $PID > /dev/null 2>&1; then - echo "[Error] AgentGym server (PID: $PID) failed to start or exited prematurely." - # Attempt to find the corresponding log file (this is a bit heuristic) - PORT=$(grep -oP -- "--port\\s+\\K\\d+" "logs/"*"${PID}"* 2>/dev/null || echo "unknown") - echo "[Error] Check server log potentially named logs/${TARGET_ENV_NAME}_server_${PORT}.log or similar." - ALL_SERVERS_RUNNING=false - fi -done +echo "[Server] Waiting a moment for AgentGym servers (${AGENTGYM_PGIDS[*]}) to initialize..." +sleep 5 # Reduced sleep, as the check is removed + +# Removed the unreliable check based on PID (setsid exits quickly) +# The trap mechanism below is the primary way to ensure cleanup. +echo "[Server] Assuming servers are starting... Cleanup will occur on script exit." -if [ "$ALL_SERVERS_RUNNING" = false ]; then - echo "[Error] Not all servers started successfully. Exiting." - # Kill remaining servers - for p in "${AGENTGYM_PIDS[@]}"; do kill $p 2>/dev/null; done - exit 1 -fi -echo "[Server] All AgentGym servers appear to be running." -# Setup trap to kill all server processes on script exit/interrupt -trap "echo '[Cleanup] Stopping AgentGym servers (PIDs: ${AGENTGYM_PIDS[*]})...'; kill ${AGENTGYM_PIDS[*]} 2>/dev/null || echo '[Cleanup] Servers already stopped.'; wait ${AGENTGYM_PIDS[*]} 2>/dev/null" EXIT +# Setup trap to kill all server process groups on script exit/interrupt +# Note the use of kill -- -$pgid to target the entire process group +trap "echo '[Cleanup] Stopping AgentGym server process groups (PGIDs: ${AGENTGYM_PGIDS[*]})...'; CLEANUP_PGIDS=(${AGENTGYM_PGIDS[*]}); for pgid in \${CLEANUP_PGIDS[@]}; do echo '[Cleanup] Killing process group -\$pgid'; kill -- -\$pgid 2>/dev/null; done; wait 2>/dev/null; echo '[Cleanup] Done.'" EXIT # --- Data and Experiment Naming --- export DATA_DIR=${DATA_DIR_OVERRIDE:-"data/$AGENTGYM_ENV_NAME"} # Default data dir based on env name From 62f325e43ec0b99df3a3880d4fe4808dac3e7cab Mon Sep 17 00:00:00 2001 From: chenzp15 Date: Sun, 27 Apr 2025 00:45:09 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix(train=5Fppo):=20=E6=94=B9=E8=BF=9B=20Ag?= =?UTF-8?q?entGym=20=E6=9C=8D=E5=8A=A1=E5=99=A8=E7=AE=A1=E7=90=86=E5=92=8C?= =?UTF-8?q?=E5=90=AF=E5=8A=A8=E6=A3=80=E6=9F=A5=20-=20**=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E5=9F=BA=E4=BA=8E=E7=BD=91=E7=BB=9C=E7=AB=AF=E5=8F=A3=20(`nc`)?= =?UTF-8?q?=20=E7=9A=84=E6=9C=8D=E5=8A=A1=E5=99=A8=E5=90=AF=E5=8A=A8?= =?UTF-8?q?=E6=A3=80=E6=9F=A5=E9=80=BB=E8=BE=91=EF=BC=8C=E6=9B=BF=E4=BB=A3?= =?UTF-8?q?=E5=8E=9F=E5=85=88=E4=B8=8D=E5=8F=AF=E9=9D=A0=E7=9A=84=20PID=20?= =?UTF-8?q?=E6=A3=80=E6=9F=A5=E3=80=82**=20-=20=E5=AE=9E=E7=8E=B0=E7=AB=AF?= =?UTF-8?q?=E5=8F=A3=E6=A3=80=E6=9F=A5=E7=9A=84=E9=87=8D=E8=AF=95=E6=9C=BA?= =?UTF-8?q?=E5=88=B6=EF=BC=8C=E5=B9=B6=E5=9C=A8=E6=A3=80=E6=9F=A5=E5=A4=B1?= =?UTF-8?q?=E8=B4=A5=E6=97=B6=E8=A7=A6=E5=8F=91=E6=B8=85=E7=90=86=E5=B9=B6?= =?UTF-8?q?=E9=80=80=E5=87=BA=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- train_ppo.sh | 63 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/train_ppo.sh b/train_ppo.sh index c34779e8..9c8a3415 100644 --- a/train_ppo.sh +++ b/train_ppo.sh @@ -153,12 +153,65 @@ for (( i=0; i<$NUM_SERVERS; i++ )); do done # --- Wait and Check Servers --- -echo "[Server] Waiting a moment for AgentGym servers (${AGENTGYM_PGIDS[*]}) to initialize..." -sleep 5 # Reduced sleep, as the check is removed +echo "[Server] Checking if AgentGym servers (${AGENTGYM_PORTS[*]}) are responsive..." +ALL_SERVERS_RUNNING=true +MAX_RETRIES=5 # Number of times to check each server +RETRY_DELAY=3 # Seconds to wait between retries +CONNECT_TIMEOUT=1 # Seconds for nc connection timeout + +for (( i=0; i<${#AGENTGYM_PORTS[@]}; i++ )); do + PORT=${AGENTGYM_PORTS[i]} + PGID=${AGENTGYM_PGIDS[i]} # Corresponding PGID for logging/debug + LOG_FILE="logs/${TARGET_ENV_NAME}_server_${PORT}.log" + SERVER_UP=false + + # Determine host to check (use localhost if host is 0.0.0.0) + CHECK_HOST=$AGENTGYM_HOST + if [ "$CHECK_HOST" == "0.0.0.0" ]; then + CHECK_HOST="127.0.0.1" + fi + + echo "[Server Check] Checking server on ${CHECK_HOST}:${PORT} (PGID: $PGID)..." + for (( attempt=1; attempt<=$MAX_RETRIES; attempt++ )); do + # Use netcat (nc) to check if port is open. -z: zero-I/O mode, -w: timeout + # Redirect errors to /dev/null to avoid clutter + if nc -z -w $CONNECT_TIMEOUT "$CHECK_HOST" "$PORT" > /dev/null 2>&1; then + echo "[Server Check] Server on port $PORT is responsive." + SERVER_UP=true + break # Exit retry loop for this server + else + if [ $attempt -lt $MAX_RETRIES ]; then + echo "[Server Check] Server on port $PORT not responsive (Attempt $attempt/$MAX_RETRIES). Retrying in $RETRY_DELAY seconds..." + sleep $RETRY_DELAY + else + echo "[Error] Server on port $PORT (PGID: $PGID) failed to respond after $MAX_RETRIES attempts." + echo "[Error] Check server log for details: $LOG_FILE" + fi + fi + done + + if [ "$SERVER_UP" = false ]; then + ALL_SERVERS_RUNNING=false + # No need to check remaining servers if one failed + break + fi +done + +if [ "$ALL_SERVERS_RUNNING" = false ]; then + echo "[Error] Not all AgentGym servers started successfully or became responsive. Initiating cleanup..." + # Manually trigger cleanup for potentially started PGIDs before exiting + # We duplicate part of the trap logic here for immediate cleanup on check failure + CLEANUP_PGIDS_ON_FAIL=(${AGENTGYM_PGIDS[*]}); + for pgid_fail in "${CLEANUP_PGIDS_ON_FAIL[@]}"; do + echo "[Cleanup] Killing process group -$pgid_fail due to failed startup check." + kill -- -$pgid_fail 2>/dev/null; + done + wait 2>/dev/null # Give kill commands a moment + echo "[Error] Exiting script due to server startup failure." + exit 1 # Exit with error code +fi -# Removed the unreliable check based on PID (setsid exits quickly) -# The trap mechanism below is the primary way to ensure cleanup. -echo "[Server] Assuming servers are starting... Cleanup will occur on script exit." +echo "[Server] All AgentGym servers appear to be responsive and running." # Setup trap to kill all server process groups on script exit/interrupt