diff --git a/dockerfiles/Dockerfile.sandbox b/dockerfiles/Dockerfile.sandbox index c42d41abbb..653a5c4436 100644 --- a/dockerfiles/Dockerfile.sandbox +++ b/dockerfiles/Dockerfile.sandbox @@ -28,14 +28,16 @@ # -o requirements/sandbox.lock # ============================================================================= -# Use the base image with Python 3.10 and Flask -FROM tiangolo/uwsgi-nginx-flask:python3.10 +FROM python:3.10-slim -# Install dependencies required for Lean 4, pypy3, and other tools +# Install system dependencies: nginx, build tools for pypy3/lean4/block_network ARG TARGETARCH RUN apt-get update && \ - apt-get install -y curl git net-tools bzip2 build-essential libseccomp-dev && \ - ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \ + apt-get install -y nginx curl git net-tools bzip2 build-essential libseccomp-dev && \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* + +# Install PyPy3 +RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \ case "$ARCH" in \ amd64) PYPY_ARCH=linux64 ;; \ arm64|aarch64) PYPY_ARCH=aarch64 ;; \ @@ -46,8 +48,7 @@ RUN apt-get update && \ tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \ ln -s /opt/pypy3.10-v7.3.17-$PYPY_ARCH/bin/pypy3 /usr/bin/pypy3 && \ /usr/bin/pypy3 -m ensurepip && \ - rm /tmp/pypy.tar.bz2 && \ - rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* + rm /tmp/pypy.tar.bz2 # Install Lean 4 toolchain RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \ @@ -151,20 +152,4 @@ RUN chmod +x /start-with-nginx.sh # Environment variables for multi-worker setup ENV NGINX_PORT=6000 -# Set default port for single worker mode -ENV LISTEN_PORT=6000 - -# Default uwsgi configuration -ARG UWSGI_CHEAPER -ENV UWSGI_CHEAPER=$UWSGI_CHEAPER - -ARG NUM_WORKERS -ENV NUM_WORKERS=$NUM_WORKERS - -ARG UWSGI_PROCESSES -ENV UWSGI_PROCESSES=$UWSGI_PROCESSES - -ENV LISTEN_PORT=6000 -RUN echo "uwsgi_read_timeout 14400s;" > /etc/nginx/conf.d/custom_timeout.conf - CMD ["/start-with-nginx.sh"] diff --git a/dockerfiles/sandbox/block_network.c b/dockerfiles/sandbox/block_network.c index 8c41d279ef..016e465bbf 100644 --- a/dockerfiles/sandbox/block_network.c +++ b/dockerfiles/sandbox/block_network.c @@ -45,7 +45,7 @@ int socket(int domain, int type, int protocol) { real_socket = dlsym(RTLD_NEXT, "socket"); } - /* Allow Unix domain sockets (needed for local IPC, uwsgi, etc.) */ + /* Allow Unix domain sockets (needed for local IPC, gunicorn, etc.) */ if (domain == AF_UNIX || domain == AF_LOCAL) { return real_socket(domain, type, protocol); } diff --git a/dockerfiles/sandbox/start-with-nginx.sh b/dockerfiles/sandbox/start-with-nginx.sh index 638b25e5f6..3931135330 100644 --- a/dockerfiles/sandbox/start-with-nginx.sh +++ b/dockerfiles/sandbox/start-with-nginx.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Start nginx load balancer with multiple uwsgi workers +# Start nginx load balancer with multiple gunicorn workers # Uses TCP sockets for workers, supporting both single-node and multi-node deployments. # # Multi-node is auto-detected from SLURM environment variables. @@ -13,20 +13,18 @@ # NGINX_PORT Port nginx listens on (default: 6000, set in Dockerfile) # # Optional — Worker Configuration: -# NUM_WORKERS Number of uWSGI workers per node (default: $(nproc --all)) +# NUM_WORKERS Number of gunicorn instances per node (default: $(nproc --all)) # SANDBOX_WORKER_BASE_PORT # Starting TCP port for workers (default: 50001). Workers # bind to sequential ports: base, base+1, ..., base+N-1. # If a port is already in use, the startup algorithm retries # with offset increments. -# STATEFUL_SANDBOX Set to 1 (default) for stateful mode: each uWSGI worker -# runs a single process to preserve Jupyter kernel sessions +# STATEFUL_SANDBOX Set to 1 (default) for stateful mode: each gunicorn instance +# runs a single worker process to preserve Jupyter kernel sessions # across requests. Set to 0 for stateless mode where -# UWSGI_PROCESSES and UWSGI_CHEAPER take effect. -# UWSGI_PROCESSES uWSGI processes per worker (default: 1). Only used when -# STATEFUL_SANDBOX=0. -# UWSGI_CHEAPER uWSGI cheaper mode: minimum number of active processes -# (default: 1). Only used when STATEFUL_SANDBOX=0. +# GUNICORN_WORKERS takes effect. +# GUNICORN_WORKERS Gunicorn worker processes per instance (default: 1). Only used +# when STATEFUL_SANDBOX=0. # # Optional — Multi-Node (SLURM): # SLURM_JOB_NODELIST SLURM-provided compressed nodelist (e.g., "node[001-016]"). @@ -47,18 +45,37 @@ # NEMO_SKILLS_SANDBOX_BLOCK_NETWORK # Set to 1 to enable network blocking for sandboxed code. # Uses /etc/ld.so.preload to intercept socket() calls in -# all new processes. Applied AFTER nginx/uWSGI start so +# all new processes. Applied AFTER nginx/gunicorn start so # the API remains functional. Note: in any mode, if a # worker crashes the monitoring loop will attempt to restart # it, but the new process will be unable to bind its socket. # The remaining workers continue serving. (default: 0) # +# Deprecated (logged warning if set): +# UWSGI_PROCESSES Use GUNICORN_WORKERS instead. +# UWSGI_CHEAPER No gunicorn equivalent; ignored. +# UWSGI_CPU_AFFINITY No gunicorn equivalent; ignored. +# # ============================================================================= set -e export NUM_WORKERS=${NUM_WORKERS:-$(nproc --all)} +# ============================================================================= +# Deprecation warnings for old uWSGI env vars +# ============================================================================= +if [ -n "$UWSGI_PROCESSES" ]; then + echo "WARNING: UWSGI_PROCESSES is deprecated. Use GUNICORN_WORKERS instead." + : "${GUNICORN_WORKERS:=$UWSGI_PROCESSES}" +fi +if [ -n "$UWSGI_CHEAPER" ]; then + echo "WARNING: UWSGI_CHEAPER is deprecated and has no gunicorn equivalent. Ignoring." +fi +if [ -n "$UWSGI_CPU_AFFINITY" ]; then + echo "WARNING: UWSGI_CPU_AFFINITY is deprecated and has no gunicorn equivalent. Ignoring." +fi + # ============================================================================= # Utility functions # ============================================================================= @@ -111,38 +128,28 @@ print(' '.join(expand_nodelist(sys.argv[1]))) " "$nodelist" 2>/dev/null } -# Start a single uWSGI worker in the background. +# Start a single gunicorn instance in the background. # Args: $1=worker_number $2=port # Prints: "pid:port" start_worker_fast() { local i=$1 local WORKER_PORT=$2 - cat > /tmp/worker${i}_uwsgi.ini << EOF -[uwsgi] -module = main -callable = app -processes = ${UWSGI_PROCESSES} -http-socket = 0.0.0.0:${WORKER_PORT} -vacuum = true -master = true -die-on-term = true -memory-report = true -listen = 100 -http-timeout = 300 -socket-timeout = 300 -disable-logging = false -log-date = true -log-prefix = [worker${i}] -logto = /var/log/worker${i}.log -EOF - - if [ -n "$UWSGI_CHEAPER" ]; then - echo "cheaper = ${UWSGI_CHEAPER}" >> /tmp/worker${i}_uwsgi.ini - fi - > /var/log/worker${i}.log - ( cd /app && env WORKER_NUM=$i uwsgi --ini /tmp/worker${i}_uwsgi.ini ) & + # Redirect stdout/stderr to the log file so the $() subshell that calls + # this function doesn't block waiting for the pipe to close. + # (uWSGI's --logto closed stdout implicitly; gunicorn does not.) + ( cd /app && env WORKER_NUM=$i \ + gunicorn main:app \ + --workers ${GUNICORN_WORKERS} \ + --bind 0.0.0.0:${WORKER_PORT} \ + --timeout 300 \ + --graceful-timeout 30 \ + --backlog 100 \ + --access-logfile /var/log/worker${i}.log \ + --error-logfile /var/log/worker${i}.log \ + --log-level info \ + ) >> /var/log/worker${i}.log 2>&1 & echo "$!:$WORKER_PORT" } @@ -158,6 +165,11 @@ worker_had_port_conflict() { grep -q "Address already in use" /var/log/worker${1}.log 2>/dev/null } +# Check if a TCP port is available (not in use). +port_is_free() { + ! ss -tln "sport = :$1" 2>/dev/null | grep -q LISTEN +} + worker_is_alive() { kill -0 "$1" 2>/dev/null } @@ -342,51 +354,18 @@ UPSTREAM_FILE="/tmp/upstream_servers.conf" echo "[$_H] Workers/node: $NUM_WORKERS | Base port: $SANDBOX_WORKER_BASE_PORT | Nginx: $NGINX_PORT" # ============================================================================= -# uWSGI configuration +# Gunicorn configuration # ============================================================================= : "${STATEFUL_SANDBOX:=1}" if [ "$STATEFUL_SANDBOX" -eq 1 ]; then - UWSGI_PROCESSES=1 - UWSGI_CHEAPER=1 + GUNICORN_WORKERS=1 else - : "${UWSGI_PROCESSES:=1}" - : "${UWSGI_CHEAPER:=1}" + : "${GUNICORN_WORKERS:=1}" fi -export UWSGI_PROCESSES UWSGI_CHEAPER - -echo "UWSGI settings: PROCESSES=$UWSGI_PROCESSES, CHEAPER=$UWSGI_CHEAPER" - -# Validate and fix uwsgi configuration -if [ -z "$UWSGI_PROCESSES" ]; then - UWSGI_PROCESSES=2 -fi - -if [ -z "$UWSGI_CHEAPER" ]; then - UWSGI_CHEAPER=1 -elif [ "$UWSGI_CHEAPER" -le 0 ]; then - echo "WARNING: UWSGI_CHEAPER ($UWSGI_CHEAPER) must be at least 1" - UWSGI_CHEAPER=1 - echo "Setting UWSGI_CHEAPER to $UWSGI_CHEAPER" -elif [ "$UWSGI_CHEAPER" -ge "$UWSGI_PROCESSES" ]; then - echo "WARNING: UWSGI_CHEAPER ($UWSGI_CHEAPER) must be lower than UWSGI_PROCESSES ($UWSGI_PROCESSES)" - if [ "$UWSGI_PROCESSES" -eq 1 ]; then - # For single process, disable cheaper mode entirely - echo "Disabling cheaper mode for single process setup" - UWSGI_CHEAPER="" - else - UWSGI_CHEAPER=$((UWSGI_PROCESSES - 1)) - echo "Setting UWSGI_CHEAPER to $UWSGI_CHEAPER" - fi -fi +export GUNICORN_WORKERS -export UWSGI_PROCESSES -if [ -n "$UWSGI_CHEAPER" ]; then - export UWSGI_CHEAPER - echo "UWSGI config - Processes: $UWSGI_PROCESSES, Cheaper: $UWSGI_CHEAPER" -else - echo "UWSGI config - Processes: $UWSGI_PROCESSES, Cheaper: disabled" -fi +echo "Gunicorn settings: WORKERS=$GUNICORN_WORKERS" # ============================================================================= # Log setup @@ -416,7 +395,7 @@ cleanup() { kill -TERM "$pid" 2>/dev/null || true fi done - pkill -f nginx || true + nginx -s quit 2>/dev/null || kill "$(cat /run/nginx.pid 2>/dev/null)" 2>/dev/null || true [ -n "$HEALTH_CHECK_DIR" ] && rm -rf "$HEALTH_CHECK_DIR" 2>/dev/null || true [ -n "$REMOTE_HEALTH_DIR" ] && rm -rf "$REMOTE_HEALTH_DIR" 2>/dev/null || true exit 0 @@ -432,48 +411,32 @@ for i in $(seq 1 $NUM_WORKERS); do ACTUAL_WORKER_PORTS+=("") done -# Phase 1: Spawn all workers simultaneously +# Phase 1: Pre-check ports and spawn all workers simultaneously. +# Gunicorn retries binding 5 times with 1s sleep (hardcoded in gunicorn/sock.py), +# so a port conflict wastes ~5s per worker. Pre-checking with ss avoids this. echo "[$_H] Starting $NUM_WORKERS workers (ports $SANDBOX_WORKER_BASE_PORT-$((SANDBOX_WORKER_BASE_PORT + NUM_WORKERS - 1)))..." START_SPAWN=$(date +%s) +CONFLICT_COUNT=0 for i in $(seq 1 $NUM_WORKERS); do port=$((SANDBOX_WORKER_BASE_PORT + i - 1)) - result=$(start_worker_fast $i $port) - WORKER_PIDS[$((i - 1))]="${result%%:*}" - ACTUAL_WORKER_PORTS[$((i - 1))]=$port -done - -echo "[$_H] All $NUM_WORKERS workers spawned in $(($(date +%s) - START_SPAWN))s" -# Phase 2: Retry workers that failed due to port conflicts -retry_round=0 -while [ $retry_round -lt $MAX_STARTUP_RETRIES ]; do - sleep 1 - - FAILED_WORKERS=() - for i in $(seq 1 $NUM_WORKERS); do - idx=$((i - 1)) - worker_is_alive "${WORKER_PIDS[$idx]}" && continue - worker_had_port_conflict $i && FAILED_WORKERS+=($i) - done - - [ ${#FAILED_WORKERS[@]} -eq 0 ] && break - - PORT_OFFSET=$(( (retry_round + 1) * PORT_INCREMENT )) - echo "[$_H] Retry $((retry_round + 1)): ${#FAILED_WORKERS[@]} port conflicts, offset +$PORT_OFFSET" - - for i in "${FAILED_WORKERS[@]}"; do - idx=$((i - 1)) - new_port=$((SANDBOX_WORKER_BASE_PORT + i - 1 + PORT_OFFSET)) - result=$(start_worker_fast $i $new_port) - WORKER_PIDS[$idx]="${result%%:*}" - ACTUAL_WORKER_PORTS[$idx]=$new_port + # Try up to MAX_STARTUP_RETRIES offsets if the port is occupied + for retry in $(seq 0 $MAX_STARTUP_RETRIES); do + if port_is_free $port; then + break + fi + CONFLICT_COUNT=$((CONFLICT_COUNT + 1)) + port=$((port + PORT_INCREMENT)) + echo "[$_H] Port conflict on worker $i, trying port $port" done - retry_round=$((retry_round + 1)) + result=$(start_worker_fast $i $port) + WORKER_PIDS[$((i - 1))]="${result%%:*}" + ACTUAL_WORKER_PORTS[$((i - 1))]=$port done -[ $retry_round -ge $MAX_STARTUP_RETRIES ] && echo "WARNING: Max startup retries reached" +echo "[$_H] All $NUM_WORKERS workers spawned in $(($(date +%s) - START_SPAWN))s ($CONFLICT_COUNT port conflicts resolved)" # ============================================================================= # Wait for local workers to be ready (parallel health checks) @@ -623,8 +586,8 @@ fi # Network blocking # ============================================================================= # ld.so.preload intercepts socket() in all NEW exec'd processes. This is safe -# for nginx/uWSGI that are already running. However, if the monitoring loop -# restarts a crashed worker, the new uWSGI process would be unable to bind its +# for nginx/gunicorn that are already running. However, if the monitoring loop +# restarts a crashed worker, the new gunicorn process would be unable to bind its # listening socket. We set NETWORK_BLOCKING_ACTIVE so the monitoring loop can # emit a clear diagnostic when this happens. NETWORK_BLOCKING_ACTIVE=0 @@ -658,7 +621,7 @@ else echo " Proxy: localhost:$NGINX_PORT -> $MASTER_NODE:$NGINX_PORT" echo " Local workers: $NUM_WORKERS (ports ${ACTUAL_WORKER_PORTS[0]}-${ACTUAL_WORKER_PORTS[$((NUM_WORKERS-1))]})" fi -echo " uWSGI: processes=$UWSGI_PROCESSES cheaper=${UWSGI_CHEAPER:-disabled}" +echo " Gunicorn: workers=$GUNICORN_WORKERS" # ============================================================================= # Monitoring loop @@ -695,7 +658,7 @@ while true; do fi done - if ! pgrep nginx > /dev/null; then + if ! kill -0 "$(cat /run/nginx.pid 2>/dev/null)" 2>/dev/null; then echo "[$_H] ERROR: Nginx died unexpectedly" cleanup exit 1 diff --git a/nemo_skills/code_execution/local_sandbox/local_sandbox_server.py b/nemo_skills/code_execution/local_sandbox/local_sandbox_server.py index 4f5fa570cd..974b8a2827 100644 --- a/nemo_skills/code_execution/local_sandbox/local_sandbox_server.py +++ b/nemo_skills/code_execution/local_sandbox/local_sandbox_server.py @@ -76,7 +76,7 @@ # Worker that runs inside the shell process and owns a TerminalInteractiveShell() def shell_worker(conn): # LAYER 2: Python-level socket blocking for IPython sessions - # The shell_worker is forked (not exec'd) from the uWSGI worker, so it does NOT + # The shell_worker is forked (not exec'd) from the gunicorn worker, so it does NOT # get the ld.so.preload library loaded. We must patch Python's socket module directly. # This blocks: socket.socket(), _socket.socket(), requests.get(), urllib, etc. if BLOCK_NETWORK: @@ -101,6 +101,11 @@ def __init__(self, family=-1, type=-1, proto=-1, fileno=None): socket_module.socket = BlockedSocket # Blocks: import socket; socket.socket() shell = TerminalInteractiveShell() + # TerminalInteractiveShell installs a SIGINT handler that calls sys.exit(0) + # instead of raising KeyboardInterrupt when _executing is False (which is + # the case when run_cell is called programmatically). Restore the default + # handler so SIGINT raises KeyboardInterrupt and our except clause catches it. + signal.signal(signal.SIGINT, signal.SIG_DFL) try: while True: try: diff --git a/nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh b/nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh index 03da9b7546..3269bc8f51 100755 --- a/nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh +++ b/nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh @@ -22,7 +22,6 @@ docker build --tag=${SANDBOX_NAME} --build-arg="NUM_WORKERS=$((`nproc --all`))" echo "Multi-worker mode: Starting $((`nproc --all`)) workers with session affinity" docker run --network=host --rm \ --memory=${NEMO_SKILLS_SANDBOX_MEM_LIMIT:-"16g"} \ - ${UWSGI_CPU_AFFINITY:+-e UWSGI_CPU_AFFINITY=${UWSGI_CPU_AFFINITY}} \ - ${UWSGI_PROCESSES:+-e UWSGI_PROCESSES=${UWSGI_PROCESSES}} \ + ${GUNICORN_WORKERS:+-e GUNICORN_WORKERS=${GUNICORN_WORKERS}} \ -v /nemo_run:/nemo_run \ --name=local-sandbox ${SANDBOX_NAME} diff --git a/nemo_skills/dataset/icpc/__init__.py b/nemo_skills/dataset/icpc/__init__.py index 89ac3d5c79..59cee4ad1d 100644 --- a/nemo_skills/dataset/icpc/__init__.py +++ b/nemo_skills/dataset/icpc/__init__.py @@ -22,9 +22,7 @@ # environment variables required by this benchmark SANDBOX_ENV_VARS = [ - "UWSGI_PROCESSES=1024", - "UWSGI_CPU_AFFINITY=8", - "UWSGI_CHEAPER=1023", + "GUNICORN_WORKERS=1024", "NUM_WORKERS=1", "STATEFUL_SANDBOX=0", ] diff --git a/nemo_skills/dataset/ioi/__init__.py b/nemo_skills/dataset/ioi/__init__.py index 2eeaf624d3..1678ac6d20 100644 --- a/nemo_skills/dataset/ioi/__init__.py +++ b/nemo_skills/dataset/ioi/__init__.py @@ -18,9 +18,7 @@ # environment variables required by this benchmark SANDBOX_ENV_VARS = [ - "UWSGI_PROCESSES=1024", - "UWSGI_CPU_AFFINITY=8", - "UWSGI_CHEAPER=1023", + "GUNICORN_WORKERS=1024", "NUM_WORKERS=1", "STATEFUL_SANDBOX=0", ] diff --git a/tests/test_sandbox_network_blocking.py b/tests/test_sandbox_network_blocking.py index e1f981be40..1dd27fcdb7 100644 --- a/tests/test_sandbox_network_blocking.py +++ b/tests/test_sandbox_network_blocking.py @@ -39,6 +39,7 @@ def blocked_sandbox(): """Start a sandbox with network blocking enabled.""" client = docker.from_env() port = get_free_port(strategy="random") + worker_base_port = get_free_port(strategy="random") name = f"sandbox-block-test-{port}" container = client.containers.run( @@ -46,7 +47,12 @@ def blocked_sandbox(): detach=True, name=name, network_mode="host", - environment={"NGINX_PORT": str(port), "NUM_WORKERS": "1", "NEMO_SKILLS_SANDBOX_BLOCK_NETWORK": "1"}, + environment={ + "NGINX_PORT": str(port), + "NUM_WORKERS": "1", + "NEMO_SKILLS_SANDBOX_BLOCK_NETWORK": "1", + "SANDBOX_WORKER_BASE_PORT": str(worker_base_port), + }, ) # Wait for health @@ -165,9 +171,9 @@ async def test_subprocess_env_clear_blocked(self, blocked_sandbox): """LLM tries: subprocess with env={} to clear LD_PRELOAD""" sandbox = LocalSandbox(host="127.0.0.1", port=str(blocked_sandbox)) code = """ -import subprocess +import subprocess, sys code = 'import socket; s=socket.socket(socket.AF_INET, socket.SOCK_STREAM); print("BYPASS_WORKED")' -result = subprocess.run(["python3", "-c", code], env={}, capture_output=True, text=True) +result = subprocess.run([sys.executable, "-c", code], env={}, capture_output=True, text=True) if "BYPASS_WORKED" in result.stdout: print("ENV_CLEAR_BYPASS_WORKED") else: