Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 8 additions & 23 deletions dockerfiles/Dockerfile.sandbox
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@
# -o requirements/sandbox.lock
# =============================================================================

# Use the base image with Python 3.10 and Flask
FROM tiangolo/uwsgi-nginx-flask:python3.10
FROM python:3.10-slim

# Install dependencies required for Lean 4, pypy3, and other tools
# Install system dependencies: nginx, build tools for pypy3/lean4/block_network
ARG TARGETARCH
RUN apt-get update && \
apt-get install -y curl git net-tools bzip2 build-essential libseccomp-dev && \
ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \
apt-get install -y nginx curl git net-tools bzip2 build-essential libseccomp-dev && \
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

# Install PyPy3
RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \
case "$ARCH" in \
amd64) PYPY_ARCH=linux64 ;; \
arm64|aarch64) PYPY_ARCH=aarch64 ;; \
Expand All @@ -46,8 +48,7 @@ RUN apt-get update && \
tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \
ln -s /opt/pypy3.10-v7.3.17-$PYPY_ARCH/bin/pypy3 /usr/bin/pypy3 && \
/usr/bin/pypy3 -m ensurepip && \
rm /tmp/pypy.tar.bz2 && \
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
rm /tmp/pypy.tar.bz2

# Install Lean 4 toolchain
RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
Expand Down Expand Up @@ -151,20 +152,4 @@ RUN chmod +x /start-with-nginx.sh
# Environment variables for multi-worker setup
ENV NGINX_PORT=6000

# Set default port for single worker mode
ENV LISTEN_PORT=6000

# Default uwsgi configuration
ARG UWSGI_CHEAPER
ENV UWSGI_CHEAPER=$UWSGI_CHEAPER

ARG NUM_WORKERS
ENV NUM_WORKERS=$NUM_WORKERS

ARG UWSGI_PROCESSES
ENV UWSGI_PROCESSES=$UWSGI_PROCESSES

ENV LISTEN_PORT=6000
RUN echo "uwsgi_read_timeout 14400s;" > /etc/nginx/conf.d/custom_timeout.conf

CMD ["/start-with-nginx.sh"]
2 changes: 1 addition & 1 deletion dockerfiles/sandbox/block_network.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ int socket(int domain, int type, int protocol) {
real_socket = dlsym(RTLD_NEXT, "socket");
}

/* Allow Unix domain sockets (needed for local IPC, uwsgi, etc.) */
/* Allow Unix domain sockets (needed for local IPC, gunicorn, etc.) */
if (domain == AF_UNIX || domain == AF_LOCAL) {
return real_socket(domain, type, protocol);
}
Expand Down
183 changes: 73 additions & 110 deletions dockerfiles/sandbox/start-with-nginx.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Start nginx load balancer with multiple uwsgi workers
# Start nginx load balancer with multiple gunicorn workers
# Uses TCP sockets for workers, supporting both single-node and multi-node deployments.
#
# Multi-node is auto-detected from SLURM environment variables.
Expand All @@ -13,20 +13,18 @@
# NGINX_PORT Port nginx listens on (default: 6000, set in Dockerfile)
#
# Optional — Worker Configuration:
# NUM_WORKERS Number of uWSGI workers per node (default: $(nproc --all))
# NUM_WORKERS Number of gunicorn instances per node (default: $(nproc --all))
# SANDBOX_WORKER_BASE_PORT
# Starting TCP port for workers (default: 50001). Workers
# bind to sequential ports: base, base+1, ..., base+N-1.
# If a port is already in use, the startup algorithm retries
# with offset increments.
# STATEFUL_SANDBOX Set to 1 (default) for stateful mode: each uWSGI worker
# runs a single process to preserve Jupyter kernel sessions
# STATEFUL_SANDBOX Set to 1 (default) for stateful mode: each gunicorn instance
# runs a single worker process to preserve Jupyter kernel sessions
# across requests. Set to 0 for stateless mode where
# UWSGI_PROCESSES and UWSGI_CHEAPER take effect.
# UWSGI_PROCESSES uWSGI processes per worker (default: 1). Only used when
# STATEFUL_SANDBOX=0.
# UWSGI_CHEAPER uWSGI cheaper mode: minimum number of active processes
# (default: 1). Only used when STATEFUL_SANDBOX=0.
# GUNICORN_WORKERS takes effect.
# GUNICORN_WORKERS Gunicorn worker processes per instance (default: 1). Only used
# when STATEFUL_SANDBOX=0.
#
# Optional — Multi-Node (SLURM):
# SLURM_JOB_NODELIST SLURM-provided compressed nodelist (e.g., "node[001-016]").
Expand All @@ -47,18 +45,37 @@
# NEMO_SKILLS_SANDBOX_BLOCK_NETWORK
# Set to 1 to enable network blocking for sandboxed code.
# Uses /etc/ld.so.preload to intercept socket() calls in
# all new processes. Applied AFTER nginx/uWSGI start so
# all new processes. Applied AFTER nginx/gunicorn start so
# the API remains functional. Note: in any mode, if a
# worker crashes the monitoring loop will attempt to restart
# it, but the new process will be unable to bind its socket.
# The remaining workers continue serving. (default: 0)
#
# Deprecated (logged warning if set):
# UWSGI_PROCESSES Use GUNICORN_WORKERS instead.
# UWSGI_CHEAPER No gunicorn equivalent; ignored.
# UWSGI_CPU_AFFINITY No gunicorn equivalent; ignored.
#
# =============================================================================

set -e

export NUM_WORKERS=${NUM_WORKERS:-$(nproc --all)}

# =============================================================================
# Deprecation warnings for old uWSGI env vars
# =============================================================================
if [ -n "$UWSGI_PROCESSES" ]; then
echo "WARNING: UWSGI_PROCESSES is deprecated. Use GUNICORN_WORKERS instead."
: "${GUNICORN_WORKERS:=$UWSGI_PROCESSES}"
fi
if [ -n "$UWSGI_CHEAPER" ]; then
echo "WARNING: UWSGI_CHEAPER is deprecated and has no gunicorn equivalent. Ignoring."
fi
if [ -n "$UWSGI_CPU_AFFINITY" ]; then
echo "WARNING: UWSGI_CPU_AFFINITY is deprecated and has no gunicorn equivalent. Ignoring."
fi

# =============================================================================
# Utility functions
# =============================================================================
Expand Down Expand Up @@ -111,38 +128,28 @@ print(' '.join(expand_nodelist(sys.argv[1])))
" "$nodelist" 2>/dev/null
}

# Start a single uWSGI worker in the background.
# Start a single gunicorn instance in the background.
# Args: $1=worker_number $2=port
# Prints: "pid:port"
start_worker_fast() {
local i=$1
local WORKER_PORT=$2

cat > /tmp/worker${i}_uwsgi.ini << EOF
[uwsgi]
module = main
callable = app
processes = ${UWSGI_PROCESSES}
http-socket = 0.0.0.0:${WORKER_PORT}
vacuum = true
master = true
die-on-term = true
memory-report = true
listen = 100
http-timeout = 300
socket-timeout = 300
disable-logging = false
log-date = true
log-prefix = [worker${i}]
logto = /var/log/worker${i}.log
EOF

if [ -n "$UWSGI_CHEAPER" ]; then
echo "cheaper = ${UWSGI_CHEAPER}" >> /tmp/worker${i}_uwsgi.ini
fi

> /var/log/worker${i}.log
( cd /app && env WORKER_NUM=$i uwsgi --ini /tmp/worker${i}_uwsgi.ini ) &
# Redirect stdout/stderr to the log file so the $() subshell that calls
# this function doesn't block waiting for the pipe to close.
# (uWSGI's --logto closed stdout implicitly; gunicorn does not.)
( cd /app && env WORKER_NUM=$i \
gunicorn main:app \
--workers ${GUNICORN_WORKERS} \
--bind 0.0.0.0:${WORKER_PORT} \
--timeout 300 \
--graceful-timeout 30 \
--backlog 100 \
--access-logfile /var/log/worker${i}.log \
--error-logfile /var/log/worker${i}.log \
--log-level info \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trailing backslash unnecessary (line continues to closing paren)

Suggested change
--log-level info \
--log-level info

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

) >> /var/log/worker${i}.log 2>&1 &
echo "$!:$WORKER_PORT"
}

Expand All @@ -158,6 +165,11 @@ worker_had_port_conflict() {
grep -q "Address already in use" /var/log/worker${1}.log 2>/dev/null
}

# Check if a TCP port is available (not in use).
port_is_free() {
! ss -tln "sport = :$1" 2>/dev/null | grep -q LISTEN
}

worker_is_alive() {
kill -0 "$1" 2>/dev/null
}
Expand Down Expand Up @@ -342,51 +354,18 @@ UPSTREAM_FILE="/tmp/upstream_servers.conf"
echo "[$_H] Workers/node: $NUM_WORKERS | Base port: $SANDBOX_WORKER_BASE_PORT | Nginx: $NGINX_PORT"

# =============================================================================
# uWSGI configuration
# Gunicorn configuration
# =============================================================================
: "${STATEFUL_SANDBOX:=1}"
if [ "$STATEFUL_SANDBOX" -eq 1 ]; then
UWSGI_PROCESSES=1
UWSGI_CHEAPER=1
GUNICORN_WORKERS=1
else
: "${UWSGI_PROCESSES:=1}"
: "${UWSGI_CHEAPER:=1}"
: "${GUNICORN_WORKERS:=1}"
fi

export UWSGI_PROCESSES UWSGI_CHEAPER

echo "UWSGI settings: PROCESSES=$UWSGI_PROCESSES, CHEAPER=$UWSGI_CHEAPER"

# Validate and fix uwsgi configuration
if [ -z "$UWSGI_PROCESSES" ]; then
UWSGI_PROCESSES=2
fi

if [ -z "$UWSGI_CHEAPER" ]; then
UWSGI_CHEAPER=1
elif [ "$UWSGI_CHEAPER" -le 0 ]; then
echo "WARNING: UWSGI_CHEAPER ($UWSGI_CHEAPER) must be at least 1"
UWSGI_CHEAPER=1
echo "Setting UWSGI_CHEAPER to $UWSGI_CHEAPER"
elif [ "$UWSGI_CHEAPER" -ge "$UWSGI_PROCESSES" ]; then
echo "WARNING: UWSGI_CHEAPER ($UWSGI_CHEAPER) must be lower than UWSGI_PROCESSES ($UWSGI_PROCESSES)"
if [ "$UWSGI_PROCESSES" -eq 1 ]; then
# For single process, disable cheaper mode entirely
echo "Disabling cheaper mode for single process setup"
UWSGI_CHEAPER=""
else
UWSGI_CHEAPER=$((UWSGI_PROCESSES - 1))
echo "Setting UWSGI_CHEAPER to $UWSGI_CHEAPER"
fi
fi
export GUNICORN_WORKERS

export UWSGI_PROCESSES
if [ -n "$UWSGI_CHEAPER" ]; then
export UWSGI_CHEAPER
echo "UWSGI config - Processes: $UWSGI_PROCESSES, Cheaper: $UWSGI_CHEAPER"
else
echo "UWSGI config - Processes: $UWSGI_PROCESSES, Cheaper: disabled"
fi
echo "Gunicorn settings: WORKERS=$GUNICORN_WORKERS"

# =============================================================================
# Log setup
Expand Down Expand Up @@ -416,7 +395,7 @@ cleanup() {
kill -TERM "$pid" 2>/dev/null || true
fi
done
pkill -f nginx || true
nginx -s quit 2>/dev/null || kill "$(cat /run/nginx.pid 2>/dev/null)" 2>/dev/null || true
[ -n "$HEALTH_CHECK_DIR" ] && rm -rf "$HEALTH_CHECK_DIR" 2>/dev/null || true
[ -n "$REMOTE_HEALTH_DIR" ] && rm -rf "$REMOTE_HEALTH_DIR" 2>/dev/null || true
exit 0
Expand All @@ -432,48 +411,32 @@ for i in $(seq 1 $NUM_WORKERS); do
ACTUAL_WORKER_PORTS+=("")
done

# Phase 1: Spawn all workers simultaneously
# Phase 1: Pre-check ports and spawn all workers simultaneously.
# Gunicorn retries binding 5 times with 1s sleep (hardcoded in gunicorn/sock.py),
# so a port conflict wastes ~5s per worker. Pre-checking with ss avoids this.
echo "[$_H] Starting $NUM_WORKERS workers (ports $SANDBOX_WORKER_BASE_PORT-$((SANDBOX_WORKER_BASE_PORT + NUM_WORKERS - 1)))..."
START_SPAWN=$(date +%s)
CONFLICT_COUNT=0

for i in $(seq 1 $NUM_WORKERS); do
port=$((SANDBOX_WORKER_BASE_PORT + i - 1))
result=$(start_worker_fast $i $port)
WORKER_PIDS[$((i - 1))]="${result%%:*}"
ACTUAL_WORKER_PORTS[$((i - 1))]=$port
done

echo "[$_H] All $NUM_WORKERS workers spawned in $(($(date +%s) - START_SPAWN))s"

# Phase 2: Retry workers that failed due to port conflicts
retry_round=0
while [ $retry_round -lt $MAX_STARTUP_RETRIES ]; do
sleep 1

FAILED_WORKERS=()
for i in $(seq 1 $NUM_WORKERS); do
idx=$((i - 1))
worker_is_alive "${WORKER_PIDS[$idx]}" && continue
worker_had_port_conflict $i && FAILED_WORKERS+=($i)
done

[ ${#FAILED_WORKERS[@]} -eq 0 ] && break

PORT_OFFSET=$(( (retry_round + 1) * PORT_INCREMENT ))
echo "[$_H] Retry $((retry_round + 1)): ${#FAILED_WORKERS[@]} port conflicts, offset +$PORT_OFFSET"

for i in "${FAILED_WORKERS[@]}"; do
idx=$((i - 1))
new_port=$((SANDBOX_WORKER_BASE_PORT + i - 1 + PORT_OFFSET))
result=$(start_worker_fast $i $new_port)
WORKER_PIDS[$idx]="${result%%:*}"
ACTUAL_WORKER_PORTS[$idx]=$new_port
# Try up to MAX_STARTUP_RETRIES offsets if the port is occupied
for retry in $(seq 0 $MAX_STARTUP_RETRIES); do
if port_is_free $port; then
break
fi
CONFLICT_COUNT=$((CONFLICT_COUNT + 1))
port=$((port + PORT_INCREMENT))
Comment on lines +424 to +430
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

potential race condition between port_is_free check and start_worker_fast binding. if another process binds to the port in this gap, gunicorn will still retry 5 times (~5s delay). the old approach with post-spawn retry handled this by checking logs after the process started. consider checking if worker actually started successfully on the expected port.

Suggested change
# Try up to MAX_STARTUP_RETRIES offsets if the port is occupied
for retry in $(seq 0 $MAX_STARTUP_RETRIES); do
if port_is_free $port; then
break
fi
CONFLICT_COUNT=$((CONFLICT_COUNT + 1))
port=$((port + PORT_INCREMENT))
# Try up to MAX_STARTUP_RETRIES offsets if the port is occupied
for retry in $(seq 0 $MAX_STARTUP_RETRIES); do
if port_is_free $port; then
result=$(start_worker_fast $i $port)
WORKER_PIDS[$((i - 1))]="${result%%:*}"
ACTUAL_WORKER_PORTS[$((i - 1))]=$port
break
fi
CONFLICT_COUNT=$((CONFLICT_COUNT + 1))
port=$((port + PORT_INCREMENT))
echo "[$_H] Port conflict on worker $i, trying port $port"
done

echo "[$_H] Port conflict on worker $i, trying port $port"
done

retry_round=$((retry_round + 1))
result=$(start_worker_fast $i $port)
WORKER_PIDS[$((i - 1))]="${result%%:*}"
ACTUAL_WORKER_PORTS[$((i - 1))]=$port
done
Comment on lines +414 to 437
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Port conflict resolution loop has no exhaustion guard.

If port_is_free never succeeds (all candidate ports occupied), the loop silently falls through and start_worker_fast is called on a busy port. After MAX_STARTUP_RETRIES offset increments, consider failing explicitly or logging a clear warning that the worker will likely fail to bind.

Also, the retry variable on line 425 is unused (ShellCheck SC2034) — use _ as the loop variable.

Proposed fix
-    for retry in $(seq 0 $MAX_STARTUP_RETRIES); do
+    for _ in $(seq 0 $MAX_STARTUP_RETRIES); do
         if port_is_free $port; then
             break
         fi
         CONFLICT_COUNT=$((CONFLICT_COUNT + 1))
         port=$((port + PORT_INCREMENT))
         echo "[$_H] Port conflict on worker $i, trying port $port"
     done
+    if ! port_is_free $port; then
+        echo "[$_H] WARNING: Could not find a free port for worker $i after $MAX_STARTUP_RETRIES retries (last tried $port)"
+    fi
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Phase 1: Pre-check ports and spawn all workers simultaneously.
# Gunicorn retries binding 5 times with 1s sleep (hardcoded in gunicorn/sock.py),
# so a port conflict wastes ~5s per worker. Pre-checking with ss avoids this.
echo "[$_H] Starting $NUM_WORKERS workers (ports $SANDBOX_WORKER_BASE_PORT-$((SANDBOX_WORKER_BASE_PORT + NUM_WORKERS - 1)))..."
START_SPAWN=$(date +%s)
CONFLICT_COUNT=0
for i in $(seq 1 $NUM_WORKERS); do
port=$((SANDBOX_WORKER_BASE_PORT + i - 1))
result=$(start_worker_fast $i $port)
WORKER_PIDS[$((i - 1))]="${result%%:*}"
ACTUAL_WORKER_PORTS[$((i - 1))]=$port
done
echo "[$_H] All $NUM_WORKERS workers spawned in $(($(date +%s) - START_SPAWN))s"
# Phase 2: Retry workers that failed due to port conflicts
retry_round=0
while [ $retry_round -lt $MAX_STARTUP_RETRIES ]; do
sleep 1
FAILED_WORKERS=()
for i in $(seq 1 $NUM_WORKERS); do
idx=$((i - 1))
worker_is_alive "${WORKER_PIDS[$idx]}" && continue
worker_had_port_conflict $i && FAILED_WORKERS+=($i)
done
[ ${#FAILED_WORKERS[@]} -eq 0 ] && break
PORT_OFFSET=$(( (retry_round + 1) * PORT_INCREMENT ))
echo "[$_H] Retry $((retry_round + 1)): ${#FAILED_WORKERS[@]} port conflicts, offset +$PORT_OFFSET"
for i in "${FAILED_WORKERS[@]}"; do
idx=$((i - 1))
new_port=$((SANDBOX_WORKER_BASE_PORT + i - 1 + PORT_OFFSET))
result=$(start_worker_fast $i $new_port)
WORKER_PIDS[$idx]="${result%%:*}"
ACTUAL_WORKER_PORTS[$idx]=$new_port
# Try up to MAX_STARTUP_RETRIES offsets if the port is occupied
for retry in $(seq 0 $MAX_STARTUP_RETRIES); do
if port_is_free $port; then
break
fi
CONFLICT_COUNT=$((CONFLICT_COUNT + 1))
port=$((port + PORT_INCREMENT))
echo "[$_H] Port conflict on worker $i, trying port $port"
done
retry_round=$((retry_round + 1))
result=$(start_worker_fast $i $port)
WORKER_PIDS[$((i - 1))]="${result%%:*}"
ACTUAL_WORKER_PORTS[$((i - 1))]=$port
done
# Phase 1: Pre-check ports and spawn all workers simultaneously.
# Gunicorn retries binding 5 times with 1s sleep (hardcoded in gunicorn/sock.py),
# so a port conflict wastes ~5s per worker. Pre-checking with ss avoids this.
echo "[$_H] Starting $NUM_WORKERS workers (ports $SANDBOX_WORKER_BASE_PORT-$((SANDBOX_WORKER_BASE_PORT + NUM_WORKERS - 1)))..."
START_SPAWN=$(date +%s)
CONFLICT_COUNT=0
for i in $(seq 1 $NUM_WORKERS); do
port=$((SANDBOX_WORKER_BASE_PORT + i - 1))
# Try up to MAX_STARTUP_RETRIES offsets if the port is occupied
for _ in $(seq 0 $MAX_STARTUP_RETRIES); do
if port_is_free $port; then
break
fi
CONFLICT_COUNT=$((CONFLICT_COUNT + 1))
port=$((port + PORT_INCREMENT))
echo "[$_H] Port conflict on worker $i, trying port $port"
done
if ! port_is_free $port; then
echo "[$_H] WARNING: Could not find a free port for worker $i after $MAX_STARTUP_RETRIES retries (last tried $port)"
fi
result=$(start_worker_fast $i $port)
WORKER_PIDS[$((i - 1))]="${result%%:*}"
ACTUAL_WORKER_PORTS[$((i - 1))]=$port
done
🧰 Tools
🪛 Shellcheck (0.11.0)

[warning] 425-425: retry appears unused. Verify use (or export if used externally).

(SC2034)

🤖 Prompt for AI Agents
In `@dockerfiles/sandbox/start-with-nginx.sh` around lines 414 - 437, The
port-conflict loop may exhaust all retries and still call start_worker_fast on
an occupied port; update the loop that uses port_is_free and MAX_STARTUP_RETRIES
to use a throwaway loop variable (_) instead of retry, and after the loop add an
explicit exhaustion check: if no free port was found within MAX_STARTUP_RETRIES
increments, log a clear error/warning including the worker index and attempted
port(s), increment CONFLICT_COUNT as needed, and either skip starting that
worker (set WORKER_PIDS[...] empty and ACTUAL_WORKER_PORTS[...] to an
empty/invalid marker) or exit with a nonzero status so failure is visible; keep
references to start_worker_fast, WORKER_PIDS, ACTUAL_WORKER_PORTS,
PORT_INCREMENT and CONFLICT_COUNT when implementing the handling.


[ $retry_round -ge $MAX_STARTUP_RETRIES ] && echo "WARNING: Max startup retries reached"
echo "[$_H] All $NUM_WORKERS workers spawned in $(($(date +%s) - START_SPAWN))s ($CONFLICT_COUNT port conflicts resolved)"

# =============================================================================
# Wait for local workers to be ready (parallel health checks)
Expand Down Expand Up @@ -623,8 +586,8 @@ fi
# Network blocking
# =============================================================================
# ld.so.preload intercepts socket() in all NEW exec'd processes. This is safe
# for nginx/uWSGI that are already running. However, if the monitoring loop
# restarts a crashed worker, the new uWSGI process would be unable to bind its
# for nginx/gunicorn that are already running. However, if the monitoring loop
# restarts a crashed worker, the new gunicorn process would be unable to bind its
# listening socket. We set NETWORK_BLOCKING_ACTIVE so the monitoring loop can
# emit a clear diagnostic when this happens.
NETWORK_BLOCKING_ACTIVE=0
Expand Down Expand Up @@ -658,7 +621,7 @@ else
echo " Proxy: localhost:$NGINX_PORT -> $MASTER_NODE:$NGINX_PORT"
echo " Local workers: $NUM_WORKERS (ports ${ACTUAL_WORKER_PORTS[0]}-${ACTUAL_WORKER_PORTS[$((NUM_WORKERS-1))]})"
fi
echo " uWSGI: processes=$UWSGI_PROCESSES cheaper=${UWSGI_CHEAPER:-disabled}"
echo " Gunicorn: workers=$GUNICORN_WORKERS"

# =============================================================================
# Monitoring loop
Expand Down Expand Up @@ -695,7 +658,7 @@ while true; do
fi
done

if ! pgrep nginx > /dev/null; then
if ! kill -0 "$(cat /run/nginx.pid 2>/dev/null)" 2>/dev/null; then
echo "[$_H] ERROR: Nginx died unexpectedly"
cleanup
exit 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
# Worker that runs inside the shell process and owns a TerminalInteractiveShell()
def shell_worker(conn):
# LAYER 2: Python-level socket blocking for IPython sessions
# The shell_worker is forked (not exec'd) from the uWSGI worker, so it does NOT
# The shell_worker is forked (not exec'd) from the gunicorn worker, so it does NOT
# get the ld.so.preload library loaded. We must patch Python's socket module directly.
# This blocks: socket.socket(), _socket.socket(), requests.get(), urllib, etc.
if BLOCK_NETWORK:
Expand All @@ -101,6 +101,11 @@ def __init__(self, family=-1, type=-1, proto=-1, fileno=None):
socket_module.socket = BlockedSocket # Blocks: import socket; socket.socket()

shell = TerminalInteractiveShell()
# TerminalInteractiveShell installs a SIGINT handler that calls sys.exit(0)
# instead of raising KeyboardInterrupt when _executing is False (which is
# the case when run_cell is called programmatically). Restore the default
# handler so SIGINT raises KeyboardInterrupt and our except clause catches it.
signal.signal(signal.SIGINT, signal.SIG_DFL)
Comment on lines +104 to +108
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🌐 Web query:

Python signal.SIG_DFL SIGINT behavior process termination

💡 Result:

In Python, there are two different “defaults” in play for SIGINT (Ctrl+C):

  • Python’s default SIGINT handler: CPython installs a handler that translates SIGINT into a KeyboardInterrupt exception (as long as the parent process hasn’t changed SIGINT handling). So the process usually does not immediately die at the OS level; instead Python raises KeyboardInterrupt in the main thread. [1]

  • signal.SIG_DFL (restore OS default): signal.SIG_DFL means “use the signal’s default action”. If you do:

    signal.signal(signal.SIGINT, signal.SIG_DFL)

    then SIGINT is no longer translated into KeyboardInterrupt; it reverts to the platform/OS default behavior. Python documents SIG_DFL as “perform the default function for the signal”. [1]

On POSIX/Unix, the default action for SIGINT is process termination (i.e., the kernel terminates the process as if by _exit(), with wait status indicating termination by signal). [2]

Net effect:

  • Default Python: Ctrl+C ⇒ KeyboardInterrupt (catchable in Python). [1]
  • After signal(SIGINT, SIG_DFL): Ctrl+C / SIGINT ⇒ process is terminated by the OS signal default, typically without a Python exception/traceback. [1][2]

Sources:
[1] Python signal module docs (SIGINT → KeyboardInterrupt, SIG_DFL restores default) (docs.python.org)
[2] POSIX/Open Group description of SIG_DFL default actions and termination semantics (pubs.opengroup.org)

Citations:


🌐 Web query:

Python signal.default_int_handler KeyboardInterrupt

💡 Result:

signal.default_int_handler is Python’s built-in default handler for SIGINT (typically generated by pressing Ctrl+C). Its behavior is simply to raise KeyboardInterrupt. [1]

Typical uses:

  • Restore Python’s Ctrl+C behavior after installing a custom handler

    import signal
    signal.signal(signal.SIGINT, signal.default_int_handler)

    This makes SIGINT raise KeyboardInterrupt again (instead of whatever your custom handler did). [1]

  • Note: signal.signal(SIGINT, signal.SIG_DFL) does not restore Python’s KeyboardInterrupt behavior; it restores the OS default action (usually terminating the process). To restore Python’s behavior, use signal.default_int_handler (or save/restore the previous handler). [4]

  • Why you might not see it installed: Python only installs its default SIGINTKeyboardInterrupt translation if the parent process hasn’t already set SIGINT to “ignore” (SIG_IGN). In that case, Python inherits the ignore disposition and won’t translate SIGINT into KeyboardInterrupt. [1], [8]

Sources: Python signal docs [1]; Stack Overflow on restoring default SIGINT handling [4]; Stack Overflow on inherited SIG_IGN preventing KeyboardInterrupt [8].


🏁 Script executed:

find . -name "local_sandbox_server.py" | head -20

Repository: NVIDIA-NeMo/Skills

Length of output: 283


🏁 Script executed:

fd "local_sandbox_server.py" -t f

Repository: NVIDIA-NeMo/Skills

Length of output: 283


Bug: signal.SIG_DFL terminates the process instead of raising KeyboardInterrupt.

signal.SIG_DFL restores the OS-level default for SIGINT, which is process termination on POSIX systems — it does not raise KeyboardInterrupt. This means the except KeyboardInterrupt block on line 142 is unreachable; when the ShellManager sends SIGINT, the shell worker will be killed by the OS rather than gracefully returning an {"status": "interrupted"} response.

Use signal.default_int_handler — Python's built-in handler that raises KeyboardInterrupt:

Proposed fix
-    signal.signal(signal.SIGINT, signal.SIG_DFL)
+    signal.signal(signal.SIGINT, signal.default_int_handler)
🤖 Prompt for AI Agents
In `@nemo_skills/code_execution/local_sandbox/local_sandbox_server.py` around
lines 104 - 108, The current call signal.signal(signal.SIGINT, signal.SIG_DFL)
will let the OS kill the process instead of raising KeyboardInterrupt, so change
the SIGINT handler to use Python's default_int_handler instead; locate the
signal setup in local_sandbox_server.py (the block mentioning
TerminalInteractiveShell) and replace the SIG_DFL installation with
signal.signal(signal.SIGINT, signal.default_int_handler) so the except
KeyboardInterrupt in the shell worker (referenced by the run_cell/interrupt
handling in the shell worker/ShellManager code) becomes reachable and returns
the {"status":"interrupted"} response.

try:
while True:
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ docker build --tag=${SANDBOX_NAME} --build-arg="NUM_WORKERS=$((`nproc --all`))"
echo "Multi-worker mode: Starting $((`nproc --all`)) workers with session affinity"
docker run --network=host --rm \
--memory=${NEMO_SKILLS_SANDBOX_MEM_LIMIT:-"16g"} \
${UWSGI_CPU_AFFINITY:+-e UWSGI_CPU_AFFINITY=${UWSGI_CPU_AFFINITY}} \
${UWSGI_PROCESSES:+-e UWSGI_PROCESSES=${UWSGI_PROCESSES}} \
${GUNICORN_WORKERS:+-e GUNICORN_WORKERS=${GUNICORN_WORKERS}} \
-v /nemo_run:/nemo_run \
--name=local-sandbox ${SANDBOX_NAME}
Loading