Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 35 additions & 39 deletions dockerfiles/Dockerfile.sandbox
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,31 @@
# See the License for the specific language governing permissions and
# limitations under the License.

# Use the base image with Python 3.10 and Flask
FROM tiangolo/uwsgi-nginx-flask:python3.10
# =============================================================================
# Dependency Locking
# =============================================================================
# The sandbox uses pinned dependencies from two lock files:
# - requirements/sandbox-core.lock (from code_execution.txt — always installed)
# - requirements/sandbox.lock (from code_execution.txt + stem.txt — skip on CI/arm64)
#
# To regenerate after changing code_execution.txt or stem.txt:
# uv pip compile requirements/code_execution.txt \
# --python-version 3.10 -o requirements/sandbox-core.lock
# uv pip compile requirements/code_execution.txt requirements/stem.txt \
# --extra-index-url https://download.pytorch.org/whl/cpu \
# --python-version 3.10 -o requirements/sandbox.lock
# =============================================================================

FROM python:3.10-slim

# Install dependencies required for Lean 4, pypy3, and other tools
# Install system dependencies: nginx, build tools for pypy3/lean4/block_network
ARG TARGETARCH
RUN apt-get update && \
apt-get install -y curl git net-tools bzip2 build-essential libseccomp-dev && \
ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \
apt-get install -y nginx curl git net-tools bzip2 build-essential libseccomp-dev && \
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

# Install PyPy3
RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" && \
case "$ARCH" in \
amd64) PYPY_ARCH=linux64 ;; \
arm64|aarch64) PYPY_ARCH=aarch64 ;; \
Expand All @@ -30,8 +47,7 @@ RUN apt-get update && \
tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \
ln -s /opt/pypy3.10-v7.3.17-$PYPY_ARCH/bin/pypy3 /usr/bin/pypy3 && \
/usr/bin/pypy3 -m ensurepip && \
rm /tmp/pypy.tar.bz2 && \
rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
rm /tmp/pypy.tar.bz2

# Install Lean 4 toolchain
RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
Expand Down Expand Up @@ -59,26 +75,25 @@ RUN cd /lean4/my_project && \
ENV LEAN_PATH="/lean4/my_project"
ENV PATH="/lean4/my_project:$PATH"

# Set up application code and install Python dependencies
COPY requirements/code_execution.txt /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt


# Install STEM related libraries
COPY requirements/stem.txt /app/stem_requirements.txt


# Speed/size/env hygiene
ENV PIP_DISABLE_PIP_VERSION_CHECK=1 \
UV_SYSTEM_PYTHON=1 \
PATH="/root/.local/bin:${PATH}"

# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | sh

# Set up application code directory
WORKDIR /app

# Install core Python dependencies from lock file (always)
COPY requirements/sandbox-core.lock /app/requirements-core.lock
RUN uv pip install --system -r /app/requirements-core.lock

# Install uv (adds to ~/.local/bin), then install deps
# Install full dependencies including STEM libraries (skip on CI/arm64)
COPY requirements/sandbox.lock /app/requirements.lock
RUN if [ "$GITHUB_CI" != "1" ] && [ "$TARGETARCH" != "arm64" ]; then \
curl -LsSf https://astral.sh/uv/install.sh | sh && \
uv pip install --upgrade pip && \
uv pip install -r /app/stem_requirements.txt --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu; \
uv pip install --system -r /app/requirements.lock --extra-index-url https://download.pytorch.org/whl/cpu; \
fi

# For scicode eval - create data directory and download test data
Expand Down Expand Up @@ -132,26 +147,7 @@ RUN gcc -shared -fPIC -o /usr/lib/libblock_network.so /tmp/block_network.c -ldl
COPY dockerfiles/sandbox/start-with-nginx.sh /start-with-nginx.sh
RUN chmod +x /start-with-nginx.sh

# Set the working directory to /app
WORKDIR /app

# Environment variables for multi-worker setup
ENV NGINX_PORT=6000

# Set default port for single worker mode
ENV LISTEN_PORT=6000

# Default uwsgi configuration
ARG UWSGI_CHEAPER
ENV UWSGI_CHEAPER=$UWSGI_CHEAPER

ARG NUM_WORKERS
ENV NUM_WORKERS=$NUM_WORKERS

ARG UWSGI_PROCESSES
ENV UWSGI_PROCESSES=$UWSGI_PROCESSES

ENV LISTEN_PORT=6000
RUN echo "uwsgi_read_timeout 14400s;" > /etc/nginx/conf.d/custom_timeout.conf

CMD ["/start-with-nginx.sh"]
2 changes: 1 addition & 1 deletion dockerfiles/sandbox/block_network.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ int socket(int domain, int type, int protocol) {
real_socket = dlsym(RTLD_NEXT, "socket");
}

/* Allow Unix domain sockets (needed for local IPC, uwsgi, etc.) */
/* Allow Unix domain sockets (needed for local IPC, gunicorn, etc.) */
if (domain == AF_UNIX || domain == AF_LOCAL) {
return real_socket(domain, type, protocol);
}
Expand Down
130 changes: 52 additions & 78 deletions dockerfiles/sandbox/start-with-nginx.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Start nginx load balancer with multiple uwsgi workers
# Start nginx load balancer with multiple gunicorn workers
# Uses TCP sockets for workers, supporting both single-node and multi-node deployments.
#
# Multi-node is auto-detected from SLURM environment variables.
Expand All @@ -13,20 +13,18 @@
# NGINX_PORT Port nginx listens on (default: 6000, set in Dockerfile)
#
# Optional — Worker Configuration:
# NUM_WORKERS Number of uWSGI workers per node (default: $(nproc --all))
# NUM_WORKERS Number of gunicorn instances per node (default: $(nproc --all))
# SANDBOX_WORKER_BASE_PORT
# Starting TCP port for workers (default: 50001). Workers
# bind to sequential ports: base, base+1, ..., base+N-1.
# If a port is already in use, the startup algorithm retries
# with offset increments.
# STATEFUL_SANDBOX Set to 1 (default) for stateful mode: each uWSGI worker
# runs a single process to preserve Jupyter kernel sessions
# STATEFUL_SANDBOX Set to 1 (default) for stateful mode: each gunicorn instance
# runs a single worker process to preserve Jupyter kernel sessions
# across requests. Set to 0 for stateless mode where
# UWSGI_PROCESSES and UWSGI_CHEAPER take effect.
# UWSGI_PROCESSES uWSGI processes per worker (default: 1). Only used when
# STATEFUL_SANDBOX=0.
# UWSGI_CHEAPER uWSGI cheaper mode: minimum number of active processes
# (default: 1). Only used when STATEFUL_SANDBOX=0.
# GUNICORN_WORKERS takes effect.
# GUNICORN_WORKERS Gunicorn worker processes per instance (default: 1). Only used
# when STATEFUL_SANDBOX=0.
#
# Optional — Multi-Node (SLURM):
# SLURM_JOB_NODELIST SLURM-provided compressed nodelist (e.g., "node[001-016]").
Expand All @@ -47,18 +45,37 @@
# NEMO_SKILLS_SANDBOX_BLOCK_NETWORK
# Set to 1 to enable network blocking for sandboxed code.
# Uses /etc/ld.so.preload to intercept socket() calls in
# all new processes. Applied AFTER nginx/uWSGI start so
# all new processes. Applied AFTER nginx/gunicorn start so
# the API remains functional. Note: in any mode, if a
# worker crashes the monitoring loop will attempt to restart
# it, but the new process will be unable to bind its socket.
# The remaining workers continue serving. (default: 0)
#
# Deprecated (logged warning if set):
# UWSGI_PROCESSES Use GUNICORN_WORKERS instead.
# UWSGI_CHEAPER No gunicorn equivalent; ignored.
# UWSGI_CPU_AFFINITY No gunicorn equivalent; ignored.
#
# =============================================================================

set -e

export NUM_WORKERS=${NUM_WORKERS:-$(nproc --all)}

# =============================================================================
# Deprecation warnings for old uWSGI env vars
# =============================================================================
if [ -n "$UWSGI_PROCESSES" ]; then
echo "WARNING: UWSGI_PROCESSES is deprecated. Use GUNICORN_WORKERS instead."
: "${GUNICORN_WORKERS:=$UWSGI_PROCESSES}"
fi
if [ -n "$UWSGI_CHEAPER" ]; then
echo "WARNING: UWSGI_CHEAPER is deprecated and has no gunicorn equivalent. Ignoring."
fi
if [ -n "$UWSGI_CPU_AFFINITY" ]; then
echo "WARNING: UWSGI_CPU_AFFINITY is deprecated and has no gunicorn equivalent. Ignoring."
fi

# =============================================================================
# Utility functions
# =============================================================================
Expand Down Expand Up @@ -111,38 +128,28 @@ print(' '.join(expand_nodelist(sys.argv[1])))
" "$nodelist" 2>/dev/null
}

# Start a single uWSGI worker in the background.
# Start a single gunicorn instance in the background.
# Args: $1=worker_number $2=port
# Prints: "pid:port"
start_worker_fast() {
local i=$1
local WORKER_PORT=$2

cat > /tmp/worker${i}_uwsgi.ini << EOF
[uwsgi]
module = main
callable = app
processes = ${UWSGI_PROCESSES}
http-socket = 0.0.0.0:${WORKER_PORT}
vacuum = true
master = true
die-on-term = true
memory-report = true
listen = 100
http-timeout = 300
socket-timeout = 300
disable-logging = false
log-date = true
log-prefix = [worker${i}]
logto = /var/log/worker${i}.log
EOF

if [ -n "$UWSGI_CHEAPER" ]; then
echo "cheaper = ${UWSGI_CHEAPER}" >> /tmp/worker${i}_uwsgi.ini
fi

> /var/log/worker${i}.log
( cd /app && env WORKER_NUM=$i uwsgi --ini /tmp/worker${i}_uwsgi.ini ) &
# Redirect stdout/stderr to the log file so the $() subshell that calls
# this function doesn't block waiting for the pipe to close.
# (uWSGI's --logto closed stdout implicitly; gunicorn does not.)
( cd /app && env WORKER_NUM=$i \
gunicorn main:app \
--workers ${GUNICORN_WORKERS} \
--bind 0.0.0.0:${WORKER_PORT} \
--timeout 300 \
--graceful-timeout 30 \
--backlog 100 \
--access-logfile /var/log/worker${i}.log \
--error-logfile /var/log/worker${i}.log \
--log-level info \
) >> /var/log/worker${i}.log 2>&1 &
echo "$!:$WORKER_PORT"
}

Expand Down Expand Up @@ -342,51 +349,18 @@ UPSTREAM_FILE="/tmp/upstream_servers.conf"
echo "[$_H] Workers/node: $NUM_WORKERS | Base port: $SANDBOX_WORKER_BASE_PORT | Nginx: $NGINX_PORT"

# =============================================================================
# uWSGI configuration
# Gunicorn configuration
# =============================================================================
: "${STATEFUL_SANDBOX:=1}"
if [ "$STATEFUL_SANDBOX" -eq 1 ]; then
UWSGI_PROCESSES=1
UWSGI_CHEAPER=1
GUNICORN_WORKERS=1
else
: "${UWSGI_PROCESSES:=1}"
: "${UWSGI_CHEAPER:=1}"
: "${GUNICORN_WORKERS:=1}"
fi

export UWSGI_PROCESSES UWSGI_CHEAPER
export GUNICORN_WORKERS

echo "UWSGI settings: PROCESSES=$UWSGI_PROCESSES, CHEAPER=$UWSGI_CHEAPER"

# Validate and fix uwsgi configuration
if [ -z "$UWSGI_PROCESSES" ]; then
UWSGI_PROCESSES=2
fi

if [ -z "$UWSGI_CHEAPER" ]; then
UWSGI_CHEAPER=1
elif [ "$UWSGI_CHEAPER" -le 0 ]; then
echo "WARNING: UWSGI_CHEAPER ($UWSGI_CHEAPER) must be at least 1"
UWSGI_CHEAPER=1
echo "Setting UWSGI_CHEAPER to $UWSGI_CHEAPER"
elif [ "$UWSGI_CHEAPER" -ge "$UWSGI_PROCESSES" ]; then
echo "WARNING: UWSGI_CHEAPER ($UWSGI_CHEAPER) must be lower than UWSGI_PROCESSES ($UWSGI_PROCESSES)"
if [ "$UWSGI_PROCESSES" -eq 1 ]; then
# For single process, disable cheaper mode entirely
echo "Disabling cheaper mode for single process setup"
UWSGI_CHEAPER=""
else
UWSGI_CHEAPER=$((UWSGI_PROCESSES - 1))
echo "Setting UWSGI_CHEAPER to $UWSGI_CHEAPER"
fi
fi

export UWSGI_PROCESSES
if [ -n "$UWSGI_CHEAPER" ]; then
export UWSGI_CHEAPER
echo "UWSGI config - Processes: $UWSGI_PROCESSES, Cheaper: $UWSGI_CHEAPER"
else
echo "UWSGI config - Processes: $UWSGI_PROCESSES, Cheaper: disabled"
fi
echo "Gunicorn settings: WORKERS=$GUNICORN_WORKERS"

# =============================================================================
# Log setup
Expand Down Expand Up @@ -416,7 +390,7 @@ cleanup() {
kill -TERM "$pid" 2>/dev/null || true
fi
done
pkill -f nginx || true
nginx -s quit 2>/dev/null || kill "$(cat /run/nginx.pid 2>/dev/null)" 2>/dev/null || true
[ -n "$HEALTH_CHECK_DIR" ] && rm -rf "$HEALTH_CHECK_DIR" 2>/dev/null || true
[ -n "$REMOTE_HEALTH_DIR" ] && rm -rf "$REMOTE_HEALTH_DIR" 2>/dev/null || true
exit 0
Expand Down Expand Up @@ -623,8 +597,8 @@ fi
# Network blocking
# =============================================================================
# ld.so.preload intercepts socket() in all NEW exec'd processes. This is safe
# for nginx/uWSGI that are already running. However, if the monitoring loop
# restarts a crashed worker, the new uWSGI process would be unable to bind its
# for nginx/gunicorn that are already running. However, if the monitoring loop
# restarts a crashed worker, the new gunicorn process would be unable to bind its
# listening socket. We set NETWORK_BLOCKING_ACTIVE so the monitoring loop can
# emit a clear diagnostic when this happens.
NETWORK_BLOCKING_ACTIVE=0
Expand Down Expand Up @@ -658,7 +632,7 @@ else
echo " Proxy: localhost:$NGINX_PORT -> $MASTER_NODE:$NGINX_PORT"
echo " Local workers: $NUM_WORKERS (ports ${ACTUAL_WORKER_PORTS[0]}-${ACTUAL_WORKER_PORTS[$((NUM_WORKERS-1))]})"
fi
echo " uWSGI: processes=$UWSGI_PROCESSES cheaper=${UWSGI_CHEAPER:-disabled}"
echo " Gunicorn: workers=$GUNICORN_WORKERS"

# =============================================================================
# Monitoring loop
Expand Down Expand Up @@ -695,7 +669,7 @@ while true; do
fi
done

if ! pgrep nginx > /dev/null; then
if ! kill -0 "$(cat /run/nginx.pid 2>/dev/null)" 2>/dev/null; then
echo "[$_H] ERROR: Nginx died unexpectedly"
cleanup
exit 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
# Worker that runs inside the shell process and owns a TerminalInteractiveShell()
def shell_worker(conn):
# LAYER 2: Python-level socket blocking for IPython sessions
# The shell_worker is forked (not exec'd) from the uWSGI worker, so it does NOT
# The shell_worker is forked (not exec'd) from the gunicorn worker, so it does NOT
# get the ld.so.preload library loaded. We must patch Python's socket module directly.
# This blocks: socket.socket(), _socket.socket(), requests.get(), urllib, etc.
if BLOCK_NETWORK:
Expand All @@ -101,6 +101,11 @@ def __init__(self, family=-1, type=-1, proto=-1, fileno=None):
socket_module.socket = BlockedSocket # Blocks: import socket; socket.socket()

shell = TerminalInteractiveShell()
# TerminalInteractiveShell installs a SIGINT handler that calls sys.exit(0)
# instead of raising KeyboardInterrupt when _executing is False (which is
# the case when run_cell is called programmatically). Restore the default
# handler so SIGINT raises KeyboardInterrupt and our except clause catches it.
signal.signal(signal.SIGINT, signal.SIG_DFL)
try:
while True:
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ docker build --tag=${SANDBOX_NAME} --build-arg="NUM_WORKERS=$((`nproc --all`))"
echo "Multi-worker mode: Starting $((`nproc --all`)) workers with session affinity"
docker run --network=host --rm \
--memory=${NEMO_SKILLS_SANDBOX_MEM_LIMIT:-"16g"} \
${UWSGI_CPU_AFFINITY:+-e UWSGI_CPU_AFFINITY=${UWSGI_CPU_AFFINITY}} \
${UWSGI_PROCESSES:+-e UWSGI_PROCESSES=${UWSGI_PROCESSES}} \
${GUNICORN_WORKERS:+-e GUNICORN_WORKERS=${GUNICORN_WORKERS}} \
-v /nemo_run:/nemo_run \
--name=local-sandbox ${SANDBOX_NAME}
4 changes: 1 addition & 3 deletions nemo_skills/dataset/icpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@

# environment variables required by this benchmark
SANDBOX_ENV_VARS = [
"UWSGI_PROCESSES=1024",
"UWSGI_CPU_AFFINITY=8",
"UWSGI_CHEAPER=1023",
"GUNICORN_WORKERS=1024",
"NUM_WORKERS=1",
"STATEFUL_SANDBOX=0",
]
4 changes: 1 addition & 3 deletions nemo_skills/dataset/ioi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@

# environment variables required by this benchmark
SANDBOX_ENV_VARS = [
"UWSGI_PROCESSES=1024",
"UWSGI_CPU_AFFINITY=8",
"UWSGI_CHEAPER=1023",
"GUNICORN_WORKERS=1024",
"NUM_WORKERS=1",
"STATEFUL_SANDBOX=0",
]
Loading