Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
- name: Cleanup
if: always()
run: |
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.0 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker ps -a -q | xargs -r docker stop

gpu-tests-qwen:
Expand Down Expand Up @@ -91,5 +91,5 @@ jobs:
- name: Cleanup
if: always()
run: |
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.0 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker ps -a -q | xargs -r docker stop
4 changes: 2 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ jobs:
if: steps.changes.outputs.docker == 'true'
run: |
# these tags need to match the ones in tests/gpu-tests/test-local.yaml
docker build -t igitman/nemo-skills:0.7.0 -f dockerfiles/Dockerfile.nemo-skills .
docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills .
docker build -t igitman/nemo-skills-sandbox:0.7.1 -f dockerfiles/Dockerfile.sandbox .
- name: Pull Images
if: steps.changes.outputs.docker != 'true'
run: |
docker pull igitman/nemo-skills:0.7.0
docker pull igitman/nemo-skills:0.7.1
docker pull igitman/nemo-skills-sandbox:0.7.1
- name: Run all tests
env:
Expand Down
6 changes: 3 additions & 3 deletions cluster_configs/example-local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
executor: local

containers:
trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
vllm: vllm/vllm-openai:v0.10.1.1
sglang: igitman/nemo-skills-sglang:0.7.0
sglang: lmsysorg/sglang:v0.5.3rc1-cu126
nemo: igitman/nemo-skills-nemo:0.7.0
megatron: igitman/nemo-skills-megatron:0.7.0
sandbox: igitman/nemo-skills-sandbox:0.7.1
nemo-skills: igitman/nemo-skills:0.7.0
nemo-skills: igitman/nemo-skills:0.7.1
verl: igitman/nemo-skills-verl:0.7.0
nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0

Expand Down
6 changes: 3 additions & 3 deletions cluster_configs/example-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
executor: slurm

containers:
trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
vllm: vllm/vllm-openai:v0.10.1.1
sglang: igitman/nemo-skills-sglang:0.7.0
sglang: lmsysorg/sglang:v0.5.3rc1-cu126
nemo: igitman/nemo-skills-nemo:0.7.0
megatron: igitman/nemo-skills-megatron:0.7.0
sandbox: igitman/nemo-skills-sandbox:0.7.1
nemo-skills: igitman/nemo-skills:0.7.0
nemo-skills: igitman/nemo-skills:0.7.1
verl: igitman/nemo-skills-verl:0.7.0
nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0

Expand Down
2 changes: 1 addition & 1 deletion dockerfiles/Dockerfile.nemo-rl
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
FROM base AS hermetic

ARG NEMO_RL_COMMIT
ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-c6e6f70adfed4954f1ebbf99c5043d242015b13f}
ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-9301d36cbf847212430b84a27cfe6990f773b7cf}

RUN git clone https://github.com/NVIDIA-NeMo/RL.git /opt/NeMo-RL && cd /opt/NeMo-RL && git checkout ${NEMO_RL_COMMIT} && git submodule update --init --recursive

Expand Down
36 changes: 27 additions & 9 deletions dockerfiles/Dockerfile.nemo-skills
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
FROM python:3.10-bookworm

RUN apt-get update && apt-get -y install curl git git-lfs

# installing apptainer
RUN apt install -y wget && \
cd /tmp && \
wget https://github.com/apptainer/apptainer/releases/download/v1.4.1/apptainer_1.4.1_amd64.deb && \
apt install -y ./apptainer_1.4.1_amd64.deb
# using ubuntu instead of debian for easier apptainer installation on arm64
FROM ubuntu:22.04

# Install Python and other dependencies
RUN apt-get update && \
apt-get install -y \
python3.10 \
python3-pip \
curl \
wget \
git \
git-lfs && \
ln -s /usr/bin/python3 /usr/bin/python && \
rm -rf /var/cache/apt/archives /var/lib/apt/lists/*

RUN pip install --upgrade pip setuptools

# Update package lists and install apptainer for arm64
# https://apptainer.org/docs/admin/1.1/installation.html
RUN apt update && \
apt install -y software-properties-common && \
add-apt-repository -y ppa:apptainer/ppa && \
apt update && apt -y install apptainer && \
add-apt-repository -y ppa:apptainer/ppa && \
apt update && apt install -y apptainer-suid && \
rm -rf /var/cache/apt/archives /var/lib/apt/lists/*
Comment on lines +18 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Remove duplicate PPA addition and consolidate apt operations.

Line 24 duplicates line 22 by adding the same PPA twice. The repository only needs to be added once, and both packages can be installed in a single operation.

Apply this diff to consolidate the operations:

-# Update package lists and install apptainer for arm64
-# https://apptainer.org/docs/admin/1.1/installation.html
-RUN apt update && \
-    apt install -y software-properties-common && \
-    add-apt-repository -y ppa:apptainer/ppa && \
-    apt update && apt -y install apptainer && \
-    add-apt-repository -y ppa:apptainer/ppa && \
-    apt update && apt install -y apptainer-suid && \
-    rm -rf /var/cache/apt/archives /var/lib/apt/lists/*
+# Update package lists and install apptainer for arm64
+# https://apptainer.org/docs/admin/1.1/installation.html
+RUN apt-get update && \
+    apt-get install -y software-properties-common && \
+    add-apt-repository -y ppa:apptainer/ppa && \
+    apt-get update && \
+    apt-get install -y apptainer apptainer-suid && \
+    rm -rf /var/cache/apt/archives /var/lib/apt/lists/*
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Update package lists and install apptainer for arm64
# https://apptainer.org/docs/admin/1.1/installation.html
RUN apt update && \
apt install -y software-properties-common && \
add-apt-repository -y ppa:apptainer/ppa && \
apt update && apt -y install apptainer && \
add-apt-repository -y ppa:apptainer/ppa && \
apt update && apt install -y apptainer-suid && \
rm -rf /var/cache/apt/archives /var/lib/apt/lists/*
# Update package lists and install apptainer for arm64
# https://apptainer.org/docs/admin/1.1/installation.html
RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository -y ppa:apptainer/ppa && \
apt-get update && \
apt-get install -y apptainer apptainer-suid && \
rm -rf /var/cache/apt/archives /var/lib/apt/lists/*
🤖 Prompt for AI Agents
In dockerfiles/Dockerfile.nemo-skills around lines 18 to 26, the Dockerfile adds
the same PPA twice and performs multiple separate apt updates/installs; remove
the duplicate add-apt-repository call (keep a single one), consolidate apt
update and apt install into a single RUN command that installs
software-properties-common, apptainer, and apptainer-suid in one apt -y install
invocation, and keep the cleanup (rm -rf /var/cache/apt/archives
/var/lib/apt/lists/*) at the end of that RUN to minimize layers and avoid
redundant repository additions.


# for ifeval benchmark
# TODO: can we get just a single dir?
Expand All @@ -25,6 +42,7 @@ RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla
RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6
RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install -e .

RUN apt remove -y python3-blinker

RUN mkdir -p /opt/NeMo-Skills/requirements
COPY pyproject.toml README.md /opt/NeMo-Skills/
Expand Down
5 changes: 0 additions & 5 deletions dockerfiles/Dockerfile.sglang

This file was deleted.

19 changes: 17 additions & 2 deletions dockerfiles/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,29 @@ Some dockerfiles are directly included in this folder and for some others the in
To build one of the existing dockerfiles use a command like this

```
docker build -t igitman/nemo-skills-nemo:0.7.0 -f dockerfiles/Dockerfile.nemo .
docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills .
```
It might take a long time for some of the images.

## Building for arm64/aarch64

To build for arm64 architecture (e.g. to use with GB200 machines) first follow the installation process at
https://docs.docker.com/build/building/multi-platform/#install-qemu-manually

Then run the same docker command but adding `--platform linux/arm64`.

## Building trtllm image

We directly use official TensorRT-LLM ngc containers. Current version is `nvcr.io/nvidia/tensorrt-llm/release:0.21.0`.
We directly use official `nvcr.io/nvidia/tensorrt-llm/release:1.0.0` image for both amd64 and arm64.

## Building sglang image

We directly use official `lmsysorg/sglang:v0.5.3rc1-cu126` image.

For arm64 we instead use `lmsysorg/sglang:blackwell-cu129-arm64` image.

## Building vllm image

We directly use official `vllm/vllm-openai:v0.10.1.1` image.

For arm64 we instead use `vllm/vllm-openai:v0.10.2` image.
16 changes: 0 additions & 16 deletions dockerfiles/sglang.patch

This file was deleted.

2 changes: 1 addition & 1 deletion docs/basics/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ config might look like
executor: local

containers:
trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
vllm: vllm/vllm-openai:v0.10.1.1
nemo: igitman/nemo-skills-nemo:0.7.0
# ... there are some more containers defined here
Expand Down
6 changes: 3 additions & 3 deletions nemo_skills/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@

# only used in ns setup command to initialize with defaults
_containers = {
"trtllm": "nvcr.io/nvidia/tensorrt-llm/release:0.21.0",
"trtllm": "nvcr.io/nvidia/tensorrt-llm/release:1.0.0",
"vllm": "vllm/vllm-openai:v0.10.1.1",
"sglang": "igitman/nemo-skills-sglang:0.7.0",
"sglang": "lmsysorg/sglang:v0.5.3rc1-cu126",
"nemo": "igitman/nemo-skills-nemo:0.7.0",
"megatron": "igitman/nemo-skills-megatron:0.7.0",
"sandbox": "igitman/nemo-skills-sandbox:0.7.1",
"nemo-skills": "igitman/nemo-skills:0.7.0",
"nemo-skills": "igitman/nemo-skills:0.7.1",
"verl": "igitman/nemo-skills-verl:0.7.0",
"nemo-rl": "igitman/nemo-skills-nemo-rl:0.7.0",
}
6 changes: 3 additions & 3 deletions tests/gpu-tests/test-local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@
executor: local

containers:
trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
vllm: vllm/vllm-openai:v0.10.1.1
sglang: igitman/nemo-skills-sglang:0.7.0
sglang: lmsysorg/sglang:v0.5.3rc1-cu126
nemo: igitman/nemo-skills-nemo:0.7.0
megatron: igitman/nemo-skills-megatron:0.7.0
sandbox: igitman/nemo-skills-sandbox:0.7.1
nemo-skills: igitman/nemo-skills:0.7.0
nemo-skills: igitman/nemo-skills:0.7.1
verl: igitman/nemo-skills-verl:0.7.0
nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0

Expand Down
2 changes: 0 additions & 2 deletions tests/slurm-tests/run_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,5 @@ python tests/slurm-tests/super_49b_evals/run_test.py --cluster $CLUSTER --worksp
sleep 10
python tests/slurm-tests/qwen3_4b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/qwen3_4b_evals --expname_prefix qwen3_4b_evals_$CURRENT_DATE &
sleep 10
python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --backend nemo-aligner --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/omr_simple_recipe/nemo-aligner --expname_prefix omr_simple_recipe_nemo_aligner_$CURRENT_DATE &
sleep 10
python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --backend nemo-rl --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/omr_simple_recipe/nemo-rl --expname_prefix omr_simple_recipe_nemo_rl_$CURRENT_DATE &
wait
2 changes: 1 addition & 1 deletion tests/test_code_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ async def test_lean4_mathlib_code_execution():
"""
expected_output = "7\n"

output, session_id = await sandbox.execute_code(correct_code_mathlib, language="lean4")
output, session_id = await sandbox.execute_code(correct_code_mathlib, language="lean4", timeout=60)

# Assertions for the mathlib code
assert session_id is None
Expand Down