diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index c7748f096f..d3949bfc74 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -52,7 +52,7 @@ jobs: - name: Cleanup if: always() run: | - docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.0 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' + docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' docker ps -a -q | xargs -r docker stop gpu-tests-qwen: @@ -91,5 +91,5 @@ jobs: - name: Cleanup if: always() run: | - docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.0 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' + docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' docker ps -a -q | xargs -r docker stop diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7e17e89f46..585292c0ee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -43,12 +43,12 @@ jobs: if: steps.changes.outputs.docker == 'true' run: | # these tags need to match the ones in tests/gpu-tests/test-local.yaml - docker build -t igitman/nemo-skills:0.7.0 -f dockerfiles/Dockerfile.nemo-skills . + docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills . docker build -t igitman/nemo-skills-sandbox:0.7.1 -f dockerfiles/Dockerfile.sandbox . - name: Pull Images if: steps.changes.outputs.docker != 'true' run: | - docker pull igitman/nemo-skills:0.7.0 + docker pull igitman/nemo-skills:0.7.1 docker pull igitman/nemo-skills-sandbox:0.7.1 - name: Run all tests env: diff --git a/cluster_configs/example-local.yaml b/cluster_configs/example-local.yaml index e773e01f28..3722399106 100644 --- a/cluster_configs/example-local.yaml +++ b/cluster_configs/example-local.yaml @@ -15,13 +15,13 @@ executor: local containers: - trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0 + trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 - sglang: igitman/nemo-skills-sglang:0.7.0 + sglang: lmsysorg/sglang:v0.5.3rc1-cu126 nemo: igitman/nemo-skills-nemo:0.7.0 megatron: igitman/nemo-skills-megatron:0.7.0 sandbox: igitman/nemo-skills-sandbox:0.7.1 - nemo-skills: igitman/nemo-skills:0.7.0 + nemo-skills: igitman/nemo-skills:0.7.1 verl: igitman/nemo-skills-verl:0.7.0 nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0 diff --git a/cluster_configs/example-slurm.yaml b/cluster_configs/example-slurm.yaml index d462ba4b9d..fd4714ca60 100644 --- a/cluster_configs/example-slurm.yaml +++ b/cluster_configs/example-slurm.yaml @@ -15,13 +15,13 @@ executor: slurm containers: - trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0 + trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 - sglang: igitman/nemo-skills-sglang:0.7.0 + sglang: lmsysorg/sglang:v0.5.3rc1-cu126 nemo: igitman/nemo-skills-nemo:0.7.0 megatron: igitman/nemo-skills-megatron:0.7.0 sandbox: igitman/nemo-skills-sandbox:0.7.1 - nemo-skills: igitman/nemo-skills:0.7.0 + nemo-skills: igitman/nemo-skills:0.7.1 verl: igitman/nemo-skills-verl:0.7.0 nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0 diff --git a/dockerfiles/Dockerfile.nemo-rl b/dockerfiles/Dockerfile.nemo-rl index d5e9139868..199876ccc9 100644 --- a/dockerfiles/Dockerfile.nemo-rl +++ b/dockerfiles/Dockerfile.nemo-rl @@ -49,7 +49,7 @@ ENV NEMO_RL_VENV_DIR=/opt/ray_venvs FROM base AS hermetic ARG NEMO_RL_COMMIT -ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-c6e6f70adfed4954f1ebbf99c5043d242015b13f} +ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-9301d36cbf847212430b84a27cfe6990f773b7cf} RUN git clone https://github.com/NVIDIA-NeMo/RL.git /opt/NeMo-RL && cd /opt/NeMo-RL && git checkout ${NEMO_RL_COMMIT} && git submodule update --init --recursive diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills index 31db72703d..fa0fcea631 100644 --- a/dockerfiles/Dockerfile.nemo-skills +++ b/dockerfiles/Dockerfile.nemo-skills @@ -1,12 +1,29 @@ -FROM python:3.10-bookworm - -RUN apt-get update && apt-get -y install curl git git-lfs - -# installing apptainer -RUN apt install -y wget && \ - cd /tmp && \ - wget https://github.com/apptainer/apptainer/releases/download/v1.4.1/apptainer_1.4.1_amd64.deb && \ - apt install -y ./apptainer_1.4.1_amd64.deb +# using ubuntu instead of debian for easier apptainer installation on arm64 +FROM ubuntu:22.04 + +# Install Python and other dependencies +RUN apt-get update && \ + apt-get install -y \ + python3.10 \ + python3-pip \ + curl \ + wget \ + git \ + git-lfs && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + rm -rf /var/cache/apt/archives /var/lib/apt/lists/* + +RUN pip install --upgrade pip setuptools + +# Update package lists and install apptainer for arm64 +# https://apptainer.org/docs/admin/1.1/installation.html +RUN apt update && \ + apt install -y software-properties-common && \ + add-apt-repository -y ppa:apptainer/ppa && \ + apt update && apt -y install apptainer && \ + add-apt-repository -y ppa:apptainer/ppa && \ + apt update && apt install -y apptainer-suid && \ + rm -rf /var/cache/apt/archives /var/lib/apt/lists/* # for ifeval benchmark # TODO: can we get just a single dir? @@ -25,6 +42,7 @@ RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6 RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install -e . +RUN apt remove -y python3-blinker RUN mkdir -p /opt/NeMo-Skills/requirements COPY pyproject.toml README.md /opt/NeMo-Skills/ diff --git a/dockerfiles/Dockerfile.sglang b/dockerfiles/Dockerfile.sglang deleted file mode 100644 index 5a1f2a59a5..0000000000 --- a/dockerfiles/Dockerfile.sglang +++ /dev/null @@ -1,5 +0,0 @@ -FROM lmsysorg/sglang:v0.4.10.post2-cu126 - -# patching for sharding states support for DeepSeek-R1 -COPY dockerfiles/sglang.patch /sgl-workspace/sglang.patch -RUN cd /sgl-workspace/sglang && git apply /sgl-workspace/sglang.patch diff --git a/dockerfiles/README.md b/dockerfiles/README.md index 3983612009..af195cc92a 100644 --- a/dockerfiles/README.md +++ b/dockerfiles/README.md @@ -4,14 +4,29 @@ Some dockerfiles are directly included in this folder and for some others the in To build one of the existing dockerfiles use a command like this ``` -docker build -t igitman/nemo-skills-nemo:0.7.0 -f dockerfiles/Dockerfile.nemo . +docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills . ``` It might take a long time for some of the images. +## Building for arm64/aarch64 + +To build for arm64 architecture (e.g. to use with GB200 machines) first follow the installation process at +https://docs.docker.com/build/building/multi-platform/#install-qemu-manually + +Then run the same docker command but adding `--platform linux/arm64`. + ## Building trtllm image -We directly use official TensorRT-LLM ngc containers. Current version is `nvcr.io/nvidia/tensorrt-llm/release:0.21.0`. +We directly use official `nvcr.io/nvidia/tensorrt-llm/release:1.0.0` image for both amd64 and arm64. + +## Building sglang image + +We directly use official `lmsysorg/sglang:v0.5.3rc1-cu126` image. + +For arm64 we instead use `lmsysorg/sglang:blackwell-cu129-arm64` image. ## Building vllm image We directly use official `vllm/vllm-openai:v0.10.1.1` image. + +For arm64 we instead use `vllm/vllm-openai:v0.10.2` image. diff --git a/dockerfiles/sglang.patch b/dockerfiles/sglang.patch deleted file mode 100644 index 1d24da3b5d..0000000000 --- a/dockerfiles/sglang.patch +++ /dev/null @@ -1,16 +0,0 @@ -diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py -index e2c6a37..4ee6347 100644 ---- a/python/sglang/srt/model_loader/loader.py -+++ b/python/sglang/srt/model_loader/loader.py -@@ -653,6 +653,11 @@ class ShardedStateLoader(BaseModelLoader): - state_dict.pop(key) - if state_dict: - raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!") -+ -+ if hasattr(model, "post_load_weights"): -+ print("Post loading weights") -+ model.post_load_weights() -+ - return model.eval() - - @staticmethod diff --git a/docs/basics/index.md b/docs/basics/index.md index 8264a493bb..f6e336ac67 100644 --- a/docs/basics/index.md +++ b/docs/basics/index.md @@ -98,7 +98,7 @@ config might look like executor: local containers: - trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0 + trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 nemo: igitman/nemo-skills-nemo:0.7.0 # ... there are some more containers defined here diff --git a/nemo_skills/__init__.py b/nemo_skills/__init__.py index 7affaa16b7..2ecd8ef473 100644 --- a/nemo_skills/__init__.py +++ b/nemo_skills/__init__.py @@ -16,13 +16,13 @@ # only used in ns setup command to initialize with defaults _containers = { - "trtllm": "nvcr.io/nvidia/tensorrt-llm/release:0.21.0", + "trtllm": "nvcr.io/nvidia/tensorrt-llm/release:1.0.0", "vllm": "vllm/vllm-openai:v0.10.1.1", - "sglang": "igitman/nemo-skills-sglang:0.7.0", + "sglang": "lmsysorg/sglang:v0.5.3rc1-cu126", "nemo": "igitman/nemo-skills-nemo:0.7.0", "megatron": "igitman/nemo-skills-megatron:0.7.0", "sandbox": "igitman/nemo-skills-sandbox:0.7.1", - "nemo-skills": "igitman/nemo-skills:0.7.0", + "nemo-skills": "igitman/nemo-skills:0.7.1", "verl": "igitman/nemo-skills-verl:0.7.0", "nemo-rl": "igitman/nemo-skills-nemo-rl:0.7.0", } diff --git a/tests/gpu-tests/test-local.yaml b/tests/gpu-tests/test-local.yaml index 85e2ec62f6..fa661985d5 100644 --- a/tests/gpu-tests/test-local.yaml +++ b/tests/gpu-tests/test-local.yaml @@ -15,13 +15,13 @@ executor: local containers: - trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0 + trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 - sglang: igitman/nemo-skills-sglang:0.7.0 + sglang: lmsysorg/sglang:v0.5.3rc1-cu126 nemo: igitman/nemo-skills-nemo:0.7.0 megatron: igitman/nemo-skills-megatron:0.7.0 sandbox: igitman/nemo-skills-sandbox:0.7.1 - nemo-skills: igitman/nemo-skills:0.7.0 + nemo-skills: igitman/nemo-skills:0.7.1 verl: igitman/nemo-skills-verl:0.7.0 nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0 diff --git a/tests/slurm-tests/run_all.sh b/tests/slurm-tests/run_all.sh index db91db56cb..11a097ef32 100755 --- a/tests/slurm-tests/run_all.sh +++ b/tests/slurm-tests/run_all.sh @@ -10,7 +10,5 @@ python tests/slurm-tests/super_49b_evals/run_test.py --cluster $CLUSTER --worksp sleep 10 python tests/slurm-tests/qwen3_4b_evals/run_test.py --cluster $CLUSTER --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/qwen3_4b_evals --expname_prefix qwen3_4b_evals_$CURRENT_DATE & sleep 10 -python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --backend nemo-aligner --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/omr_simple_recipe/nemo-aligner --expname_prefix omr_simple_recipe_nemo_aligner_$CURRENT_DATE & -sleep 10 python tests/slurm-tests/omr_simple_recipe/run_test.py --cluster $CLUSTER --backend nemo-rl --workspace /workspace/nemo-skills-slurm-ci/$CURRENT_DATE/omr_simple_recipe/nemo-rl --expname_prefix omr_simple_recipe_nemo_rl_$CURRENT_DATE & wait diff --git a/tests/test_code_execution.py b/tests/test_code_execution.py index d7bb609e2e..c7e29535ef 100644 --- a/tests/test_code_execution.py +++ b/tests/test_code_execution.py @@ -315,7 +315,7 @@ async def test_lean4_mathlib_code_execution(): """ expected_output = "7\n" - output, session_id = await sandbox.execute_code(correct_code_mathlib, language="lean4") + output, session_id = await sandbox.execute_code(correct_code_mathlib, language="lean4", timeout=60) # Assertions for the mathlib code assert session_id is None