NVIDIA-NeMo · Kipok · Oct 20, 2025 · Oct 17, 2025 · Oct 17, 2025 · Oct 17, 2025
diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml
@@ -40,6 +40,10 @@ jobs:
         pip install -e .
         pip install -r requirements/common-tests.txt
         ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
+    - name: Build Docker image
+      run: |
+        cd ${{ github.run_id }}
+        docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills .
     - name: Run GPU tests
       timeout-minutes: 240
       env:
@@ -52,7 +56,7 @@ jobs:
     - name: Cleanup
       if: always()
       run: |
-        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker run --rm -v /tmp:/tmp -v /home:/home nemo-skills-image bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
         docker ps -a -q | xargs -r docker stop
 
   gpu-tests-qwen:
@@ -79,6 +83,10 @@ jobs:
         pip install -e .
         pip install -r requirements/common-tests.txt
         ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
+    - name: Build Docker image
+      run: |
+        cd ${{ github.run_id }}
+        docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills .
     - name: Run GPU tests
       timeout-minutes: 240
       env:
@@ -91,5 +99,5 @@ jobs:
     - name: Cleanup
       if: always()
       run: |
-        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker run --rm -v /tmp:/tmp -v /home:/home nemo-skills-image bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
         docker ps -a -q | xargs -r docker stop
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -25,37 +25,21 @@ jobs:
       with:
         python-version: "3.10"
         cache: pip
-    - name: Detect Docker changes
-      id: changes
-      uses: dorny/paths-filter@v3
-      with:
-        filters: |
-          docker:
-            - 'dockerfiles/Dockerfile.sandbox'
-            - 'dockerfiles/Dockerfile.nemo-skills'
-            - 'nemo_skills/code_execution/local_sandbox/**'
-            - 'requirements/**'
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         pip install -e .[dev]
     - name: Build Images
-      if: steps.changes.outputs.docker == 'true'
       run: |
         # these tags need to match the ones in tests/gpu-tests/test-local.yaml
-        docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills .
-        docker build -t igitman/nemo-skills-sandbox:0.7.1 -f dockerfiles/Dockerfile.sandbox .
-    - name: Pull Images
-      if: steps.changes.outputs.docker != 'true'
-      run: |
-        docker pull igitman/nemo-skills:0.7.1
-        docker pull igitman/nemo-skills-sandbox:0.7.1
+        docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills .
+        docker build -t nemo-skills-sandbox-image -f dockerfiles/Dockerfile.sandbox .
     - name: Run all tests
       env:
         NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
         HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
-        docker run --rm --network=host igitman/nemo-skills-sandbox:0.7.1 &
+        docker run --rm --network=host nemo-skills-sandbox-image &
         sleep 10
         set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
         ns prepare_data gsm8k math-500

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,4 @@
 recursive-include nemo_skills *.yaml
 recursive-include nemo_skills *.txt
+graft dockerfiles
+graft requirements
diff --git a/cluster_configs/example-local.yaml b/cluster_configs/example-local.yaml
@@ -18,12 +18,12 @@ containers:
   trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
   vllm: vllm/vllm-openai:v0.10.1.1
   sglang: lmsysorg/sglang:v0.5.3rc1-cu126
-  nemo: igitman/nemo-skills-nemo:0.7.0
-  megatron: igitman/nemo-skills-megatron:0.7.0
-  sandbox: igitman/nemo-skills-sandbox:0.7.1
-  nemo-skills: igitman/nemo-skills:0.7.1
-  verl: igitman/nemo-skills-verl:0.7.0
-  nemo-rl: igitman/nemo-skills-nemo-rl:0.7.1
+  # dockerfile: for now can only specify relative to repo root
+  megatron: dockerfile:dockerfiles/Dockerfile.megatron
+  sandbox: dockerfile:dockerfiles/Dockerfile.sandbox
+  nemo-skills: dockerfile:dockerfiles/Dockerfile.nemo-skills
+  verl: dockerfile:dockerfiles/Dockerfile.verl
+  nemo-rl: dockerfile:dockerfiles/Dockerfile.nemo-rl
 
 # add required mounts for models/data here
 # the code is mounted automatically inside /nemo_run/code
@@ -34,8 +34,8 @@ containers:
 #   - /mnt/datadrive/models:/models
 #   - /mnt/datadrive/data:/data
 #   - /home/<username>/workspace:/workspace
-#   you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do
-#   - <...>/NeMo-Aligner:/opt/NeMo-Aligner
+#   you can also override container libraries by directly mounting over them. E.g. to override NeMo-RL do
+#   - <...>/NeMo-RL:/opt/NeMo-RL
 
 # define any environment variables. Note that HF_HOME is required by default and needs to be a mounted path!
 # env_vars:

diff --git a/cluster_configs/example-slurm.yaml b/cluster_configs/example-slurm.yaml
@@ -15,15 +15,8 @@
 executor: slurm
 
 containers:
-  trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
-  vllm: vllm/vllm-openai:v0.10.1.1
-  sglang: lmsysorg/sglang:v0.5.3rc1-cu126
-  nemo: igitman/nemo-skills-nemo:0.7.0
-  megatron: igitman/nemo-skills-megatron:0.7.0
-  sandbox: igitman/nemo-skills-sandbox:0.7.1
-  nemo-skills: igitman/nemo-skills:0.7.1
-  verl: igitman/nemo-skills-verl:0.7.0
-  nemo-rl: igitman/nemo-skills-nemo-rl:0.7.1
+  # follow steps in https://nvidia-nemo.github.io/Skills/basics/#slurm-inference
+  # to complete this section
 
 job_name_prefix: "nemo_skills:"
 

diff --git a/dockerfiles/Dockerfile.nemo b/dockerfiles/Dockerfile.nemo
diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills
@@ -36,14 +36,6 @@ RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install -e .
 
 RUN apt remove -y python3-blinker
 
-RUN mkdir -p /opt/NeMo-Skills/requirements
-COPY pyproject.toml README.md /opt/NeMo-Skills/
-COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
-COPY requirements /opt/NeMo-Skills/requirements/
-# installing sdp in container only
-RUN pip install git+https://github.com/NVIDIA/NeMo-speech-data-processor@29b9b1ec0ceaf3ffa441c1d01297371b3f8e11d2
-RUN cd /opt/NeMo-Skills && pip install -e .
-
 # ifbench
 RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1
 RUN cd /opt/benchmarks/IFBench && pip install -r requirements.txt
@@ -55,3 +47,12 @@ RUN cd /opt/benchmarks/IFBench && git apply ifbench.patch
 RUN pip install langdetect absl-py immutabledict nltk ipython && \
     python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); \
     nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger_eng'); download('en_core_web_sm')"
+
+# we aren't copying main nemo_skills folder as it will always be mounted from host
+# but we do want to install all requirements in the container directly
+RUN mkdir -p /opt/NeMo-Skills/requirements
+COPY pyproject.toml README.md /opt/NeMo-Skills/
+COPY requirements /opt/NeMo-Skills/requirements/
+# installing sdp in container only
+RUN pip install git+https://github.com/NVIDIA/NeMo-speech-data-processor@29b9b1ec0ceaf3ffa441c1d01297371b3f8e11d2
+RUN pip install -r /opt/NeMo-Skills/requirements/main.txt
diff --git a/dockerfiles/README.md b/dockerfiles/README.md
@@ -4,7 +4,7 @@ Some dockerfiles are directly included in this folder and for some others the in
 
 The dockerfiles can be built using the standard docker build command. e.g.,
 ```shell
-docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills .
+docker build -t nemo-skills-image:0.7.1 -f dockerfiles/Dockerfile.nemo-skills .
 ```
 
 In addition, we provide a utility script which provides sane build defaults

diff --git a/docs/basics/index.md b/docs/basics/index.md
@@ -98,9 +98,12 @@ config might look like
 executor: local
 
 containers:
+  # some containers are public and we pull them
   trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0
   vllm: vllm/vllm-openai:v0.10.1.1
-  nemo: igitman/nemo-skills-nemo:0.7.0
+  # some containers are custom and we will build them locally before running the job
+  # you can always pre-build them as well
+  nemo-skills: dockerfile:dockerfiles/Dockerfile.nemo-skills
   # ... there are some more containers defined here
 
 env_vars:
@@ -172,6 +175,34 @@ leverage a Slurm cluster[^2]. Let's setup our cluster config for that case by ru
 This time pick `slurm` for the config type and fill out all other required information
 (such as ssh access, account, partition, etc.).
 
+!!! note
+    If you're an NVIDIA employee, we have a pre-configured cluster configs for internal usage with pre-built sqsh
+    containers available at https://gitlab-master.nvidia.com/igitman/nemo-skills-configs. You can most likely
+    skip the step below and reuse one of the existing configurations.
+
+You will also need to build .sqsh files for all containers or upload all `dockerfile:...` containers to
+some registry (e.g. dockerhub) and reference the uploaded versions. To build sqsh files you can use the following commands
+
+1. Build images locally and upload to some container registry. E.g.
+   ```bash
+   docker build -t gitlab-master.nvidia.com/igitman/nemo-skills-containers:nemo-skills-0.7.1 -f dockerfiles/Dockerfile.nemo-skills .
+   docker push gitlab-master.nvidia.com/igitman/nemo-skills-containers:nemo-skills-0.7.1
+   ```
+2. Start an interactive shell, e.g. with the following (assuming there is a "cpu" partition)
+   ```bash
+   srun -A <account> --partition cpu --job-name build-sqsh --time=1:00:00 --exclusive --pty /bin/bash -l
+   ```
+3. Import the image, e.g.:
+   ```bash
+   enroot import -o /path/to/nemo-skills-image.sqsh --docker://gitlab-master.nvidia.com/igitman/nemo-skills-containers:nemo-skills-0.7.1
+   ```
+4. Specify this image path in your cluster config
+   ```yaml
+   containers:
+     nemo-skills: /path/to/nemo-skills-image.sqsh
+   ```
+```
+
 Now that we have a slurm config setup, we can try running some jobs. Generally, you will need to upload models / data
 on cluster manually and then reference a proper mounted path. But for small-scale things we can also leverage the
 [code packaging](./code-packaging.md) functionality that nemo-skills provide. Whenever you run any of the ns commands

diff --git a/docs/basics/sandbox.md b/docs/basics/sandbox.md
@@ -18,7 +18,7 @@ Most of the time, the pipeline scripts will launch sandbox automatically when re
 it manually, you can use the following command
 
 ```bash
-docker run --rm --network=host igitman/nemo-skills-sandbox:0.7.1
+./nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh
 ```
 
 If docker is not available, you can still run a sandbox (although less efficient version) like this