diff --git a/.github/workflows/gpu_tests.yml b/.github/workflows/gpu_tests.yml index d3949bfc74..c069a01172 100644 --- a/.github/workflows/gpu_tests.yml +++ b/.github/workflows/gpu_tests.yml @@ -40,6 +40,10 @@ jobs: pip install -e . pip install -r requirements/common-tests.txt ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24 + - name: Build Docker image + run: | + cd ${{ github.run_id }} + docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills . - name: Run GPU tests timeout-minutes: 240 env: @@ -52,7 +56,7 @@ jobs: - name: Cleanup if: always() run: | - docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' + docker run --rm -v /tmp:/tmp -v /home:/home nemo-skills-image bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' docker ps -a -q | xargs -r docker stop gpu-tests-qwen: @@ -79,6 +83,10 @@ jobs: pip install -e . pip install -r requirements/common-tests.txt ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24 + - name: Build Docker image + run: | + cd ${{ github.run_id }} + docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills . - name: Run GPU tests timeout-minutes: 240 env: @@ -91,5 +99,5 @@ jobs: - name: Cleanup if: always() run: | - docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.7.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' + docker run --rm -v /tmp:/tmp -v /home:/home nemo-skills-image bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/' docker ps -a -q | xargs -r docker stop diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 585292c0ee..521a5cc103 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,37 +25,21 @@ jobs: with: python-version: "3.10" cache: pip - - name: Detect Docker changes - id: changes - uses: dorny/paths-filter@v3 - with: - filters: | - docker: - - 'dockerfiles/Dockerfile.sandbox' - - 'dockerfiles/Dockerfile.nemo-skills' - - 'nemo_skills/code_execution/local_sandbox/**' - - 'requirements/**' - name: Install dependencies run: | python -m pip install --upgrade pip pip install -e .[dev] - name: Build Images - if: steps.changes.outputs.docker == 'true' run: | # these tags need to match the ones in tests/gpu-tests/test-local.yaml - docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills . - docker build -t igitman/nemo-skills-sandbox:0.7.1 -f dockerfiles/Dockerfile.sandbox . - - name: Pull Images - if: steps.changes.outputs.docker != 'true' - run: | - docker pull igitman/nemo-skills:0.7.1 - docker pull igitman/nemo-skills-sandbox:0.7.1 + docker build -t nemo-skills-image -f dockerfiles/Dockerfile.nemo-skills . + docker build -t nemo-skills-sandbox-image -f dockerfiles/Dockerfile.sandbox . - name: Run all tests env: NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} HF_TOKEN: ${{ secrets.HF_TOKEN }} run: | - docker run --rm --network=host igitman/nemo-skills-sandbox:0.7.1 & + docker run --rm --network=host nemo-skills-sandbox-image & sleep 10 set -o pipefail # this will make sure next line returns non-0 exit code if tests fail ns prepare_data gsm8k math-500 diff --git a/MANIFEST.in b/MANIFEST.in index a69e235884..112893bcab 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,4 @@ recursive-include nemo_skills *.yaml recursive-include nemo_skills *.txt +graft dockerfiles +graft requirements diff --git a/cluster_configs/example-local.yaml b/cluster_configs/example-local.yaml index 7588fb80f8..d7f025fec5 100644 --- a/cluster_configs/example-local.yaml +++ b/cluster_configs/example-local.yaml @@ -18,12 +18,12 @@ containers: trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 sglang: lmsysorg/sglang:v0.5.3rc1-cu126 - nemo: igitman/nemo-skills-nemo:0.7.0 - megatron: igitman/nemo-skills-megatron:0.7.0 - sandbox: igitman/nemo-skills-sandbox:0.7.1 - nemo-skills: igitman/nemo-skills:0.7.1 - verl: igitman/nemo-skills-verl:0.7.0 - nemo-rl: igitman/nemo-skills-nemo-rl:0.7.1 + # dockerfile: for now can only specify relative to repo root + megatron: dockerfile:dockerfiles/Dockerfile.megatron + sandbox: dockerfile:dockerfiles/Dockerfile.sandbox + nemo-skills: dockerfile:dockerfiles/Dockerfile.nemo-skills + verl: dockerfile:dockerfiles/Dockerfile.verl + nemo-rl: dockerfile:dockerfiles/Dockerfile.nemo-rl # add required mounts for models/data here # the code is mounted automatically inside /nemo_run/code @@ -34,8 +34,8 @@ containers: # - /mnt/datadrive/models:/models # - /mnt/datadrive/data:/data # - /home//workspace:/workspace -# you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do -# - <...>/NeMo-Aligner:/opt/NeMo-Aligner +# you can also override container libraries by directly mounting over them. E.g. to override NeMo-RL do +# - <...>/NeMo-RL:/opt/NeMo-RL # define any environment variables. Note that HF_HOME is required by default and needs to be a mounted path! # env_vars: diff --git a/cluster_configs/example-slurm.yaml b/cluster_configs/example-slurm.yaml index 7a32565ebc..3cf914cafa 100644 --- a/cluster_configs/example-slurm.yaml +++ b/cluster_configs/example-slurm.yaml @@ -15,15 +15,8 @@ executor: slurm containers: - trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 - vllm: vllm/vllm-openai:v0.10.1.1 - sglang: lmsysorg/sglang:v0.5.3rc1-cu126 - nemo: igitman/nemo-skills-nemo:0.7.0 - megatron: igitman/nemo-skills-megatron:0.7.0 - sandbox: igitman/nemo-skills-sandbox:0.7.1 - nemo-skills: igitman/nemo-skills:0.7.1 - verl: igitman/nemo-skills-verl:0.7.0 - nemo-rl: igitman/nemo-skills-nemo-rl:0.7.1 + # follow steps in https://nvidia-nemo.github.io/Skills/basics/#slurm-inference + # to complete this section job_name_prefix: "nemo_skills:" diff --git a/dockerfiles/Dockerfile.nemo b/dockerfiles/Dockerfile.nemo deleted file mode 100644 index a333b44f6a..0000000000 --- a/dockerfiles/Dockerfile.nemo +++ /dev/null @@ -1,169 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# copied from https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile -# with pinned NeMo-Aligner version for reproducibility - -# To build NeMo-Aligner from a base PyTorch container: -# -# docker buildx build -t aligner:latest . -# -# To update NeMo-Aligner from a pre-built NeMo-Framework container: -# -# docker buildx build --target=aligner-bump -t aligner:latest . -# - -# Number of parallel threads for compute heavy build jobs -# if you get errors building TE or Apex, decrease this to 4 -ARG MAX_JOBS=8 -# Git refs for dependencies -ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea -ARG PYTRITON_VERSION=0.5.10 -ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main -ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main -ARG ALIGNER_COMMIT=35fcfd9df754aff56f71cb3ba3382cc02384361a -ARG TRTLLM_VERSION=v0.13.0 -ARG PROTOBUF_VERSION=4.24.4 -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 - -FROM ${BASE_IMAGE} AS aligner-bump -ARG ALIGNER_COMMIT -WORKDIR /opt -# NeMo Aligner -RUN <<"EOF" bash -exu -if [[ ! -d NeMo-Aligner ]]; then - git clone https://github.com/NVIDIA/NeMo-Aligner.git -fi -cd NeMo-Aligner -git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge' -git checkout -f $ALIGNER_COMMIT -# case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it -# case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail -git pull --rebase || true - -pip install --no-cache-dir --no-deps -e . -EOF - -FROM ${BASE_IMAGE} as final -LABEL "nemo.library"="nemo-aligner" -WORKDIR /opt -# needed in case git complains that it can't detect a valid email, this email is fake but works -RUN git config --global user.email "worker@nvidia.com" -# install latest apex -ARG APEX_TAG -RUN pip uninstall -y apex && \ - git clone https://github.com/NVIDIA/apex && \ - cd apex && \ - if [ ! -z $APEX_TAG ]; then \ - git fetch origin $APEX_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./ - -# Git LFS -RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \ - apt-get install git-lfs && \ - git lfs install && \ - apt-get clean - -# TRTLLM -ARG TRTLLM_VERSION -RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \ - cd TensorRT-LLM && \ - git checkout ${TRTLLM_VERSION} && \ - . docker/common/install_tensorrt.sh && \ - python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \ - pip install -e . -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/ - -# install TransformerEngine -ARG MAX_JOBS -ARG TE_TAG -RUN pip uninstall -y transformer-engine && \ - git clone https://github.com/NVIDIA/TransformerEngine.git && \ - cd TransformerEngine && \ - if [ ! -z $TE_TAG ]; then \ - git fetch origin $TE_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - git submodule init && git submodule update && \ - NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install . - -RUN pip install fire - -# place any util pkgs here -ARG PYTRITON_VERSION -RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION -ARG PROTOBUF_VERSION -RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION -RUN pip install --upgrade-strategy only-if-needed jsonlines - -# NeMo -ARG NEMO_TAG -RUN git clone https://github.com/NVIDIA/NeMo.git && \ - cd NeMo && \ - git pull && \ - if [ ! -z $NEMO_TAG ]; then \ - git fetch origin $NEMO_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip uninstall -y nemo_toolkit sacrebleu && \ - pip install -e ".[nlp]" && \ - cd nemo/collections/nlp/data/language_modeling/megatron && make - -# MLM -ARG MLM_TAG -RUN pip uninstall -y megatron-core && \ - git clone https://github.com/NVIDIA/Megatron-LM.git && \ - cd Megatron-LM && \ - git pull && \ - if [ ! -z $MLM_TAG ]; then \ - git fetch origin $MLM_TAG && \ - git checkout FETCH_HEAD; \ - fi && \ - pip install -e . - -COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner -RUN cd /opt/NeMo-Aligner && \ - pip install --no-deps -e . - -RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch - -# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs -RUN <<"EOF" bash -exu -cd NeMo -# Ensures we don't cherry-pick "future" origin/main commits -git fetch -a -# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651 -# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652 -# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863 -# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654 -for pr_and_commit in \ - "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \ - "10652 60e677423667c029dd05875da72bf0719774f844" \ - "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \ -; do - pr=$(cut -f1 -d' ' <<<"$pr_and_commit") - head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit") - git fetch origin $head_pr_commit:PR-${pr} - # cherry-picks all commits between main and the top of the PR - git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr} - # Tag cherry-picks to help - git tag cherry-pick-PR-${pr} -done -EOF - -# patching gpt sft dataset to properly support packing -# TODO: remove when integrated in NeMo -COPY nemo_skills/training/gpt_sft_dataset.py /opt/NeMo/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py diff --git a/dockerfiles/Dockerfile.nemo-skills b/dockerfiles/Dockerfile.nemo-skills index 1683368c4a..25f6499808 100644 --- a/dockerfiles/Dockerfile.nemo-skills +++ b/dockerfiles/Dockerfile.nemo-skills @@ -36,14 +36,6 @@ RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install -e . RUN apt remove -y python3-blinker -RUN mkdir -p /opt/NeMo-Skills/requirements -COPY pyproject.toml README.md /opt/NeMo-Skills/ -COPY nemo_skills /opt/NeMo-Skills/nemo_skills/ -COPY requirements /opt/NeMo-Skills/requirements/ -# installing sdp in container only -RUN pip install git+https://github.com/NVIDIA/NeMo-speech-data-processor@29b9b1ec0ceaf3ffa441c1d01297371b3f8e11d2 -RUN cd /opt/NeMo-Skills && pip install -e . - # ifbench RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1 RUN cd /opt/benchmarks/IFBench && pip install -r requirements.txt @@ -55,3 +47,12 @@ RUN cd /opt/benchmarks/IFBench && git apply ifbench.patch RUN pip install langdetect absl-py immutabledict nltk ipython && \ python -c "import nltk; from spacy.cli import download; nltk.download('punkt'); nltk.download('punkt_tab'); \ nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger_eng'); download('en_core_web_sm')" + +# we aren't copying main nemo_skills folder as it will always be mounted from host +# but we do want to install all requirements in the container directly +RUN mkdir -p /opt/NeMo-Skills/requirements +COPY pyproject.toml README.md /opt/NeMo-Skills/ +COPY requirements /opt/NeMo-Skills/requirements/ +# installing sdp in container only +RUN pip install git+https://github.com/NVIDIA/NeMo-speech-data-processor@29b9b1ec0ceaf3ffa441c1d01297371b3f8e11d2 +RUN pip install -r /opt/NeMo-Skills/requirements/main.txt diff --git a/dockerfiles/README.md b/dockerfiles/README.md index 0e5197d142..86182fabad 100644 --- a/dockerfiles/README.md +++ b/dockerfiles/README.md @@ -4,7 +4,7 @@ Some dockerfiles are directly included in this folder and for some others the in The dockerfiles can be built using the standard docker build command. e.g., ```shell -docker build -t igitman/nemo-skills:0.7.1 -f dockerfiles/Dockerfile.nemo-skills . +docker build -t nemo-skills-image:0.7.1 -f dockerfiles/Dockerfile.nemo-skills . ``` In addition, we provide a utility script which provides sane build defaults diff --git a/docs/basics/index.md b/docs/basics/index.md index 49efee39f6..436496c104 100644 --- a/docs/basics/index.md +++ b/docs/basics/index.md @@ -98,9 +98,12 @@ config might look like executor: local containers: + # some containers are public and we pull them trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 - nemo: igitman/nemo-skills-nemo:0.7.0 + # some containers are custom and we will build them locally before running the job + # you can always pre-build them as well + nemo-skills: dockerfile:dockerfiles/Dockerfile.nemo-skills # ... there are some more containers defined here env_vars: @@ -172,6 +175,34 @@ leverage a Slurm cluster[^2]. Let's setup our cluster config for that case by ru This time pick `slurm` for the config type and fill out all other required information (such as ssh access, account, partition, etc.). +!!! note + If you're an NVIDIA employee, we have a pre-configured cluster configs for internal usage with pre-built sqsh + containers available at https://gitlab-master.nvidia.com/igitman/nemo-skills-configs. You can most likely + skip the step below and reuse one of the existing configurations. + +You will also need to build .sqsh files for all containers or upload all `dockerfile:...` containers to +some registry (e.g. dockerhub) and reference the uploaded versions. To build sqsh files you can use the following commands + +1. Build images locally and upload to some container registry. E.g. + ```bash + docker build -t gitlab-master.nvidia.com/igitman/nemo-skills-containers:nemo-skills-0.7.1 -f dockerfiles/Dockerfile.nemo-skills . + docker push gitlab-master.nvidia.com/igitman/nemo-skills-containers:nemo-skills-0.7.1 + ``` +2. Start an interactive shell, e.g. with the following (assuming there is a "cpu" partition) + ```bash + srun -A --partition cpu --job-name build-sqsh --time=1:00:00 --exclusive --pty /bin/bash -l + ``` +3. Import the image, e.g.: + ```bash + enroot import -o /path/to/nemo-skills-image.sqsh --docker://gitlab-master.nvidia.com/igitman/nemo-skills-containers:nemo-skills-0.7.1 + ``` +4. Specify this image path in your cluster config + ```yaml + containers: + nemo-skills: /path/to/nemo-skills-image.sqsh + ``` +``` + Now that we have a slurm config setup, we can try running some jobs. Generally, you will need to upload models / data on cluster manually and then reference a proper mounted path. But for small-scale things we can also leverage the [code packaging](./code-packaging.md) functionality that nemo-skills provide. Whenever you run any of the ns commands diff --git a/docs/basics/sandbox.md b/docs/basics/sandbox.md index 25a66b00e1..2a4ae23ada 100644 --- a/docs/basics/sandbox.md +++ b/docs/basics/sandbox.md @@ -18,7 +18,7 @@ Most of the time, the pipeline scripts will launch sandbox automatically when re it manually, you can use the following command ```bash -docker run --rm --network=host igitman/nemo-skills-sandbox:0.7.1 +./nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh ``` If docker is not available, you can still run a sandbox (although less efficient version) like this diff --git a/nemo_skills/__init__.py b/nemo_skills/__init__.py index 5817bc9fa1..af63dea75a 100644 --- a/nemo_skills/__init__.py +++ b/nemo_skills/__init__.py @@ -19,9 +19,9 @@ "trtllm": "nvcr.io/nvidia/tensorrt-llm/release:1.0.0", "vllm": "vllm/vllm-openai:v0.10.1.1", "sglang": "lmsysorg/sglang:v0.5.3rc1-cu126", - "megatron": "igitman/nemo-skills-megatron:0.7.0", - "sandbox": "igitman/nemo-skills-sandbox:0.7.1", - "nemo-skills": "igitman/nemo-skills:0.7.1", - "verl": "igitman/nemo-skills-verl:0.7.0", - "nemo-rl": "igitman/nemo-skills-nemo-rl:0.7.1", + "megatron": "dockerfile:dockerfiles/Dockerfile.megatron", + "sandbox": "dockerfile:dockerfiles/Dockerfile.sandbox", + "nemo-skills": "dockerfile:dockerfiles/Dockerfile.nemo-skills", + "verl": "dockerfile:dockerfiles/Dockerfile.verl", + "nemo-rl": "dockerfile:dockerfiles/Dockerfile.nemo-rl", } diff --git a/nemo_skills/pipeline/setup.py b/nemo_skills/pipeline/setup.py index 46d760df29..c767a9ce5a 100644 --- a/nemo_skills/pipeline/setup.py +++ b/nemo_skills/pipeline/setup.py @@ -22,6 +22,7 @@ from nemo_skills import _containers from nemo_skills.pipeline.app import app from nemo_skills.pipeline.utils import is_mounted_filepath +from nemo_skills.pipeline.utils.docker_images import resolve_container_image def is_docker_available(): @@ -35,12 +36,20 @@ def is_docker_available(): # Helper function to pull Docker containers def pull_docker_containers(containers): for container_name, container_image in containers.items(): - typer.echo(f"Pulling {container_name}: {container_image}...") - try: - subprocess.run(["docker", "pull", container_image], check=True) - typer.echo(f"Successfully pulled {container_image}") - except subprocess.SubprocessError as e: - typer.echo(f"Failed to pull {container_image}: {e}") + if container_image.startswith("dockerfile:"): + typer.echo(f"Building {container_name} from {container_image}...") + try: + resolved_image = resolve_container_image(container_image, {"executor": "local"}) + typer.echo(f"Successfully built {resolved_image}") + except Exception as e: + typer.echo(f"Failed to build {container_image}: {e}") + else: + typer.echo(f"Pulling {container_name}: {container_image}...") + try: + subprocess.run(["docker", "pull", container_image], check=True) + typer.echo(f"Successfully pulled {container_image}") + except subprocess.SubprocessError as e: + typer.echo(f"Failed to pull {container_image}: {e}") @app.command() @@ -81,8 +90,10 @@ def setup(): if not overwrite: continue - # initializing default containers - config = {"executor": config_type, "containers": _containers} + # initialize config with executor only; containers handled per executor type + config = {"executor": config_type} + if config_type == "local": + config["containers"] = dict(_containers) mounts = typer.prompt( "\nWe execute all commands in docker containers, so you need to " @@ -201,21 +212,37 @@ def setup(): } # Create the config file + yaml_content = yaml.dump(config, sort_keys=False, indent=4) + if config_type == "slurm": + slurm_comment = ( + "executor: slurm\n\n" + "containers:\n" + " # follow steps in https://nvidia-nemo.github.io/Skills/basics/#slurm-inference\n" + " # to complete this section\n\n" + ) + yaml_content = yaml_content.replace("executor: slurm\n", slurm_comment) with open(config_file, "wt") as fout: - yaml.dump(config, fout, sort_keys=False, indent=4) + fout.write(yaml_content) - typer.echo( - f"\nCreated {config_type} config file at {config_file}.\n" - f"The containers section was initialized with default values, but you can always change them manually.\n" - f"You can find more information on what containers we use in " - f"https://github.com/NVIDIA-NeMo/Skills/tree/main/dockerfiles" - ) + if config_type == "local": + typer.echo( + f"\nCreated {config_type} config file at {config_file}.\n" + f"The containers section was initialized with default values, but you can always change them manually.\n" + f"You can find more information on what containers we use in " + f"https://github.com/NVIDIA-NeMo/Skills/tree/main/dockerfiles" + ) + else: + typer.echo( + f"\nCreated {config_type} config file at {config_file}.\n" + "We left the containers section empty. Follow the instructions at " + "https://nvidia-nemo.github.io/Skills/basics/#slurm-inference to configure your cluster containers." + ) if config_type == "local": pull_containers = typer.confirm( - "\nWould you like to pull all the necessary Docker containers now? " + "\nWould you like to pull/build all the necessary Docker containers now? " "This might take some time but ensures everything is ready to use.\n" - "You can skip this step and we will pull the containers automatically when you run the first job.", + "You can skip this step and we will pull/build the containers automatically when you run the first job.", default=True, ) diff --git a/nemo_skills/pipeline/utils/__init__.py b/nemo_skills/pipeline/utils/__init__.py index 875bb5a552..99da28fa42 100644 --- a/nemo_skills/pipeline/utils/__init__.py +++ b/nemo_skills/pipeline/utils/__init__.py @@ -31,6 +31,7 @@ tunnel_hash, update_ssh_tunnel_config, ) +from nemo_skills.pipeline.utils.docker_images import resolve_container_image from nemo_skills.pipeline.utils.exp import ( CustomJobDetails, add_task, diff --git a/nemo_skills/pipeline/utils/docker_images.py b/nemo_skills/pipeline/utils/docker_images.py new file mode 100644 index 0000000000..a1036f74f2 --- /dev/null +++ b/nemo_skills/pipeline/utils/docker_images.py @@ -0,0 +1,97 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import hashlib +import logging +import re +import subprocess +from pathlib import Path + +from nemo_skills.utils import get_logger_name + +LOG = logging.getLogger(get_logger_name(__file__)) + +_DOCKERFILE_PREFIX = "dockerfile:" +_REPO_ROOT = Path(__file__).resolve().parents[3] + + +def _sanitize_image_component(value: str) -> str: + sanitized = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") + return sanitized + + +def _resolve_dockerfile_path(dockerfile_path_str: str) -> Path: + dockerfile_path = Path(dockerfile_path_str.strip()) + if dockerfile_path.is_absolute(): + raise ValueError("Dockerfile path must be specified relative to the repository root.") + + resolved = (_REPO_ROOT / dockerfile_path).resolve() + try: + resolved.relative_to(_REPO_ROOT) + except ValueError as exc: + raise ValueError(f"Dockerfile path '{dockerfile_path}' escapes the repository root.") from exc + + if not resolved.exists(): + raise FileNotFoundError( + f"Dockerfile '{dockerfile_path}' not found relative to repository root '{_REPO_ROOT}'." + ) + if not resolved.is_file(): + raise ValueError(f"Dockerfile path '{dockerfile_path}' does not resolve to a file.") + + return resolved + + +def _build_local_docker_image(dockerfile_spec: str) -> str: + dockerfile_path = _resolve_dockerfile_path(dockerfile_spec) + rel_identifier = dockerfile_path.relative_to(_REPO_ROOT).as_posix() + image_name = f"locally-built-{_sanitize_image_component(rel_identifier)}" + digest = hashlib.sha256(dockerfile_path.read_bytes()).hexdigest()[:12] + image_ref = f"{image_name}:{digest}" + context_dir = _REPO_ROOT + + LOG.info("Building Docker image %s from %s (context: %s)", image_ref, dockerfile_path, context_dir) + try: + subprocess.run( + [ + "docker", + "build", + "-f", + str(dockerfile_path), + "-t", + image_ref, + str(context_dir), + ], + check=True, + ) + except FileNotFoundError as exc: + raise RuntimeError( + "Docker is required to build images from dockerfile specifications, but it was not found in PATH." + ) from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError(f"Failed to build Docker image from {dockerfile_path}") from exc + + return image_ref + + +def resolve_container_image(container: str, cluster_config: dict) -> str: + if not container.startswith(_DOCKERFILE_PREFIX): + return container + + if cluster_config["executor"] != "local": + raise ValueError("dockerfile container specifications are only supported for the local executor.") + + dockerfile_spec = container[len(_DOCKERFILE_PREFIX) :].strip() + if not dockerfile_spec: + raise ValueError("dockerfile container specification must include a path.") + return _build_local_docker_image(dockerfile_spec) diff --git a/nemo_skills/pipeline/utils/exp.py b/nemo_skills/pipeline/utils/exp.py index df5fa98767..be2328b012 100644 --- a/nemo_skills/pipeline/utils/exp.py +++ b/nemo_skills/pipeline/utils/exp.py @@ -34,6 +34,7 @@ temporary_env_update, tunnel_hash, ) +from nemo_skills.pipeline.utils.docker_images import resolve_container_image from nemo_skills.pipeline.utils.mounts import ( check_remote_mount_directories, get_mounts_from_config, @@ -196,8 +197,9 @@ def get_executor( if cluster_config["executor"] == "local": env_vars["PYTHONUNBUFFERED"] = "1" # this makes sure logs are streamed right away + resolved_container = resolve_container_image(container, cluster_config) return DockerExecutor( - container_image=container, + container_image=resolved_container, packager=packager, ipc_mode="host", volumes=mounts, diff --git a/nemo_skills/pipeline/utils/mounts.py b/nemo_skills/pipeline/utils/mounts.py index 16250bef61..b55aa9b5bb 100644 --- a/nemo_skills/pipeline/utils/mounts.py +++ b/nemo_skills/pipeline/utils/mounts.py @@ -360,7 +360,7 @@ def resolve_mount_paths(cluster_config: dict, mount_paths: str | list | dict, cr def check_remote_mount_directories(directories: list, cluster_config: dict, exit_on_failure: bool = True): - """Create a remote directory on the cluster.""" + """Check if a directory exists on the cluster.""" if cluster_config is None: raise ValueError("Cluster config is not provided.") if isinstance(directories, str): @@ -373,22 +373,8 @@ def check_remote_mount_directories(directories: list, cluster_config: dict, exit ] if cluster_config.get("executor") != "slurm": - tunnel = run.LocalTunnel(job_dir=None) - missing_source_locations = [] - for directory in directories: - result = tunnel.run(f'test -e {directory} && echo "Directory Exists"', hide=True, warn=True) - if "Directory Exists" not in result.stdout: - missing_source_locations.append(directory) - tunnel.cleanup() - if len(missing_source_locations) > 0 and exit_on_failure: - missing_source_locations = [ - f"{loc} DOES NOT exist at source destination" for loc in missing_source_locations - ] - missing_source_locations = "\n".join(missing_source_locations) - raise FileNotFoundError( - f"Some files or directories do not exist at the source location for mounting !!\n\n" - f"{missing_source_locations}" - ) + # there is no error locally if mounts aren't present, so we are skipping the check + return elif cluster_config.get("executor") == "slurm": tunnel = get_tunnel(cluster_config) missing_source_locations = [] diff --git a/tests/conftest.py b/tests/conftest.py index d391f99d48..f975044637 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ def docker_run(command, image_name=None, volume_paths=None): config = yaml.safe_load(f.read()) if image_name is None: - image_name = config["containers"]["nemo-skills"] + image_name = "bash" if volume_paths is None: volume_paths = config["mounts"] @@ -39,7 +39,7 @@ def docker_run(command, image_name=None, volume_paths=None): volumes[os.path.abspath(src)] = {"bind": dst, "mode": "rw"} # Run the container - full_command = f"/bin/bash -c '{command}'" + full_command = f"bash -c '{command}'" result = client.containers.run( image_name, command=full_command, diff --git a/tests/gpu-tests/test-local.yaml b/tests/gpu-tests/test-local.yaml index dff1d5b9fd..57ba7d635a 100644 --- a/tests/gpu-tests/test-local.yaml +++ b/tests/gpu-tests/test-local.yaml @@ -18,12 +18,11 @@ containers: trtllm: nvcr.io/nvidia/tensorrt-llm/release:1.0.0 vllm: vllm/vllm-openai:v0.10.1.1 sglang: lmsysorg/sglang:v0.5.3rc1-cu126 - nemo: igitman/nemo-skills-nemo:0.7.0 - megatron: igitman/nemo-skills-megatron:0.7.0 - sandbox: igitman/nemo-skills-sandbox:0.7.1 - nemo-skills: igitman/nemo-skills:0.7.1 - verl: igitman/nemo-skills-verl:0.7.0 - nemo-rl: igitman/nemo-skills-nemo-rl:0.7.1 + sandbox: dockerfile:dockerfiles/Dockerfile.sandbox + nemo-skills: dockerfile:dockerfiles/Dockerfile.nemo-skills + megatron: dockerfile:dockerfiles/Dockerfile.megatron + verl: dockerfile:dockerfiles/Dockerfile.verl + nemo-rl: dockerfile:dockerfiles/Dockerfile.nemo-rl mounts: - /tmp:/tmp