diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 89827efdad..59af0dc80b 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -70,6 +70,9 @@ jobs: run: | docker pull nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} + - name: Checkout repository + uses: actions/checkout@v4 + - name: Start container run: | nvidia-smi @@ -81,6 +84,7 @@ jobs: --env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \ --env REINFORCER_REPO_DIR=/opt/reinforcer \ --env HF_TOKEN \ + --volume $GITHUB_WORKSPACE:/opt/reinforcer \ --volume $GITHUB_ACTION_DIR:$GITHUB_ACTION_DIR \ --volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \ --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \ @@ -91,6 +95,7 @@ jobs: - name: Run unit tests run: | + docker exec nemo_container_${{ github.run_id }} git config --global --add safe.directory /opt/reinforcer docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "${{ inputs.UNIT_TEST_SCRIPT }}" - name: Run doc tests @@ -107,6 +112,7 @@ jobs: - name: after_script if: always() && inputs.AFTER_SCRIPT != ':' run: | + # Run the after script cmd=$(cat <<"RUN_TEST_EOF" ${{ inputs.AFTER_SCRIPT }} RUN_TEST_EOF @@ -125,5 +131,7 @@ jobs: - name: Container shutdown if: always() run: | + # Ensure any added files in the mounted directory are owned by the runner user to allow it to clean up + docker exec nemo_container_${{ github.run_id }} bash -c "find /opt/reinforcer -path '/opt/reinforcer/datasets' -prune -o -exec chown $(id -u):$(id -g) {} +" docker container stop nemo_container_${{ github.run_id }} || true docker container rm nemo_container_${{ github.run_id }} || true diff --git a/docker/Dockerfile b/docker/Dockerfile index 347f68e3ca..b7d39c841f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,14 +18,16 @@ RUN chmod 755 /home/ray/.cache FROM base AS hermetic WORKDIR /opt/reinforcer -# This is less efficient as this invalidates the cache more frequently, but -# creates a smaller image. Adding reinforcer afterwards and doing -# `uv pip install --no-deps --editable .` causes a "sync" of some of the environment, -# which defeats the purpose of pre-installing. -# In the future we may optimize this: https://github.com/NVIDIA/reinforcer/issues/129 -COPY --chown=ray --chmod=755 . /opt/reinforcer + +# First copy only the dependency files +COPY --chown=ray --chmod=755 pyproject.toml uv.lock ./ + +ENV UV_PROJECT_ENVIRONMENT=/opt/reinforcer_venv +ENV VIRTUAL_ENV=/opt/reinforcer_venv + +# Create and activate virtual environment RUN <<"EOF" -uv venv .venv +uv venv /opt/reinforcer_venv # uv sync has a more reliable resolver than simple uv pip install which can fail # Sync each training + inference backend one at a time (since they may conflict) @@ -33,11 +35,10 @@ uv venv .venv # Do everything in one layer to prevent large layers. uv sync --locked --extra vllm --no-install-project -uv sync --locked --all-groups +uv sync --locked --all-groups --no-install-project EOF -ENV VIRTUAL_ENV=/opt/reinforcer/.venv -ENV PATH="/opt/reinforcer/.venv/bin:$PATH" +ENV PATH="/opt/reinforcer_venv/bin:$PATH" # The ray images automatically activate the anaconda venv. We will # comment this out of the .bashrc to give the same UX between docker @@ -50,7 +51,4 @@ sed -i '/# >>> conda initialize >>>/,/# <<< conda initialize <<