Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/_run_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ jobs:
# NOTE: under certain circumstances, the checkout action cannot clean up the workspace properly, so
# this workaround is needed to ensure that the workspace is clean by removing all files created by root.
#
# Tracking issue: https://github.com/NVIDIA/reinforcer/issues/76
#
# The error observed looked like this from the checkout action:
# Run actions/checkout@v4
# ...
Expand All @@ -85,15 +87,32 @@ jobs:

- name: Start container
run: |
# TODO: disable caching (--env UV_CACHE_DIR=/uv_cache --volume /mnt/datadrive/TestData/reinforcer/uv_cache:/uv_cache)
# for now since it results in
#
# Using CPython 3.12.9 interpreter at: /home/ray/anaconda3/bin/python3
# Creating virtual environment at: .venv
# × Failed to download and build `antlr4-python3-runtime==4.9.3`
# ├─▶ Failed to create temporary virtualenv
# ╰─▶ Permission denied (os error 13)
# help: `antlr4-python3-runtime` (v4.9.3) was included because
# `nemo-reinforcer` (v0.0.1) depends on `math-verify` (v0.7.0) which
# depends on `latex2sympy2-extended==1.10.1` (v1.10.1) which depends on
# `antlr4-python3-runtime>=4.9.3, <=4.13.2`
#
# Something about our CI machines causes this issue since it is not reproducible locally.

docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \
--env TRANSFORMERS_OFFLINE=0 \
--env HYDRA_FULL_ERROR=1 \
--env HF_HOME=/home/TestData/reinforcer/hf_home \
--env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \
--env REINFORCER_REPO_DIR=/opt/reinforcer \
--volume $PWD:/opt/reinforcer \
--volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \
--volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \
--volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub \
--volume /mnt/datadrive/TestData/reinforcer/hf_datasets_cache:/home/TestData/reinforcer/hf_datasets_cache \
nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \
bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ jobs:
if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
with:
RUNNER: self-hosted-azure
TIMEOUT: 10
TIMEOUT: 15
SCRIPT: |
cd ${REINFORCER_REPO_DIR}
uv run --extra test bash -x ./tests/run_unit.sh
Expand All @@ -171,7 +171,7 @@ jobs:
# TODO: For now, allow these to fail since the checks are not robust.
IS_OPTIONAL: true
RUNNER: self-hosted-azure
TIMEOUT: 8
TIMEOUT: 15
SCRIPT: |
cd ${REINFORCER_REPO_DIR}
uv run bash ./tests/functional/${{ matrix.test_case }}
Expand Down
3 changes: 2 additions & 1 deletion tests/functional/grpo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
export RAY_DEDUP_LOGS=0
export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache
export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}

rm -rf $LOG_DIR
mkdir -p $LOG_DIR

cd $PROJECT_ROOT
Expand Down
5 changes: 4 additions & 1 deletion tests/functional/sft.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,18 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
export RAY_DEDUP_LOGS=0
export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache
export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}

rm -rf $LOG_DIR
mkdir -p $LOG_DIR

cd $PROJECT_ROOT
python -u $PROJECT_ROOT/examples/run_sft.py \
policy.model_name=meta-llama/Llama-3.2-1B \
cluster.gpus_per_node=2 \
sft.max_num_steps=10 \
sft.val_batches=1 \
logger.tensorboard_enabled=true \
logger.log_dir=$LOG_DIR \
logger.wandb_enabled=false \
Expand Down
19 changes: 18 additions & 1 deletion tests/run_functional_in_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ TEST_SCRIPT=$(realpath $1)
CONTAINER=${CONTAINER}

export HF_HOME=${HF_HOME:-$(realpath $SCRIPT_DIR/../hf_home)}
export HF_DATASETS_CACHE=${HF_DATASETS_CACHE:-$(realpath $SCRIPT_DIR/../hf_datasets_cache)}
export UV_CACHE_DIR=${UV_CACHE_DIR:-$(realpath $SCRIPT_DIR/../uv_cache)}
mkdir -p $HF_HOME
mkdir -p $HF_DATASETS_CACHE
mkdir -p $UV_CACHE_DIR

# Check if running in GitLab CI
INTERACTIVE_FLAG=""
Expand All @@ -44,4 +48,17 @@ fi
# We have found that 111 does not always work and can leave the filesystem permissions in a bad state.

# Run the script inside the Docker container with GPU support
docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' -v "$PROJECT_ROOT:$PROJECT_ROOT" -v $HF_HOME:/hf_home -e WANDB_API_KEY -e HF_TOKEN -e HF_HOME=/hf_home -e HOME=/tmp/ -w $SCRIPT_DIR "$CONTAINER" -- bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT"
docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' \
-v "$PROJECT_ROOT:$PROJECT_ROOT" \
-v $HF_HOME:/hf_home \
-v $HF_DATASETS_CACHE:/hf_datasets_cache \
-v $UV_CACHE_DIR:/uv_cache \
-e WANDB_API_KEY \
-e HF_TOKEN \
-e HF_HOME=/hf_home \
-e HF_DATASETS_CACHE=/hf_datasets_cache \
-e UV_CACHE_DIR=/uv_cache \
-e HOME=/tmp/ \
-w $SCRIPT_DIR \
"$CONTAINER" -- \
bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT"