From 5aa0ddec6d413df883971047802ec01d516b200b Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Sun, 23 Mar 2025 23:15:00 -0700 Subject: [PATCH 1/4] force tests to run for longer to download the dataset once Signed-off-by: Terry Kong now run grpo Signed-off-by: Terry Kong more args Signed-off-by: Terry Kong trying again Signed-off-by: Terry Kong trying with ultra short uv cache dirname Signed-off-by: Terry Kong lower val batches b/c took 10min to do val Signed-off-by: Terry Kong give this a shot Signed-off-by: Terry Kong With all datasets downloaded, re-enable functional tests Signed-off-by: Terry Kong lint Signed-off-by: Terry Kong fix some stuff Signed-off-by: Terry Kong --- .github/workflows/_run_test.yml | 17 +++++++++++++++++ .github/workflows/cicd-main.yml | 4 ++-- tests/functional/grpo.sh | 3 ++- tests/functional/sft.sh | 4 +++- tests/run_functional_in_docker.sh | 19 ++++++++++++++++++- 5 files changed, 42 insertions(+), 5 deletions(-) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index d75a6a7178..8e8b034dd4 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -85,15 +85,32 @@ jobs: - name: Start container run: | + # TODO: disable caching (--env UV_CACHE_DIR=/uv_cache --volume /mnt/datadrive/TestData/reinforcer/uv_cache:/uv_cache) + # for now since it results in + # + # Using CPython 3.12.9 interpreter at: /home/ray/anaconda3/bin/python3 + # Creating virtual environment at: .venv + # × Failed to download and build `antlr4-python3-runtime==4.9.3` + # ├─▶ Failed to create temporary virtualenv + # ╰─▶ Permission denied (os error 13) + # help: `antlr4-python3-runtime` (v4.9.3) was included because + # `nemo-reinforcer` (v0.0.1) depends on `math-verify` (v0.7.0) which + # depends on `latex2sympy2-extended==1.10.1` (v1.10.1) which depends on + # `antlr4-python3-runtime>=4.9.3, <=4.13.2` + # + # Something about our CI machines causes this issue since it is not reproducible locally. + docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \ --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 \ --env HF_HOME=/home/TestData/reinforcer/hf_home \ + --env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \ --env REINFORCER_REPO_DIR=/opt/reinforcer \ --volume $PWD:/opt/reinforcer \ --volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \ --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \ --volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub \ + --volume /mnt/datadrive/TestData/reinforcer/hf_datasets_cache:/home/TestData/reinforcer/hf_datasets_cache \ nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \ bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))" diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 03b2179ff4..0e2a289c62 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -150,7 +150,7 @@ jobs: if: ${{ needs.pre-flight.outputs.run_ci == 'true' }} with: RUNNER: self-hosted-azure - TIMEOUT: 10 + TIMEOUT: 15 SCRIPT: | cd ${REINFORCER_REPO_DIR} uv run --extra test bash -x ./tests/run_unit.sh @@ -171,7 +171,7 @@ jobs: # TODO: For now, allow these to fail since the checks are not robust. IS_OPTIONAL: true RUNNER: self-hosted-azure - TIMEOUT: 8 + TIMEOUT: 15 SCRIPT: | cd ${REINFORCER_REPO_DIR} uv run bash ./tests/functional/${{ matrix.test_case }} diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh index 16f7e7530d..faaed1903c 100755 --- a/tests/functional/grpo.sh +++ b/tests/functional/grpo.sh @@ -11,9 +11,10 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json RUN_LOG=$LOG_DIR/$(basename $0 .sh).log export RAY_DEDUP_LOGS=0 -export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache +export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} +rm -rf $LOG_DIR mkdir -p $LOG_DIR cd $PROJECT_ROOT diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index b9836886d6..ca290a7e90 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -11,15 +11,17 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json RUN_LOG=$LOG_DIR/$(basename $0 .sh).log export RAY_DEDUP_LOGS=0 -export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache +export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache} export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-} +rm -rf $LOG_DIR mkdir -p $LOG_DIR cd $PROJECT_ROOT python -u $PROJECT_ROOT/examples/run_sft.py \ cluster.gpus_per_node=2 \ sft.max_num_steps=10 \ + sft.val_batches=1 \ logger.tensorboard_enabled=true \ logger.log_dir=$LOG_DIR \ logger.wandb_enabled=false \ diff --git a/tests/run_functional_in_docker.sh b/tests/run_functional_in_docker.sh index e3fd403ba1..439982f4fc 100755 --- a/tests/run_functional_in_docker.sh +++ b/tests/run_functional_in_docker.sh @@ -27,7 +27,11 @@ TEST_SCRIPT=$(realpath $1) CONTAINER=${CONTAINER} export HF_HOME=${HF_HOME:-$(realpath $SCRIPT_DIR/../hf_home)} +export HF_DATASETS_CACHE=${HF_DATASETS_CACHE:-$(realpath $SCRIPT_DIR/../hf_datasets_cache)} +export UV_CACHE_DIR=${UV_CACHE_DIR:-$(realpath $SCRIPT_DIR/../uv_cache)} mkdir -p $HF_HOME +mkdir -p $HF_DATASETS_CACHE +mkdir -p $UV_CACHE_DIR # Check if running in GitLab CI INTERACTIVE_FLAG="" @@ -44,4 +48,17 @@ fi # We have found that 111 does not always work and can leave the filesystem permissions in a bad state. # Run the script inside the Docker container with GPU support -docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' -v "$PROJECT_ROOT:$PROJECT_ROOT" -v $HF_HOME:/hf_home -e WANDB_API_KEY -e HF_TOKEN -e HF_HOME=/hf_home -e HOME=/tmp/ -w $SCRIPT_DIR "$CONTAINER" -- bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT" +docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' \ + -v "$PROJECT_ROOT:$PROJECT_ROOT" \ + -v $HF_HOME:/hf_home \ + -v $HF_DATASETS_CACHE:/hf_datasets_cache \ + -v $UV_CACHE_DIR:/uv_cache \ + -e WANDB_API_KEY \ + -e HF_TOKEN \ + -e HF_HOME=/hf_home \ + -e HF_DATASETS_CACHE=/hf_datasets_cache \ + -e UV_CACHE_DIR=/uv_cache \ + -e HOME=/tmp/ \ + -w $SCRIPT_DIR \ + "$CONTAINER" -- \ + bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT" From 72a950ea92e62b31ccf137f8ae1544d50c2ca993 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 24 Mar 2025 14:16:46 -0700 Subject: [PATCH 2/4] make it 1b Signed-off-by: Terry Kong --- examples/configs/sft.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 1282285fc3..7d2efb452f 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -17,7 +17,7 @@ checkpointing: save_period: 10 policy: - model_name: "meta-llama/Meta-Llama-3-8B" + model_name: "meta-llama/Llama-3.2-1B" train_global_batch_size: 128 train_micro_batch_size: 1 max_total_sequence_length: 2048 From 546873c8e112e512fc3853dfe953d27d38ec14a5 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 24 Mar 2025 15:34:54 -0700 Subject: [PATCH 3/4] add tracking issue Signed-off-by: Terry Kong --- .github/workflows/_run_test.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml index 8e8b034dd4..813719503c 100644 --- a/.github/workflows/_run_test.yml +++ b/.github/workflows/_run_test.yml @@ -68,6 +68,8 @@ jobs: # NOTE: under certain circumstances, the checkout action cannot clean up the workspace properly, so # this workaround is needed to ensure that the workspace is clean by removing all files created by root. # + # Tracking issue: https://github.com/NVIDIA/reinforcer/issues/76 + # # The error observed looked like this from the checkout action: # Run actions/checkout@v4 # ... From 69998d2d661e9764cd5817d737863fa6a08fbf52 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Mon, 24 Mar 2025 15:38:48 -0700 Subject: [PATCH 4/4] revert some changes so that they can be dealt with later Signed-off-by: Terry Kong --- examples/configs/sft.yaml | 2 +- tests/functional/sft.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 7d2efb452f..1282285fc3 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -17,7 +17,7 @@ checkpointing: save_period: 10 policy: - model_name: "meta-llama/Llama-3.2-1B" + model_name: "meta-llama/Meta-Llama-3-8B" train_global_batch_size: 128 train_micro_batch_size: 1 max_total_sequence_length: 2048 diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh index ca290a7e90..82d263c9da 100755 --- a/tests/functional/sft.sh +++ b/tests/functional/sft.sh @@ -19,6 +19,7 @@ mkdir -p $LOG_DIR cd $PROJECT_ROOT python -u $PROJECT_ROOT/examples/run_sft.py \ + policy.model_name=meta-llama/Llama-3.2-1B \ cluster.gpus_per_node=2 \ sft.max_num_steps=10 \ sft.val_batches=1 \