From 5aa0ddec6d413df883971047802ec01d516b200b Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Sun, 23 Mar 2025 23:15:00 -0700
Subject: [PATCH 1/4] force tests to run for longer to download the dataset
 once

Signed-off-by: Terry Kong <terryk@nvidia.com>

now run grpo

Signed-off-by: Terry Kong <terryk@nvidia.com>

more args

Signed-off-by: Terry Kong <terryk@nvidia.com>

trying again

Signed-off-by: Terry Kong <terryk@nvidia.com>

trying with ultra short uv cache dirname

Signed-off-by: Terry Kong <terryk@nvidia.com>

lower val batches b/c took 10min to do val

Signed-off-by: Terry Kong <terryk@nvidia.com>

give this a shot

Signed-off-by: Terry Kong <terryk@nvidia.com>

With all datasets downloaded, re-enable functional tests

Signed-off-by: Terry Kong <terryk@nvidia.com>

lint

Signed-off-by: Terry Kong <terryk@nvidia.com>

fix some stuff

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/_run_test.yml   | 17 +++++++++++++++++
 .github/workflows/cicd-main.yml   |  4 ++--
 tests/functional/grpo.sh          |  3 ++-
 tests/functional/sft.sh           |  4 +++-
 tests/run_functional_in_docker.sh | 19 ++++++++++++++++++-
 5 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml
index d75a6a7178..8e8b034dd4 100644
--- a/.github/workflows/_run_test.yml
+++ b/.github/workflows/_run_test.yml
@@ -85,15 +85,32 @@ jobs:
 
         - name: Start container
           run: |
+            # TODO: disable caching (--env UV_CACHE_DIR=/uv_cache --volume /mnt/datadrive/TestData/reinforcer/uv_cache:/uv_cache)
+            #       for now since it results in
+            # 
+            # Using CPython 3.12.9 interpreter at: /home/ray/anaconda3/bin/python3
+            # Creating virtual environment at: .venv
+            #   × Failed to download and build `antlr4-python3-runtime==4.9.3`
+            #   ├─▶ Failed to create temporary virtualenv
+            #   ╰─▶ Permission denied (os error 13)
+            #   help: `antlr4-python3-runtime` (v4.9.3) was included because
+            #         `nemo-reinforcer` (v0.0.1) depends on `math-verify` (v0.7.0) which
+            #         depends on `latex2sympy2-extended==1.10.1` (v1.10.1) which depends on
+            #         `antlr4-python3-runtime>=4.9.3, <=4.13.2`
+            #
+            # Something about our CI machines causes this issue since it is not reproducible locally.
+            
             docker run --rm -d --name nemo_container_${{ github.run_id }} --runtime=nvidia --gpus all --shm-size=64g \
               --env TRANSFORMERS_OFFLINE=0 \
               --env HYDRA_FULL_ERROR=1 \
               --env HF_HOME=/home/TestData/reinforcer/hf_home \
+              --env HF_DATASETS_CACHE=/home/TestData/reinforcer/hf_datasets_cache \
               --env REINFORCER_REPO_DIR=/opt/reinforcer \
               --volume $PWD:/opt/reinforcer \
               --volume /mnt/datadrive/TestData/reinforcer/datasets:/opt/reinforcer/datasets:ro \
               --volume /mnt/datadrive/TestData/reinforcer/checkpoints:/home/TestData/reinforcer/checkpoints:ro \
               --volume /mnt/datadrive/TestData/reinforcer/hf_home/hub:/home/TestData/reinforcer/hf_home/hub \
+              --volume /mnt/datadrive/TestData/reinforcer/hf_datasets_cache:/home/TestData/reinforcer/hf_datasets_cache \
               nemoci.azurecr.io/nemo_reinforcer_container:${{ github.run_id }} \
               bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
   
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 03b2179ff4..0e2a289c62 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -150,7 +150,7 @@ jobs:
     if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
     with:
       RUNNER: self-hosted-azure
-      TIMEOUT: 10
+      TIMEOUT: 15
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}
         uv run --extra test bash -x ./tests/run_unit.sh
@@ -171,7 +171,7 @@ jobs:
       # TODO: For now, allow these to fail since the checks are not robust.
       IS_OPTIONAL: true
       RUNNER: self-hosted-azure
-      TIMEOUT: 8
+      TIMEOUT: 15
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}
         uv run bash ./tests/functional/${{ matrix.test_case }}
diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh
index 16f7e7530d..faaed1903c 100755
--- a/tests/functional/grpo.sh
+++ b/tests/functional/grpo.sh
@@ -11,9 +11,10 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
 JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
 RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
 export RAY_DEDUP_LOGS=0
-export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache
+export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
+rm -rf $LOG_DIR
 mkdir -p $LOG_DIR
 
 cd $PROJECT_ROOT
diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh
index b9836886d6..ca290a7e90 100755
--- a/tests/functional/sft.sh
+++ b/tests/functional/sft.sh
@@ -11,15 +11,17 @@ LOG_DIR=$SCRIPT_DIR/$(basename $0 .sh)-logs
 JSON_METRICS=$LOG_DIR/$(basename $0 .sh).json
 RUN_LOG=$LOG_DIR/$(basename $0 .sh).log
 export RAY_DEDUP_LOGS=0
-export UV_CACHE_DIR=$PROJECT_ROOT/uv_cache
+export UV_CACHE_DIR=${UV_CACHE_DIR:-$PROJECT_ROOT/uv_cache}
 export PYTHONPATH=${PROJECT_ROOT}:${PYTHONPATH:-}
 
+rm -rf $LOG_DIR
 mkdir -p $LOG_DIR
 
 cd $PROJECT_ROOT
 python -u $PROJECT_ROOT/examples/run_sft.py \
     cluster.gpus_per_node=2 \
     sft.max_num_steps=10 \
+    sft.val_batches=1 \
     logger.tensorboard_enabled=true \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=false \
diff --git a/tests/run_functional_in_docker.sh b/tests/run_functional_in_docker.sh
index e3fd403ba1..439982f4fc 100755
--- a/tests/run_functional_in_docker.sh
+++ b/tests/run_functional_in_docker.sh
@@ -27,7 +27,11 @@ TEST_SCRIPT=$(realpath $1)
 CONTAINER=${CONTAINER}
 
 export HF_HOME=${HF_HOME:-$(realpath $SCRIPT_DIR/../hf_home)}
+export HF_DATASETS_CACHE=${HF_DATASETS_CACHE:-$(realpath $SCRIPT_DIR/../hf_datasets_cache)}
+export UV_CACHE_DIR=${UV_CACHE_DIR:-$(realpath $SCRIPT_DIR/../uv_cache)}
 mkdir -p $HF_HOME
+mkdir -p $HF_DATASETS_CACHE
+mkdir -p $UV_CACHE_DIR
 
 # Check if running in GitLab CI
 INTERACTIVE_FLAG=""
@@ -44,4 +48,17 @@ fi
 # We have found that 111 does not always work and can leave the filesystem permissions in a bad state.
 
 # Run the script inside the Docker container with GPU support
-docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' -v "$PROJECT_ROOT:$PROJECT_ROOT" -v $HF_HOME:/hf_home -e WANDB_API_KEY -e HF_TOKEN -e HF_HOME=/hf_home -e HOME=/tmp/ -w $SCRIPT_DIR "$CONTAINER" -- bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT"
+docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' \
+  -v "$PROJECT_ROOT:$PROJECT_ROOT" \
+  -v $HF_HOME:/hf_home \
+  -v $HF_DATASETS_CACHE:/hf_datasets_cache \
+  -v $UV_CACHE_DIR:/uv_cache \
+  -e WANDB_API_KEY \
+  -e HF_TOKEN \
+  -e HF_HOME=/hf_home \
+  -e HF_DATASETS_CACHE=/hf_datasets_cache \
+  -e UV_CACHE_DIR=/uv_cache \
+  -e HOME=/tmp/ \
+  -w $SCRIPT_DIR \
+  "$CONTAINER" -- \
+  bash -x -c "umask 000 && uv run bash -x $TEST_SCRIPT"

From 72a950ea92e62b31ccf137f8ae1544d50c2ca993 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 24 Mar 2025 14:16:46 -0700
Subject: [PATCH 2/4] make it 1b

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 examples/configs/sft.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 1282285fc3..7d2efb452f 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -17,7 +17,7 @@ checkpointing:
   save_period: 10
 
 policy:
-  model_name: "meta-llama/Meta-Llama-3-8B"
+  model_name: "meta-llama/Llama-3.2-1B"
   train_global_batch_size: 128
   train_micro_batch_size: 1
   max_total_sequence_length: 2048

From 546873c8e112e512fc3853dfe953d27d38ec14a5 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 24 Mar 2025 15:34:54 -0700
Subject: [PATCH 3/4] add tracking issue

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 .github/workflows/_run_test.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml
index 8e8b034dd4..813719503c 100644
--- a/.github/workflows/_run_test.yml
+++ b/.github/workflows/_run_test.yml
@@ -68,6 +68,8 @@ jobs:
         # NOTE: under certain circumstances, the checkout action cannot clean up the workspace properly, so
         # this workaround is needed to ensure that the workspace is clean by removing all files created by root.
         #
+        # Tracking issue: https://github.com/NVIDIA/reinforcer/issues/76
+        #
         # The error observed looked like this from the checkout action:
         #      Run actions/checkout@v4
         #      ...

From 69998d2d661e9764cd5817d737863fa6a08fbf52 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 24 Mar 2025 15:38:48 -0700
Subject: [PATCH 4/4] revert some changes so that they can be dealt with later

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 examples/configs/sft.yaml | 2 +-
 tests/functional/sft.sh   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 7d2efb452f..1282285fc3 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -17,7 +17,7 @@ checkpointing:
   save_period: 10
 
 policy:
-  model_name: "meta-llama/Llama-3.2-1B"
+  model_name: "meta-llama/Meta-Llama-3-8B"
   train_global_batch_size: 128
   train_micro_batch_size: 1
   max_total_sequence_length: 2048
diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh
index ca290a7e90..82d263c9da 100755
--- a/tests/functional/sft.sh
+++ b/tests/functional/sft.sh
@@ -19,6 +19,7 @@ mkdir -p $LOG_DIR
 
 cd $PROJECT_ROOT
 python -u $PROJECT_ROOT/examples/run_sft.py \
+    policy.model_name=meta-llama/Llama-3.2-1B \
     cluster.gpus_per_node=2 \
     sft.max_num_steps=10 \
     sft.val_batches=1 \