Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
c060437
Test GB200 runner
chtruong814 Feb 24, 2026
02084e3
Fix gb200 container build
chtruong814 Feb 24, 2026
bcf8f81
Test updated registry
chtruong814 Feb 26, 2026
e87f2e2
Test gb200
chtruong814 Feb 28, 2026
2435ca5
Merge remote-tracking branch 'origin/main' into chtruong/gb200
chtruong814 Feb 28, 2026
f517e6a
Force gb200 build
chtruong814 Feb 28, 2026
3feb0ca
Fix RL image name
chtruong814 Feb 28, 2026
44a6636
Fix image ref
chtruong814 Feb 28, 2026
99a9236
Move decord import inside of load_media_from_message method
chtruong814 Mar 1, 2026
626b4d9
Revert "Move decord import inside of load_media_from_message method"
chtruong814 Mar 2, 2026
9576824
Replace decord with decord2
chtruong814 Mar 2, 2026
bdace86
Skip eval test in fast functional
chtruong814 Mar 2, 2026
280ae40
Enable full functional test on gb200
chtruong814 Mar 2, 2026
c20713b
Fix test functional
chtruong814 Mar 2, 2026
0166036
Merge remote-tracking branch 'origin/main' into chtruong/gb200
chtruong814 Mar 2, 2026
30959e6
Update copy-pr-bot to not run automatically
chtruong814 Mar 3, 2026
69e8711
Merge remote-tracking branch 'origin/main' into chtruong/gb200
chtruong814 Mar 3, 2026
8681cd2
Run full CI tests with gcp
chtruong814 Mar 3, 2026
cafa08f
Fix CI file
chtruong814 Mar 3, 2026
0cfedc1
Fix default registry
chtruong814 Mar 3, 2026
b59b8cf
Fix pre-flight ref
chtruong814 Mar 3, 2026
2bbe325
Remove Azure login
chtruong814 Mar 3, 2026
570d4f5
Fix registry
chtruong814 Mar 3, 2026
e4f293a
Fix image nmae
chtruong814 Mar 3, 2026
21e5d84
Fix doc test image ref
chtruong814 Mar 3, 2026
a10a3e4
Skip broken megatron lora tests
chtruong814 Mar 4, 2026
66707ac
Skip test_vllm_generation_with_hf_training_colocated
chtruong814 Mar 4, 2026
9866c4d
Fix test skip
chtruong814 Mar 4, 2026
5d6eb10
Skip test
chtruong814 Mar 4, 2026
60d4b5c
Skip fp8 generation for gb200 for now
chtruong814 Mar 4, 2026
08d62fd
Skip fp8 vllm generation tests
chtruong814 Mar 4, 2026
31613ca
Use variable for runner
chtruong814 Mar 4, 2026
3f623a1
Fix lint error in test_vllm_generation
chtruong814 Mar 4, 2026
6b541f4
Use container name variable
chtruong814 Mar 4, 2026
4417675
Use copy-pr-bot
chtruong814 Mar 4, 2026
c9ca7db
Revert changes
chtruong814 Mar 4, 2026
bb72598
Merge remote-tracking branch 'origin/main' into chtruong/gb200
chtruong814 Mar 4, 2026
73e70e8
Update expected eval metrics
chtruong814 Mar 4, 2026
836c8cb
Ensure functional tests wait for unit tests
chtruong814 Mar 4, 2026
7166bce
Update preflight and build container to 0.78.0
chtruong814 Mar 6, 2026
2926902
Move skip of vllm fp8 generation to top of tests
chtruong814 Mar 6, 2026
0c14ffd
Add instructions to run CI
chtruong814 Mar 6, 2026
939e0da
Fix markdown bullets in contribution guide
chtruong814 Mar 6, 2026
98a7634
Merge remote-tracking branch 'origin/main' into chtruong/gb200
chtruong814 Mar 7, 2026
f30ebd6
Fix format in test_vllm_generation.py
chtruong814 Mar 7, 2026
a36f5a9
Fix base and head refs
chtruong814 Mar 7, 2026
99f82b9
Fix pre-flight ref
chtruong814 Mar 7, 2026
f2c3fbb
Fix checkout
chtruong814 Mar 7, 2026
1fbc780
Fix test labels
chtruong814 Mar 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 18 additions & 72 deletions .github/actions/test-template/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ inputs:
description: "Whether this is a pull request from a fork"
required: false
default: "false"
registry:
description: "Registry to use for test"
required: false
test_data_path:
description: "Test data path"
required: false
default: "/mnt/datadrive/TestData"
image-tag:
description: "Override container image tag. If set, infers FAST=1 and prefetches venvs + regenerates fingerprint at startup."
required: false
Expand All @@ -72,73 +79,12 @@ runs:
run: |
curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash

- name: Azure Login
if: ${{ inputs.has-azure-credentials == 'true' }}
uses: azure/login@v2
with:
client-id: ${{ inputs.azure-client-id }}
tenant-id: ${{ inputs.azure-tenant-id }}
subscription-id: ${{ inputs.azure-subscription-id }}

- name: Azure ACR Login
if: ${{ inputs.has-azure-credentials == 'true' }}
shell: bash
run: |
az acr login --name nemoci

- name: Azure Fileshare
if: ${{ inputs.has-azure-credentials == 'true' && inputs.is_unit_test == 'false' && inputs.is_doc_test == 'false' }}
shell: bash
id: azure-fileshare
- name: Install uuidgen
shell: bash -x -e -u -o pipefail {0}
if: ${{ contains(inputs.runner, 'gcp') }}
run: |
sudo apt update
sudo apt install -y cifs-utils

RESOURCE_GROUP_NAME="azure-gpu-vm-runner_group"
STORAGE_ACCOUNT_NAME="nemocistorageaccount2"
FILE_SHARE_NAME="fileshare"

MNT_ROOT="/media"
MNT_PATH="$MNT_ROOT/$STORAGE_ACCOUNT_NAME/$FILE_SHARE_NAME"

echo "MNT_PATH=$MNT_PATH" | tee -a "$GITHUB_OUTPUT"

sudo mkdir -p $MNT_PATH

# Create a folder to store the credentials for this storage account and
# any other that you might set up.
CREDENTIAL_ROOT="/etc/smbcredentials"
sudo mkdir -p "/etc/smbcredentials"

# Get the storage account key for the indicated storage account.
# You must be logged in with az login and your user identity must have
# permissions to list the storage account keys for this command to work.
STORAGE_ACCOUNT_KEY=$(az storage account keys list \
--resource-group $RESOURCE_GROUP_NAME \
--account-name $STORAGE_ACCOUNT_NAME \
--query "[0].value" --output tsv | tr -d '"')

# Create the credential file for this individual storage account
SMB_CREDENTIAL_FILE="$CREDENTIAL_ROOT/$STORAGE_ACCOUNT_NAME.cred"
if [ ! -f $SMB_CREDENTIAL_FILE ]; then
echo "username=$STORAGE_ACCOUNT_NAME" | sudo tee $SMB_CREDENTIAL_FILE > /dev/null
echo "password=$STORAGE_ACCOUNT_KEY" | sudo tee -a $SMB_CREDENTIAL_FILE > /dev/null
else
echo "The credential file $SMB_CREDENTIAL_FILE already exists, and was not modified."
fi

# Change permissions on the credential file so only root can read or modify the password file.
sudo chmod 600 $SMB_CREDENTIAL_FILE

# This command assumes you have logged in with az login
HTTP_ENDPOINT=$(az storage account show --resource-group $RESOURCE_GROUP_NAME --name $STORAGE_ACCOUNT_NAME --query "primaryEndpoints.file" --output tsv | tr -d '"')
SMB_PATH=$(echo $HTTP_ENDPOINT | cut -c7-${#HTTP_ENDPOINT})$FILE_SHARE_NAME

STORAGE_ACCOUNT_KEY=$(az storage account keys list --resource-group $RESOURCE_GROUP_NAME --account-name $STORAGE_ACCOUNT_NAME --query "[0].value" --output tsv | tr -d '"')

sudo mount -t cifs $SMB_PATH $MNT_PATH -o credentials=$SMB_CREDENTIAL_FILE,serverino,nosharesock,actimeo=30,mfsymlinks

ls -al $MNT_PATH/TestData
apt-get update
apt-get install -y uuid-runtime

- name: Docker system cleanup
shell: bash
Expand All @@ -148,7 +94,7 @@ runs:
- name: Docker pull image
shell: bash
run: |
docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ inputs.image-tag || github.run_id }}
docker pull ${{ inputs.registry }}/${{ inputs.image }}:${{ inputs.image-tag || github.run_id }}

- name: Create UUID
id: uuid
Expand Down Expand Up @@ -183,11 +129,11 @@ runs:
${{ inputs.image-tag != '' && '--env FAST=1' || '' }} \
--volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/nemo-rl:/opt/nemo-rl \
--volume $GITHUB_ACTION_DIR:$GITHUB_ACTION_DIR \
--volume /mnt/datadrive/TestData/nemo-rl/datasets:/opt/nemo-rl/datasets:ro \
--volume /mnt/datadrive/TestData/nemo-rl/checkpoints:/home/TestData/nemo-rl/checkpoints:ro \
--volume /mnt/datadrive/TestData/nemo-rl/hf_home/hub:/home/TestData/nemo-rl/hf_home/hub \
--volume /mnt/datadrive/TestData/nemo-rl/hf_datasets_cache:/home/TestData/nemo-rl/hf_datasets_cache \
nemoci.azurecr.io/${{ inputs.image }}:${{ inputs.image-tag || github.run_id }} bash -eux -o pipefail -c '\
--volume ${{ inputs.test_data_path }}/nemo-rl/datasets:/opt/nemo-rl/datasets:ro \
--volume ${{ inputs.test_data_path }}/nemo-rl/checkpoints:/home/TestData/nemo-rl/checkpoints:ro \
--volume ${{ inputs.test_data_path }}/nemo-rl/hf_home/hub:/home/TestData/nemo-rl/hf_home/hub \
--volume ${{ inputs.test_data_path }}/nemo-rl/hf_datasets_cache:/home/TestData/nemo-rl/hf_datasets_cache \
${{ inputs.registry }}/${{ inputs.image }}:${{ inputs.image-tag || github.run_id }} bash -eux -o pipefail -c '\
git config --global --add safe.directory /opt/nemo-rl
# This is needed since we create virtualenvs in the workspace, so this allows it to be cleaned up if necessary
umask 000
Expand Down
140 changes: 102 additions & 38 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,10 @@
name: "CICD NeMo RL"

on:
pull_request:
push:
branches:
- "main"
- "r**"
types: [labeled, opened, synchronize, reopened]
merge_group:
types: [checks_requested]
- main
- "pull-request/[0-9]+"
schedule:
- cron: "0 9 * * *"
workflow_dispatch:
Expand All @@ -40,13 +37,9 @@ on:
description: "Override container image tag (e.g. 'main'). Skips container build."
required: false
default: ""
# TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge
#push:
# branches:
# - 'main'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
Expand All @@ -55,12 +48,61 @@ jobs:
outputs:
test_level: ${{ steps.evaluate.outputs.test_level }}
image_tag: ${{ steps.evaluate.outputs.image_tag }}
base_ref: ${{ steps.base-head-ref.outputs.base_ref }}
base_sha: ${{ steps.base-head-ref.outputs.base_sha }}
head_ref: ${{ steps.base-head-ref.outputs.head_ref }}
head_sha: ${{ steps.base-head-ref.outputs.head_sha }}
head_label: ${{ steps.base-head-ref.outputs.head_label }}
has_skip_cicd: ${{ steps.base-head-ref.outputs.has_skip_cicd }}
steps:
- name: Get PR info
id: get-pr-info
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: nv-gha-runners/get-pr-info@main

- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Determine base and head references
id: base-head-ref
env:
IS_PULL_REQUEST_REF: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
PR_INFO_JSON: ${{ steps.get-pr-info.outputs.pr-info }}
run: |
if [[ "$IS_PULL_REQUEST_REF" == "true" && -n "$PR_INFO_JSON" ]]; then
base_ref=$(echo "$PR_INFO_JSON" | jq -r '.base.ref')
base_sha=$(echo "$PR_INFO_JSON" | jq -r '.base.sha')
head_ref=$(echo "$PR_INFO_JSON" | jq -r '.head.ref')
head_sha=$(echo "$PR_INFO_JSON" | jq -r '.head.sha')
head_label=$(echo "$PR_INFO_JSON" | jq -r '.head.label // empty')
ci_label=$(echo "$PR_INFO_JSON" | jq -r '[.labels[]? | (if type == "string" then . else .name end) | select(startswith("CI:"))] | first // empty')
has_skip_cicd=$(echo "$PR_INFO_JSON" | jq -r '[.labels[]? | (if type == "string" then . else .name end) | select(. == "Skip CICD")] | length > 0')
else
base_ref="HEAD~1"
base_sha=$(git rev-parse HEAD~1)
head_ref="HEAD"
head_sha="${{ github.sha }}"
head_label="${{ github.ref_name }}"
ci_label=""
has_skip_cicd="false"
fi
[[ "$has_skip_cicd" != "true" ]] && has_skip_cicd="false"
echo "base_ref=$base_ref" >> "$GITHUB_OUTPUT"
echo "base_sha=$base_sha" >> "$GITHUB_OUTPUT"
echo "head_ref=$head_ref" >> "$GITHUB_OUTPUT"
echo "head_sha=$head_sha" >> "$GITHUB_OUTPUT"
echo "head_label=$head_label" >> "$GITHUB_OUTPUT"
echo "ci_label=$ci_label" >> "$GITHUB_OUTPUT"
echo "has_skip_cicd=$has_skip_cicd" >> "$GITHUB_OUTPUT"

- name: Get changed files
id: changed-files
if: github.event_name == 'pull_request'
if: startsWith(github.ref, 'refs/heads/pull-request/')
uses: step-security/changed-files@v45.0.1
with:
base_sha: ${{ steps.base-head-ref.outputs.base_sha }}
files_yaml: |
doc:
- '**.md'
Expand All @@ -75,8 +117,8 @@ jobs:
DOCS_ONLY: ${{ steps.changed-files.outputs.doc_any_changed == 'true' && steps.changed-files.outputs.src_any_changed == 'false' }}
CHANGED_DOCS: ${{ steps.changed-files.outputs.doc_all_changed_files }}
CHANGED_SRC: ${{ steps.changed-files.outputs.src_all_changed_files }}
IS_PULLREQUEST: ${{ github.event_name == 'pull_request' }}
LABEL: ${{ github.event.label.name }}
IS_PULLREQUEST: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
LABEL: ${{ steps.base-head-ref.outputs.ci_label }}
MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
run: |
# Some output that's helpful for debugging
Expand Down Expand Up @@ -128,10 +170,22 @@ jobs:
fi
echo "image_tag=$IMAGE_TAG" | tee -a "$GITHUB_OUTPUT"

org-member-pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.78.0
with:
default_runner_prefix: ${{ vars.DEFAULT_RUNNER_PREFIX }}
non_nvidia_runner_prefix: ${{ vars.NON_NVIDIA_RUNNER_PREFIX }}
default_test_data_path: ${{ vars.DEFAULT_TEST_DATA_PATH }}
non_nvidia_test_data_path: ${{ vars.NON_NVIDIA_TEST_DATA_PATH }}
default_registry: ${{ vars.DEFAULT_CONTAINER_REGISTRY }}
non_nvidia_registry: ${{ vars.NON_NVIDIA_CONTAINER_REGISTRY }}
secrets:
NVIDIA_MANAGEMENT_ORG_PAT: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}

pr-branch-up-to-date-check:
name: Check if PR branch is up to date
needs: [pre-flight]
if: ${{ github.event_name == 'pull_request' }}
if: ${{ startsWith(github.ref, 'refs/heads/pull-request/') }}
runs-on: ubuntu-latest
env:
MAX_COMMITS_BEHIND: 10
Expand All @@ -140,10 +194,10 @@ jobs:
env:
GH_TOKEN: ${{ github.token }}
REPO: ${{ github.repository }}
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
BASE_REF: ${{ github.base_ref }}
HEAD_LABEL: ${{ github.event.pull_request.head.label }}
BASE_SHA: ${{ needs.pre-flight.outputs.base_sha }}
HEAD_SHA: ${{ needs.pre-flight.outputs.head_sha }}
BASE_REF: ${{ needs.pre-flight.outputs.base_ref }}
HEAD_LABEL: ${{ needs.pre-flight.outputs.head_label }}
run: |
echo "Repository: $REPO"
echo "Base branch: $BASE_REF (SHA: $BASE_SHA)"
Expand Down Expand Up @@ -227,14 +281,16 @@ jobs:

build-container:
if: ${{ needs.pre-flight.outputs.test_level != 'none' && needs.pre-flight.outputs.image_tag == '' }}
needs: [pre-flight]
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.52.0
needs: [pre-flight, org-member-pre-flight]
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.78.0
with:
build-ref: ${{ github.sha }}
image-name: nemo_rl_container
image-name: ${{ vars.CI_CONTAINER_NAME }}
dockerfile: docker/Dockerfile
image-label: nemo-rl
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
image-label: ${{ vars.CI_CONTAINER_NAME }}
target: release
registry: ${{ needs.org-member-pre-flight.outputs.registry }}
build-contexts: |
nemo-rl=${{ github.run_id }}/
build-args: |
Expand All @@ -247,8 +303,8 @@ jobs:
matrix:
include:
- script: Docs_Tests
runner: self-hosted-azure
needs: [pre-flight, build-container]
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
needs: [pre-flight, build-container, org-member-pre-flight]
if: ${{ contains('docs L0 L1 L2', needs.pre-flight.outputs.test_level) }}
runs-on: ${{ matrix.runner }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
Expand All @@ -260,22 +316,24 @@ jobs:
uses: ./.github/actions/test-template
with:
runner: ${{ runner.name }}
registry: ${{ needs.org-member-pre-flight.outputs.registry }}
image: ${{ vars.CI_CONTAINER_NAME }}
test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }}
script: ${{ matrix.script }}
is_doc_test: "true"
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

cicd-unit-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L0_Unit_Tests_Generation
runner: self-hosted-azure
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
- script: L0_Unit_Tests_Policy
runner: self-hosted-azure
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
- script: L0_Unit_Tests_Other
runner: self-hosted-azure
needs: [pre-flight, build-container, cicd-doc-tests]
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
needs: [pre-flight, build-container, cicd-doc-tests, org-member-pre-flight]
if: >-
${{
(
Expand All @@ -298,19 +356,21 @@ jobs:
with:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
registry: ${{ needs.org-member-pre-flight.outputs.registry }}
test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }}
image: ${{ vars.CI_CONTAINER_NAME }}
image-tag: ${{ needs.pre-flight.outputs.image_tag }}
is_unit_test: "true"
cpu-only: ${{ matrix.cpu-only || false }}
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

cicd-functional-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L1_Functional_Tests_GPU
runner: self-hosted-azure
needs: [pre-flight, build-container, cicd-unit-tests]
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
needs: [pre-flight, build-container, cicd-unit-tests, org-member-pre-flight]
runs-on: ${{ matrix.runner }}
if: ${{ contains('L1 L2', needs.pre-flight.outputs.test_level) }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
Expand All @@ -324,17 +384,19 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
with:
runner: ${{ runner.name }}
registry: ${{ needs.org-member-pre-flight.outputs.registry }}
image: ${{ vars.CI_CONTAINER_NAME }}
test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }}
script: ${{ matrix.script }}
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}

cicd-fast-functional-tests:
strategy:
fail-fast: false
matrix:
include:
- script: L1_Functional_Tests_GPU
runner: self-hosted-azure
needs: [pre-flight]
runner: ${{ needs.org-member-pre-flight.outputs.runner_prefix }}-gpu-x2
needs: [pre-flight, build-container, org-member-pre-flight]
if: ${{ needs.pre-flight.outputs.test_level == 'Lfast' }}
runs-on: ${{ matrix.runner }}
name: fast_${{ matrix.script }}
Expand All @@ -350,7 +412,9 @@ jobs:
runner: ${{ runner.name }}
script: ${{ matrix.script }}
image-tag: ${{ needs.pre-flight.outputs.image_tag }}
is_fork_pr: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name != github.event.pull_request.base.repo.full_name }}
registry: ${{ needs.org-member-pre-flight.outputs.registry }}
image: ${{ vars.CI_CONTAINER_NAME }}
test_data_path: ${{ needs.org-member-pre-flight.outputs.test_data_path }}

CI_QA_Gate:
name: "CI quality check${{ needs.pre-flight.outputs.test_level == 'none' && ' (No tests run: Label CI:L*)' || '' }}"
Expand Down Expand Up @@ -389,7 +453,7 @@ jobs:
)
)
}}
CI_SKIP: ${{ github.event.label.name == 'Skip CICD' }}
CI_SKIP: ${{ needs.pre-flight.outputs.has_cicd_skip_label }}
TEST_LEVEL: ${{ needs.pre-flight.outputs.test_level }}
run: |
SUMMARY=$(echo $JOB_RESULTS | jq 'to_entries[] | .key + ": " + .value.result' | tr -d '"')
Expand Down
Loading
Loading