diff --git a/.github/actions/test-template/action.yml b/.github/actions/test-template/action.yml deleted file mode 100644 index 42011f1bd..000000000 --- a/.github/actions/test-template/action.yml +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -name: "Test Template" -description: "Template for running NeMo tests in a containerized environment" - -inputs: - runner: - description: "Runner to use for test" - required: true - timeout: - description: "Max runtime of test in minutes" - required: false - default: "10" - script: - description: "Test script to execute" - required: true - is_optional: - description: "Failure will cancel all other tests if set to true" - required: false - default: "false" - is_unit_test: - description: "Upload coverage as unit test" - required: false - default: "false" - image: - description: "Image to use for test" - required: false - default: "nemo_gym" - cpu-only: - description: "Run tests on CPU only" - required: false - default: "false" - azure-client-id: - description: "Azure Client ID" - required: true - azure-tenant-id: - description: "Azure Tenant ID" - required: true - azure-subscription-id: - description: "Azure Subscription ID" - required: true - has-azure-credentials: - description: "Has Azure credentials" - required: false - default: "false" - -runs: - using: "composite" - steps: - - name: Install Azure CLI - if: ${{ inputs.has-azure-credentials == 'true' }} - shell: bash - run: | - curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash - - - name: Azure Login - if: ${{ inputs.has-azure-credentials == 'true' }} - uses: azure/login@v2 - with: - client-id: ${{ inputs.azure-client-id }} - tenant-id: ${{ inputs.azure-tenant-id }} - subscription-id: ${{ inputs.azure-subscription-id }} - - - name: Azure ACR Login - if: ${{ inputs.has-azure-credentials == 'true' }} - shell: bash - run: | - az acr login --name nemoci - - - name: Azure Fileshare - if: ${{ inputs.has-azure-credentials == 'true' && inputs.is_unit_test == 'false' }} - shell: bash - id: azure-fileshare - run: | - sudo apt update - sudo apt install -y cifs-utils - - RESOURCE_GROUP_NAME="azure-gpu-vm-runner_group" - STORAGE_ACCOUNT_NAME="nemocistorageaccount2" - FILE_SHARE_NAME="fileshare" - - MNT_ROOT="/media" - MNT_PATH="$MNT_ROOT/$STORAGE_ACCOUNT_NAME/$FILE_SHARE_NAME" - - echo "MNT_PATH=$MNT_PATH" | tee -a "$GITHUB_OUTPUT" - - sudo mkdir -p $MNT_PATH - - # Create a folder to store the credentials for this storage account and - # any other that you might set up. - CREDENTIAL_ROOT="/etc/smbcredentials" - sudo mkdir -p "/etc/smbcredentials" - - # Get the storage account key for the indicated storage account. - # You must be logged in with az login and your user identity must have - # permissions to list the storage account keys for this command to work. - STORAGE_ACCOUNT_KEY=$(az storage account keys list \ - --resource-group $RESOURCE_GROUP_NAME \ - --account-name $STORAGE_ACCOUNT_NAME \ - --query "[0].value" --output tsv | tr -d '"') - - # Create the credential file for this individual storage account - SMB_CREDENTIAL_FILE="$CREDENTIAL_ROOT/$STORAGE_ACCOUNT_NAME.cred" - if [ ! -f $SMB_CREDENTIAL_FILE ]; then - echo "username=$STORAGE_ACCOUNT_NAME" | sudo tee $SMB_CREDENTIAL_FILE > /dev/null - echo "password=$STORAGE_ACCOUNT_KEY" | sudo tee -a $SMB_CREDENTIAL_FILE > /dev/null - else - echo "The credential file $SMB_CREDENTIAL_FILE already exists, and was not modified." - fi - - # Change permissions on the credential file so only root can read or modify the password file. - sudo chmod 600 $SMB_CREDENTIAL_FILE - - # This command assumes you have logged in with az login - HTTP_ENDPOINT=$(az storage account show --resource-group $RESOURCE_GROUP_NAME --name $STORAGE_ACCOUNT_NAME --query "primaryEndpoints.file" --output tsv | tr -d '"') - SMB_PATH=$(echo $HTTP_ENDPOINT | cut -c7-${#HTTP_ENDPOINT})$FILE_SHARE_NAME - - STORAGE_ACCOUNT_KEY=$(az storage account keys list --resource-group $RESOURCE_GROUP_NAME --account-name $STORAGE_ACCOUNT_NAME --query "[0].value" --output tsv | tr -d '"') - - sudo mount -t cifs $SMB_PATH $MNT_PATH -o credentials=$SMB_CREDENTIAL_FILE,serverino,nosharesock,actimeo=30,mfsymlinks - - ls -al $MNT_PATH/TestData - - - name: Docker pull image - shell: bash - run: | - docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} - - - name: Checkout repository - uses: actions/checkout@v2 - with: - path: NeMo-Gym - - - name: Start container - shell: bash - run: | - MNT_PATH=${{ steps.azure-fileshare.outputs.mnt_path }} - - ARG=("") - if [[ "${{ inputs.cpu-only }}" == "false" ]]; then - ARG=("--runtime=nvidia --gpus all") - fi - - cmd=$(cat <&1 | tee err.log - - RUN_TEST_EOF - ) - - echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT" - echo "$cmd" | tee "job.sh" - - - name: Run main script - uses: nick-fields/retry@v3 - with: - timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }} - max_attempts: 3 - shell: bash - retry_on: timeout - command: /bin/bash job.sh - on_retry_command: /bin/bash retry_job.sh - - - name: Check result - id: check - shell: bash - run: | - docker exec nemo_container_${{ github.run_id }} coverage combine || true - docker exec nemo_container_${{ github.run_id }} coverage xml - docker cp nemo_container_${{ github.run_id }}:/workspace/.coverage .coverage - docker cp nemo_container_${{ github.run_id }}:/workspace/coverage.xml coverage.xml - - coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen) - echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT" - - IS_SUCCESS=$(tail -n 1 err.log | grep -q "Finished successfully." && echo "true" || echo "false") - - if [[ "$IS_SUCCESS" == "false" && "{% raw %}${{ inputs.is_optional }}" == "true" ]]; then - echo "::warning:: Test failed, but displayed as successful because it is marked as optional." - IS_SUCCESS=true - fi - - if [[ "$IS_SUCCESS" == "false" ]]; then - echo Test did not finish successfully. - exit 1 - fi - - exit $EXIT_CODE - - - name: Test coverage - shell: bash -x -e -u -o pipefail {0} - run: | - docker exec -t nemo_container_${{ github.run_id }} coverage report -i - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - if: ${{ steps.check.outputs.coverage_report != 'none' }} - with: - name: ${{ steps.check.outputs.coverage_report }} - path: | - coverage.xml - .coverage - include-hidden-files: true diff --git a/.github/workflows/cicd-approve-test-queue.yml b/.github/workflows/cicd-approve-test-queue.yml deleted file mode 100644 index 5b4dc7e57..000000000 --- a/.github/workflows/cicd-approve-test-queue.yml +++ /dev/null @@ -1,153 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -name: Approve Test Queue - -on: - schedule: - - cron: '*/5 * * * *' # Runs every 5 minutes - workflow_dispatch: # Allows manual triggering - -jobs: - approve-queue: - runs-on: ubuntu-latest - environment: main - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install requests - - - name: Approve waiting deployments - env: - GITHUB_TOKEN: ${{ secrets.PAT }} - MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }} - run: | - python - <= MAX_CONCURRENCY: - print("Maximum concurrency reached, no new approvals will be made") - exit(0) - - # Get waiting CI workflows for test environment - print("Fetching deployments...") - pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", []) - pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"] - - # Sort deployments by creation date (oldest first) - print("Sorting workflows...") - pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"]) - - # Process each deployment - print("Processing ...") - for workflow in pending_workflows: - if total_workflows >= MAX_CONCURRENCY: - print("Maximum concurrency reached, stopping approvals") - break - - workflow_id = workflow["id"] - workflow_name = workflow["display_title"] - print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}") - - deployment_url = f"actions/runs/{workflow_id}/pending_deployments" - deployment = make_request(deployment_url)[0] - environment_id = deployment["environment"]["id"] - - # Approve the deployment - status_data = { - "environment_ids": [environment_id], - "state": "approved", - "comment": "Automatically approved by queue manager" - } - result = make_request(deployment_url, method="POST", data=status_data) - - if result: - total_workflows += 1 - else: - print(f"Failed to approve deployment {deployment['id']}") - exit(1) - - EOF - notify: - if: failure() - runs-on: ubuntu-latest - needs: [approve-queue] - steps: - - name: Notify - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - SLACK_WEBHOOK_ADMIN: - GITHUB_RUN_ID: ${{ github.run_id }} - GITHUB_REPOSITORY: ${{ github.repository }} - run: | - curl -X POST \ - -H 'Content-type: application/json' \ - --data "{\"text\":\":robot_joy: failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \ - $SLACK_WEBHOOK - diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml deleted file mode 100644 index 6cdc78c8d..000000000 --- a/.github/workflows/cicd-main.yml +++ /dev/null @@ -1,220 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -name: CICD NeMo -on: - schedule: - - cron: 0 0 * * * - push: - branches: - - main - - "pull-request/[0-9]+" - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} - cancel-in-progress: true - -permissions: - id-token: write - contents: read - -jobs: - - cicd-wait-in-queue: - runs-on: ubuntu-latest - environment: test - steps: - - name: Running CI tests - run: | - echo "Running CI tests" - - cicd-container-build: - uses: ./.github/workflows/_build_container.yml - needs: cicd-wait-in-queue - with: - image-name: nemo_gym - dockerfile: docker/Dockerfile.ci - secrets: - AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} - AZURE_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} - AZURE_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - cicd-unit-tests: - strategy: - fail-fast: false - matrix: - include: - - script: L0_Unit_Tests_CPU - runner: linux-amd64-cpu16 - cpu-only: true - needs: [cicd-container-build] - runs-on: ${{ matrix.runner }} - name: ${{ matrix.script }} - environment: nemo-ci - steps: - - name: Checkout - uses: actions/checkout@v4 - - name: main - uses: ./.github/actions/test-template - with: - runner: ${{ runner.name }} - script: ${{ matrix.script }} - timeout: ${{ matrix.timeout || 10 }} - is_unit_test: "true" - image: nemo_gym - cpu-only: ${{ matrix.cpu-only || false }} - has-azure-credentials: "true" - azure-client-id: ${{ secrets.AZURE_CLIENT_ID }} - azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }} - azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - Nemo_CICD_Test: - needs: - - cicd-container-build - - cicd-unit-tests - if: always() - runs-on: ubuntu-latest - permissions: write-all - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Get workflow result - id: result - env: - GH_TOKEN: ${{ github.token }} - RUN_ID: ${{ github.run_id }} - run: | - # Get workflow run details and check job conclusions - LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last') - NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length') - NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length') - - if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then - RESULT="success" - elif [[ $NUM_CANCELLED -gt 0 ]]; then - RESULT="cancelled" - else - RESULT="failure" - fi - - # Output the final status - echo "code=$RESULT" | tee -a $GITHUB_OUTPUT - - - name: Checkout for GH CLI - uses: actions/checkout@v4 - - - name: Remove label if not cancelled - if: | - steps.result.outputs.code != 'cancelled' - && github.event.label.name == 'Run CICD' - && github.event.pull_request.head.repo.full_name == github.repository - env: - GH_TOKEN: ${{ github.token }} - PR_NUMBER: ${{ github.event.number }} - run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD" - - - name: Pipeline successful, add PR comment - if: | - steps.result.outputs.code == 'success' - && github.event_name == 'pull_request' - && env.SLACK_WEBHOOK != '' - uses: peter-evans/create-or-update-comment@v4 - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - REPOSITORY: ${{ github.repository }} - RUN_ID: ${{ github.run_id }} - with: - issue-number: ${{ github.event.number }} - body: | - [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋, - - We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully. - - So it might be time to merge this PR or get some approvals. - - //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc - - - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary" - if: | - steps.result.outputs.code == 'failure' - && github.event.label.name == 'Run CICD' - && env.SLACK_WEBHOOK != '' - env: - SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - REPOSITORY: ${{ github.repository }} - RUN_ID: ${{ github.run_id }} - PR_NUMBER: ${{ github.event.number }} - SERVER_URL: ${{ github.server_url }} - run: | - set -x - pip install PyGithub - export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} - - python .github/scripts/notify.py - - - name: Exit - if: ${{ always() }} - env: - RESULT: ${{ steps.result.outputs.code }} - run: | - if [ $RESULT == "success" ]; then - exit 0 - else - exit 1 - fi - - Coverage: - runs-on: ubuntu-latest - needs: [Nemo_CICD_Test] - strategy: - matrix: - flag: [unit-test] - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Download coverage reports of current branch - uses: actions/download-artifact@v4 - with: - pattern: coverage-${{ matrix.flag }}-* - - - name: Get total coverage of current branch - shell: bash -x -e -u -o pipefail {0} - if: always() - run: | - pip install coverage - - ls -al . - ls -al coverage-*/ - coverage combine --keep $(ls coverage-*/.coverage) - coverage report -i - rm -rf coverage-* - ls -al - - # Disabled for new repos initially - # - name: Upload coverage reports to Codecov - # uses: codecov/codecov-action@v5 - # with: - # token: ${{ secrets.CODECOV_TOKEN }} - # verbose: true - # flags: ${{ matrix.flag }} - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - name: coverage-${{ matrix.flag }}-aggregated - path: | - .coverage - include-hidden-files: true diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci deleted file mode 100644 index 255f26ec5..000000000 --- a/docker/Dockerfile.ci +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -FROM nvcr.io/nvidia/pytorch:25.05-py3 - -ENV PIP_CONSTRAINT="" - -# Install uv and python -ARG UV_VERSION=0.7.2 -ENV PATH="/root/.local/bin:$PATH" -RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh - -ENV UV_PROJECT_ENVIRONMENT=/opt/venv -ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH" - -WORKDIR /workspace -RUN --mount=type=bind,source=pyproject.toml,target=/workspace/pyproject.toml \ - --mount=type=bind,source=nemo_gym/__init__.py,target=/workspace/nemo_gym/__init__.py \ - --mount=type=bind,source=nemo_gym/package_info.py,target=/workspace/nemo_gym/package_info.py \ - --mount=type=bind,source=uv.lock,target=/workspace/uv.lock bash -exu <<"EOF" - - # Use the container's torch installation rather than reinstall it - uv venv --system-site-packages ${UV_PROJECT_ENVIRONMENT} - uv sync --link-mode symlink --locked --only-group test --only-group dev -EOF diff --git a/pyproject.toml b/pyproject.toml index 449c0ebb7..737aede80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -182,6 +182,9 @@ dev = [ requires = ["setuptools>=61", "setuptools-scm"] build-backend = "setuptools.build_meta" +[tool.uv] +managed = true + [tool.setuptools.dynamic] version = {attr = "nemo_gym.__version__"} readme = {file = "README.md", content-type = "text/markdown"} diff --git a/tests/unit_tests/L0_Unit_Tests_CPU.sh b/tests/unit_tests/L0_Unit_Tests_CPU.sh deleted file mode 100644 index 58768e086..000000000 --- a/tests/unit_tests/L0_Unit_Tests_CPU.sh +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -CUDA_VISIBLE_DEVICES="" coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/unit_tests -m "not pleasefixme" --cpu --with_downloads diff --git a/tests/unit_tests/L0_Unit_Tests_GPU.sh b/tests/unit_tests/L0_Unit_Tests_GPU.sh deleted file mode 100644 index 54c98b699..000000000 --- a/tests/unit_tests/L0_Unit_Tests_GPU.sh +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -CUDA_VISIBLE_DEVICES="0,1" coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/unit_tests -m "not pleasefixme" --with_downloads