diff --git a/.github/workflows/add-model-like.yml b/.github/workflows/add-model-like.yml index 3ea3c89249fe..b5cc5d4dfb67 100644 --- a/.github/workflows/add-model-like.yml +++ b/.github/workflows/add-model-like.yml @@ -14,7 +14,7 @@ on: jobs: run_tests_templates_like: name: "Add new model like template tests" - runs-on: ubuntu-latest + runs-on: rocm steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 03ecf450264d..ff252643a9de 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,7 +3,7 @@ name: Build docker images (scheduled) on: push: branches: - - docker-image* + - build_ci_docker_image* repository_dispatch: workflow_call: inputs: @@ -11,7 +11,7 @@ on: required: true type: string schedule: - - cron: "0 1 * * *" + - cron: "17 0 * * *" concurrency: group: docker-images-builds @@ -20,8 +20,18 @@ concurrency: jobs: latest-docker: name: "Latest PyTorch + TensorFlow [dev]" - runs-on: ubuntu-latest + runs-on: rocm steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -41,7 +51,7 @@ jobs: context: ./docker/transformers-all-latest-gpu build-args: | REF=main - push: true + push: false tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} # Push CI images still need to be re-built daily - @@ -54,42 +64,23 @@ jobs: context: ./docker/transformers-all-latest-gpu build-args: | REF=main - push: true + push: false tags: huggingface/transformers-all-latest-gpu-push-ci - latest-with-torch-nightly-docker: - name: "Nightly PyTorch + Stable TensorFlow" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - PYTORCH=pre - push: true - tags: huggingface/transformers-all-latest-torch-nightly-gpu - latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" - runs-on: ubuntu-latest + runs-on: rocm steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -109,14 +100,24 @@ jobs: context: ./docker/transformers-pytorch-deepspeed-latest-gpu build-args: | REF=main - push: true + push: false tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) latest-torch-deepspeed-docker-for-push-ci-daily-build: name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - runs-on: ubuntu-latest + runs-on: rocm steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -140,42 +141,14 @@ jobs: context: ./docker/transformers-pytorch-deepspeed-latest-gpu build-args: | REF=main - push: true + push: false tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - nightly-torch-deepspeed-docker: - name: "Nightly PyTorch + DeepSpeed" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-pytorch-deepspeed-nightly-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu - doc-builder: name: "Doc builder" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Set up Docker Buildx @@ -194,14 +167,14 @@ jobs: uses: docker/build-push-action@v3 with: context: ./docker/transformers-doc-builder - push: true + push: false tags: huggingface/transformers-doc-builder latest-pytorch: name: "Latest PyTorch [dev]" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Set up Docker Buildx @@ -222,14 +195,14 @@ jobs: context: ./docker/transformers-pytorch-gpu build-args: | REF=main - push: true + push: false tags: huggingface/transformers-pytorch-gpu latest-tensorflow: name: "Latest TensorFlow [dev]" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Set up Docker Buildx @@ -250,5 +223,5 @@ jobs: context: ./docker/transformers-tensorflow-gpu build-args: | REF=main - push: true + push: false tags: huggingface/transformers-tensorflow-gpu diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml new file mode 100644 index 000000000000..fbda02e547d8 --- /dev/null +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -0,0 +1,75 @@ +name: Build docker images (Nightly CI) + +on: + workflow_call: + push: + branches: + - build_nightly_ci_docker_image* + +concurrency: + group: docker-images-builds + cancel-in-progress: false + +jobs: + latest-with-torch-nightly-docker: + name: "Nightly PyTorch + Stable TensorFlow" + runs-on: rocm + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + PYTORCH=pre + push: false + tags: huggingface/transformers-all-latest-torch-nightly-gpu + + nightly-torch-deepspeed-docker: + name: "Nightly PyTorch + DeepSpeed" + runs-on: rocm + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-pytorch-deepspeed-nightly-gpu + build-args: | + REF=main + push: false + tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml index 3a0e1612454c..2593b183e743 100644 --- a/.github/workflows/build-past-ci-docker-images.yml +++ b/.github/workflows/build-past-ci-docker-images.yml @@ -3,7 +3,7 @@ name: Build docker images (Past CI) on: push: branches: - - past-ci-docker-image* + - build_past_ci_docker_image* concurrency: group: docker-images-builds @@ -15,8 +15,8 @@ jobs: strategy: fail-fast: false matrix: - version: ["1.11", "1.10", "1.9", "1.8", "1.7", "1.6", "1.5", "1.4"] - runs-on: ubuntu-latest + version: ["1.13", "1.12", "1.11", "1.10", "1.9"] + runs-on: rocm steps: - name: Set up Docker Buildx @@ -24,6 +24,17 @@ jobs: - name: Check out code uses: actions/checkout@v3 + - + id: get-base-image + name: Get Base Image + env: + framework_version: ${{ matrix.version }} + run: | + echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["pytorch"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT + - + name: Print Base Image + run: | + echo ${{ steps.get-base-image.outputs.base_image }} - name: Login to DockerHub uses: docker/login-action@v2 @@ -37,9 +48,10 @@ jobs: context: ./docker/transformers-past-gpu build-args: | REF=main + BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }} FRAMEWORK=pytorch VERSION=${{ matrix.version }} - push: true + push: false tags: huggingface/transformers-pytorch-past-${{ matrix.version }}-gpu past-tensorflow-docker: @@ -47,8 +59,8 @@ jobs: strategy: fail-fast: false matrix: - version: ["2.8", "2.7", "2.6", "2.5"] - runs-on: ubuntu-latest + version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"] + runs-on: rocm steps: - name: Set up Docker Buildx @@ -57,37 +69,16 @@ jobs: name: Check out code uses: actions/checkout@v3 - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} + id: get-base-image + name: Get Base Image + env: + framework_version: ${{ matrix.version }} + run: | + echo "base_image=$(python3 -c 'import os; from utils.past_ci_versions import past_versions_testing; base_image = past_versions_testing["tensorflow"][os.environ["framework_version"]]["base_image"]; print(base_image)')" >> $GITHUB_OUTPUT - - name: Build and push - uses: docker/build-push-action@v3 - with: - context: ./docker/transformers-past-gpu - build-args: | - REF=main - FRAMEWORK=tensorflow - VERSION=${{ matrix.version }} - push: true - tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu - - past-tensorflow-docker-2-4: - name: "Past TensorFlow Docker" - strategy: - fail-fast: false - matrix: - version: ["2.4"] - runs-on: ubuntu-latest - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - - name: Check out code - uses: actions/checkout@v3 + name: Print Base Image + run: | + echo ${{ steps.get-base-image.outputs.base_image }} - name: Login to DockerHub uses: docker/login-action@v2 @@ -101,8 +92,8 @@ jobs: context: ./docker/transformers-past-gpu build-args: | REF=main - BASE_DOCKER_IMAGE=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04 + BASE_DOCKER_IMAGE=${{ steps.get-base-image.outputs.base_image }} FRAMEWORK=tensorflow VERSION=${{ matrix.version }} - push: true - tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu \ No newline at end of file + push: false + tags: huggingface/transformers-tensorflow-past-${{ matrix.version }}-gpu diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 9f29a7d7a7ef..4e59cfeb9d0d 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -15,6 +15,6 @@ jobs: commit_sha: ${{ github.sha }} package: transformers notebook_folder: transformers_doc - languages: de en es it ko pt zh + languages: de en es fr it ko pt zh secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 0c8aa237f36e..640a0cb2f59f 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -14,4 +14,4 @@ jobs: commit_sha: ${{ github.event.pull_request.head.sha }} pr_number: ${{ github.event.number }} package: transformers - languages: de en es it ko pt zh + languages: de en es fr it ko pt zh diff --git a/.github/workflows/check_runner_status.yml b/.github/workflows/check_runner_status.yml index 8912e32c94ee..5d7578e0eae4 100644 --- a/.github/workflows/check_runner_status.yml +++ b/.github/workflows/check_runner_status.yml @@ -18,7 +18,7 @@ env: jobs: check_runner_status: name: Check Runner Status - runs-on: ubuntu-latest + runs-on: rocm outputs: offline_runners: ${{ steps.set-offline_runners.outputs.offline_runners }} steps: @@ -39,7 +39,7 @@ jobs: send_results: name: Send results to webhook - runs-on: ubuntu-latest + runs-on: rocm needs: check_runner_status if: ${{ failure() }} steps: @@ -57,6 +57,7 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: runner status check RUNNER_STATUS: ${{ needs.check_runner_status.result }} OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }} diff --git a/.github/workflows/check_tiny_models.yml b/.github/workflows/check_tiny_models.yml new file mode 100644 index 000000000000..6e4c327f95b9 --- /dev/null +++ b/.github/workflows/check_tiny_models.yml @@ -0,0 +1,82 @@ +name: Check Tiny Models + +on: + push: + branches: + - check_tiny_models* + repository_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }} + +jobs: + check_tiny_models: + name: Check tiny models + runs-on: rocm + steps: + - name: Checkout transformers + uses: actions/checkout@v3 + with: + fetch-depth: 2 + + - uses: actions/checkout@v3 + - name: Set up Python 3.8 + uses: actions/setup-python@v4 + with: + # Semantic version range syntax or exact version of a Python version + python-version: '3.8' + # Optional - x64 or x86 architecture, defaults to x64 + architecture: 'x64' + + - name: Install + run: | + sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake + pip install --upgrade pip + python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu] + pip install tensorflow_probability + python -m pip install -U natten + + - name: Create all tiny models (locally) + run: | + python utils/create_dummy_models.py tiny_local_models --all --num_workers 2 + + - name: Local tiny model reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_local_model_creation_reports + path: tiny_local_models/reports + + # GitHub-hosted runners have 2-core CPUs + - name: Run pipeline tests against all new (local) tiny models + run: | + OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_local_model_creation_reports + path: reports/tests_pipelines + + - name: Create + Upload tiny models for new model architecture(s) + run: | + python utils/update_tiny_models.py --num_workers 2 + + - name: Full report + run: cat tiny_models/reports/tiny_model_creation_report.json + + - name: Failure report + run: cat tiny_models/reports/simple_failed_report.txt + + - name: Summary report + run: cat tiny_models/reports/tiny_model_summary.json + + - name: New tiny model creation reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tiny_model_creation_reports + path: tiny_models/reports diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index d65698e2a4f3..cc1efcc591c6 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -6,7 +6,7 @@ on: - doctest* repository_dispatch: schedule: - - cron: "0 2 * * *" + - cron: "17 2 * * *" env: @@ -20,32 +20,35 @@ env: jobs: run_doctests: - runs-on: [self-hosted, doc-tests-gpu] + runs-on: rocm container: image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: + - name: uninstall transformers (installed during docker image build) + run: python3 -m pip uninstall -y transformers + - uses: actions/checkout@v3 - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" + + - name: Install transformers in edit mode + run: python3 -m pip install -e . - name: GPU visibility run: | python3 utils/print_env.py - - name: Prepare files for doctests - run: | - python3 utils/prepare_for_doc_test.py src docs + - name: Show installed libraries and their versions + run: pip freeze - name: Run doctests run: | python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx" - - name: Clean files after doctests - run: | - python3 utils/prepare_for_doc_test.py src docs --remove_new_line - - name: Failure short reports if: ${{ failure() }} continue-on-error: true @@ -61,7 +64,7 @@ jobs: send_results: name: Send results to webhook - runs-on: ubuntu-latest + runs-on: rocm if: always() needs: [run_doctests] steps: diff --git a/.github/workflows/model-templates.yml b/.github/workflows/model-templates.yml index 3830c23fe048..16389349d7bb 100644 --- a/.github/workflows/model-templates.yml +++ b/.github/workflows/model-templates.yml @@ -7,7 +7,7 @@ on: jobs: run_tests_templates: - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Checkout repository uses: actions/checkout@v3 diff --git a/.github/workflows/release-conda.yml b/.github/workflows/release-conda.yml index 4cc0b662fcc8..3d1656cc7cef 100644 --- a/.github/workflows/release-conda.yml +++ b/.github/workflows/release-conda.yml @@ -12,7 +12,7 @@ env: jobs: build_and_package: - runs-on: ubuntu-latest + runs-on: rocm defaults: run: shell: bash -l {0} diff --git a/.github/workflows/self-nightly-past-ci-caller.yml b/.github/workflows/self-nightly-past-ci-caller.yml new file mode 100644 index 000000000000..843269512dbd --- /dev/null +++ b/.github/workflows/self-nightly-past-ci-caller.yml @@ -0,0 +1,156 @@ +name: Self-hosted runner (nightly-past-ci-caller) + +on: + schedule: + # 2:17 am on each Sunday and Thursday + + - cron: "17 2 * * 0,4" + push: + branches: + - run_nightly_ci* + - run_past_ci* + +jobs: + build_nightly_ci_images: + name: Build Nightly CI Docker Images + if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci')) + uses: ./.github/workflows/build-nightly-ci-docker-images.yml + secrets: inherit + + run_nightly_ci: + name: Nightly CI + needs: [build_nightly_ci_images] + uses: ./.github/workflows/self-nightly-scheduled.yml + secrets: inherit + + run_past_ci_pytorch_1-13: + name: PyTorch 1.13 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_nightly_ci] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.13" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-12: + name: PyTorch 1.12 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-13] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.12" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-11: + name: PyTorch 1.11 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-12] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.11" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-10: + name: PyTorch 1.10 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-11] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.10" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_pytorch_1-9: + name: PyTorch 1.9 + if: (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci'))) + needs: [run_past_ci_pytorch_1-10] + uses: ./.github/workflows/self-past.yml + with: + framework: pytorch + version: "1.9" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-11: + name: TensorFlow 2.11 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_pytorch_1-9] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.11" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-10: + name: TensorFlow 2.10 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-11] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.10" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-9: + name: TensorFlow 2.9 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-10] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.9" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-8: + name: TensorFlow 2.8 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-9] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.8" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-7: + name: TensorFlow 2.7 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-8] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.7" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-6: + name: TensorFlow 2.6 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-7] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.6" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-5: + name: TensorFlow 2.5 + if: (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + needs: [run_past_ci_tensorflow_2-6] + uses: ./.github/workflows/self-past.yml + with: + framework: tensorflow + version: "2.5" + sha: ${{ github.sha }} + secrets: inherit diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index accccf6164bc..7b2977a2443a 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (nightly) +name: Self-hosted runner (nightly-ci) # Note that each job's dependencies go into a corresponding docker file. # @@ -8,9 +8,7 @@ name: Self-hosted runner (nightly) on: repository_dispatch: -# Disable temporarily until the test suite can be run under 12 hours. -# schedule: -# - cron: "0 16 * * *" + workflow_call: env: HF_HOME: /mnt/cache @@ -25,7 +23,7 @@ env: jobs: check_runner_status: name: Check Runner Status - runs-on: ubuntu-latest + runs-on: rocm gfx90a rocm-org steps: - name: Checkout transformers uses: actions/checkout@v3 @@ -33,7 +31,7 @@ jobs: fetch-depth: 2 - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} check_runners: name: Check Runners @@ -41,14 +39,16 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" setup: name: Setup @@ -56,10 +56,10 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -85,9 +85,11 @@ jobs: run: | echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" run_tests_single_gpu: name: Model tests @@ -96,10 +98,10 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -117,9 +119,11 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -143,7 +147,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: @@ -153,10 +157,10 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-all-latest-torch-nightly-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -174,9 +178,11 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -200,7 +206,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_all_tests_torch_cuda_extensions_gpu: @@ -209,11 +215,11 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} needs: setup container: image: huggingface/transformers-pytorch-deepspeed-nightly-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone working-directory: /workspace/transformers @@ -229,11 +235,13 @@ jobs: python3 -m pip uninstall -y deepspeed rm -rf DeepSpeed git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - name: NVIDIA-SMI run: | nvidia-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /workspace/transformers @@ -258,7 +266,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu send_results: @@ -291,7 +299,8 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} - CI_EVENT: nightly-build + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: Nightly CI RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} SETUP_STATUS: ${{ needs.setup.result }} @@ -301,3 +310,11 @@ jobs: pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + + + # delete-artifact + - uses: geekyeggo/delete-artifact@v2 + with: + name: | + single-* + multi-* diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index c59800445bdc..416e8ed9cf47 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -1,4 +1,4 @@ -name: Self-hosted runner (past) +name: Self-hosted runner (past-ci) # Note that each job's dependencies go into a corresponding docker file. # @@ -34,7 +34,7 @@ env: jobs: check_runner_status: name: Check Runner Status - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Checkout transformers uses: actions/checkout@v3 @@ -53,11 +53,13 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" setup: name: Setup @@ -68,7 +70,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -104,7 +106,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -122,9 +124,17 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" + + - name: Install + if: inputs.framework == 'pytorch' + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - name: Environment working-directory: /transformers @@ -157,7 +167,7 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} run_tests_multi_gpu: @@ -170,7 +180,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} container: image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -188,9 +198,17 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" + + - name: Install + if: inputs.framework == 'pytorch' + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate - name: Environment working-directory: /transformers @@ -223,14 +241,87 @@ jobs: if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} + run_all_tests_torch_cuda_extensions_gpu: + name: Torch CUDA extension tests + if: inputs.framework == 'pytorch' + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker-past-ci') }} + needs: setup + container: + image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Install + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: / + run: | + python3 -m pip uninstall -y deepspeed + rm -rf DeepSpeed + git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu + send_results: name: Send results to webhook - runs-on: ubuntu-latest + runs-on: rocm if: always() - needs: [check_runner_status, check_runners, setup, run_tests_single_gpu, run_tests_multi_gpu] + needs: [ + check_runner_status, + check_runners, + setup, + run_tests_single_gpu, + run_tests_multi_gpu, + run_all_tests_torch_cuda_extensions_gpu + ] steps: - name: Preliminary job status shell: bash @@ -254,6 +345,7 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }} RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} @@ -271,4 +363,11 @@ jobs: uses: actions/upload-artifact@v3 with: name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }} - path: test_failure_tables \ No newline at end of file + path: test_failure_tables + + # delete-artifact + - uses: geekyeggo/delete-artifact@v2 + with: + name: | + single-* + multi-* diff --git a/.github/workflows/self-push-caller.yml b/.github/workflows/self-push-caller.yml index 994567c5cdbd..83a8d869acf1 100644 --- a/.github/workflows/self-push-caller.yml +++ b/.github/workflows/self-push-caller.yml @@ -14,7 +14,7 @@ on: jobs: check-for-setup: - runs-on: ubuntu-latest + runs-on: rocm name: Check if setup was changed outputs: changed: ${{ steps.was_changed.outputs.changed }} @@ -46,9 +46,9 @@ jobs: run_push_ci: name: Trigger Push CI - runs-on: ubuntu-latest + runs-on: rocm if: ${{ always() }} needs: build-docker-containers steps: - name: Trigger push CI via workflow_run - run: echo "Trigger push CI via workflow_run" \ No newline at end of file + run: echo "Trigger push CI via workflow_run" diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index b6c3a70e3eb8..5eb09e48e431 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -29,7 +29,7 @@ env: jobs: check_runner_status: name: Check Runner Status - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Checkout transformers uses: actions/checkout@v3 @@ -48,11 +48,13 @@ jobs: runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-all-latest-gpu-push-ci - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" setup: name: Setup @@ -63,7 +65,7 @@ jobs: runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-all-latest-gpu-push-ci - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} test_map: ${{ steps.set-matrix.outputs.test_map }} @@ -161,7 +163,7 @@ jobs: runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-all-latest-gpu-push-ci - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) @@ -207,9 +209,11 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -250,7 +254,7 @@ jobs: runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-all-latest-gpu-push-ci - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) @@ -296,9 +300,11 @@ jobs: echo "$matrix_folders" echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -339,7 +345,7 @@ jobs: runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) @@ -381,11 +387,13 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /workspace/transformers @@ -425,7 +433,7 @@ jobs: runs-on: [self-hosted, docker-gpu, '${{ matrix.machine_type }}'] container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: # Necessary to get the correct branch name and commit SHA for `workflow_run` event # We also take into account the `push` event (we might want to test some changes in a branch) @@ -467,11 +475,13 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /workspace/transformers @@ -502,7 +512,7 @@ jobs: send_results: name: Send results to webhook - runs-on: ubuntu-latest + runs-on: rocm if: always() needs: [ check_runner_status, @@ -568,6 +578,7 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: push CI_TITLE_PUSH: ${{ github.event.head_commit.message }} CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 750f4a956943..7e1adbd9b8e4 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -9,7 +9,10 @@ name: Self-hosted runner (scheduled) on: repository_dispatch: schedule: - - cron: "0 2 * * *" + - cron: "17 2 * * *" + push: + branches: + - run_scheduled_ci* env: HF_HOME: /mnt/cache @@ -24,7 +27,7 @@ env: jobs: check_runner_status: name: Check Runner Status - runs-on: ubuntu-latest + runs-on: rocm steps: - name: Checkout transformers uses: actions/checkout@v3 @@ -43,11 +46,13 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" setup: name: Setup @@ -58,7 +63,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -84,9 +89,11 @@ jobs: run: | echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" run_tests_single_gpu: name: Model tests @@ -98,7 +105,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -116,9 +123,11 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -155,7 +164,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Echo folder ${{ matrix.folders }} @@ -173,9 +182,11 @@ jobs: working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -211,16 +222,18 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -258,16 +271,18 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-pytorch-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -304,7 +319,7 @@ jobs: runs-on: ${{ format('{0}-{1}', matrix.machine_type, 'docker') }} container: image: huggingface/transformers-tensorflow-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ needs: setup steps: - name: Update clone @@ -312,9 +327,11 @@ jobs: run: | git fetch && git checkout ${{ github.sha }} - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /transformers @@ -352,7 +369,7 @@ jobs: needs: setup container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + options: --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 16G -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: - name: Update clone working-directory: /workspace/transformers @@ -366,11 +383,13 @@ jobs: working-directory: /workspace run: | python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - name: NVIDIA-SMI + - name: ROCM-SMI run: | - nvidia-smi + rocm-smi + - name: ROCM info gfx + run: rocminfo | grep "gfx*" - name: Environment working-directory: /workspace/transformers @@ -400,7 +419,7 @@ jobs: run_extract_warnings: name: Extract warnings in CI artifacts - runs-on: ubuntu-latest + runs-on: rocm if: always() needs: [ check_runner_status, @@ -450,7 +469,7 @@ jobs: send_results: name: Send results to webhook - runs-on: ubuntu-latest + runs-on: rocm gfx90a rocm-org if: always() needs: [ check_runner_status, @@ -482,13 +501,25 @@ jobs: CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} CI_EVENT: scheduled + CI_SHA: ${{ github.sha }} + CI_WORKFLOW_REF: ${{ github.workflow_ref }} RUNNER_STATUS: ${{ needs.check_runner_status.result }} RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} SETUP_STATUS: ${{ needs.setup.result }} # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. run: | + sudo apt-get install -y curl pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" + + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. + - name: Failure table artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: test_failure_tables + path: test_failure_tables diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 9412442a7d0a..a1b547a543c1 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -8,7 +8,7 @@ jobs: close_stale_issues: name: Close Stale Issues if: github.repository == 'huggingface/transformers' - runs-on: ubuntu-latest + runs-on: rocm env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml index f6c9afd15b7e..e7fe4b66b1cf 100644 --- a/.github/workflows/update_metdata.yml +++ b/.github/workflows/update_metdata.yml @@ -4,11 +4,11 @@ on: push: branches: - main - - update_transformers_metadata + - update_transformers_metadata* jobs: build_and_package: - runs-on: ubuntu-latest + runs-on: rocm defaults: run: shell: bash -l {0} @@ -16,25 +16,12 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Load cached virtual environment - uses: actions/cache@v2 - id: cache - with: - path: ~/venv/ - key: v3-metadata-${{ hashFiles('setup.py') }} - - - name: Create virtual environment on cache miss - if: steps.cache.outputs.cache-hit != 'true' - run: | - python -m venv ~/venv && . ~/venv/bin/activate - pip install --upgrade pip - - name: Setup environment run: | - . ~/venv/bin/activate - pip install git+https://github.com/huggingface/transformers#egg=transformers[dev] + pip install --upgrade pip + pip install datasets pandas + pip install .[torch,tf,flax] - name: Update metadata run: | - . ~/venv/bin/activate python utils/update_metadata.py --token ${{ secrets.SYLVAIN_HF_TOKEN }} --commit_sha ${{ github.sha }}