diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml index aa60275b588f..5f37e7f9541e 100644 --- a/.github/workflows/check_failed_tests.yml +++ b/.github/workflows/check_failed_tests.yml @@ -6,9 +6,6 @@ on: docker: required: true type: string - start_sha: - required: true - type: string job: required: true type: string @@ -24,7 +21,13 @@ on: commit_sha: required: false type: string - + pr_number: + required: false + type: string + outputs: + report: + description: "Content of the report of new failures" + value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }} env: HF_HOME: /mnt/cache @@ -88,27 +91,55 @@ jobs: echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV fi - if [ -f setup_values/other_workflow_run_id.txt ]; then - echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV - else - echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV - fi - - name: Update clone working-directory: /transformers if: ${{ env.process == 'true' }} - run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} + run: | + git fetch origin ${{ inputs.commit_sha || github.sha }} + git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - - name: Get target commit + - name: Get `START_SHA` working-directory: /transformers/utils if: ${{ env.process == 'true' }} + run: | + echo "START_SHA=${{ inputs.commit_sha || github.sha }}" >> $GITHUB_ENV + + # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified) + - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR + id: pr_info + if: ${{ env.process == 'true' && inputs.pr_number != '' }} + uses: actions/github-script@v6 + with: + script: | + const { data: pr } = await github.rest.pulls.get({ + owner: context.repo.owner, + repo: context.repo.repo, + pull_number: ${{ inputs.pr_number }} + }); + + const { data: merge_commit } = await github.rest.repos.getCommit({ + owner: pr.base.repo.owner.login, + repo: pr.base.repo.name, + ref: pr.merge_commit_sha, + }); + + core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha); + + # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow. + # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.) + - name: Get `END_SHA` from previous CI runs of the same workflow + working-directory: /transformers/utils + if: ${{ env.process == 'true' && inputs.pr_number == '' }} run: | echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV - - name: Checkout to `start_sha` - working-directory: /transformers - if: ${{ env.process == 'true' }} - run: git fetch && git checkout ${{ inputs.start_sha }} + # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the + # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to + # see if a reported failing test is actually ONLY failing on the `merge_commit`. + - name: Set `END_SHA` + if: ${{ env.process == 'true' && inputs.pr_number != '' }} + run: | + echo "END_SHA=${{ steps.pr_info.outputs.merge_commit_base_sha }}" >> $GITHUB_ENV - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers @@ -138,7 +169,7 @@ jobs: - name: Check failed tests working-directory: /transformers if: ${{ env.process == 'true' }} - run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json + run: python3 utils/check_bad_commit.py --start_commit ${{ env.START_SHA }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json - name: Show results working-directory: /transformers @@ -159,6 +190,8 @@ jobs: if: needs.check_new_failures.outputs.process == 'true' runs-on: group: aws-g5-4xlarge-cache + outputs: + report: ${{ steps.set_output.outputs.report }} container: image: ${{ inputs.docker }} options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -190,18 +223,9 @@ jobs: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - - - name: Process report - shell: bash - working-directory: /transformers - env: - ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} - JOB_NAME: ${{ inputs.job }} - REPORT_REPO_ID: ${{ inputs.report_repo_id }} run: | - python3 utils/process_bad_commit_report.py + git fetch origin ${{ inputs.commit_sha || github.sha }} + git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Process report shell: bash @@ -218,6 +242,29 @@ jobs: echo EOF } >> "$GITHUB_ENV" + # The output is useful if a caller needs more processing, for example, we have a chain + # self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml), + # and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page. + - name: Show results & Set outputs + id: set_output + working-directory: /transformers + run: | + ls -l new_failures_with_bad_commit.json + cat new_failures_with_bad_commit.json + + { + echo 'report<> "$GITHUB_OUTPUT" + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: new_failures_with_bad_commit_${{ inputs.job }} + path: /transformers/new_failures_with_bad_commit.json + - name: Prepare Slack report title working-directory: /transformers run: | diff --git a/.github/workflows/get-pr-info.yml b/.github/workflows/get-pr-info.yml index 989281e5b904..0f60c039349f 100644 --- a/.github/workflows/get-pr-info.yml +++ b/.github/workflows/get-pr-info.yml @@ -39,6 +39,9 @@ on: PR_MERGE_COMMIT_SHA: description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository" value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }} + PR_MERGE_COMMIT_BASE_SHA: + description: "The sha of the parent commit of the the merge commit on the target branch in the base repository" + value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }} PR_HEAD_COMMIT_DATE: description: "The date of the head sha of the pull request branch in the head repository" value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }} @@ -74,6 +77,7 @@ jobs: PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }} PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }} PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }} + PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }} PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }} PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }} PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }} @@ -122,6 +126,7 @@ jobs: core.setOutput('base_ref', pr.base.ref); core.setOutput('head_sha', pr.head.sha); core.setOutput('base_sha', pr.base.sha); + core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha); core.setOutput('merge_commit_sha', pr.merge_commit_sha); core.setOutput('pr', pr); @@ -142,6 +147,10 @@ jobs: date: merge_commit.commit.committer.date }); + console.log('PR Info:', { + pr_info: pr + }); + - name: Convert dates to timestamps id: get_timestamps run: | diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 9c946d7974a1..69c84f22fe8d 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -80,7 +80,9 @@ jobs: - name: Update clone working-directory: /transformers - run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} + run: | + git fetch origin ${{ inputs.commit_sha || github.sha }} + git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Reinstall transformers in edit mode (remove the one installed during docker image build) working-directory: /transformers @@ -174,7 +176,7 @@ jobs: collated_reports: name: Collated Reports - if: ${{ always() }} + if: ${{ always() && inputs.runner_type != '' }} needs: run_models_gpu uses: huggingface/transformers/.github/workflows/collated-reports.yml@main with: diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml index cf0dbb162386..60d63851da18 100644 --- a/.github/workflows/push-important-models.yml +++ b/.github/workflows/push-important-models.yml @@ -153,5 +153,5 @@ jobs: ci_event: push report_repo_id: hf-internal-testing/transformers_ci_push commit_sha: ${{ github.sha }} - models: ${{ needs.get_modified_models.outputs.matrix }} + subdirs: ${{ needs.get_modified_models.outputs.matrix }} secrets: inherit diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml index 2f81f4f0fe53..ab0ab412bb59 100644 --- a/.github/workflows/self-comment-ci.yml +++ b/.github/workflows/self-comment-ci.yml @@ -23,62 +23,34 @@ env: TF_FORCE_GPU_ALLOW_GROWTH: true CUDA_VISIBLE_DEVICES: 0,1 + jobs: get-pr-number: - runs-on: ubuntu-22.04 name: Get PR number - # For security: only allow team members to run if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} - outputs: - PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }} - steps: - - name: Get PR number - shell: bash - run: | - if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then - echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV - else - echo "PR_NUMBER=" >> $GITHUB_ENV - fi - - - name: Check PR number - shell: bash - run: | - echo "${{ env.PR_NUMBER }}" - - - name: Set PR number - id: set_pr_number - run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT" + uses: ./.github/workflows/get-pr-number.yml - get-sha: - runs-on: ubuntu-22.04 + get-pr-info: + name: Get PR commit SHA needs: get-pr-number if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + uses: ./.github/workflows/get-pr-info.yml + with: + pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }} + + check-timestamps: + name: Check timestamps (security check) + runs-on: ubuntu-22.04 + needs: get-pr-info outputs: - PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }} - PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }} + PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }} + PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }} steps: - - uses: actions/checkout@v4 - with: - fetch-depth: "0" - ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" - - - name: Get SHA (and verify timestamps against the issue comment date) - id: get_sha + - name: Verify `merge_commit` timestamp is older than the issue comment timestamp env: - PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }} COMMENT_DATE: ${{ github.event.comment.created_at }} + PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }} run: | - git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head - git checkout refs/remotes/pull/$PR_NUMBER/head - echo "PR_HEAD_SHA: $(git log -1 --format=%H)" - echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" - git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge - git checkout refs/remotes/pull/$PR_NUMBER/merge - echo "PR_MERGE_SHA: $(git log -1 --format=%H)" - echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" - PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd) - echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP" COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s") echo "COMMENT_DATE: $COMMENT_DATE" echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP" @@ -87,13 +59,10 @@ jobs: exit -1; fi - # use a python script to handle this complex logic - # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model) - # case 2: `run-slow model_1, model_2` + # use a python script to handle this complex logic. get-tests: runs-on: ubuntu-22.04 - needs: [get-pr-number, get-sha] - if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + needs: [get-pr-number, check-timestamps] outputs: models: ${{ steps.models_to_run.outputs.models }} quantizations: ${{ steps.models_to_run.outputs.quantizations }} @@ -101,11 +70,11 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: "0" - ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge" - name: Verify merge commit SHA env: - VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} + VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }} run: | PR_MERGE_SHA=$(git log -1 --format=%H) if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then @@ -119,19 +88,39 @@ jobs: run: | python -m pip install GitPython python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt - echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV + echo 'models=$(tail -n 1 output.txt)' >> $GITHUB_ENV python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt - echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV + echo 'quantizations=$(tail -n 1 output2.txt)' >> $GITHUB_ENV - name: Show models to test id: models_to_run run: | echo "${{ env.models }}" - echo "models=${{ env.models }}" >> $GITHUB_ENV echo "models=${{ env.models }}" >> $GITHUB_OUTPUT echo "${{ env.quantizations }}" echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT + # Report back if we are not able to get the tests (for example, security check is failing) + report_error_earlier: + name: Report error earlier + if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }} + needs: [get-pr-number, get-pr-info, get-tests] + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - name: Reply to the comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ + -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!" + reply_to_comment: name: Reply to the comment if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }} @@ -143,20 +132,18 @@ jobs: - name: Reply to the comment env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - MODELS: ${{ needs.get-tests.outputs.models }} - BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}" + BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}' run: | gh api \ --method POST \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ - -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..." + -f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e '${{ env.BODY }}')" create_run: name: Create run - if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }} - needs: [get-sha, get-tests, reply_to_comment] + needs: [check-timestamps, reply_to_comment] permissions: statuses: write runs-on: ubuntu-22.04 @@ -173,243 +160,179 @@ jobs: --method POST \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ - repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + repos/${{ github.repository }}/statuses/${{ needs.check-timestamps.outputs.PR_HEAD_SHA }} \ -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests" - run_models_gpu: - name: Run all tests for the model + model-ci: + name: Model CI if: ${{ needs.get-tests.outputs.models != '[]' }} - needs: [get-pr-number, get-sha, get-tests, create_run] - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.get-tests.outputs.models) }} - machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Echo input and matrix info - shell: bash - run: | - echo "${{ matrix.folders }}" - - - name: Echo folder ${{ matrix.folders }} - shell: bash - # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to - # set the artifact folder names (because the character `/` is not allowed). - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'models/'/'models_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Checkout to PR merge commit - working-directory: /transformers - run: | - git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge - git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge - git log -1 --format=%H - - - name: Verify merge commit SHA - env: - VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} - working-directory: /transformers - run: | - PR_MERGE_SHA=$(git log -1 --format=%H) - if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then - echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; - exit -1; - fi - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /transformers - run: | - export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" - echo $CUDA_VISIBLE_DEVICES - python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - - - name: Make sure report directory exists - shell: bash - run: | - mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports - - run_quantization_torch_gpu: - name: Run all tests for a quantization + uses: ./.github/workflows/self-scheduled.yml + needs: [get-pr-number, check-timestamps, get-tests, create_run] + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-pr" + docker: huggingface/transformers-all-latest-gpu + ci_event: PR Comment CI + report_repo_id: hf-internal-testing/transformers_pr_ci + commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }} + subdirs: ${{ needs.get-tests.outputs.models }} + pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }} + secrets: inherit + + quantization-ci: + name: Quantization CI if: ${{ needs.get-tests.outputs.quantizations != '[]' }} - needs: [get-pr-number, get-sha, get-tests, create_run] - strategy: - fail-fast: false - matrix: - folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }} - machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache] - runs-on: - group: '${{ matrix.machine_type }}' - container: - image: huggingface/transformers-quantization-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + uses: ./.github/workflows/self-scheduled.yml + needs: [get-pr-number, check-timestamps, get-tests, create_run] + with: + job: run_quantization_torch_gpu + slack_report_channel: "#transformers-ci-pr" + docker: huggingface/transformers-quantization-latest-gpu + ci_event: PR Comment CI + report_repo_id: hf-internal-testing/transformers_pr_ci + commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }} + subdirs: ${{ needs.get-tests.outputs.quantizations }} + pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }} + secrets: inherit + + report: + name: Check & Report + needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci] + permissions: + pull-requests: write + statuses: write + if: ${{ always() && needs.create_run.result == 'success' }} + runs-on: ubuntu-22.04 steps: - - name: Echo folder ${{ matrix.folders }} - shell: bash - run: | - echo "${{ matrix.folders }}" - matrix_folders=${{ matrix.folders }} - matrix_folders=${matrix_folders/'quantization/'/'quantization_'} - echo "$matrix_folders" - echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV - - - name: Checkout to PR merge commit - working-directory: /transformers + - name: Show reports from jobs run: | - git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge - git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge - git log -1 --format=%H + echo "${{ needs.model-ci.outputs.report }}" + echo "${{ needs.quantization-ci.outputs.report }}" - - name: Verify merge commit SHA + - name: Process and filter reports env: - VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} - working-directory: /transformers - run: | - PR_MERGE_SHA=$(git log -1 --format=%H) - if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then - echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; - exit -1; - fi - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Set `machine_type` for report and artifact names - working-directory: /transformers - shell: bash - run: | - echo "${{ matrix.machine_type }}" - if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then - machine_type=single-gpu - elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then - machine_type=multi-gpu - else - machine_type=${{ matrix.machine_type }} - fi - echo "$machine_type" - echo "machine_type=$machine_type" >> $GITHUB_ENV - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run quantization tests on GPU - working-directory: /transformers + MODEL_REPORT: ${{ needs.model-ci.outputs.report }} + QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }} run: | - python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt - - - name: Make sure report directory exists - shell: bash + # Preprocess with Python + python3 << 'PYTHON_SCRIPT' + import json + import os + + def filter_and_format_report(data): + """ + Filter out entries where commit is `None` (failing tests who status is not certain) and format as text + """ + lines = [] + + for model, model_result in data.items(): + model_lines = [] + for device, failures in model_result.items(): + + # Filter out None commits and extract just the test names + test_names = [ + failure['test'] + for failure in failures + if isinstance(failure, dict) and failure.get('commit') is not None + ] + + # Add tests to model lines + for idx, test_name in enumerate(test_names): + if idx == 0: + job_link = failures[idx]['job_link'] + model_lines.append(f"- [{model}]({job_link}):") + + model_lines.append(f" {test_name}") + + # Only add model section if it has tests + if len(model_lines) > 0: + lines.extend(model_lines) + lines.append("") # Empty line between models + + return "\n".join(lines).strip() + + # Load and filter reports + model_report_str = os.environ.get('MODEL_REPORT', '{}') + quant_report_str = os.environ.get('QUANT_REPORT', '{}') + + model_report = json.loads(model_report_str) if model_report_str else {} + quant_report = json.loads(quant_report_str) if quant_report_str else {} + + formatted_model = filter_and_format_report(model_report) + formatted_quant = filter_and_format_report(quant_report) + + # Write to files + with open('model_ci.txt', 'w') as f: + f.write(formatted_model) + if formatted_model: + f.write('\n') + + with open('quantization_ci.txt', 'w') as f: + f.write(formatted_quant) + if formatted_quant: + f.write('\n') + PYTHON_SCRIPT + + - name: Post results as PR comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | - mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports - echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt - echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports" - - - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v4 - with: - name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports - path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports + { + echo '## CI Results' + echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)" + echo '' + + # Check if both jobs were skipped or cancelled + if [[ "${{ needs.model-ci.result }}" == "skipped" || "${{ needs.model-ci.result }}" == "cancelled" ]] && \ + [[ "${{ needs.quantization-ci.result }}" == "skipped" || "${{ needs.quantization-ci.result }}" == "cancelled" ]]; then + echo '⚠️ No test being reported (jobs are skipped or cancelled)!' + echo "STATUS=error" >> $GITHUB_ENV + + # Check if either file has content + elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then + echo "STATUS=failure" >> $GITHUB_ENV + + # Check if model_ci.txt has content + if [ -s model_ci.txt ]; then + echo '### Model CI Report' + echo '' + echo '#### ❌ Failed tests' + echo '' + cat model_ci.txt + echo '' + fi + + # Check if quantization_ci.txt has content + if [ -s quantization_ci.txt ]; then + echo '### Quantization CI Report' + echo '' + echo '#### ❌ Failed tests' + echo '' + cat quantization_ci.txt + echo '' + fi + else + echo "STATUS=success" >> $GITHUB_ENV + echo '✅ No failing test specific to this PR 🎉 !' + fi + } > comment_body.txt - update_run_status: - name: Update Check Run Status - needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu] - permissions: - statuses: write - if: ${{ always() && needs.create_run.result == 'success' }} - runs-on: ubuntu-22.04 - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} - STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }} - steps: - - name: Get `run_models_gpu` job status - run: | - echo "${{ needs.run_models_gpu.result }}" - echo "${{ needs.run_quantization_torch_gpu.result }}" - echo $STATUS_OK - if [ "$STATUS_OK" = "true" ]; then - echo "STATUS=success" >> $GITHUB_ENV - else - echo "STATUS=failure" >> $GITHUB_ENV - fi + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ + -F body=@comment_body.txt - name: Update PR commit statuses + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | - echo "${{ needs.run_models_gpu.result }}" - echo "${{ env.STATUS }}" gh api \ --method POST \ -H "Accept: application/vnd.github+json" \ -H "X-GitHub-Api-Version: 2022-11-28" \ - repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + repos/${{ github.repository }}/statuses/${{ needs.check-timestamps.outputs.PR_HEAD_SHA }} \ -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests" diff --git a/.github/workflows/self-nightly-caller.yml b/.github/workflows/self-nightly-caller.yml index db808b257f72..d58d927bb59b 100644 --- a/.github/workflows/self-nightly-caller.yml +++ b/.github/workflows/self-nightly-caller.yml @@ -51,6 +51,7 @@ jobs: slack_report_channel: "#transformers-ci-past-future" docker: huggingface/transformers-all-latest-torch-nightly-gpu ci_event: Nightly CI + runner_type: "a10" report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }} secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index d18428fd0d82..d3de9b70e87c 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -34,14 +34,20 @@ on: runner_type: required: false type: string - models: + subdirs: default: "" required: false type: string pytest_marker: required: false type: string - + pr_number: + required: false + type: string + outputs: + report: + description: "Content of the report of new failures" + value: ${{ jobs.check_new_failures.outputs.report }} env: HF_HOME: /mnt/cache @@ -76,6 +82,7 @@ jobs: - name: Update clone working-directory: /transformers run: | + git fetch origin ${{ inputs.commit_sha || github.sha }} git fetch && git checkout ${{ inputs.commit_sha || github.sha }} - name: Cleanup @@ -95,7 +102,7 @@ jobs: working-directory: /transformers/tests run: | if [ "${{ inputs.job }}" = "run_models_gpu" ]; then - echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT + echo "folder_slices=$(python3 ../utils/split_model_tests.py --subdirs '${{ inputs.subdirs }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT @@ -107,7 +114,7 @@ jobs: name: Identify quantization method to test working-directory: /transformers/tests run: | - echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT + echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(${{ inputs.subdirs || '"None"' }}); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT - name: NVIDIA-SMI run: | @@ -539,16 +546,17 @@ jobs: secrets: inherit check_new_failures: - if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }} + if: ${{ always() && needs.send_results.result == 'success' }} name: Check new failures needs: send_results uses: ./.github/workflows/check_failed_tests.yml with: docker: ${{ inputs.docker }} - start_sha: ${{ inputs.commit_sha || github.sha }} + commit_sha: ${{ inputs.commit_sha || github.sha }} job: ${{ inputs.job }} slack_report_channel: ${{ inputs.slack_report_channel }} ci_event: ${{ inputs.ci_event }} report_repo_id: ${{ inputs.report_repo_id }} + pr_number: ${{ inputs.pr_number }} secrets: inherit diff --git a/utils/check_bad_commit.py b/utils/check_bad_commit.py index 124aeece0a4c..48bbec64819d 100644 --- a/utils/check_bad_commit.py +++ b/utils/check_bad_commit.py @@ -151,7 +151,7 @@ def find_bad_commit(target_test, start_commit, end_commit): bash = f""" git bisect reset -git bisect start {start_commit} {end_commit} +git bisect start --first-parent {start_commit} {end_commit} git bisect run python3 target_script.py """ diff --git a/utils/notification_service.py b/utils/notification_service.py index 5ef297f7913c..be6f488165c4 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -1521,6 +1521,16 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any: token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=other_workflow_id, commit_sha=ci_sha ) other_workflow_run_ids.append(other_workflow_run_id) + # triggered via `issue_comment` for CI on pull requests (e.g. using the comment `run-slow:`) + elif os.environ.get("GITHUB_EVENT_NAME") in ["issue_comment"]: + # TODO (ydshieh): Make this flexible once we implement `run-slow` for AMD CI and others. + # The id of the workflow `.github/workflows/self-scheduled-caller.yml` (not of a workflow run of it). + prev_workflow_id = "90575235" + # TODO (ydshieh): It's better to make sure using the last completed scheduled workflow run with the commit being a parent + # of the PR's `merge_commit`. + prev_workflow_run_id = get_last_daily_ci_workflow_run_id( + token=os.environ["ACCESS_REPO_INFO_TOKEN"], workflow_id=prev_workflow_id + ) else: prev_workflow_run_id = os.environ["PREV_WORKFLOW_RUN_ID"] other_workflow_run_id = os.environ["OTHER_WORKFLOW_RUN_ID"] diff --git a/utils/pr_slow_ci_models.py b/utils/pr_slow_ci_models.py index 4f6e80157115..0ac7ceeb70de 100644 --- a/utils/pr_slow_ci_models.py +++ b/utils/pr_slow_ci_models.py @@ -27,6 +27,7 @@ """ import argparse +import json import os.path import re import string @@ -169,4 +170,6 @@ def check_model_names(model_name: str): elif os.path.isdir(f"tests/quantization/{model}"): final_list.append(f"quantization/{model}") - print(sorted(set(final_list))) + # Use `json.dumps` to get the double quotes instead of single quote, e.g. `["model/vit"]`. + # (to avoid some shell expansion issues when this script is called from a Github Actions workflow) + print(json.dumps(sorted(set(final_list)))) diff --git a/utils/process_bad_commit_report.py b/utils/process_bad_commit_report.py index 9bf098250131..43bcecadc082 100644 --- a/utils/process_bad_commit_report.py +++ b/utils/process_bad_commit_report.py @@ -45,6 +45,25 @@ report_repo_id = os.getenv("REPORT_REPO_ID") + with open("new_failures_with_bad_commit.json") as fp: + data = json.load(fp) + + with open(f"ci_results_{job_name}/job_links.json") as fp: + job_links = json.load(fp) + + # Update `new_failures_with_bad_commit.json` with job links information before uploading to Hub repository + # - need to change `single-gpu` to `single` and same for `multi-gpu` to match the keys in `job_link`. + for model, model_result in data.items(): + for device, failed_tests in model_result.items(): + for failed_test in failed_tests: + key = model + if list(job_links.keys()) == [job_name]: + key = job_name + failed_test["job_link"] = job_links[key][device.replace("-gpu", "")] + + with open("new_failures_with_bad_commit.json", "w") as fp: + json.dump(data, fp, indent=4, ensure_ascii=False) + commit_info = api.upload_file( path_or_fileobj="new_failures_with_bad_commit.json", path_in_repo=f"{report_repo_folder}/ci_results_{job_name}/new_failures_with_bad_commit.json", @@ -53,12 +72,6 @@ token=os.environ.get("TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN", None), ) - with open("new_failures_with_bad_commit.json") as fp: - data = json.load(fp) - - with open(f"ci_results_{job_name}/job_links.json") as fp: - job_links = json.load(fp) - # TODO: extend team_members = [ "ArthurZucker", @@ -101,16 +114,7 @@ for author, _data in new_data_full.items(): for model, model_result in _data.items(): for device, failed_tests in model_result.items(): - # prepare job_link and add it to each entry of new failed test information. - # need to change from `single-gpu` to `single` and same for `multi-gpu` to match `job_link`. - key = model - if list(job_links.keys()) == [job_name]: - key = job_name - job_link = job_links[key][device.replace("-gpu", "")] - failed_tests = [x for x in failed_tests if x["author"] == author or x["merged_by"] == author] - for x in failed_tests: - x.update({"job_link": job_link}) model_result[device] = failed_tests _data[model] = {k: v for k, v in model_result.items() if len(v) > 0} new_data_full[author] = {k: v for k, v in _data.items() if len(v) > 0} diff --git a/utils/split_model_tests.py b/utils/split_model_tests.py index 6a2aefb293a4..344dc5449f35 100644 --- a/utils/split_model_tests.py +++ b/utils/split_model_tests.py @@ -40,10 +40,10 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--models", + "--subdirs", type=str, default="", - help="the list of pre-computed model names.", + help="the list of pre-computed model names (directory names under `tests/models`) or directory names under `tests` (except `models`).", ) parser.add_argument( "--num_splits", @@ -60,9 +60,18 @@ d1.remove("models") d = d2 + d1 - if args.models != "": - model_tests = ast.literal_eval(args.models) - d = sorted(filter(os.path.isdir, [f"models/{x}" for x in model_tests])) + if args.subdirs != "": + model_tests = ast.literal_eval(args.subdirs) + # We handle both cases with and without prefix because `push-important-models.yml` returns the list without + # the prefix (i.e. `models`) but `utils/pr_slow_ci_models.py` (called by `self-comment-ci.yml`) returns the + # list with the prefix (`models`) and some directory names under `tests`. + d = [] + for x in model_tests: + if os.path.isdir(x): + d.append(x) + if os.path.isdir(f"models/{x}"): + d.append(f"models/{x}") + d = sorted(d) num_jobs = len(d) num_jobs_per_splits = num_jobs // args.num_splits