diff --git a/.github/workflows/ci-bot-commands.yml b/.github/workflows/ci-bot-commands.yml new file mode 100644 index 0000000000..768d417111 --- /dev/null +++ b/.github/workflows/ci-bot-commands.yml @@ -0,0 +1,238 @@ +# Bot command handler for CI permissions +# Authorized users (ci-users team) can comment to control CI: +# @flashinfer-bot run - Add run-ci label to trigger CI +# @flashinfer-bot rerun - Cancel and rerun all workflows +# @flashinfer-bot rerun failed - Rerun failed and cancelled jobs +# @flashinfer-bot stop - Cancel all in-progress workflows + +name: CI Bot Commands + +on: + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: write + actions: write + +jobs: + handle-command: + # Only run on PR comments mentioning @flashinfer-bot + if: | + github.event.issue.pull_request && + contains(github.event.comment.body, '@flashinfer-bot') + runs-on: ubuntu-latest + + steps: + - name: Check team membership + id: check-permission + env: + GH_TOKEN: ${{ secrets.FLASHINFER_GITHUB_TOKEN }} + ORG: ${{ github.repository_owner }} + TEAM: ci-users + ACTOR: ${{ github.event.comment.user.login }} + run: | + echo "Checking if $ACTOR is a member of $ORG/$TEAM..." + + # Verify token is set + if [[ -z "$GH_TOKEN" ]]; then + echo "::error::FLASHINFER_GITHUB_TOKEN secret is not set" + echo "authorized=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # List team members and check if commenter is in the list + MEMBERS=$(gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + --paginate \ + "/orgs/${ORG}/teams/${TEAM}/members" \ + --jq '.[].login' 2>&1) || { + echo "::error::Failed to get team members: $MEMBERS" + echo "authorized=false" >> "$GITHUB_OUTPUT" + exit 0 + } + + if echo "$MEMBERS" | grep -qx "$ACTOR"; then + echo "$ACTOR is a member of $TEAM" + echo "authorized=true" >> "$GITHUB_OUTPUT" + else + echo "$ACTOR is not a member of $TEAM" + echo "authorized=false" >> "$GITHUB_OUTPUT" + fi + + - name: Parse command + id: parse + env: + COMMENT_BODY: ${{ github.event.comment.body }} + run: | + if echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot rerun failed"; then + echo "command=rerun-failed" >> "$GITHUB_OUTPUT" + elif echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot rerun"; then + echo "command=rerun" >> "$GITHUB_OUTPUT" + elif echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot stop"; then + echo "command=stop" >> "$GITHUB_OUTPUT" + elif echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot run"; then + echo "command=run" >> "$GITHUB_OUTPUT" + else + echo "command=unknown" >> "$GITHUB_OUTPUT" + fi + + - name: Handle @flashinfer-bot run + if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'run' + env: + GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }} + run: | + echo "Adding run-ci label to PR #${{ github.event.issue.number }}" + + # Add run-ci label + gh pr edit ${{ github.event.issue.number }} \ + --repo ${{ github.repository }} \ + --add-label "run-ci" + + # React with thumbs up + gh api \ + -X POST \ + "/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ + -f content='+1' + + echo "Label added successfully" + + - name: Handle @flashinfer-bot rerun + if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'rerun' + env: + GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }} + run: | + echo "Rerunning all jobs for PR #${{ github.event.issue.number }}" + + # Get PR head SHA + PR_SHA=$(gh pr view ${{ github.event.issue.number }} \ + --repo ${{ github.repository }} \ + --json headRefOid -q '.headRefOid') + + echo "PR HEAD SHA: $PR_SHA" + + # Cancel in-progress and queued runs first + echo "Cancelling in-progress runs..." + gh run list \ + --repo ${{ github.repository }} \ + --commit "$PR_SHA" \ + --json databaseId,status -q '.[] | select(.status == "in_progress" or .status == "queued") | .databaseId' | \ + while read -r run_id; do + if [ -n "$run_id" ]; then + echo "Cancelling workflow $run_id..." + gh run cancel "$run_id" --repo ${{ github.repository }} || true + fi + done + + # Wait for cancellations to complete + sleep 2 + + # Rerun all workflow runs for this commit + echo "Rerunning all workflows..." + gh run list \ + --repo ${{ github.repository }} \ + --commit "$PR_SHA" \ + --json databaseId -q '.[].databaseId' | \ + while read -r run_id; do + if [ -n "$run_id" ]; then + echo "Rerunning workflow $run_id..." + gh run rerun "$run_id" --repo ${{ github.repository }} || true + fi + done + + # React with thumbs up + gh api \ + -X POST \ + "/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ + -f content='+1' + + echo "Rerun triggered successfully" + + - name: Handle @flashinfer-bot rerun failed + if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'rerun-failed' + env: + GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }} + run: | + echo "Rerunning failed/cancelled jobs for PR #${{ github.event.issue.number }}" + + # Get PR head SHA + PR_SHA=$(gh pr view ${{ github.event.issue.number }} \ + --repo ${{ github.repository }} \ + --json headRefOid -q '.headRefOid') + + echo "PR HEAD SHA: $PR_SHA" + + # Rerun failed and cancelled workflow runs for this commit + # (cancelled jobs are common with fail-fast when one job fails) + for STATUS in failure cancelled; do + gh run list \ + --repo ${{ github.repository }} \ + --commit "$PR_SHA" \ + --status "$STATUS" \ + --json databaseId -q '.[].databaseId' | \ + while read -r run_id; do + if [ -n "$run_id" ]; then + echo "Rerunning $STATUS workflow $run_id..." + gh run rerun "$run_id" --repo ${{ github.repository }} --failed || true + fi + done + done + + # React with thumbs up + gh api \ + -X POST \ + "/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ + -f content='+1' + + echo "Rerun-failed triggered successfully" + + - name: Handle @flashinfer-bot stop + if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'stop' + env: + GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }} + run: | + echo "Stopping all workflows for PR #${{ github.event.issue.number }}" + + # Get PR head SHA + PR_SHA=$(gh pr view ${{ github.event.issue.number }} \ + --repo ${{ github.repository }} \ + --json headRefOid -q '.headRefOid') + + echo "PR HEAD SHA: $PR_SHA" + + # Cancel all in-progress and queued runs + CANCEL_COUNT=0 + gh run list \ + --repo ${{ github.repository }} \ + --commit "$PR_SHA" \ + --json databaseId,status -q '.[] | select(.status == "in_progress" or .status == "queued") | .databaseId' | \ + while read -r run_id; do + if [ -n "$run_id" ]; then + echo "Cancelling workflow $run_id..." + gh run cancel "$run_id" --repo ${{ github.repository }} || true + CANCEL_COUNT=$((CANCEL_COUNT + 1)) + fi + done + + # React with thumbs up + gh api \ + -X POST \ + "/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ + -f content='+1' + + echo "Stop triggered successfully" + + - name: Unauthorized user + if: steps.check-permission.outputs.authorized != 'true' && steps.parse.outputs.command != 'unknown' + env: + GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }} + run: | + echo "User ${{ github.event.comment.user.login }} is not authorized" + + # React with confused emoji + gh api \ + -X POST \ + "/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \ + -f content='confused' diff --git a/.github/workflows/pr-label-cleanup.yml b/.github/workflows/pr-label-cleanup.yml new file mode 100644 index 0000000000..8a4bd81aef --- /dev/null +++ b/.github/workflows/pr-label-cleanup.yml @@ -0,0 +1,46 @@ +# Auto-remove run-ci label when new commits are pushed to external PRs +# This ensures maintainers must re-approve after code changes + +name: PR Label Cleanup + +on: + pull_request: + types: [synchronize] # New commits pushed + +permissions: + pull-requests: write + +jobs: + remove-label: + # Only run if PR has run-ci label and author is external + if: contains(github.event.pull_request.labels.*.name, 'run-ci') + runs-on: ubuntu-latest + steps: + - name: Check if external contributor + id: check + run: | + ASSOC="${{ github.event.pull_request.author_association }}" + if [[ "$ASSOC" =~ ^(OWNER|MEMBER|COLLABORATOR)$ ]]; then + echo "is_external=false" >> "$GITHUB_OUTPUT" + echo "PR author has $ASSOC access, keeping label" + else + echo "is_external=true" >> "$GITHUB_OUTPUT" + echo "PR author is $ASSOC (external), will remove label" + fi + + - name: Remove run-ci label + if: steps.check.outputs.is_external == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "Removing run-ci label from PR #${{ github.event.pull_request.number }}" + gh pr edit ${{ github.event.pull_request.number }} \ + --repo ${{ github.repository }} \ + --remove-label "run-ci" + + # Post a comment explaining why + gh pr comment ${{ github.event.pull_request.number }} \ + --repo ${{ github.repository }} \ + --body "New commits detected. The \`run-ci\` label has been removed for security. + + A maintainer can re-approve by commenting \`@flashinfer-bot run\`" diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 62e4b5921e..ee04965583 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -1,6 +1,12 @@ # CI workflow using AWS self-hosted runners. # Runs AOT build tests and GPU unit tests on push/PR to main. # Uses ci/bash.sh for Docker execution (same as Jenkins). +# +# Permission Control: +# - Push to main: Always runs +# - PR from org members (ci-users team): Runs automatically +# - PR from external contributors: Requires 'run-ci' label +# (added via @flashinfer-bot run command from authorized user) name: PR Test @@ -9,6 +15,7 @@ on: branches: [main] pull_request: branches: [main] + types: [opened, synchronize, reopened, labeled] workflow_dispatch: inputs: skip_aot: @@ -26,16 +33,88 @@ concurrency: permissions: contents: read + pull-requests: write env: EXECUTOR_NUMBER: "0" jobs: + # --------------------------------------------------------------------------- + # Gate - Check if PR is authorized to run CI + # --------------------------------------------------------------------------- + gate: + name: Permission Check + runs-on: ubuntu-latest + outputs: + authorized: ${{ steps.check.outputs.authorized }} + steps: + - name: Check authorization + id: check + env: + GH_TOKEN: ${{ secrets.FLASHINFER_GITHUB_TOKEN }} + run: | + # Always allow push to main and workflow_dispatch + if [[ "${{ github.event_name }}" != "pull_request" ]]; then + echo "authorized=true" >> "$GITHUB_OUTPUT" + echo "Not a PR, authorized" + exit 0 + fi + + # Check if PR has run-ci label + if [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-ci') }}" == "true" ]]; then + echo "authorized=true" >> "$GITHUB_OUTPUT" + echo "PR has run-ci label, authorized" + exit 0 + fi + + # Check if PR author is a member of ci-users team + AUTHOR="${{ github.event.pull_request.user.login }}" + ORG="${{ github.repository_owner }}" + TEAM="ci-users" + + echo "Checking if $AUTHOR is a member of $ORG/$TEAM..." + + if [[ -z "$GH_TOKEN" ]]; then + echo "::warning::FLASHINFER_GITHUB_TOKEN not set, falling back to association check" + # Fallback: check if author has write access + ASSOC="${{ github.event.pull_request.author_association }}" + if [[ "$ASSOC" =~ ^(OWNER|MEMBER|COLLABORATOR)$ ]]; then + echo "authorized=true" >> "$GITHUB_OUTPUT" + echo "PR author has $ASSOC access, authorized" + else + echo "authorized=false" >> "$GITHUB_OUTPUT" + echo "PR author is $ASSOC, not authorized" + fi + exit 0 + fi + + # Check team membership + MEMBERS=$(gh api \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + --paginate \ + "/orgs/${ORG}/teams/${TEAM}/members" \ + --jq '.[].login' 2>&1) || { + echo "::warning::Failed to get team members: $MEMBERS" + echo "authorized=false" >> "$GITHUB_OUTPUT" + exit 0 + } + + if echo "$MEMBERS" | grep -qx "$AUTHOR"; then + echo "authorized=true" >> "$GITHUB_OUTPUT" + echo "$AUTHOR is a member of $TEAM, authorized" + else + echo "authorized=false" >> "$GITHUB_OUTPUT" + echo "$AUTHOR is not a member of $TEAM, not authorized" + fi + # --------------------------------------------------------------------------- # Setup - Read docker tag and check if build should be skipped # --------------------------------------------------------------------------- setup: name: Setup + needs: gate + if: needs.gate.outputs.authorized == 'true' runs-on: ubuntu-latest outputs: docker_tag: ${{ steps.get-tag.outputs.tag }} @@ -90,8 +169,11 @@ jobs: # --------------------------------------------------------------------------- aot-build-import: name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }}) - needs: setup - if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_aot != 'true' + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_aot != 'true' runs-on: - self-hosted - Linux @@ -144,8 +226,11 @@ jobs: # --------------------------------------------------------------------------- gpu-tests-a10g: name: JIT Unittest ${{ matrix.shard }} (A10G) - needs: setup - if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_gpu != 'true' runs-on: [self-hosted, Linux, X64, gpu, sm86] timeout-minutes: 360 strategy: @@ -194,8 +279,11 @@ jobs: # --------------------------------------------------------------------------- gpu-tests-t4: name: JIT Unittest (T4) - needs: setup - if: needs.setup.outputs.skip_build != 'true' && github.event.inputs.skip_gpu != 'true' + needs: [gate, setup] + if: | + needs.gate.outputs.authorized == 'true' && + needs.setup.outputs.skip_build != 'true' && + github.event.inputs.skip_gpu != 'true' runs-on: [self-hosted, Linux, X64, gpu, sm75] timeout-minutes: 360 env: @@ -240,13 +328,20 @@ jobs: test-results-summary: name: Test Results Summary if: always() - needs: [setup, aot-build-import, gpu-tests-a10g, gpu-tests-t4] + needs: [gate, setup, aot-build-import, gpu-tests-a10g, gpu-tests-t4] runs-on: ubuntu-latest steps: - name: Check Results run: | echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY + # Check if CI was skipped due to permissions + if [ "${{ needs.gate.outputs.authorized }}" != "true" ]; then + echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY + echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY + exit 0 + fi + if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY exit 0