-
Notifications
You must be signed in to change notification settings - Fork 245
Add gating mechanism to ensure PyTorch wheels pass tests before releasing #1110
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
90043d8
2830983
2db2495
c7ed846
25e4ffe
3c79406
a22814a
4f24452
c7bc598
c40e61e
76a4f93
c47dad2
26acce7
d284128
55e3938
0f31e5c
9495af6
bfb8546
fd5c912
c9d86fa
81654c3
d700c99
731d1d1
6a62bf4
3d114d4
7b0dcb9
cb41903
1bbd27f
2f05501
16fba93
d40c0c9
3cc50f8
746e9ff
2248163
b3ffedc
25bfa39
a88b0fb
db929bf
bdff065
bc63a87
7192390
ee26b7d
f957f64
971d9f9
029ea02
c0ed8f1
8d6b63e
1aa1473
be258df
cb3f485
c4dc878
91e4976
84b034e
d7d4e01
a68c3de
2f8f638
576595c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,10 +17,18 @@ on: | |
| description: S3 subdirectory, not including the GPU-family | ||
| required: true | ||
| type: string | ||
| s3_staging_subdir: | ||
| description: S3 staging subdirectory, not including the GPU-family | ||
| required: true | ||
| type: string | ||
| cloudfront_url: | ||
| description: CloudFront URL pointing to Python index | ||
| required: true | ||
| type: string | ||
| cloudfront_staging_url: | ||
| description: CloudFront base URL pointing to staging Python index | ||
| required: true | ||
| type: string | ||
| rocm_version: | ||
| description: ROCm version to pip install | ||
| type: string | ||
|
|
@@ -55,10 +63,18 @@ on: | |
| description: S3 subdirectory, not including the GPU-family | ||
| type: string | ||
| default: "v2" | ||
| s3_staging_subdir: | ||
| description: S3 staging subdirectory, not including the GPU-family | ||
| type: string | ||
| default: "v2-staging" | ||
| cloudfront_url: | ||
| description: CloudFront base URL pointing to Python index | ||
| type: string | ||
| default: "https://d25kgig7rdsyks.cloudfront.net/v2" | ||
| cloudfront_staging_url: | ||
| description: CloudFront base URL pointing to staging Python index | ||
| type: string | ||
| default: "https://d25kgig7rdsyks.cloudfront.net/v2-staging" | ||
| rocm_version: | ||
| description: ROCm version to pip install | ||
| type: string | ||
|
|
@@ -89,7 +105,11 @@ jobs: | |
| S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" | ||
| optional_build_prod_arguments: "" | ||
| outputs: | ||
| cp_version: ${{ env.cp_version }} | ||
| torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }} | ||
| torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }} | ||
| torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }} | ||
| triton_version: ${{ steps.build-pytorch-wheels.outputs.triton_version }} | ||
| steps: | ||
| - name: Checkout | ||
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | ||
|
|
@@ -167,23 +187,24 @@ jobs: | |
| run: | | ||
| python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }}/ | ||
|
|
||
| - name: Upload wheels to S3 | ||
| - name: Upload wheels to S3 staging | ||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
| run: | | ||
| aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ | ||
| aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ | ||
| --recursive --exclude "*" --include "*.whl" | ||
|
|
||
| - name: (Re-)Generate Python package release index | ||
| - name: (Re-)Generate Python package release index for staging | ||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
| run: | | ||
| pip install boto3 packaging | ||
| python ./build_tools/third_party/s3_management/manage.py ${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }} | ||
| python ./build_tools/third_party/s3_management/manage.py ${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }} | ||
|
|
||
| generate_target_to_run: | ||
| name: Generate target_to_run | ||
| runs-on: ubuntu-24.04 | ||
| outputs: | ||
| test_runs_on: ${{ steps.configure.outputs.test-runs-on }} | ||
| bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }} | ||
| steps: | ||
| - name: Checking out repository | ||
| uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 | ||
|
|
@@ -203,7 +224,79 @@ jobs: | |
| with: | ||
| amdgpu_family: ${{ inputs.amdgpu_family }} | ||
| test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }} | ||
| cloudfront_url: ${{ inputs.cloudfront_url }} | ||
| cloudfront_url: ${{ inputs.cloudfront_staging_url }} | ||
| python_version: ${{ inputs.python_version }} | ||
| torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }} | ||
| pytorch_version: ${{ inputs.pytorch_version }} | ||
|
|
||
| upload_pytorch_wheels: | ||
| name: Release PyTorch Wheels to S3 | ||
| needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels] | ||
| if: always() | ||
| runs-on: ubuntu-24.04 | ||
| env: | ||
| S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python" | ||
| CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}" | ||
| TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}" | ||
| TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}" | ||
| TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}" | ||
| TRITON_VERSION: "${{ needs.build_pytorch_wheels.outputs.triton_version }}" | ||
|
|
||
| steps: | ||
| - name: Checkout | ||
| uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 | ||
|
|
||
| - name: Configure AWS Credentials | ||
| if: always() | ||
| uses: aws-actions/configure-aws-credentials@7474bc4690e29a8392af63c5b98e7449536d5c3a # v4.3.1 | ||
|
Comment on lines
+246
to
+251
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: this
|
||
| with: | ||
| aws-region: us-east-2 | ||
| role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases | ||
|
|
||
|
|
||
| - name: Determine upload flag | ||
| env: | ||
| BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }} | ||
| TEST_RESULT: ${{ needs.test_pytorch_wheels.result }} | ||
| TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }} | ||
| BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }} | ||
| run: | | ||
| # 1) If the build failed → upload=false | ||
| if [[ "$BUILD_RESULT" != "success" ]]; then | ||
| echo "::warning::Build failed. Skipping upload." | ||
| echo "upload=false" >> "$GITHUB_ENV" | ||
|
|
||
| # 2) Else if there was a test runner AND tests failed or were skipped → upload=false | ||
| elif [[ -n "$TEST_RUNS_ON" && ( "$TEST_RESULT" == "failure" || "$TEST_RESULT" == "skipped" ) ]]; then | ||
| echo "::warning::Tests failed or were skipped (runner present). Skipping upload." | ||
| echo "upload=false" >> "$GITHUB_ENV" | ||
|
|
||
| # 3) Else if BYPASS_TESTS_FOR_RELEASES is not set and there was no test runner → upload=false | ||
| elif [[ -z "$BYPASS_TESTS_FOR_RELEASES" && -z "$TEST_RUNS_ON" ]]; then | ||
| echo "::warning::No test runner and BYPASS_TESTS_FOR_RELEASES not set. Skipping upload." | ||
| echo "upload=false" >> "$GITHUB_ENV" | ||
|
|
||
| # 4) Otherwise → upload=true | ||
| else | ||
| echo "upload=true" >> "$GITHUB_ENV" | ||
| fi | ||
|
|
||
| - name: Copy PyTorch wheels from staging to release S3 | ||
| if: ${{ env.upload == 'true' }} | ||
| run: | | ||
| echo "Copying exact tested wheels to release S3 bucket..." | ||
| aws s3 cp \ | ||
| s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \ | ||
| s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \ | ||
| --recursive \ | ||
| --exclude "*" \ | ||
| --include "torch-${TORCH_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ | ||
| --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ | ||
| --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-linux_x86_64.whl" \ | ||
| --include "pytorch_triton_rocm-${TRITON_VERSION}-${CP_VERSION}-linux_x86_64.whl" | ||
|
|
||
| - name: (Re-)Generate Python package release index | ||
| if: ${{ env.upload == 'true' }} | ||
| run: | | ||
| pip install boto3 packaging | ||
| python ./build_tools/third_party/s3_management/manage.py ${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }} | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (future work) Let's make sure these changes are carried over to the Windows release workflows too. Moving steps into scripts instead of inlined commands in yml will help with that. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,10 @@ on: | |
| description: "Subdirectory to push the Python packages" | ||
| type: string | ||
| default: "v2" | ||
| s3_staging_subdir: | ||
| description: "Staging subdirectory to push the Python packages" | ||
| type: string | ||
| default: "v2-staging" | ||
| # Trigger manually (typically to test the workflow or manually build a release [candidate]) | ||
| workflow_dispatch: | ||
| inputs: | ||
|
|
@@ -27,6 +31,10 @@ on: | |
| description: "Subdirectory to push the Python packages" | ||
| type: string | ||
| default: "v2" | ||
| s3_staging_subdir: | ||
| description: "Staging subdirectory to push the Python packages" | ||
| type: string | ||
| default: "v2-staging" | ||
| families: | ||
| description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`" | ||
| type: string | ||
|
|
@@ -44,6 +52,7 @@ jobs: | |
| runs-on: ubuntu-24.04 | ||
| env: | ||
| S3_SUBDIR: ${{ inputs.s3_subdir || 'v2' }} | ||
| S3_STAGING_SUBDIR: ${{ inputs.s3_staging_subdir || 'v2-staging' }} | ||
| release_type: ${{ inputs.release_type || 'nightly' }} | ||
| outputs: | ||
| version: ${{ steps.release_information.outputs.version }} | ||
|
|
@@ -109,6 +118,7 @@ jobs: | |
| S3_BUCKET_TAR: "therock-${{ needs.setup_metadata.outputs.release_type }}-tarball" | ||
| S3_BUCKET_PY: "therock-${{ needs.setup_metadata.outputs.release_type }}-python" | ||
| S3_SUBDIR: ${{ inputs.s3_subdir || 'v2' }} | ||
| S3_STAGING_SUBDIR: ${{ inputs.s3_staging_subdir || 'v2-staging' }} | ||
|
|
||
| steps: | ||
| - name: "Checking out repository" | ||
|
|
@@ -147,6 +157,11 @@ jobs: | |
| echo "Building ${{ env.DIST_ARCHIVE }}" | ||
| tar cfz "${{ env.DIST_ARCHIVE }}" . | ||
|
|
||
| - name: Setup Python | ||
| uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | ||
| with: | ||
| python-version: 3.12 | ||
|
|
||
| - name: Build Python Packages | ||
| run: | | ||
| ./build_tools/linux_portable_build.py \ | ||
|
|
@@ -171,6 +186,22 @@ jobs: | |
| aws-region: us-east-2 | ||
| role-to-assume: arn:aws:iam::692859939525:role/therock-${{ env.RELEASE_TYPE }}-releases | ||
|
|
||
| - name: Upload Releases to staging S3 | ||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
| run: | | ||
| aws s3 cp ${{ env.OUTPUT_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \ | ||
| --recursive --no-follow-symlinks \ | ||
| --exclude "*" \ | ||
| --include "*.whl" \ | ||
| --include "*.tar.gz" | ||
|
|
||
| - name: (Re-)Generate Python package release index for staging | ||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
|
araravik-psd marked this conversation as resolved.
|
||
| run: | | ||
| pip install boto3 packaging | ||
| python ./build_tools/third_party/s3_management/manage.py ${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }} | ||
|
|
||
| ## TODO: Restrict uploading to the non-staging S3 directory until ROCm sanity checks and all validation tests have successfully passed. | ||
| - name: Upload Releases to S3 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (future work) Out of curiosity, is this "upload from local to cloud" slower or faster than the "copy from cloud to cloud" that the I think we could script these uploads/copies so we aren't inlining as much code into .yml files, in which case we could have a "copy release from staging to tested" mode on the script that we can use for both pytorch wheels and rocm wheels. |
||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
| run: | | ||
|
|
@@ -181,12 +212,6 @@ jobs: | |
| --include "*.whl" \ | ||
| --include "*.tar.gz" | ||
|
|
||
| - name: Setup Python | ||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
| uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 | ||
| with: | ||
| python-version: 3.12 | ||
|
|
||
| - name: (Re-)Generate Python package release index | ||
| if: ${{ github.repository_owner == 'ROCm' }} | ||
| run: | | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,14 +46,50 @@ def get_runner_label(target: str, platform: str) -> str: | |
| if test_runs_on_machine: | ||
| print(f" Found runner: '{test_runs_on_machine}'") | ||
| return test_runs_on_machine | ||
| return "" | ||
|
|
||
|
|
||
| def get_upload_label(target: str, platform: str) -> str: | ||
| print(f"Searching for a runner for target '{target}' on platform '{platform}'") | ||
| amdgpu_family_info_matrix = ( | ||
| amdgpu_family_info_matrix_presubmit | amdgpu_family_info_matrix_postsubmit | ||
| ) | ||
|
Comment on lines
+54
to
+56
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fine for now since this is following the existing code patterns, but I just edited some of this code in 1376958. We should be able to replace this code pattern with |
||
| for key, info_for_key in amdgpu_family_info_matrix.items(): | ||
| print(f"Cheecking key '{key}' with info:\n {info_for_key}") | ||
| platform_for_key = info_for_key.get(platform) | ||
|
|
||
| if not platform_for_key: | ||
| # Some AMDGPU families are only supported on certain platforms. | ||
| print(f" Skipping since this entry has no platform '{platform}'") | ||
| continue | ||
|
|
||
| # Check against both the inner "family" and the outer "key". If neither | ||
| # match then skip. Workflows are expected to use the inner "family" | ||
| # but manually triggered runs may use the outer "key" instead, so we'll | ||
| # be a bit lenient here. | ||
| # This needs a rework, see https://github.com/ROCm/TheRock/issues/1097. | ||
| family_for_platform = platform_for_key.get("family") | ||
| if target != family_for_platform and key not in target.lower(): | ||
| print( | ||
| f" Skipping since the target '{target}' does not match the family '{family_for_platform}'" | ||
| ) | ||
| continue | ||
|
|
||
| # If there is no test machine available and bypass_tests_for_releases flag is True for GPU family and platform, output bypass_tests_for_releases as True | ||
| bypass_tests_for_releases = platform_for_key.get("bypass_tests_for_releases") | ||
| if bypass_tests_for_releases: | ||
| print(f" bypass_tests_for_releases: True") | ||
| return bypass_tests_for_releases | ||
| return "" | ||
|
|
||
|
|
||
| def main(target: str, platform: str): | ||
| runner_label = get_runner_label(target, platform) | ||
| if runner_label: | ||
| gha_set_output({"test-runs-on": runner_label}) | ||
| upload_label = get_upload_label(target, platform) | ||
| if upload_label: | ||
| gha_set_output({"bypass_tests_for_releases": upload_label}) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.