diff --git a/.github/workflows/build_linux_jax_wheels.yml b/.github/workflows/build_linux_jax_wheels.yml
new file mode 100644
index 0000000000000..8b4f18ae5d9a7
--- /dev/null
+++ b/.github/workflows/build_linux_jax_wheels.yml
@@ -0,0 +1,290 @@
+name: Build Portable Linux JAX Wheels
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      python_version:
+        required: true
+        type: string
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        required: true
+        type: string
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        required: true
+        type: string
+      s3_staging_subdir:
+        description: S3 staging subdirectory, not including the GPU-family
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version to install
+        type: string
+      tar_url:
+        description: URL to TheRock tarball to build against
+        type: string
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        required: true
+        type: string
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        required: true
+        type: string
+      repository:
+        description: "Repository to checkout. Defaults to `ROCm/TheRock`."
+        type: string
+        default: "ROCm/TheRock"
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx94X-dcgpu
+      python_version:
+        required: true
+        type: string
+        default: "3.12"
+      release_type:
+        type: choice
+        description: Type of release to create. All developer-triggered jobs should use "dev"!
+        options:
+          - dev
+          - nightly
+          - prerelease
+        default: dev
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: S3 staging subdirectory, not including the GPU-family
+        type: string
+        default: "v2-staging"
+      rocm_version:
+        description: ROCm version to install
+        type: string
+      tar_url:
+        description: URL to TheRock tarball to build against
+        type: string
+      cloudfront_url:
+        description: CloudFront base URL pointing to Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2"
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2-staging"
+      jax_ref:
+        description: rocm-jax repository ref/branch to check out
+        type: string
+        default: rocm-jaxlib-v0.8.0
+
+permissions:
+  id-token: write
+  contents: read
+
+run-name: Build Linux JAX Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.python_version }}, ${{ inputs.release_type }})
+
+jobs:
+  build_jax_wheels:
+    strategy:
+      matrix:
+        jax_ref: [rocm-jaxlib-v0.8.0]
+    name: Build Linux JAX Wheels | ${{ inputs.amdgpu_family }} | Python ${{ inputs.python_version }}
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }}
+    env:
+      PACKAGE_DIST_DIR: ${{ github.workspace }}/jax/jax_rocm_plugin/wheelhouse
+      S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python"
+    outputs:
+      cp_version: ${{ env.cp_version }}
+      jax_version: ${{ steps.extract_jax_version.outputs.jax_version }}
+    steps:
+      - name: Checkout TheRock
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Checkout JAX
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          path: jax
+          repository: rocm/rocm-jax
+          ref: ${{ matrix.jax_ref }}
+
+      - name: Configure Git Identity
+        run: |
+          git config --global user.name "therockbot"
+          git config --global user.email "therockbot@amd.com"
+
+      - name: "Setting up Python"
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: ${{ inputs.python_version }}
+
+      - name: Select Python version
+        run: |
+          python build_tools/github_actions/python_to_cp_version.py \
+            --python-version ${{ inputs.python_version }}
+
+      - name: Build JAX Wheels
+        env:
+          ROCM_VERSION: ${{ inputs.rocm_version }}
+        run: |
+          ls -lah
+          pushd jax
+          python3 build/ci_build \
+            --compiler=clang \
+            --python-versions="${{ inputs.python_version }}" \
+            --rocm-version="${ROCM_VERSION}" \
+            --therock-path="${{ inputs.tar_url }}" \
+            dist_wheels
+
+      - name: Extract JAX version
+        id: extract_jax_version
+        run: |
+            # Extract JAX version from requirements.txt (e.g., "jax==0.8.0")
+            # Remove all whitespace from requirements.txt to simplify parsing
+            # Search for lines starting with "jax==" or "jaxlib==" followed by version (excluding comments)
+            # Extract the version number by splitting on '=' and taking the 3rd field
+            # [^#]+ matches one or more characters that are NOT '#', ensuring we stop before any inline comments
+            JAX_VERSION=$(tr -d ' ' < jax/build/requirements.txt \
+            | grep -E '^(jax|jaxlib)==[^#]+' | head -n1 | cut -d'=' -f3)
+            echo "jax_version=$JAX_VERSION" >> "$GITHUB_OUTPUT"
+
+      - name: Install AWS CLI
+        if: always()
+        run: bash ./dockerfiles/install_awscli.sh
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases
+
+      - name: Upload wheels to S3
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive --exclude "*" --include "*.whl"
+
+      - name: (Re-)Generate Python package release index
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          python3 -m venv .venv
+          source .venv/bin/activate
+          pip3 install boto3 packaging
+          python3 ./build_tools/third_party/s3_management/manage.py ${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}
+
+  generate_target_to_run:
+    name: Generate target_to_run
+    runs-on: ubuntu-24.04
+    outputs:
+      test_runs_on: ${{ steps.configure.outputs.test-runs-on }}
+      bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }}
+    steps:
+      - name: Checking out repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Generating target to run
+        id: configure
+        env:
+          TARGET: ${{ inputs.amdgpu_family }}
+          PLATFORM: "linux"
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: python ./build_tools/github_actions/configure_target_run.py
+
+  test_jax_wheels:
+    name: Test JAX wheels | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+    needs: [build_jax_wheels, generate_target_to_run]
+    permissions:
+      contents: read
+      packages: read
+    uses: ./.github/workflows/test_linux_jax_wheels.yml
+    with:
+      amdgpu_family: ${{ inputs.amdgpu_family }}
+      release_type: ${{ inputs.release_type }}
+      s3_subdir: ${{ inputs.s3_subdir }}
+      package_index_url: ${{ inputs.cloudfront_staging_url }}
+      rocm_version: ${{ inputs.rocm_version }}
+      tar_url: ${{ inputs.tar_url }}
+      python_version: ${{ inputs.python_version }}
+      repository: ${{ inputs.repository || github.repository }}
+      ref: ${{ inputs.ref || '' }}
+      jax_ref: ${{ inputs.jax_ref }}
+      test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+
+  upload_jax_wheels:
+    name: Release JAX Wheels to S3
+    needs: [build_jax_wheels, generate_target_to_run, test_jax_wheels]
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-24.04
+    env:
+      S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python"
+      JAX_VERSION: "${{ needs.build_jax_wheels.outputs.jax_version }}"
+      ROCM_VERSION: "${{ inputs.rocm_version }}"
+      CP_VERSION: "${{ needs.build_jax_wheels.outputs.cp_version }}"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@00943011d9042930efac3dcd3a170e4273319bc8 # v5.1.0
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases
+
+      - name: Determine upload flag
+        env:
+          BUILD_RESULT: ${{ needs.build_jax_wheels.result }}
+          TEST_RESULT: ${{ needs.test_jax_wheels.result }}
+          TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+          BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }}
+        run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py
+
+      - name: Copy JAX wheels from staging to release S3
+        if: ${{ env.upload == 'true' }}
+        run: |
+          echo "Copying exact tested wheels to release S3 bucket..."
+          aws s3 cp \
+            s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \
+            s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive \
+            --exclude "*" \
+            --include "jaxlib-${JAX_VERSION}+rocm${ROCM_VERSION}-${CP_VERSION}-manylinux_2_27_x86_64.whl" \
+            --include "jax_rocm7_plugin-${JAX_VERSION}+rocm${ROCM_VERSION}-${CP_VERSION}-manylinux_2_28_x86_64.whl" \
+            --include "jax_rocm7_pjrt-${JAX_VERSION}+rocm${ROCM_VERSION}-py3-none-manylinux_2_28_x86_64.whl"
+
+      - name: (Re-)Generate Python package release index
+        if: ${{ env.upload == 'true' }}
+        env:
+          # Environment variables to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
diff --git a/.github/workflows/build_native_linux_packages.yml b/.github/workflows/build_native_linux_packages.yml
new file mode 100644
index 0000000000000..ead640630e25c
--- /dev/null
+++ b/.github/workflows/build_native_linux_packages.yml
@@ -0,0 +1,135 @@
+name: Build Native Linux Packages
+
+on:
+  workflow_call:
+    inputs:
+      artifact_group:
+        description: gfx arch group for the s3 server
+        type: string
+        default: gfx94X-dcgpu
+      artifact_run_id:
+        description: workflow run id to download the artifacts from.
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version to append to the package (8.0.0, 8.0.1rc1, ...).
+        required: true
+        type: string
+      native_package_type:
+        description: Specify whether debian or rpm packages are needed (deb or rpm).
+        required: true
+        type: string
+      package_suffix:
+        description: The suffix to be added to package name (asan, static or rpath).
+        required: false
+        type: string
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        required: false
+        type: string
+  workflow_dispatch:
+    inputs:
+      artifact_group:
+        type: string
+        default: gfx94X-dcgpu
+      artifact_run_id:
+        description: workflow run id to download the artifacts from
+        type: string
+      rocm_version:
+        description: ROCm version to append to the package (8.0.0, 8.0.1rc1, ...).
+        type: string
+        default: "0.0.1"
+      native_package_type:
+        description: Specify whether debian or rpm packages are needed (deb or rpm).
+        required: true
+        type: choice
+        options:
+          - rpm
+          - deb
+        default: "rpm"
+      package_suffix:
+        description: The suffix to be added to package name (asan, static or rpath).
+        type: string
+        required: false
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+
+permissions:
+  id-token: write
+  contents: read
+
+run-name: Build native Linux packages (${{ inputs.artifact_group }}, ${{ inputs.rocm_version }}, ${{ inputs.native_package_type }}, ${{ inputs.package_suffix }}, ${{ inputs.release_type }})
+
+jobs:
+  build_native_packages:
+    name: Build Linux native Packages
+    strategy:
+      fail-fast: false
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }}
+    env:
+      BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      ARTIFACT_RUN_ID: ${{ inputs.artifact_run_id || github.run_id }}
+      PACKAGE_SUFFIX: ${{ inputs.package_suffix != '' && inputs.package_suffix || '' }}
+      OUTPUT_DIR: ${{ github.workspace }}/output
+      ARTIFACTS_DIR: ${{ github.workspace }}/output/artifacts
+      PACKAGE_DIST_DIR: ${{ github.workspace }}/output/packages
+      RELEASE_TYPE: ${{ inputs.release_type || '' }}
+    steps:
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: '3.12'
+      - name: Install Python requirements
+        run: |
+          pip install pyelftools boto3 jinja2
+
+      - name: Install System requirements
+        run: |
+          # Install the needed tools for creating rpm / deb packages
+          # Also install tools for creating repo files
+          sudo apt update
+          sudo apt install -y llvm
+          sudo apt install -y rpm debhelper-compat build-essential
+          sudo apt install -y dpkg-dev createrepo-c
+
+      - name: Fetch Artifacts
+        run: |
+          echo "Fetching artifacts for build ${{ inputs.artifact_run_id }}"
+          python ./build_tools/fetch_artifacts.py \
+            --run-id=${{ env.ARTIFACT_RUN_ID }} \
+            --run-github-repo="ROCm/TheRock" \
+            --artifact-group=${{ inputs.artifact_group }} \
+            --output-dir=${{ env.ARTIFACTS_DIR }}
+
+      - name: Build Packages
+        id: build-packages
+        run: |
+          echo "Building ${{ inputs.native_package_type }} packages for ${{ inputs.artifact_group }} ${{ inputs.artifact_run_id }}"
+          python ./build_tools/packaging/linux/build_package.py \
+            --dest-dir ${{ env.PACKAGE_DIST_DIR }} \
+            --rocm-version  ${{ inputs.rocm_version }} \
+            --target  ${{ inputs.artifact_group }} \
+            --artifacts-dir ${{ env.ARTIFACTS_DIR }} \
+            --pkg-type ${{ inputs.native_package_type }} \
+            --version-suffix ${{ env.ARTIFACT_RUN_ID }}
+
+      - name: Install AWS CLI
+        run: bash ./dockerfiles/install_awscli.sh
+
+      - name: Configure AWS Credentials for non-forked repos
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-artifacts-external
+
+      - name: Upload Package repo to S3
+        id: upload-packages
+        run: |
+          echo "Uploading to s3 bucket"
+          python ./build_tools/packaging/linux/upload_package_repo.py \
+            --pkg-type ${{ inputs.native_package_type }} \
+            --s3-bucket therock-deb-rpm-test \
+            --amdgpu-family ${{ inputs.artifact_group }} \
+            --artifact-id ${{ env.ARTIFACT_RUN_ID }}
diff --git a/.github/workflows/build_portable_linux_artifacts.yml b/.github/workflows/build_portable_linux_artifacts.yml
new file mode 100644
index 0000000000000..a1f7a87b61af2
--- /dev/null
+++ b/.github/workflows/build_portable_linux_artifacts.yml
@@ -0,0 +1,220 @@
+name: Build Portable Linux Artifacts
+
+on:
+  workflow_dispatch:
+    inputs:
+      amdgpu_families:
+        type: string
+        default: gfx94X-dcgpu
+      artifact_group:
+        type: string
+        default: gfx94X-dcgpu
+      build_variant_label:
+        type: string
+        description: "A label for the build variant (ex: 'release', 'asan')"
+        default: "release"
+      build_variant_suffix:
+        type: string
+        description: "The build variant suffix (ex: 'asan' suffix -> 'gfx94X-dcgpu-asan')"
+        default: ""
+      build_variant_cmake_preset:
+        type: string
+        description: "The name of the cmake preset to use for this build variant, matching an entry in CMakePresets.json (ex: 'linux-release-asan')"
+        default: ""
+      package_version:
+        type: string
+        default: ADHOCBUILD
+      expect_failure:
+        type: boolean
+        default: false
+      extra_cmake_options:
+        type: string
+
+  workflow_call:
+    inputs:
+      package_version:
+        type: string
+        default: ADHOCBUILD
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      build_variant_label:
+        type: string
+      build_variant_suffix:
+        type: string
+      build_variant_cmake_preset:
+        type: string
+      expect_failure:
+        type: boolean
+      extra_cmake_options:
+        type: string
+
+# See the details regarding permissions from the link:
+# https://github.com/aws-actions/configure-aws-credentials?tab=readme-ov-file#oidc
+permissions:
+  contents: read
+
+jobs:
+  build_portable_linux_artifacts:
+    name: Build (xfail ${{ inputs.expect_failure }})
+    # azure-linux-scale-rocm are used for regular CI builds
+    # azure-linux-scale-rocm-heavy are used for CI builds that require more resources (ex: ASAN builds)
+    runs-on: ${{ inputs.build_variant_label == 'asan' && 'azure-linux-u2404-hx176-cpu-rocm' || 'azure-linux-scale-rocm' }}
+    continue-on-error: ${{ inputs.expect_failure }}
+    timeout-minutes: 720 # 12 hour timeout
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      CACHE_DIR: ${{ github.workspace }}/.container-cache
+      # The ccache.conf will be written by setup_ccache.py before this gets used.
+      CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+      TEATIME_FORCE_INTERACTIVE: 0
+      IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+    steps:
+      - name: Checkout TheRock repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+          fetch-depth: 10
+
+      - name: Install python deps
+        run: |
+          pip install -r requirements.txt
+
+      # safe.directory must be set before Runner Health Status
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      # TODO: We shouldn't be using a cache on actual release branches, but it
+      # really helps for iteration time.
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Test build_tools
+        run: |
+          python -m pytest build_tools/tests build_tools/github_actions/tests
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: |
+          ./build_tools/fetch_sources.py --jobs 12
+
+      - name: "Checking out repository for llvm-project"
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          path: compiler/amd-llvm
+
+      - name: "Checking out repository for spriv-llvm-translator"
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/spirv-llvm-translator"
+          path: compiler/spirv-llvm-translator
+          ref: ${{ secrets.SPIRV_LLVM_TRANSLATOR_MAINLINE_REF }}
+
+      - name: "Checking out repository for hipify"
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/hipify"
+          path: compiler/hipify
+          ref: ${{ secrets.HIPIFY_MAINLINE_REF }}
+
+      - name: Apply patches
+        run: |
+             cp -v patches/amd-mainline/llvm-project/*.patch compiler/amd-llvm
+             cd compiler/amd-llvm
+             git log -10
+             git config --global --add safe.directory $PWD
+             find . -type f -name '*.patch' -exec git apply --check {} \;
+             find . -type f -name '*.patch' -exec git apply {} \;
+             git log -15
+             cd -
+
+      - name: TheRock and llvm SHA
+        run: |
+             git config --global --add safe.directory $PWD
+             git log -1
+             cd compiler/amd-llvm/llvm
+             git log -3
+             cd -
+
+      - name: Configure Projects
+        env:
+          cmake_preset: ${{ inputs.build_variant_cmake_preset }}
+          amdgpu_families: ${{ inputs.amdgpu_families }}
+          package_version: ${{ inputs.package_version }}
+          extra_cmake_options: ${{ inputs.extra_cmake_options }}
+          BUILD_DIR: build
+        run: |
+          python3 build_tools/github_actions/build_configure.py --manylinux
+
+      - name: Build therock-archives and therock-dist
+        run: |
+          cmake --build build --target therock-archives therock-dist -- -k 0
+
+      - name: Test Packaging
+        if: ${{ github.event.repository.name == 'TheRock' }}
+        run: |
+          ctest --test-dir build --output-on-failure
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        shell: bash
+        run: |
+          if [ -d "./build" ]; then
+            echo "Full SDK du:"
+            echo "------------"
+            du -h -d 1 build/dist/rocm
+            echo "Artifact Archives:"
+            echo "------------------"
+            ls -lh build/artifacts/*.tar.xz
+            echo "Artifacts:"
+            echo "----------"
+            du -h -d 1 build/artifacts
+            echo "CCache Stats:"
+            echo "-------------"
+            ccache -s -v
+            tail -v -n +1 .ccache/compiler_check_cache/* > build/logs/ccache_compiler_check_cache.log
+          else
+            echo "[ERROR] Build directory ./build does not exist. Skipping report!"
+            echo "        This should only happen if the CI is cancelled before the build step."
+            exit 1
+          fi
+
+      # Analyze ninja build log to generate per-component timing report
+      - name: Analyze Build Times
+        if: ${{ !cancelled() }}
+        run: |
+          python3 build_tools/analyze_build_times.py --build-dir build
+
+      - name: Configure AWS Credentials for non-forked repos
+        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Post Build Upload
+        if: always()
+        run: |
+          python3 build_tools/github_actions/post_build_upload.py \
+            --run-id ${{ github.run_id }} \
+            --artifact-group "${{ inputs.artifact_group }}" \
+            --build-dir build \
+            --upload
diff --git a/.github/workflows/build_portable_linux_python_packages.yml b/.github/workflows/build_portable_linux_python_packages.yml
new file mode 100644
index 0000000000000..69390ff9f472f
--- /dev/null
+++ b/.github/workflows/build_portable_linux_python_packages.yml
@@ -0,0 +1,95 @@
+name: Build Portable Linux Python Packages
+
+on:
+  workflow_dispatch:
+    inputs:
+      artifact_github_repo:
+        description: GitHub repository for artifact_run_id
+        type: string
+        default: ROCm/TheRock
+      artifact_run_id:
+        description: Workflow run ID to download artifacts from
+        type: string
+        default: "17865324892" # TODO: default to the most recent successful run (using a script)
+      artifact_group:
+        description: "The artifact group to build (ex: gfx94X-dcgpu, gfx101X-dgpu, gfx1151, gfx120X-all)"
+        type: string
+      package_version:
+        type: string
+  workflow_call:
+    inputs:
+      artifact_github_repo:
+        type: string
+      artifact_run_id:
+        type: string
+        default: ""
+      artifact_group:
+        type: string
+      package_version:
+        type: string
+
+permissions:
+  contents: read
+
+run-name: Build portable Linux Python Packages (${{ inputs.artifact_group }}, ${{ inputs.package_version }})
+
+jobs:
+  build:
+    name: Build Python | ${{ inputs.artifact_group }}
+    # Note: GitHub-hosted runners run out of disk space for some gpu families
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }}
+    env:
+      BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      ARTIFACTS_DIR: "${{ github.workspace }}/artifacts"
+      PACKAGES_DIR: "${{ github.workspace }}/packages"
+      MANYLINUX: 1
+
+    steps:
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: '3.12'
+
+      - name: Install Python requirements
+        run: pip install boto3 packaging piprepo setuptools
+
+      # Note: we could fetch "all" artifacts if we wanted to include more files
+      - name: Fetch artifacts
+        env:
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+        run: |
+          python ./build_tools/fetch_artifacts.py \
+            --run-github-repo=${{ inputs.artifact_github_repo }} \
+            --run-id=${{ env.ARTIFACT_RUN_ID }} \
+            --artifact-group=${{ inputs.artifact_group }} \
+            --output-dir=${{ env.ARTIFACTS_DIR }} \
+            _dev_ _lib_ _run_
+
+      - name: Build Python packages
+        run: |
+          ./build_tools/linux_portable_build.py \
+            --image=${{ env.BUILD_IMAGE }} \
+            --output-dir=${{ env.PACKAGES_DIR }} \
+            --artifact-dir=${{ env.ARTIFACTS_DIR }} \
+            --build-python-only \
+            -- \
+            "--version=${{ inputs.package_version }}"
+
+      - name: Inspect Python packages
+        run: |
+          ls -la "${{ env.PACKAGES_DIR }}"
+
+      # TODO(#1559): Sanity check (Linux can't find the directories, maybe Docker issues?)
+
+      # - name: Sanity check Python packages
+      #   run: |
+      #     piprepo build "${{ env.PACKAGES_DIR }}/dist"
+      #     pip install rocm[devel]==${{ inputs.package_version }} \
+      #       --extra-index-url "${{ env.PACKAGES_DIR }}/dist/simple/"
+      #     rocm-sdk test
+
+      # TODO(#1559): upload packages to artifacts S3 bucket and/or a dedicated Python packages bucket
diff --git a/.github/workflows/build_portable_linux_pytorch_wheels.yml b/.github/workflows/build_portable_linux_pytorch_wheels.yml
new file mode 100644
index 0000000000000..59a811ee6c0f2
--- /dev/null
+++ b/.github/workflows/build_portable_linux_pytorch_wheels.yml
@@ -0,0 +1,325 @@
+name: Build Portable Linux PyTorch Wheels
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      python_version:
+        required: true
+        type: string
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        required: true
+        type: string
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        required: true
+        type: string
+      s3_staging_subdir:
+        description: S3 staging subdirectory, not including the GPU-family
+        required: true
+        type: string
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        required: true
+        type: string
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      pytorch_git_ref:
+        description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y")
+        required: true
+        type: string
+      pytorch_patchset:
+        description: Patch directory name from where to apply existing patches.
+        required: true
+        type: string
+      repository:
+        description: "Repository to checkout. Otherwise, defaults to `github.repository`."
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx94X-dcgpu
+      python_version:
+        required: true
+        type: string
+        default: "3.12"
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: S3 staging subdirectory, not including the GPU-family
+        type: string
+        default: "v2-staging"
+      cloudfront_url:
+        description: CloudFront base URL pointing to Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2"
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2-staging"
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      pytorch_git_ref:
+        description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y")
+        required: true
+        type: string
+        default: "release/2.7"
+      pytorch_patchset:
+        description: Patch directory name from where to apply existing patches.
+        required: true
+        type: string
+        default: "rocm_2.7"
+
+permissions:
+  id-token: write
+  contents: read
+
+run-name: Build portable Linux PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.python_version }}, ${{ inputs.release_type }})
+
+jobs:
+  build_pytorch_wheels:
+    name: Build | ${{ inputs.amdgpu_family }} | py ${{ inputs.python_version }} | torch ${{ inputs.pytorch_git_ref }}
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }}
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+    env:
+      OUTPUT_DIR: ${{ github.workspace }}/output
+      PACKAGE_DIST_DIR: ${{ github.workspace }}/output/packages/dist
+      S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python"
+      optional_build_prod_arguments: ""
+    outputs:
+      cp_version: ${{ env.cp_version }}
+      # The following are python package versions produced by the build. The
+      # exact versions will depend on workflow inputs and the underlying code.
+      # For example:
+      #   Inputs
+      #     rocm_version       : 7.10.0a20251120
+      #     pytorch_git_ref    : release/2.9
+      #   Outputs
+      #     torch_version      : 2.9.1+rocm7.10.0a20251120
+      #     torchaudio_version : 2.9.0+rocm7.10.0a20251120
+      #     torchvision_version: 0.24.0+rocm7.10.0a20251120
+      #     triton_version     : 3.5.1+rocm7.10.0a20251120
+      # Future jobs can use these version outputs to identify newly built
+      # packages, for example via `pip install torch==${TORCH_VERSION}`.
+      torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }}
+      torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }}
+      torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }}
+      triton_version: ${{ steps.build-pytorch-wheels.outputs.triton_version }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Configure Git Identity
+        run: |
+          git config --global user.name "therockbot"
+          git config --global user.email "therockbot@amd.com"
+
+      - name: Select Python version
+        run: |
+          python build_tools/github_actions/python_to_cp_version.py \
+            --python-version ${{ inputs.python_version }}
+
+      - name: Add selected Python version to PATH
+        run: |
+          python_dir="/opt/python/${{ env.cp_version }}"
+          if ! [ -x "${python_dir}/bin/python" ]; then
+            echo "ERROR: Could not find python: ${python_dir}"
+            exit 1
+          fi
+          echo "${python_dir}/bin" >> "$GITHUB_PATH"
+
+      # Checkout nightly sources from https://github.com/pytorch/pytorch
+      - name: Checkout PyTorch Source Repos from nightly branch
+        if: ${{ inputs.pytorch_git_ref == 'nightly' }}
+        run: |
+          ./external-builds/pytorch/pytorch_torch_repo.py checkout --repo-hashtag nightly
+          ./external-builds/pytorch/pytorch_audio_repo.py checkout --repo-hashtag nightly
+          ./external-builds/pytorch/pytorch_vision_repo.py checkout --repo-hashtag nightly
+          ./external-builds/pytorch/pytorch_triton_repo.py checkout --patch --patchset nightly
+
+      # Checkout stable sources from https://github.com/ROCm/pytorch
+      - name: Checkout PyTorch Source Repos from stable branch
+        if: ${{ inputs.pytorch_git_ref != 'nightly' }}
+        run: |
+          ./external-builds/pytorch/pytorch_torch_repo.py checkout --gitrepo-origin https://github.com/ROCm/pytorch.git --repo-hashtag ${{ inputs.pytorch_git_ref }} --patchset ${{ inputs.pytorch_patchset }}
+          ./external-builds/pytorch/pytorch_audio_repo.py checkout --require-related-commit
+          ./external-builds/pytorch/pytorch_vision_repo.py checkout --require-related-commit
+          ./external-builds/pytorch/pytorch_triton_repo.py checkout
+
+      - name: Create pip cache directory
+        run: mkdir -p /tmp/pipcache
+
+      - name: Determine optional arguments passed to `build_prod_wheels.py`
+        if: ${{ inputs.rocm_version }}
+        run: |
+          pip install packaging
+          python build_tools/github_actions/determine_version.py \
+            --rocm-version ${{ inputs.rocm_version }}
+
+      - name: Build PyTorch Wheels
+        id: build-pytorch-wheels
+        run: |
+          echo "Building PyTorch wheels for ${{ inputs.amdgpu_family }}"
+          ./external-builds/pytorch/build_prod_wheels.py \
+            build \
+            --install-rocm \
+            --pip-cache-dir /tmp/pipcache \
+            --index-url "${{ inputs.cloudfront_url }}/${{ inputs.amdgpu_family }}/" \
+            --clean \
+            --output-dir ${{ env.PACKAGE_DIST_DIR }} ${{ env.optional_build_prod_arguments }}
+          python ./build_tools/github_actions/write_torch_versions.py --dist-dir ${{ env.PACKAGE_DIST_DIR }}
+
+      - name: Sanity Check Wheel
+        run: |
+          python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }}/
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases
+
+      - name: Upload wheels to S3 staging
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive --exclude "*" --include "*.whl"
+
+      - name: (Re-)Generate Python package release index for staging
+        if: ${{ github.repository_owner == 'ROCm' }}
+        env:
+          # Environment variables to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
+
+  generate_target_to_run:
+    name: Generate target_to_run
+    runs-on: ubuntu-24.04
+    outputs:
+      test_runs_on: ${{ steps.configure.outputs.test-runs-on }}
+      bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }}
+    steps:
+      - name: Checking out repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Generating target to run
+        id: configure
+        env:
+          TARGET: ${{ inputs.amdgpu_family }}
+          PLATFORM: "linux"
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: python ./build_tools/github_actions/configure_target_run.py
+
+  test_pytorch_wheels:
+    name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+    if: ${{ needs.generate_target_to_run.outputs.test_runs_on != '' }}
+    needs: [build_pytorch_wheels, generate_target_to_run]
+    uses: ./.github/workflows/test_pytorch_wheels.yml
+    with:
+      amdgpu_family: ${{ inputs.amdgpu_family }}
+      test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+      package_index_url: ${{ inputs.cloudfront_staging_url }}
+      python_version: ${{ inputs.python_version }}
+      torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }}
+      pytorch_git_ref: ${{ inputs.pytorch_git_ref }}
+      repository: ${{ inputs.repository || github.repository }}
+      ref: ${{ inputs.ref || '' }}
+
+  upload_pytorch_wheels:
+    name: Release PyTorch Wheels to S3
+    needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels]
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-24.04
+    env:
+      S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python"
+      CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}"
+      TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}"
+      TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}"
+      TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}"
+      TRITON_VERSION: "${{ needs.build_pytorch_wheels.outputs.triton_version }}"
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases
+
+      - name: Determine upload flag
+        env:
+          BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }}
+          TEST_RESULT: ${{ needs.test_pytorch_wheels.result }}
+          TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+          BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }}
+        run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py
+
+      - name: Copy PyTorch wheels from staging to release S3
+        if: ${{ env.upload == 'true' }}
+        run: |
+          echo "Copying exact tested wheels to release S3 bucket..."
+          aws s3 cp \
+            s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \
+            s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive \
+            --exclude "*" \
+            --include "torch-${TORCH_VERSION}-${CP_VERSION}-linux_x86_64.whl" \
+            --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-linux_x86_64.whl" \
+            --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-linux_x86_64.whl" \
+            --include "triton-${TRITON_VERSION}-${CP_VERSION}-linux_x86_64.whl"
+
+      - name: (Re-)Generate Python package release index
+        if: ${{ env.upload == 'true' }}
+        env:
+          # Environment variables to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
diff --git a/.github/workflows/build_windows_artifacts.yml b/.github/workflows/build_windows_artifacts.yml
new file mode 100644
index 0000000000000..68ddfa76a1aab
--- /dev/null
+++ b/.github/workflows/build_windows_artifacts.yml
@@ -0,0 +1,230 @@
+name: Build Windows Artifacts
+
+on:
+  workflow_dispatch:
+    inputs:
+      amdgpu_families:
+        type: string
+        default: gfx1151
+      artifact_group:
+        type: string
+        default: gfx1151
+      build_variant_label:
+        type: string
+        description: "A label for the build variant (ex: 'release', 'asan')"
+        default: "release"
+      build_variant_suffix:
+        type: string
+        description: "The build variant suffix (ex: 'asan' suffix -> 'gfx94X-dcgpu-asan')"
+        default: ""
+      build_variant_cmake_preset:
+        type: string
+        description: "The name of the cmake preset to use for this build variant, matching an entry in CMakePresets.json (ex: 'linux-release-asan')"
+        default: ""
+      package_version:
+        type: string
+        default: ADHOCBUILD
+      expect_failure:
+        type: boolean
+      extra_cmake_options:
+        type: string
+
+  workflow_call:
+    inputs:
+      package_version:
+        type: string
+        default: ADHOCBUILD
+      amdgpu_families:
+        type: string
+      artifact_group:
+        type: string
+      build_variant_label:
+        type: string
+      build_variant_suffix:
+        type: string
+      build_variant_cmake_preset:
+        type: string
+      expect_failure:
+        type: boolean
+      extra_cmake_options:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  build_windows_artifacts:
+    name: Build ${{ inputs.build_variant_label }} (xfail ${{ inputs.expect_failure }})
+    runs-on: azure-windows-scale-rocm
+    continue-on-error: ${{ inputs.expect_failure }}
+    timeout-minutes: 720 # 12 hour timeout
+    permissions:
+      id-token: write
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: true
+    env:
+      BUILD_DIR: B:\build
+      CACHE_DIR: "${{github.workspace}}/.cache"
+      CCACHE_DIR: "${{github.workspace}}/.cache/ccache"
+      CCACHE_MAXSIZE: "4000M"
+      TEATIME_FORCE_INTERACTIVE: 0
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+      IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+    steps:
+      - name: Checkout TheRock repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+          fetch-depth: 10
+
+      - name: SHA of TheRock
+        run: |
+             git rev-parse HEAD
+             git log -1
+
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: Install python deps
+        run: |
+          pip install -r requirements.txt
+
+      - name: Install requirements
+        # The first two lines removes the default commmunity feed and uses the internal proxy feed
+        run: |
+          choco source disable -n=chocolatey
+          choco source add -n=internal -s http://10.0.167.96:8081/repository/choco-group/ --priority=1
+          choco install --no-progress -y ccache
+          # ninja pinned due to a bug in the 1.13.0 release:
+          # https://github.com/ninja-build/ninja/issues/2616
+          choco install --no-progress -y ninja --version 1.12.1
+          choco install --no-progress -y strawberryperl
+          echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH
+          choco install --no-progress -y awscli
+          choco install --no-progress -y pkgconfiglite
+          echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH
+
+      - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0
+        with:
+          version: '3.62.0'
+
+      # After other installs, so MSVC get priority in the PATH.
+      - name: Configure MSVC
+        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+
+      - name: Runner health status
+        run: |
+          ccache --zero-stats
+          python ./build_tools/health_status.py
+
+      - name: Test build_tools
+        run: |
+          python -m pytest build_tools/tests build_tools/github_actions/tests
+
+      # TODO: We shouldn't be using a cache on actual release branches, but it
+      # really helps for iteration time.
+      - name: Enable cache
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: windows-build-packages-v4-${{ inputs.amdgpu_families }}-${{ github.sha }}
+          restore-keys: |
+            windows-build-packages-v4-${{ inputs.amdgpu_families }}-
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: |
+          git config fetch.parallel 10
+          git config --global core.symlinks true
+          git config --global core.longpaths true
+          python ./build_tools/fetch_sources.py --jobs 12
+
+      - name: "Checking out repository for llvm-project"
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+           path: compiler/amd-llvm
+
+      - name: Apply patches
+        run: |
+             cp -v patches/amd-mainline/llvm-project/*.patch compiler/amd-llvm
+             cd compiler/amd-llvm
+             git config --global --add safe.directory /__w/llvm-project/llvm-project
+             find . -type f -name '*.patch' -exec git apply --check {} \;
+             find . -type f -name '*.patch' -exec git apply {} \;
+             git log -15
+             cd -
+
+      - name: Configure Projects
+        env:
+          cmake_preset: ${{ inputs.build_variant_cmake_preset }}
+          amdgpu_families: ${{ inputs.amdgpu_families }}
+          package_version: ${{ inputs.package_version }}
+          extra_cmake_options: ${{ inputs.extra_cmake_options }}
+        run: |
+          # clear cache before build and after download
+          ccache -z
+          python3 build_tools/github_actions/build_configure.py
+
+      - name: Build therock-archives and therock-dist
+        run: cmake --build "${{ env.BUILD_DIR }}" --target therock-archives therock-dist -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        shell: bash
+        run: |
+          if [ -d "${{ env.BUILD_DIR }}" ]; then
+            echo "Build dir:"
+            echo "------------"
+            ls -lh "${{ env.BUILD_DIR }}"
+            echo "Artifact Archives:"
+            echo "------------------"
+            ls -lh "${{ env.BUILD_DIR }}"/artifacts/*.tar.xz
+            echo "Artifacts:"
+            echo "----------"
+            du -h -d 1 "${{ env.BUILD_DIR }}"/artifacts
+            echo "CCache Stats:"
+            echo "-------------"
+            ccache -s
+          else
+            echo "[ERROR] Build directory ${{ env.BUILD_DIR }} does not exist. Skipping report!"
+            echo "        This should only happen if the CI is cancelled before the build step."
+            exit 1
+          fi
+
+      - name: "Build size report"
+        if: always()
+        shell: powershell
+        run: |
+          $fs = Get-PSDrive -PSProvider "FileSystem"
+          $fsout = $fs | Select-Object -Property Name,Used,Free,Root
+          $fsout | % {$_.Used/=1GB;$_.Free/=1GB;$_} | Write-Host
+          get-disk | Select-object @{Name="Size(GB)";Expression={$_.Size/1GB}} | Write-Host
+
+      - name: Configure AWS Credentials for non-forked repos
+        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+          special-characters-workaround: true
+
+      - name: Post Build Upload
+        if: always()
+        run: |
+          python3 build_tools/github_actions/post_build_upload.py \
+            --run-id ${{ github.run_id }} \
+            --artifact-group ${{ inputs.artifact_group }} \
+            --build-dir ${{ env.BUILD_DIR }} \
+            --upload
+
+      - name: Save cache
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: windows-build-packages-v4-${{ inputs.amdgpu_families }}-${{ github.sha }}
diff --git a/.github/workflows/build_windows_python_packages.yml b/.github/workflows/build_windows_python_packages.yml
new file mode 100644
index 0000000000000..40c3d184a0b8d
--- /dev/null
+++ b/.github/workflows/build_windows_python_packages.yml
@@ -0,0 +1,87 @@
+name: Build Windows Python Packages
+
+on:
+  workflow_dispatch:
+    inputs:
+      artifact_github_repo:
+        description: GitHub repository for artifact_run_id
+        type: string
+        default: ROCm/TheRock
+      artifact_run_id:
+        description: Workflow run ID to download artifacts from
+        type: string
+        default: "17865324892" # TODO: default to the most recent successful run (using a script)
+      artifact_group:
+        description: "The artifact group to build (ex: gfx94X-dcgpu, gfx101X-dgpu, gfx1151, gfx120X-all)"
+        type: string
+      package_version:
+        type: string
+  workflow_call:
+    inputs:
+      artifact_github_repo:
+        type: string
+      artifact_run_id:
+        type: string
+        default: ""
+      artifact_group:
+        type: string
+      package_version:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  build:
+    name: Build Python | ${{ inputs.artifact_group }}
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }}
+    env:
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      ARTIFACTS_DIR: "${{ github.workspace }}/artifacts"
+      PACKAGES_DIR: "${{ github.workspace }}/packages"
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: '3.12'
+
+      - name: Install Python requirements
+        run: pip install boto3 packaging piprepo setuptools
+
+      # Note: we could fetch "all" artifacts if we wanted to include more files
+      - name: Fetch artifacts
+        env:
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+        run: |
+          python ./build_tools/fetch_artifacts.py \
+            --run-github-repo=${{ inputs.artifact_github_repo }} \
+            --run-id=${{ env.ARTIFACT_RUN_ID }} \
+            --artifact-group=${{ inputs.artifact_group }} \
+            --output-dir="${{ env.ARTIFACTS_DIR }}" \
+            _dev_ _lib_ _run_
+
+      - name: Build Python packages
+        run: |
+          python ./build_tools/build_python_packages.py \
+            --artifact-dir="${{ env.ARTIFACTS_DIR }}" \
+            --dest-dir="${{ env.PACKAGES_DIR }}" \
+            --version="${{ inputs.package_version }}"
+
+      - name: Inspect Python packages
+        run: |
+          ls -la "${{ env.PACKAGES_DIR }}"
+
+      - name: Sanity check Python packages
+        run: |
+          piprepo build "${{ env.PACKAGES_DIR }}/dist"
+          pip install rocm[libraries,devel]==${{ inputs.package_version }} \
+            --extra-index-url "${{ env.PACKAGES_DIR }}/dist/simple/"
+          rocm-sdk test
+
+      # TODO(#1559): upload packages to artifacts S3 bucket and/or a dedicated Python packages bucket
diff --git a/.github/workflows/build_windows_pytorch_wheels.yml b/.github/workflows/build_windows_pytorch_wheels.yml
new file mode 100644
index 0000000000000..aa1fc5d43a75f
--- /dev/null
+++ b/.github/workflows/build_windows_pytorch_wheels.yml
@@ -0,0 +1,357 @@
+name: Build Windows PyTorch Wheels
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      python_version:
+        required: true
+        type: string
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        required: true
+        type: string
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        required: true
+        type: string
+      s3_staging_subdir:
+        description: S3 staging subdirectory, not including the GPU-family
+        required: true
+        type: string
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        required: true
+        type: string
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      pytorch_git_ref:
+        description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y")
+        required: true
+        type: string
+      pytorch_patchset:
+        description: Patch directory name from where to apply existing patches.
+        required: true
+        type: string
+      repository:
+        description: "Repository to checkout. Otherwise, defaults to `github.repository`."
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx1151
+      python_version:
+        required: true
+        type: string
+        default: "3.12"
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: S3 staging subdirectory, not including the GPU-family
+        type: string
+        default: "v2-staging"
+      cloudfront_url:
+        description: CloudFront base URL pointing to Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2"
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2-staging"
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      pytorch_git_ref:
+        description: PyTorch ref to checkout. (typically "nightly", or "release/X.Y")
+        required: true
+        type: string
+        default: "release/2.7"
+      pytorch_patchset:
+        description: Patch directory name from where to apply existing patches.
+        required: true
+        type: string
+        default: "rocm_2.7"
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  build_pytorch_wheels:
+    name: Build | ${{ inputs.amdgpu_family }} | py ${{ inputs.python_version }} | torch ${{ inputs.pytorch_git_ref }}
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }}
+    env:
+      CHECKOUT_ROOT: B:/src
+      OUTPUT_DIR: ${{ github.workspace }}/output
+      # Note the \ here instead of /. This should be used from 'cmd' not 'bash'!
+      PACKAGE_DIST_DIR: ${{ github.workspace }}\output\packages\dist
+      S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python"
+      optional_build_prod_arguments: ""
+    outputs:
+      cp_version: ${{ env.cp_version }}
+      # The following are python package versions produced by the build. The
+      # exact versions will depend on workflow inputs and the underlying code.
+      # For example:
+      #   Inputs
+      #     rocm_version       : 7.10.0a20251120
+      #     pytorch_git_ref    : release/2.9
+      #   Outputs
+      #     torch_version      : 2.9.1+rocm7.10.0a20251120
+      #     torchaudio_version : 2.9.0+rocm7.10.0a20251120
+      #     torchvision_version: 0.24.0+rocm7.10.0a20251120
+      # Future jobs can use these version outputs to identify newly built
+      # packages, for example via `pip install torch==${TORCH_VERSION}`.
+      torch_version: ${{ steps.build-pytorch-wheels.outputs.torch_version }}
+      torchaudio_version: ${{ steps.build-pytorch-wheels.outputs.torchaudio_version }}
+      torchvision_version: ${{ steps.build-pytorch-wheels.outputs.torchvision_version }}
+
+    defaults:
+      run:
+        # Note: there are mixed uses of 'bash' (this default) and 'cmd' below
+        shell: bash
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Configure Git Identity
+        run: |
+          git config --global user.name "therockbot"
+          git config --global user.email "therockbot@amd.com"
+
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: ${{ inputs.python_version }}
+
+      - name: Select Python version
+        run: |
+          python build_tools/github_actions/python_to_cp_version.py \
+            --python-version ${{ inputs.python_version }}
+
+      # TODO(amd-justchen): share with build_windows_artifacts.yml. Include in VM image? Dockerfile?
+      - name: Install requirements
+        run: |
+          choco install --no-progress -y ninja --version 1.13.1
+          choco install --no-progress -y awscli
+          echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH
+
+      # After other installs, so MSVC get priority in the PATH.
+      - name: Configure MSVC
+        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+
+      # Checkout nightly sources from https://github.com/pytorch/pytorch
+      # TODO: switch to 'nightly' to match our Linux workflows?
+      - name: Checkout PyTorch source repos (nightly branch)
+        if: ${{ inputs.pytorch_git_ref == 'nightly' }}
+        run: |
+          git config --global core.longpaths true
+          python ./external-builds/pytorch/pytorch_torch_repo.py checkout \
+            --checkout-dir ${{ env.CHECKOUT_ROOT }}/torch \
+            --repo-hashtag nightly
+          python ./external-builds/pytorch/pytorch_audio_repo.py checkout \
+            --checkout-dir ${{ env.CHECKOUT_ROOT }}/audio \
+            --repo-hashtag nightly
+          python ./external-builds/pytorch/pytorch_vision_repo.py checkout \
+            --checkout-dir ${{ env.CHECKOUT_ROOT }}/vision \
+            --repo-hashtag nightly
+
+      # Checkout stable sources from https://github.com/ROCm/pytorch
+      - name: Checkout PyTorch Source Repos from stable branch
+        if: ${{ inputs.pytorch_git_ref != 'nightly' }}
+        run: |
+          git config --global core.longpaths true
+          python ./external-builds/pytorch/pytorch_torch_repo.py checkout \
+              --checkout-dir ${{ env.CHECKOUT_ROOT }}/torch \
+              --gitrepo-origin https://github.com/ROCm/pytorch.git \
+              --repo-hashtag ${{ inputs.pytorch_git_ref }} \
+              --patchset ${{ inputs.pytorch_patchset }}
+          python ./external-builds/pytorch/pytorch_audio_repo.py checkout \
+              --checkout-dir ${{ env.CHECKOUT_ROOT }}/audio \
+              --torch-dir ${{ env.CHECKOUT_ROOT }}/torch \
+              --require-related-commit
+          python ./external-builds/pytorch/pytorch_vision_repo.py checkout \
+              --checkout-dir ${{ env.CHECKOUT_ROOT }}/vision \
+              --torch-dir ${{ env.CHECKOUT_ROOT }}/torch \
+              --require-related-commit
+
+      - name: Determine optional arguments passed to `build_prod_wheels.py`
+        if: ${{ inputs.rocm_version }}
+        run: |
+          pip install packaging
+          python build_tools/github_actions/determine_version.py \
+            --rocm-version ${{ inputs.rocm_version }}
+
+      - name: Build PyTorch Wheels
+        id: build-pytorch-wheels
+        # Using 'cmd' here is load bearing! There are configuration issues when
+        # run under 'bash': https://github.com/ROCm/TheRock/issues/827#issuecomment-3025858800
+        shell: cmd
+        run: |
+          echo "Building PyTorch wheels for ${{ inputs.amdgpu_family }}"
+          python ./external-builds/pytorch/build_prod_wheels.py ^
+            build ^
+            --install-rocm ^
+            --index-url "${{ inputs.cloudfront_url }}/${{ inputs.amdgpu_family }}/" ^
+            --pytorch-dir ${{ env.CHECKOUT_ROOT }}/torch ^
+            --pytorch-audio-dir ${{ env.CHECKOUT_ROOT }}/audio ^
+            --pytorch-vision-dir ${{ env.CHECKOUT_ROOT }}/vision ^
+            --enable-pytorch-flash-attention-windows ^
+            --clean ^
+            --output-dir ${{ env.PACKAGE_DIST_DIR }} ^
+            ${{ env.optional_build_prod_arguments }}
+          python ./build_tools/github_actions/write_torch_versions.py --dist-dir ${{ env.PACKAGE_DIST_DIR }}
+
+      - name: Sanity Check Wheel
+        shell: cmd
+        run: |
+          python external-builds/pytorch/sanity_check_wheel.py ${{ env.PACKAGE_DIST_DIR }}
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases
+          special-characters-workaround: true
+
+      - name: Upload wheels to S3 staging
+        if: ${{ github.repository_owner == 'ROCm' }}
+        # Using 'cmd' here since PACKAGE_DIST_DIR uses \ in paths instead of /
+        shell: cmd
+        run: |
+          aws s3 cp ${{ env.PACKAGE_DIST_DIR }}/ ^
+            s3://${{ env.S3_BUCKET_PY }}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ ^
+            --recursive --exclude "*" --include "*.whl"
+
+      - name: (Re-)Generate Python package release index for staging
+        if: ${{ github.repository_owner == 'ROCm' }}
+        env:
+          # Environment variables to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}"
+        shell: cmd
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
+
+  generate_target_to_run:
+    name: Generate target_to_run
+    runs-on: ubuntu-24.04
+    outputs:
+      test_runs_on: ${{ steps.configure.outputs.test-runs-on }}
+      bypass_tests_for_releases: ${{ steps.configure.outputs.bypass_tests_for_releases }}
+    steps:
+      - name: Checking out repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Generating target to run
+        id: configure
+        env:
+          TARGET: ${{ inputs.amdgpu_family }}
+          PLATFORM: "windows"
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: python ./build_tools/github_actions/configure_target_run.py
+
+  test_pytorch_wheels:
+    name: Test | ${{ inputs.amdgpu_family }} | ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+    if: ${{ needs.generate_target_to_run.outputs.test_runs_on != '' }}
+    needs: [build_pytorch_wheels, generate_target_to_run]
+    uses: ./.github/workflows/test_pytorch_wheels.yml
+    with:
+      amdgpu_family: ${{ inputs.amdgpu_family }}
+      test_runs_on: ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+      package_index_url: ${{ inputs.cloudfront_staging_url }}
+      python_version: ${{ inputs.python_version }}
+      torch_version: ${{ needs.build_pytorch_wheels.outputs.torch_version }}
+      pytorch_git_ref: ${{ inputs.pytorch_git_ref }}
+      repository: ${{ inputs.repository || github.repository }}
+      ref: ${{ inputs.ref || '' }}
+
+  upload_pytorch_wheels:
+    name: Release PyTorch Wheels to S3
+    needs: [build_pytorch_wheels, generate_target_to_run, test_pytorch_wheels]
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-24.04
+    env:
+      S3_BUCKET_PY: "therock-${{ inputs.release_type }}-python"
+      CP_VERSION: "${{ needs.build_pytorch_wheels.outputs.cp_version }}"
+      TORCH_VERSION: "${{ needs.build_pytorch_wheels.outputs.torch_version }}"
+      TORCHAUDIO_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchaudio_version }}"
+      TORCHVISION_VERSION: "${{ needs.build_pytorch_wheels.outputs.torchvision_version }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Configure AWS Credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.release_type }}-releases
+          special-characters-workaround: true
+
+      - name: Determine upload flag
+        env:
+          BUILD_RESULT: ${{ needs.build_pytorch_wheels.result }}
+          TEST_RESULT: ${{ needs.test_pytorch_wheels.result }}
+          TEST_RUNS_ON: ${{ needs.generate_target_to_run.outputs.test_runs_on }}
+          BYPASS_TESTS_FOR_RELEASES: ${{ needs.generate_target_to_run.outputs.bypass_tests_for_releases }}
+        run: python ./build_tools/github_actions/promote_wheels_based_on_policy.py
+
+      - name: Copy PyTorch wheels from staging to release S3
+        if: ${{ env.upload == 'true' }}
+        run: |
+          echo "Copying exact tested wheels to release S3 bucket..."
+          aws s3 cp \
+            s3://${S3_BUCKET_PY}/${{ inputs.s3_staging_subdir }}/${{ inputs.amdgpu_family }}/ \
+            s3://${S3_BUCKET_PY}/${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive \
+            --exclude "*" \
+            --include "torch-${TORCH_VERSION}-${CP_VERSION}-win_amd64.whl" \
+            --include "torchaudio-${TORCHAUDIO_VERSION}-${CP_VERSION}-win_amd64.whl" \
+            --include "torchvision-${TORCHVISION_VERSION}-${CP_VERSION}-win_amd64.whl"
+
+      - name: (Re-)Generate Python package release index
+        if: ${{ env.upload == 'true' }}
+        env:
+          # Environment variables to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ inputs.s3_subdir }}/${{ inputs.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000..d131226a8d3b8
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,138 @@
+# This CI workflow is triggered by:
+#   - push to main branch
+#   - pull request
+#   - workflow dispatch
+#
+# For pull requests, we run default builds and tests for:
+#   - Linux: gfx94X gfx110X
+#   - Windows: gfx110X
+# If you want to trigger jobs for additional targets, please add a defined label (ex: gfx120X-linux) to the pull request
+#
+# For push to main branch, all AMD families will built and tested from `amdgpu_family_matrix.py`.
+#
+# Note: If a test machine is not available for a specific AMD GPU family in `amdgpu_family_matrix.py`, tests will be skipped.
+
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+    inputs:
+      linux_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      linux_test_labels:
+        type: string
+        description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'. ex: test:rocprim, test:hipcub"
+        default: ""
+      linux_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests"
+      windows_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      windows_test_labels:
+        type: string
+        description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:' ex: test:rocprim, test:hipcub"
+        default: ""
+      windows_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests"
+      artifact_run_id:
+        type: string
+        description: "If provided, the tests will run on this artifact ID"
+        default: ""
+  pull_request:
+    types:
+      - labeled
+      - opened
+      - synchronize
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup:
+    uses: ./.github/workflows/setup.yml
+    with:
+      build_variant: "release"
+
+  linux_build_and_test:
+    name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }}
+    needs: setup
+    if: >-
+      ${{
+        needs.setup.outputs.linux_variants != '[]' &&
+        needs.setup.outputs.enable_build_jobs == 'true'
+      }}
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }}
+    uses: ./.github/workflows/ci_linux.yml
+    secrets: inherit
+    with:
+      amdgpu_families: ${{ matrix.variant.family }}
+      artifact_group: ${{ matrix.variant.artifact_group }}
+      test_runs_on: ${{ matrix.variant.test-runs-on }}
+      build_variant_label: ${{ matrix.variant.build_variant_label }}
+      build_variant_suffix: ${{ matrix.variant.build_variant_suffix }}
+      build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }}
+      test_labels: ${{ needs.setup.outputs.linux_test_labels }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      expect_failure: ${{ matrix.variant.expect_failure == true }}
+      use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }}
+      rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }}
+      test_type: ${{ needs.setup.outputs.test_type }}
+      sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }}
+    permissions:
+      contents: read
+      id-token: write
+
+  # build_python_packages:
+  #   name: Build Python Packages
+  #   uses: ./.github/workflows/build_python_packages.yml
+
+  ci_summary:
+    name: CI Summary
+    if: always()
+    needs:
+      - setup
+      - linux_build_and_test
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Output failed jobs
+        run: |
+          echo '${{ toJson(needs) }}'
+
+          # Build a list of failed jobs, but ignore those marked continue-on-error
+          FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
+            | jq --raw-output '
+              to_entries
+              | map(select(
+                  (.value.result != "success" and .value.result != "skipped")
+                  and (.value.outputs.continue_on_error | not)
+                ))
+              | map(.key)
+              | join(",")
+            ' \
+          )"
+
+          if [[ -n "${FAILED_JOBS}" ]]; then
+            echo "The following jobs failed: ${FAILED_JOBS}"
+            exit 1
+          else
+            echo "All required jobs succeeded (continue-on-error jobs ignored)."
+          fi
diff --git a/.github/workflows/ci_asan.yml b/.github/workflows/ci_asan.yml
new file mode 100644
index 0000000000000..4da6ce0b14d11
--- /dev/null
+++ b/.github/workflows/ci_asan.yml
@@ -0,0 +1,67 @@
+name: CI ASAN
+
+on:
+  schedule:
+    - cron: "0 2 * * *" # Runs nightly at 2 AM UTC
+  workflow_dispatch:
+    inputs:
+      linux_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      linux_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests"
+      artifact_run_id:
+        type: string
+        description: "If provided, the tests will run on this artifact ID"
+        default: ""
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup:
+    uses: ./.github/workflows/setup.yml
+    with:
+      build_variant: "asan"
+
+  linux_build_and_test:
+    name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }}
+    needs: setup
+    if: >-
+      ${{
+        needs.setup.outputs.linux_variants != '[]' &&
+        needs.setup.outputs.enable_build_jobs == 'true'
+      }}
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }}
+    uses: ./.github/workflows/ci_linux.yml
+    secrets: inherit
+    with:
+      amdgpu_families: ${{ matrix.variant.family }}
+      artifact_group: ${{ matrix.variant.artifact_group }}
+      test_runs_on: ${{ matrix.variant.test-runs-on }}
+      build_variant_label: ${{ matrix.variant.build_variant_label }}
+      build_variant_suffix: ${{ matrix.variant.build_variant_suffix }}
+      build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }}
+      test_labels: ${{ needs.setup.outputs.linux_test_labels }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      expect_failure: ${{ matrix.variant.expect_failure == true }}
+      use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }}
+      rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }}
+      test_type: ${{ needs.setup.outputs.test_type }}
+      sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }}
+    permissions:
+      contents: read
+      id-token: write
diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml
new file mode 100644
index 0000000000000..e9522b323870d
--- /dev/null
+++ b/.github/workflows/ci_linux.yml
@@ -0,0 +1,108 @@
+name: CI - Linux
+
+on:
+  workflow_call:
+    inputs:
+      artifact_group:
+        type: string
+      amdgpu_families:
+        type: string
+      build_variant_label:
+        type: string
+      build_variant_cmake_preset:
+        type: string
+      build_variant_suffix:
+        type: string
+      test_labels:
+        type: string
+      artifact_run_id:
+        type: string
+      test_runs_on:
+        type: string
+      expect_failure:
+        type: boolean
+      use_prebuilt_artifacts:
+        type: string
+      rocm_package_version:
+        type: string
+      test_type:
+        type: string
+      sanity_check_only_for_family:
+        type: boolean
+
+permissions:
+  contents: read
+
+jobs:
+  build_portable_linux_artifacts:
+    name: Build Artifacts
+    if: ${{ inputs.use_prebuilt_artifacts == 'false' }}
+    uses: ./.github/workflows/build_portable_linux_artifacts.yml
+    secrets: inherit
+    with:
+      artifact_group: ${{ inputs.artifact_group }}
+      package_version: ${{ inputs.rocm_package_version }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      build_variant_label: ${{ inputs.build_variant_label }}
+      build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
+      build_variant_suffix: ${{ inputs.build_variant_suffix }}
+      expect_failure: ${{ inputs.expect_failure }}
+    permissions:
+      contents: read
+      id-token: write
+
+  # TODO: rework "artifact_run_id" and "use_prebuilt_artifacts" here?
+  #   I don't want to copy/paste this condition and special case plumbing
+  #   through multiple workflows. All the packaging and testing workflows need
+  #   to know is what artifact run id to use. That could be the current
+  #   (implicit) run id, or it could be an explicit run id.
+  #   How about having the "build artifacts" job run as a passthrough?
+
+  test_linux_artifacts:
+    needs: [build_portable_linux_artifacts]
+    name: Test Artifacts
+    # If the dependent job failed/cancelled, this job will not be run
+    # The use_prebuilt_artifacts "or" statement ensures that tests will run if
+    # previous build step is run or skipped.concurrency.
+    # If we are expecting a build failure, do not run tests to save machine capacity
+    if: >-
+      ${{
+        !failure() &&
+        !cancelled() &&
+        (
+          inputs.use_prebuilt_artifacts == 'false' ||
+          inputs.use_prebuilt_artifacts == 'true'
+        ) &&
+        inputs.expect_failure == false
+      }}
+    uses: ./.github/workflows/test_artifacts.yml
+    with:
+      artifact_group: ${{ inputs.artifact_group }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      test_type: ${{ inputs.test_type }}
+      test_labels: ${{ inputs.test_labels }}
+      sanity_check_only_for_family: ${{ inputs.sanity_check_only_for_family == true }}
+
+  build_portable_linux_python_packages:
+    needs: [build_portable_linux_artifacts]
+    name: Build Python
+    # If the dependent job failed/cancelled, this job will not be run
+    # The use_prebuilt_artifacts "or" statement ensures that tests will run if
+    # previous build step is run or skipped.concurrency.
+    if: >-
+      ${{
+        !failure() &&
+        !cancelled() &&
+        (
+          inputs.use_prebuilt_artifacts == 'false' ||
+          inputs.use_prebuilt_artifacts == 'true'
+        ) &&
+        inputs.expect_failure == false
+      }}
+    uses: ./.github/workflows/build_portable_linux_python_packages.yml
+    with:
+      artifact_run_id: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      artifact_group: ${{ inputs.artifact_group }}
+      package_version: ${{ inputs.rocm_package_version }}
diff --git a/.github/workflows/ci_nightly.yml b/.github/workflows/ci_nightly.yml
new file mode 100644
index 0000000000000..e15f5e887a077
--- /dev/null
+++ b/.github/workflows/ci_nightly.yml
@@ -0,0 +1,124 @@
+# This CI workflow is triggered by:
+#   - scheduled run
+#
+# In the scheduled run, we run all targets from amdgpu_family_matrix.py and amdgpu_family_matrix_xfail.py
+# As some of these builds are xfail, we allow errors to occur with `continue-on-error`, where the job will fail but the workflow is green
+
+name: CI Nightly
+
+on:
+  # For AMD GPU families that expect_failure, we run builds and tests from this scheduled trigger
+  schedule:
+    - cron: "0 2 * * *" # Runs nightly at 2 AM UTC
+  workflow_dispatch:
+    inputs:
+      linux_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      linux_test_labels:
+        type: string
+        description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'"
+        default: ""
+      linux_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests"
+      windows_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      windows_test_labels:
+        type: string
+        description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:'"
+        default: ""
+      windows_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests"
+      artifact_run_id:
+        type: string
+        description: "If provided, the tests will run on this artifact ID"
+        default: ""
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup:
+    uses: ./.github/workflows/setup.yml
+    with:
+      build_variant: "release"
+
+  linux_build_and_test:
+    name: Linux::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }}
+    needs: setup
+    if: >-
+      ${{
+        needs.setup.outputs.linux_variants != '[]' &&
+        needs.setup.outputs.enable_build_jobs == 'true'
+      }}
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }}
+    uses: ./.github/workflows/ci_linux.yml
+    secrets: inherit
+    with:
+      amdgpu_families: ${{ matrix.variant.family }}
+      artifact_group: ${{ matrix.variant.artifact_group }}
+      test_runs_on: ${{ matrix.variant.test-runs-on }}
+      build_variant_label: ${{ matrix.variant.build_variant_label }}
+      build_variant_suffix: ${{ matrix.variant.build_variant_suffix }}
+      build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }}
+      test_labels: ${{ needs.setup.outputs.linux_test_labels }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      expect_failure: ${{ matrix.variant.expect_failure == true }}
+      use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }}
+      rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }}
+      test_type: ${{ needs.setup.outputs.test_type }}
+      sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }}
+    permissions:
+      contents: read
+      id-token: write
+
+  windows_build_and_test:
+    name: Windows::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }}
+    needs: setup
+    if: >-
+      ${{
+        needs.setup.outputs.windows_variants != '[]' &&
+        needs.setup.outputs.enable_build_jobs == 'true'
+      }}
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: ${{ fromJSON(needs.setup.outputs.windows_variants) }}
+    uses: ./.github/workflows/ci_windows.yml
+    with:
+      amdgpu_families: ${{ matrix.variant.family }}
+      artifact_group: ${{ matrix.variant.artifact_group }}
+      test_runs_on: ${{ matrix.variant.test-runs-on }}
+      build_variant_label: ${{ matrix.variant.build_variant_label }}
+      build_variant_suffix: ${{ matrix.variant.build_variant_suffix }}
+      build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }}
+      test_labels: ${{ needs.setup.outputs.windows_test_labels }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      expect_failure: ${{ matrix.variant.expect_failure == true }}
+      use_prebuilt_artifacts: ${{ inputs.windows_use_prebuilt_artifacts == true && 'true' || 'false' }}
+      rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }}
+      test_type: ${{ needs.setup.outputs.test_type }}
+      sanity_check_only_for_family: ${{ matrix.variant.sanity_check_only_for_family == true }}
+    permissions:
+      contents: read
+      id-token: write
+
+  # build_python_packages:
+  #   name: Build Python Packages
+  #   uses: ./.github/workflows/build_python_packages.yml
diff --git a/.github/workflows/ci_weekly.yml b/.github/workflows/ci_weekly.yml
new file mode 100644
index 0000000000000..9570a74f3f7e1
--- /dev/null
+++ b/.github/workflows/ci_weekly.yml
@@ -0,0 +1,14 @@
+name: WIP Placeholder CI Weekly
+
+on:
+    # For AMD GPU families that expect_failure, we run builds and tests from this scheduled trigger
+    # schedule:
+    #   - cron: "0 3 * * 0" # Runs weekly at 3 AM UTC Sundays
+    workflow_dispatch:
+
+
+jobs:
+  donothing:
+    runs-on: ubuntu-latest
+    steps:
+      - run: echo "Skipped"
diff --git a/.github/workflows/ci_windows.yml b/.github/workflows/ci_windows.yml
new file mode 100644
index 0000000000000..536463a2c4e43
--- /dev/null
+++ b/.github/workflows/ci_windows.yml
@@ -0,0 +1,108 @@
+name: CI - Windows
+
+on:
+  workflow_call:
+    inputs:
+      artifact_group:
+        type: string
+      amdgpu_families:
+        type: string
+      build_variant_label:
+        type: string
+      build_variant_cmake_preset:
+        type: string
+      build_variant_suffix:
+        type: string
+      test_labels:
+        type: string
+      artifact_run_id:
+        type: string
+      test_runs_on:
+        type: string
+      expect_failure:
+        type: boolean
+      use_prebuilt_artifacts:
+        type: string
+      rocm_package_version:
+        type: string
+      test_type:
+        type: string
+      sanity_check_only_for_family:
+        type: boolean
+
+permissions:
+  contents: read
+
+jobs:
+  build_windows_artifacts:
+    name: Build Artifacts
+    if: ${{ inputs.use_prebuilt_artifacts == 'false' }}
+    uses: ./.github/workflows/build_windows_artifacts.yml
+    secrets: inherit
+    with:
+      artifact_group: ${{ inputs.artifact_group }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      build_variant_label: ${{ inputs.build_variant_label }}
+      build_variant_cmake_preset: ${{ inputs.build_variant_cmake_preset }}
+      build_variant_suffix: ${{ inputs.build_variant_suffix }}
+      package_version: ${{ inputs.rocm_package_version }}
+      expect_failure: ${{ inputs.expect_failure }}
+    permissions:
+      contents: read
+      id-token: write
+
+  # TODO: rework "artifact_run_id" and "use_prebuilt_artifacts" here?
+  #   I don't want to copy/paste this condition and special case plumbing
+  #   through multiple workflows. All the packaging and testing workflows need
+  #   to know is what artifact run id to use. That could be the current
+  #   (implicit) run id, or it could be an explicit run id.
+  #   How about having the "build artifacts" job run as a passthrough?
+
+  test_windows_artifacts:
+    needs: [build_windows_artifacts]
+    name: Test Artifacts
+    # If the dependent job failed/cancelled, this job will not be run
+    # The use_prebuilt_artifacts "or" statement ensures that tests will run if
+    # previous build step is run or skipped.concurrency.
+    # If we are expecting a build failure, do not run tests to save machine capacity
+    if: >-
+      ${{
+        !failure() &&
+        !cancelled() &&
+        (
+          inputs.use_prebuilt_artifacts == 'false' ||
+          inputs.use_prebuilt_artifacts == 'true'
+        ) &&
+        inputs.expect_failure == false
+      }}
+    uses: ./.github/workflows/test_artifacts.yml
+    with:
+      artifact_group: ${{ inputs.artifact_group }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      test_type: ${{ inputs.test_type }}
+      test_labels: ${{ inputs.test_labels }}
+      sanity_check_only_for_family: ${{ inputs.sanity_check_only_for_family == true }}
+
+  build_windows_python_packages:
+    needs: [build_windows_artifacts]
+    name: Build Python
+    # If the dependent job failed/cancelled, this job will not be run
+    # The use_prebuilt_artifacts "or" statement ensures that tests will run if
+    # previous build step is run or skipped.concurrency.
+    if: >-
+      ${{
+        !failure() &&
+        !cancelled() &&
+        (
+          inputs.use_prebuilt_artifacts == 'false' ||
+          inputs.use_prebuilt_artifacts == 'true'
+        ) &&
+        inputs.expect_failure == false
+      }}
+    uses: ./.github/workflows/build_windows_python_packages.yml
+    with:
+      artifact_run_id: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      artifact_group: ${{ inputs.artifact_group }}
+      package_version: ${{ inputs.rocm_package_version }}
diff --git a/.github/workflows/copy_release.yml b/.github/workflows/copy_release.yml
new file mode 100644
index 0000000000000..fd4a49dbe4993
--- /dev/null
+++ b/.github/workflows/copy_release.yml
@@ -0,0 +1,101 @@
+name: Copy release to dev bucket
+
+on:
+  workflow_dispatch:
+    inputs:
+      rocm_version:
+        description: ROCm version to copy, e.g. 7.0.0rc20250912
+        type: string
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx94X-dcgpu
+      python_version:
+        type: choice
+        options:
+          - 3.11
+          - 3.12
+          - 3.13
+        default: 3.12
+      include_torch:
+        type: boolean
+        default: false
+      sourcesubdir:
+        type: choice
+        options:
+          - v2
+          - v2-staging
+      destsubdir:
+        type: string
+        default: v2
+      sourcebucket:
+        type: choice
+        options:
+          - nightly
+          - dev
+        default: nightly
+      destbucket:
+        type: choice
+        options:
+          - dev
+          - nightly
+        default: dev
+permissions:
+  contents: read
+
+jobs:
+  copy_python_packages:
+    name: Copy ${{ inputs.sourcebucket }} ${{ inputs.sourcesubdir }} -> ${{ inputs.destbucket }} ${{ inputs.destsubdir }} | ${{ inputs.amdgpu_family }} | rocm ${{ inputs.rocm_version }} | py ${{ inputs.python_version }}
+    runs-on: ubuntu-24.04
+    permissions:
+      id-token: write
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install the AWS tool
+        run: ./dockerfiles/install_awscli.sh
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ inputs.destbucket }}-releases
+
+      - name: Select Python version
+        run: |
+          python build_tools/github_actions/python_to_cp_version.py \
+            --python-version ${{ inputs.python_version }}
+
+      - name: Copy ROCm packages between S3 buckets
+        run: |
+          aws s3 cp \
+            s3://therock-${{ inputs.sourcebucket }}-python/${{ inputs.sourcesubdir }}/${{ inputs.amdgpu_family }}/ \
+            s3://therock-${{ inputs.destbucket }}-python/${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive --exclude "*"  --include "rocm*${{ inputs.rocm_version }}*"
+
+      - name: Copy torch wheels between S3 buckets
+        if: ${{ inputs.include_torch }}
+        run: |
+          aws s3 cp \
+            s3://therock-${{ inputs.sourcebucket }}-python/${{ inputs.sourcesubdir }}/${{ inputs.amdgpu_family }}/ \
+            s3://therock-${{ inputs.destbucket }}-python/${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}/ \
+            --recursive --exclude "*"  --include "*torch*${{ inputs.rocm_version }}*${{ env.cp_version }}*"
+
+      - name: (Re-)Generate Python package release index
+        env:
+          S3_BUCKET_PY: "therock-${{ inputs.destbucket }}-python"
+          CUSTOM_PREFIX: "${{ inputs.destsubdir }}/${{ inputs.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${CUSTOM_PREFIX}
diff --git a/.github/workflows/multi_arch_build_portable_linux.yml b/.github/workflows/multi_arch_build_portable_linux.yml
new file mode 100644
index 0000000000000..acffe43062f43
--- /dev/null
+++ b/.github/workflows/multi_arch_build_portable_linux.yml
@@ -0,0 +1,785 @@
+# Multi-Arch Build - Sharded Pipeline for Linux
+#
+# This workflow builds TheRock in stages:
+# 1. foundation (generic) - sysdeps, base
+# 2. compiler-runtime (generic) - compiler, runtimes, profiler-core
+# 3. math-libs (per-arch) - BLAS, FFT, etc.
+# 4. comm-libs (per-arch) - RCCL (parallel to math-libs)
+# 5. dctools-core (generic) - RDC (parallel to math-libs)
+# 6. profiler-apps (generic) - rocprofiler-systems (parallel to math-libs)
+# 7. media (generic) - sysdeps-amd-mesa, rocdecode (todo), rocjpeg (todo)
+#
+# Artifacts flow between stages via S3 using the artifact_manager.py tool.
+
+name: Multi-Arch Build (Linux)
+
+on:
+  workflow_call:
+    inputs:
+      artifact_group:
+        type: string
+      matrix_per_family_json:
+        type: string
+        description: "JSON array of {amdgpu_family, test-runs-on} objects for per-arch stages"
+      dist_amdgpu_families:
+        type: string
+        description: "Semicolon-separated list of all GPU families for dist targets"
+      build_variant_label:
+        type: string
+      build_variant_cmake_preset:
+        type: string
+      build_variant_suffix:
+        type: string
+      test_labels:
+        type: string
+      artifact_run_id:
+        type: string
+      expect_failure:
+        type: boolean
+      use_prebuilt_artifacts:
+        type: string
+      rocm_package_version:
+        type: string
+      test_type:
+        type: string
+
+permissions:
+  contents: read
+
+env:
+  CONTAINER_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+  CCACHE_CONFIGPATH: ${{ github.workspace }}/.ccache/ccache.conf
+  CACHE_DIR: ${{ github.workspace }}/.container-cache
+  TEATIME_FORCE_INTERACTIVE: 0
+
+jobs:
+  # ==========================================================================
+  # STAGE: foundation (generic)
+  # ==========================================================================
+  foundation:
+    name: Stage - Foundation
+    # Always run all stages
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 180  # 3 hours
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: foundation
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials
+        if: ${{ always() && !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --build-dir build
+
+  # ==========================================================================
+  # STAGE: compiler-runtime (generic)
+  # ==========================================================================
+  compiler-runtime:
+    name: Stage - Compiler Runtime
+    needs: foundation
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 480  # 8 hours (compiler is big)
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: compiler-runtime
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Fetch inbound artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --output-dir build \
+            --bootstrap
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials (refresh for push)
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --build-dir build
+
+  # ==========================================================================
+  # STAGE: math-libs (per-arch)
+  # ==========================================================================
+  math-libs:
+    name: Stage - Math Libs (${{ matrix.family_info.amdgpu_family }})
+    needs: compiler-runtime
+    strategy:
+      fail-fast: false
+      matrix:
+        family_info: ${{ fromJSON(inputs.matrix_per_family_json) }}
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 480  # 8 hours
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: math-libs
+      AMDGPU_FAMILIES: ${{ matrix.family_info.amdgpu_family }}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Fetch inbound artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \
+            --output-dir build \
+            --bootstrap
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DTHEROCK_AMDGPU_FAMILIES=${{ matrix.family_info.amdgpu_family }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials (refresh for push)
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \
+            --build-dir build
+
+  # ==========================================================================
+  # STAGE: comm-libs (per-arch, parallel to math-libs)
+  # ==========================================================================
+  comm-libs:
+    name: Stage - Comm Libs (${{ matrix.family_info.amdgpu_family }})
+    needs: compiler-runtime
+    strategy:
+      fail-fast: false
+      matrix:
+        family_info: ${{ fromJSON(inputs.matrix_per_family_json) }}
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 240  # 4 hours
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: comm-libs
+      AMDGPU_FAMILIES: ${{ matrix.family_info.amdgpu_family }}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Fetch inbound artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \
+            --output-dir build \
+            --bootstrap
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DTHEROCK_AMDGPU_FAMILIES=${{ matrix.family_info.amdgpu_family }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials (refresh for push)
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --amdgpu-families ${{ matrix.family_info.amdgpu_family }} \
+            --build-dir build
+
+  # ==========================================================================
+  # STAGE: dctools-core (generic, parallel to math-libs)
+  # ==========================================================================
+  dctools-core:
+    name: Stage - DC Tools Core
+    needs: compiler-runtime
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 120  # 2 hours
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: dctools-core
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Fetch inbound artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --output-dir build \
+            --bootstrap
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials (refresh for push)
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --build-dir build
+
+  # ==========================================================================
+  # STAGE: profiler-apps (generic, parallel to math-libs)
+  # ==========================================================================
+  profiler-apps:
+    name: Stage - Profiler Apps
+    needs: compiler-runtime
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 180  # 3 hours
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: profiler-apps
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Fetch inbound artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --output-dir build \
+            --bootstrap
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials (refresh for push)
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --build-dir build
+
+  # ==========================================================================
+  # STAGE: media (generic)
+  # ==========================================================================
+  media:
+    name: Stage - Media
+    needs: foundation
+    runs-on: azure-linux-scale-rocm
+    timeout-minutes: 180  # 3 hours
+    permissions:
+      id-token: write
+    container:
+      image: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      options: -v /runner/config:/home/awsconfig/
+    env:
+      AWS_SHARED_CREDENTIALS_FILE: /home/awsconfig/credentials.ini
+      STAGE_NAME: media
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Install python deps
+        run: pip install -r requirements.txt
+
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Setup ccache
+        run: |
+          ./build_tools/setup_ccache.py \
+            --config-preset "github-oss-presubmit" \
+            --dir "$(dirname $CCACHE_CONFIGPATH)" \
+            --local-path "$CACHE_DIR/ccache"
+
+      - name: Runner health status
+        run: |
+          ./build_tools/health_status.py
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Fetch inbound artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py fetch --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --output-dir build \
+            --bootstrap
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: ./build_tools/fetch_sources.py --stage ${STAGE_NAME} --jobs 12 --depth 1
+
+      - name: Get stage configuration
+        id: stage_config
+        run: |
+          python build_tools/configure_stage.py \
+            --stage ${STAGE_NAME} \
+            --dist-amdgpu-families "${{ inputs.dist_amdgpu_families }}" \
+            --gha-output
+
+      - name: Install stage python deps
+        if: ${{ steps.stage_config.outputs.pip_install_cmd }}
+        run: pip install ${{ steps.stage_config.outputs.pip_install_cmd }}
+
+      - name: Configure
+        run: |
+          cmake -B build -S . -GNinja \
+            -DTHEROCK_PACKAGE_VERSION=${{ inputs.rocm_package_version }} \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            ${{ steps.stage_config.outputs.cmake_args }}
+
+      - name: Build stage
+        run: |
+          cmake --build build --target stage-${STAGE_NAME} therock-artifacts -- -k 0
+
+      - name: Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "CCache Stats:"
+          ccache -s -v
+          echo "Artifacts:"
+          ls -lh build/artifacts/*.tar.xz 2>/dev/null || echo "No artifacts found"
+
+      - name: Configure AWS Credentials
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-ci
+
+      - name: Push stage artifacts
+        if: ${{ !github.event.pull_request.head.repo.fork }}
+        run: |
+          python build_tools/artifact_manager.py push --run-id ${{ github.run_id }} \
+            --stage ${STAGE_NAME} \
+            --build-dir build
diff --git a/.github/workflows/multi_arch_ci.yml b/.github/workflows/multi_arch_ci.yml
new file mode 100644
index 0000000000000..73a6a74b9df2c
--- /dev/null
+++ b/.github/workflows/multi_arch_ci.yml
@@ -0,0 +1,142 @@
+# Multi-Arch CI
+#
+# This is a staging workflow for the sharded multi-arch build pipeline.
+# It mirrors ci.yml but uses multi_arch_build_portable_linux.yml instead of
+# ci_linux.yml. Once validated, ci.yml will be updated to use the multi-arch
+# sub-workflows directly.
+
+name: Multi-Arch CI
+
+on:
+  push:
+    branches:
+      # While we are iterating on testing.
+      - 'multi_arch/**'
+  workflow_dispatch:
+    inputs:
+      linux_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Linux GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      linux_test_labels:
+        type: string
+        description: "If enabled, reduce test set on Linux to the list of labels prefixed with 'test:'. ex: test:rocprim, test:hipcub"
+        default: ""
+      linux_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Linux artifacts using artifact_run_id and only run tests"
+      windows_amdgpu_families:
+        type: string
+        description: "Insert comma-separated list of Windows GPU families to build and test. ex: gfx94X, gfx1201X"
+        default: ""
+      windows_test_labels:
+        type: string
+        description: "If enabled, reduce test set on Windows to the list of labels prefixed with 'test:' ex: test:rocprim, test:hipcub"
+        default: ""
+      windows_use_prebuilt_artifacts:
+        type: boolean
+        description: "If enabled, the CI will pull Windows artifacts using artifact_run_id and only run tests"
+      artifact_run_id:
+        type: string
+        description: "If provided, the tests will run on this artifact ID"
+        default: ""
+  # pull_request:
+  #   types:
+  #     - labeled
+  #     - opened
+  #     - synchronize
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup:
+    uses: ./.github/workflows/setup.yml
+    with:
+      build_variant: "release"
+      multi_arch: true
+
+  linux_build_and_test:
+    name: Linux::${{ matrix.variant.build_variant_label }}
+    needs: setup
+    if: >-
+      ${{
+        needs.setup.outputs.linux_variants != '[]' &&
+        needs.setup.outputs.enable_build_jobs == 'true'
+      }}
+    strategy:
+      fail-fast: false
+      matrix:
+        variant: ${{ fromJSON(needs.setup.outputs.linux_variants) }}
+    uses: ./.github/workflows/multi_arch_build_portable_linux.yml
+    secrets: inherit
+    with:
+      matrix_per_family_json: ${{ matrix.variant.matrix_per_family_json }}
+      dist_amdgpu_families: ${{ matrix.variant.dist_amdgpu_families }}
+      artifact_group: ${{ matrix.variant.artifact_group }}
+      build_variant_label: ${{ matrix.variant.build_variant_label }}
+      build_variant_suffix: ${{ matrix.variant.build_variant_suffix }}
+      build_variant_cmake_preset: ${{ matrix.variant.build_variant_cmake_preset }}
+      test_labels: ${{ needs.setup.outputs.linux_test_labels }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      expect_failure: ${{ matrix.variant.expect_failure == true }}
+      use_prebuilt_artifacts: ${{ inputs.linux_use_prebuilt_artifacts == true && 'true' || 'false' }}
+      rocm_package_version: ${{ needs.setup.outputs.rocm_package_version }}
+      test_type: ${{ needs.setup.outputs.test_type }}
+    permissions:
+      contents: read
+      id-token: write
+
+  # TODO: Add windows_build_and_test when ready
+  # windows_build_and_test:
+  #   name: Windows::${{ matrix.variant.family }}::${{ matrix.variant.build_variant_label }}
+  #   needs: setup
+  #   if: >-
+  #     ${{
+  #       needs.setup.outputs.windows_variants != '[]' &&
+  #       needs.setup.outputs.enable_build_jobs == 'true'
+  #     }}
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       variant: ${{ fromJSON(needs.setup.outputs.windows_variants) }}
+  #   uses: ./.github/workflows/ci_windows.yml
+  #   ...
+
+  ci_summary:
+    name: CI Summary
+    if: always()
+    needs:
+      - setup
+      - linux_build_and_test
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Output failed jobs
+        run: |
+          # Build a list of failed jobs, but ignore those marked continue-on-error
+          FAILED_JOBS="$(echo '${{ toJson(needs) }}' \
+            | jq --raw-output '
+              to_entries
+              | map(select(
+                  (.value.result != "success" and .value.result != "skipped")
+                  and (.value.outputs.continue_on_error | not)
+                ))
+              | map(.key)
+              | join(",")
+            ' \
+          )"
+
+          if [[ -n "${FAILED_JOBS}" ]]; then
+            echo "The following jobs failed: ${FAILED_JOBS}"
+            exit 1
+          else
+            echo "All required jobs succeeded (continue-on-error jobs ignored)."
+          fi
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
new file mode 100644
index 0000000000000..a129cad3f0c1a
--- /dev/null
+++ b/.github/workflows/pre-commit.yml
@@ -0,0 +1,21 @@
+name: pre-commit
+
+on:
+  pull_request:
+  push:
+    branches: [main]
+
+permissions:
+  contents: read
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-24.04
+    steps:
+      - name: Checkout TheRock repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/TheRock"
+          fetch-depth: 10
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+      - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
diff --git a/.github/workflows/publish_build_manylinux_rccl_x86_64.yml b/.github/workflows/publish_build_manylinux_rccl_x86_64.yml
new file mode 100644
index 0000000000000..5e9c22824da45
--- /dev/null
+++ b/.github/workflows/publish_build_manylinux_rccl_x86_64.yml
@@ -0,0 +1,21 @@
+name: Publish build_manylinux_rccl_x86_64 images
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+      - 'stage/docker/**'
+    paths:
+      - dockerfiles/build_manylinux_rccl_x86_64*.Dockerfile
+      - .github/workflows/publish_build_manylinux_rccl_x86_64.yml
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  publish_build_manylinux_x86_64:
+    uses: ./.github/workflows/publish_dockerfile.yml
+    with:
+      DOCKER_FILE_NAME: build_manylinux_rccl_x86_64
+      DOCKER_IMAGE_NAME: therock_build_manylinux_rccl_x86_64
diff --git a/.github/workflows/publish_build_manylinux_x86_64.yml b/.github/workflows/publish_build_manylinux_x86_64.yml
new file mode 100644
index 0000000000000..4501d1fe776db
--- /dev/null
+++ b/.github/workflows/publish_build_manylinux_x86_64.yml
@@ -0,0 +1,21 @@
+name: Publish build_manylinux_x86_64 images
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+      - 'stage/docker/**'
+    paths:
+      - dockerfiles/build_manylinux_x86_64*.Dockerfile
+      - .github/workflows/publish_build_manylinux_x86_64.yml
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  publish_build_manylinux_x86_64:
+    uses: ./.github/workflows/publish_dockerfile.yml
+    with:
+      DOCKER_FILE_NAME: build_manylinux_x86_64
+      DOCKER_IMAGE_NAME: therock_build_manylinux_x86_64
diff --git a/.github/workflows/publish_dockerfile.yml b/.github/workflows/publish_dockerfile.yml
new file mode 100644
index 0000000000000..bb725e88a8cd0
--- /dev/null
+++ b/.github/workflows/publish_dockerfile.yml
@@ -0,0 +1,70 @@
+name: Publish TheRock Docker image
+on:
+  workflow_call:
+    inputs:
+      DOCKER_FILE_NAME:
+        type: string
+      DOCKER_IMAGE_NAME:
+        type: string
+
+jobs:
+  build-and-push-image:
+    runs-on: ubuntu-24.04
+    env:
+      REGISTRY: ghcr.io
+      IMAGE_NAME: ROCm/${{ inputs.DOCKER_IMAGE_NAME }}
+    # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
+    permissions:
+      contents: read
+      packages: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      # Sanitization of tag names is done automatically by the metadata-action
+      - name: Determine Docker tag
+        id: tag
+        run: |
+          ref="${{ github.ref_name }}"
+          if [[ "$ref" == stage/docker/* ]]; then
+            suffix="${ref#stage/docker/}"
+            echo "TAG_SUFFIX=stage-${suffix}" >> "$GITHUB_OUTPUT"
+          elif [[ "$ref" == "main" ]]; then
+            echo "TAG_SUFFIX=latest" >> "$GITHUB_OUTPUT"
+          else
+            echo "TAG_SUFFIX=${ref}" >> "$GITHUB_OUTPUT"
+          fi
+
+      # Adds extra tags to the image, with the default tags from https://github.com/docker/metadata-action#tags-input
+      # The custom tag is for the branches prefixed with `stage/docker/`.
+      # For the default branch (i.e., main), the default behaviour remains and is labelled `latest`.
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@c299e40c65443455700f0fdfc63efafe5b349051 # v5.10.0
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=schedule
+            type=ref,event=branch,enable={{is_default_branch}}
+            type=ref,event=tag
+            type=ref,event=pr
+            type=raw,value=${{ steps.tag.outputs.TAG_SUFFIX }}
+
+      # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
+      # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
+      # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
+      - name: Build and push Docker image
+        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0
+        with:
+          context: dockerfiles/
+          file: dockerfiles/${{ inputs.DOCKER_FILE_NAME }}.Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/publish_no_rocm_image_ubuntu24_04.yml b/.github/workflows/publish_no_rocm_image_ubuntu24_04.yml
new file mode 100644
index 0000000000000..ca562fc899e62
--- /dev/null
+++ b/.github/workflows/publish_no_rocm_image_ubuntu24_04.yml
@@ -0,0 +1,21 @@
+name: Publish no_rocm_image_ubuntu24_04 images
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - 'main'
+      - 'stage/docker/**'
+    paths:
+      - dockerfiles/no_rocm_image_ubuntu24_04*.Dockerfile
+      - .github/workflows/publish_no_rocm_image_ubuntu24_04.yml
+
+permissions:
+  contents: read
+  packages: write
+
+jobs:
+  publish_no_rocm_image_ubuntu24_04:
+    uses: ./.github/workflows/publish_dockerfile.yml
+    with:
+      DOCKER_FILE_NAME: no_rocm_image_ubuntu24_04
+      DOCKER_IMAGE_NAME: no_rocm_image_ubuntu24_04
diff --git a/.github/workflows/release_native_linux_packages.yml b/.github/workflows/release_native_linux_packages.yml
new file mode 100644
index 0000000000000..50e4dd2972797
--- /dev/null
+++ b/.github/workflows/release_native_linux_packages.yml
@@ -0,0 +1,67 @@
+name: Release native Linux Packages
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        description: gfx arch for creating the s3 bucket url
+        required: true
+        type: string
+      artifact_run_id:
+        description: workflow run id to download the artifacts from
+        type: string
+      rocm_version:
+        description: ROCm version to append to the package( Like 8.0.0 or 8.1.0).
+        required: true
+        type: string
+      package_type:
+        description: Specify whether debian or rpm packages are needed (deb or rpm).
+        required: true
+        type: string
+      package_suffix:
+        description: The suffix to be added to package name(build_no or master or rc or combiantion).
+        required: true
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: string
+        default: gfx94X-dcgpu
+      artifact_run_id:
+        description: workflow run id to download the artifacts from
+        type: string
+      rocm_version:
+        description: ROCm version to append to the package( Like 7.0.0 or 7.1.0)
+        type: string
+        default: "0.0.1"
+      package_type:
+        description: Specify whether debian or rpm packages are needed (deb or rpm).
+        required: true
+        type: choice
+        options:
+          - rpm
+          - deb
+        default: "rpm"
+      package_suffix:
+        description: The suffix to be added to package name(build_no or master or rc or combiantion).
+        type: string
+        default: "test"
+
+permissions:
+  id-token: write
+  contents: read
+
+run-name: Release native Linux packages (${{ inputs.amdgpu_family }}, ${{ inputs.rocm_version }}, ${{ inputs.package_type }}, ${{ inputs.package_suffix }})
+
+jobs:
+  release:
+    name: Release Native Linux Package
+    strategy:
+      fail-fast: false
+    uses: ./.github/workflows/build_native_linux_packages.yml
+    with:
+      artifact_group: ${{ inputs.amdgpu_family }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      rocm_version: ${{ inputs.rocm_version }}
+      native_package_type: ${{ inputs.package_type }}
+      package_suffix: ${{ inputs.package_suffix }}
diff --git a/.github/workflows/release_portable_linux_packages.yml b/.github/workflows/release_portable_linux_packages.yml
new file mode 100644
index 0000000000000..133f7403de1d1
--- /dev/null
+++ b/.github/workflows/release_portable_linux_packages.yml
@@ -0,0 +1,380 @@
+name: Release portable Linux packages
+
+on:
+  # Trigger from another workflow (typically to build dev packages and then test them)
+  workflow_call:
+    inputs:
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      package_suffix:
+        type: string
+      s3_subdir:
+        description: "Subdirectory to push the packages"
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: "Staging subdirectory to push the packages"
+        type: string
+        default: "v2-staging"
+      families:
+        description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`"
+        type: string
+      prerelease_version:
+        description: "(Optional) Number of the prerelease"
+        type: string
+      repository:
+        description: "Repository to checkout. Otherwise, defaults to `github.repository`."
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  # Trigger manually (typically to test the workflow or manually build a release [candidate])
+  workflow_dispatch:
+    inputs:
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      package_suffix:
+        type: string
+      s3_subdir:
+        description: "Subdirectory to push the packages"
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: "Staging subdirectory to push the packages"
+        type: string
+        default: "v2-staging"
+      families:
+        description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`"
+        type: string
+      prerelease_version:
+        description: "(Optional) Number of the prerelease"
+        type: string
+  # Trigger on a schedule to build nightly release candidates.
+  schedule:
+    # Runs at 04:00 AM UTC, which is 8:00 PM PST (UTC-8)
+    - cron: '0 04 * * *'
+
+permissions:
+  contents: read
+
+run-name: Release portable Linux packages (${{ inputs.families || 'default' }}, ${{ inputs.release_type || 'nightly' }})
+
+jobs:
+  setup_metadata:
+    if: ${{ github.repository_owner == 'ROCm' || github.event_name != 'schedule' }}
+    runs-on: ubuntu-24.04
+    env:
+      release_type: ${{ inputs.release_type || 'nightly' }}
+    outputs:
+      version: ${{ steps.rocm_package_version.outputs.rocm_package_version }}
+      rpm_version: ${{ steps.rocm_native_package_version.outputs.rocm_rpm_package_version }}
+      deb_version: ${{ steps.rocm_native_package_version.outputs.rocm_deb_package_version }}
+      release_type: ${{ env.release_type }}
+      package_targets: ${{ steps.configure.outputs.package_targets }}
+      cloudfront_url: ${{ steps.release_information.outputs.cloudfront_url }}
+      cloudfront_staging_url: ${{ steps.release_information.outputs.cloudfront_staging_url }}
+      s3_subdir_tar: ${{ steps.release_information.outputs.s3_subdir_tar }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Setup Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: Compute package version
+        id: rocm_package_version
+        run: |
+          python ./build_tools/compute_rocm_package_version.py \
+            --release-type=${{ env.release_type }} \
+            --prerelease-version=${{ inputs.prerelease_version }}
+
+      - name: Compute native package version
+        id: rocm_native_package_version
+        run: |
+          # Compute rpm package version
+          # This sets the 'rocm_rpm_package_version' output
+          python ./build_tools/compute_rocm_package_version.py \
+            --release-type=${{ env.release_type }} \
+            --prerelease-version=${{ inputs.prerelease_version }} \
+            --package-type="rpm"
+          # Compute debian package version
+          # This sets the 'rocm_deb_package_version' output
+          python ./build_tools/compute_rocm_package_version.py \
+            --release-type=${{ env.release_type }} \
+            --prerelease-version=${{ inputs.prerelease_version }} \
+            --package-type="deb"
+
+      - name: Set variables for nightly release
+        if: ${{ env.release_type == 'nightly' }}
+        run: |
+          echo "tmp_cloudfront_url=https://rocm.nightlies.amd.com/v2" >> $GITHUB_ENV
+          echo "tmp_cloudfront_staging_url=https://rocm.nightlies.amd.com/v2-staging" >> $GITHUB_ENV
+          echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV
+
+      - name: Set variables for prerelease
+        if: ${{ env.release_type == 'prerelease' }}
+        run: |
+          echo "tmp_cloudfront_url=https://rocm.prereleases.amd.com/whl" >> $GITHUB_ENV
+          echo "tmp_cloudfront_staging_url=https://rocm.prereleases.amd.com/whl-staging" >> $GITHUB_ENV
+          echo "tmp_s3_subdir_tar=v3/tarball/" >> $GITHUB_ENV
+
+      - name: Set variables for development release
+        if: ${{ env.release_type == 'dev' }}
+        run: |
+          echo "tmp_cloudfront_url=https://rocm.devreleases.amd.com/v2" >> $GITHUB_ENV
+          echo "tmp_cloudfront_staging_url=https://rocm.devreleases.amd.com/v2-staging" >> $GITHUB_ENV
+          echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV
+
+      - name: Generate release information
+        id: release_information
+        run: |
+          echo "cloudfront_url=${tmp_cloudfront_url}" >> $GITHUB_OUTPUT
+          echo "cloudfront_staging_url=${tmp_cloudfront_staging_url}" >> $GITHUB_OUTPUT
+          echo "s3_subdir_tar=${tmp_s3_subdir_tar}" >> $GITHUB_OUTPUT
+
+      - name: Generating package target matrix
+        id: configure
+        env:
+          AMDGPU_FAMILIES: ${{ inputs.families }}
+          THEROCK_PACKAGE_PLATFORM: "linux"
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: python ./build_tools/github_actions/fetch_package_targets.py
+
+  portable_linux_packages:
+    name: ${{ matrix.target_bundle.amdgpu_family }}::Build Portable Linux
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-linux-scale-rocm' || 'ubuntu-24.04' }}
+    continue-on-error: ${{ matrix.target_bundle.expect_failure == true }} # for GPU families that are flaky, we mark as xfail
+    timeout-minutes: 720 # 12 hour timeout
+    needs: [setup_metadata]
+    permissions:
+      contents: write
+      actions: write # Added permission to trigger workflows
+      id-token: write # Added permission for AWS S3 upload
+    strategy:
+      fail-fast: false
+      matrix:
+        target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }}
+    env:
+      TEATIME_LABEL_GH_GROUP: 1
+      OUTPUT_DIR: ${{ github.workspace }}/output
+      BUILD_IMAGE: ghcr.io/rocm/therock_build_manylinux_x86_64@sha256:583d473f263a289222c48d4b493e2956b2354a45796f09dee6f2c8ecd4504ab6
+      DIST_ARCHIVE: "${{ github.workspace }}/output/therock-dist-linux-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz"
+      FILE_NAME: "therock-dist-linux-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz"
+      RELEASE_TYPE: "${{ needs.setup_metadata.outputs.release_type }}"
+      S3_BUCKET_TAR: "therock-${{ needs.setup_metadata.outputs.release_type }}-tarball"
+      S3_SUBDIR_TAR: ${{ needs.setup_metadata.outputs.s3_subdir_tar }}
+      S3_BUCKET_PY: "therock-${{ needs.setup_metadata.outputs.release_type }}-python"
+      S3_SUBDIR: ${{ inputs.s3_subdir || 'v2' }}
+      S3_STAGING_SUBDIR: ${{ inputs.s3_staging_subdir || 'v2-staging' }}
+      MANYLINUX: 1
+
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      # TODO: We shouldn't be using a cache on actual release branches, but it
+      # really helps for iteration time.
+      - name: Enable cache
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          path: ${{ env.OUTPUT_DIR }}/caches
+          key: portable-linux-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }}
+          restore-keys: |
+            portable-linux-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-
+
+      - name: Install the AWS tool
+        run: ./dockerfiles/install_awscli.sh
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: |
+          # Prefetch docker container in background.
+          docker pull ${{ env.BUILD_IMAGE }} &
+          ./build_tools/fetch_sources.py --jobs 10
+          wait
+
+      - name: Build Projects
+        run: |
+          ./build_tools/linux_portable_build.py \
+            --image=${{ env.BUILD_IMAGE }} \
+            --output-dir=${{ env.OUTPUT_DIR }} \
+            -- \
+            "-DTHEROCK_AMDGPU_FAMILIES=${{ matrix.target_bundle.amdgpu_family }}"
+          cd ${{ env.OUTPUT_DIR }}/build/dist/rocm
+          echo "Building ${{ env.DIST_ARCHIVE }}"
+          tar cfz "${{ env.DIST_ARCHIVE }}" .
+
+      - name: Build Python Packages
+        run: |
+          ./build_tools/linux_portable_build.py \
+            --image=${{ env.BUILD_IMAGE }} \
+            --output-dir=${{ env.OUTPUT_DIR }}/packages \
+            --build-python-only \
+            --artifact-dir=${{ env.OUTPUT_DIR }}/build/artifacts \
+            -- \
+            "--version=${{ needs.setup_metadata.outputs.version }}"
+
+      - name: Grant ownership over output directory
+        if: ${{ !cancelled() }}
+        run: |
+          sudo chown -R $(whoami) ${{ env.OUTPUT_DIR }}
+
+      - name: Build Report
+        if: ${{ !cancelled() }}
+        run: |
+          echo "Full SDK du:"
+          echo "------------"
+          du -h -d 1 ${{ env.OUTPUT_DIR }}/build/dist/rocm
+
+      # Analyze ninja build log to generate per-component timing report
+      - name: Analyze Build Times
+        if: ${{ !cancelled() }}
+        run: |
+          python3 build_tools/analyze_build_times.py --build-dir ${{ env.OUTPUT_DIR }}/build
+
+      - name: Configure AWS Credentials
+        if: ${{ github.repository_owner == 'ROCm' && !cancelled() }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ env.RELEASE_TYPE }}
+
+      - name: Post Build Upload
+        if: ${{ github.repository_owner == 'ROCm' && !cancelled() }}
+        run: |
+          python3 build_tools/github_actions/post_build_upload.py \
+            --run-id ${{ github.run_id }} \
+            --artifact-group "${{ matrix.target_bundle.amdgpu_family }}" \
+            --build-dir ${{ env.OUTPUT_DIR }}/build \
+            --upload \
+            --job-status ${{ job.status }}
+
+      - name: Upload Releases to staging S3
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          aws s3 cp ${{ env.OUTPUT_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \
+          --recursive --no-follow-symlinks \
+          --exclude "*" \
+          --include "*.whl" \
+          --include "*.tar.gz"
+
+      - name: (Re-)Generate Python package release index for staging
+        if: ${{ github.repository_owner == 'ROCm' }}
+        env:
+          # Environment variable to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
+
+      ## TODO: Restrict uploading to the non-staging S3 directory until ROCm sanity checks and all validation tests have successfully passed.
+      - name: Upload Releases to S3
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          aws s3 cp ${{ env.DIST_ARCHIVE }} s3://${{ env.S3_BUCKET_TAR }}/${{ env.S3_SUBDIR_TAR }}
+          aws s3 cp ${{ env.OUTPUT_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \
+          --recursive --no-follow-symlinks \
+          --exclude "*" \
+          --include "*.whl" \
+          --include "*.tar.gz"
+
+      - name: (Re-)Generate release index pages
+        if: ${{ github.repository_owner == 'ROCm' }}
+        env:
+          # Environment variable to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py  ${{ env.CUSTOM_PREFIX }}
+          python ./build_tools/index_generation_s3_tar.py \
+            --bucket ${{ env.S3_BUCKET_TAR }} \
+            --directory ${{ env.S3_SUBDIR_TAR }} \
+            --upload
+
+      - name: Trigger building PyTorch wheels
+        if: ${{ github.repository_owner == 'ROCm' && matrix.target_bundle.expect_pytorch_failure == false }}
+        uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4
+        with:
+          workflow: release_portable_linux_pytorch_wheels.yml
+          inputs: |
+            { "amdgpu_family": "${{ matrix.target_bundle.amdgpu_family }}",
+              "release_type": "${{ env.RELEASE_TYPE }}",
+              "s3_subdir": "${{ env.S3_SUBDIR }}",
+              "s3_staging_subdir": "${{ env.S3_STAGING_SUBDIR }}",
+              "cloudfront_url": "${{ needs.setup_metadata.outputs.cloudfront_url }}",
+              "cloudfront_staging_url": "${{ needs.setup_metadata.outputs.cloudfront_staging_url }}",
+              "rocm_version": "${{ needs.setup_metadata.outputs.version }}",
+              "ref": "${{ inputs.ref || '' }}"
+            }
+
+      - name: URL-encode .tar URL
+        # TODO: Enable JAX wheels for prereleases
+        if: ${{ env.RELEASE_TYPE != 'prerelease' }}
+        id: url-encode-tar
+        run: python -c "from urllib.parse import quote; print('tar_url=https://therock-${{ env.RELEASE_TYPE }}-tarball.s3.amazonaws.com/' + quote('therock-dist-linux-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz'))" >> ${GITHUB_OUTPUT}
+
+      - name: Trigger build JAX wheels
+        # TODO: Enable JAX wheels for prereleases
+        if: ${{ env.RELEASE_TYPE != 'prerelease' && github.repository_owner == 'ROCm' }}
+        uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4
+        with:
+          workflow: build_linux_jax_wheels.yml
+          inputs: |
+            { "amdgpu_family": "${{ matrix.target_bundle.amdgpu_family }}",
+              "python_version": "3.12",
+              "release_type": "${{ env.RELEASE_TYPE }}",
+              "s3_subdir": "${{ env.S3_STAGING_SUBDIR }}",
+              "rocm_version": "${{ needs.setup_metadata.outputs.version }}",
+              "tar_url": "${{ steps.url-encode-tar.outputs.tar_url }}"
+            }
+
+      - name: Trigger build native rpm package
+        if: ${{ github.repository_owner == 'ROCm' }}
+        uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4
+        with:
+          workflow: build_native_linux_packages.yml
+          inputs: |
+            { "artifact_group": "${{ matrix.target_bundle.amdgpu_family }}",
+              "rocm_version": "${{ needs.setup_metadata.outputs.rpm_version }}",
+              "release_type": "${{ env.RELEASE_TYPE }}",
+              "artifact_run_id": "${{ github.run_id  }}",
+              "native_package_type": "rpm"
+            }
+
+      - name: Trigger build native debian package
+        if: ${{ github.repository_owner == 'ROCm' }}
+        uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4
+        with:
+          workflow: build_native_linux_packages.yml
+          inputs: |
+            { "artifact_group": "${{ matrix.target_bundle.amdgpu_family }}",
+              "rocm_version": "${{ needs.setup_metadata.outputs.deb_version }}",
+              "release_type": "${{ env.RELEASE_TYPE }}",
+              "artifact_run_id": "${{ github.run_id  }}",
+              "native_package_type": "deb"
+            }
+
+      - name: Save cache
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.OUTPUT_DIR }}/caches
+          key: portable-linux-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }}
diff --git a/.github/workflows/release_portable_linux_pytorch_wheels.yml b/.github/workflows/release_portable_linux_pytorch_wheels.yml
new file mode 100644
index 0000000000000..87b52de133899
--- /dev/null
+++ b/.github/workflows/release_portable_linux_pytorch_wheels.yml
@@ -0,0 +1,114 @@
+name: Release portable Linux PyTorch Wheels
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: Staging subdirectory to push the wheels for test
+        type: string
+        default: "v2-staging"
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        required: true
+        type: string
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx94X-dcgpu
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: "Staging subdirectory to push the wheels for test"
+        type: string
+        default: "v2-staging"
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2"
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2-staging"
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+        default: ''
+
+permissions:
+  id-token: write
+  contents: read
+
+run-name: Release portable Linux PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.release_type }}, ${{ inputs.rocm_version }})
+
+jobs:
+  release:
+    name: Release | ${{ inputs.amdgpu_family }} | py ${{ matrix.python_version }} | torch ${{ matrix.pytorch_git_ref }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11", "3.12", "3.13"]
+        pytorch_git_ref: ["release/2.7", "release/2.8", "release/2.9", "nightly"]
+        include:
+          - pytorch_git_ref: release/2.7
+            pytorch_patchset: rocm_2.7
+          - pytorch_git_ref: release/2.8
+            pytorch_patchset: rocm_2.8
+          - pytorch_git_ref: release/2.9
+            pytorch_patchset: rocm_2.9
+          - pytorch_git_ref: nightly
+            pytorch_patchset: nightly
+
+    uses: ./.github/workflows/build_portable_linux_pytorch_wheels.yml
+    with:
+      amdgpu_family: ${{ inputs.amdgpu_family }}
+      python_version: ${{ matrix.python_version }}
+      release_type: ${{ inputs.release_type }}
+      s3_subdir: ${{ inputs.s3_subdir }}
+      s3_staging_subdir: ${{ inputs.s3_staging_subdir }}
+      cloudfront_url: ${{ inputs.cloudfront_url }}
+      cloudfront_staging_url: ${{ inputs.cloudfront_staging_url }}
+      rocm_version: ${{ inputs.rocm_version }}
+      pytorch_git_ref: ${{ matrix.pytorch_git_ref }}
+      pytorch_patchset: ${{ matrix.pytorch_patchset }}
+      ref: ${{ inputs.ref || '' }}
diff --git a/.github/workflows/release_windows_packages.yml b/.github/workflows/release_windows_packages.yml
new file mode 100644
index 0000000000000..4c456b4d6489d
--- /dev/null
+++ b/.github/workflows/release_windows_packages.yml
@@ -0,0 +1,360 @@
+name: Release Windows packages
+
+on:
+  # Trigger from another workflow (typically to build dev packages and then test them)
+  workflow_call:
+    inputs:
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      package_suffix:
+        type: string
+      s3_subdir:
+        description: "Subdirectory to push the Python packages"
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: "Staging subdirectory to push the packages"
+        type: string
+        default: "v2-staging"
+      families:
+        description: "Comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`, or empty for the default list"
+        type: string
+      prerelease_version:
+        description: "(Optional) Number of the prerelease"
+        type: string
+      repository:
+        description: "Repository to checkout. Otherwise, defaults to `github.repository`."
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  # Trigger manually (typically to test the workflow or manually build a release [candidate])
+  workflow_dispatch:
+    inputs:
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      package_suffix:
+        type: string
+      s3_subdir:
+        description: "Subdirectory to push the Python packages"
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: "Staging subdirectory to push the packages"
+        type: string
+        default: "v2-staging"
+      families:
+        description: "A comma separated list of AMD GPU families, e.g. `gfx94X,gfx103x`, or empty for the default list"
+        type: string
+      prerelease_version:
+        description: "(Optional) Number of the prerelease"
+        type: string
+      extra_cmake_options:
+        description: "Extra options to pass to the CMake configure command"
+        type: string
+
+  # Trigger on a schedule to build nightly release candidates.
+  schedule:
+    # Runs at 04:00 AM UTC, which is 8:00 PM PST (UTC-8)
+    - cron: '0 04 * * *'
+
+permissions:
+  contents: read
+
+run-name: Release Windows packages (${{ inputs.families || 'default' }}, ${{ inputs.release_type || 'nightly' }})
+
+jobs:
+  setup_metadata:
+    if: ${{ github.repository_owner == 'ROCm' || github.event_name != 'schedule' }}
+    runs-on: ubuntu-24.04
+    env:
+      release_type: ${{ inputs.release_type || 'nightly' }}
+    outputs:
+      version: ${{ steps.rocm_package_version.outputs.rocm_package_version }}
+      release_type: ${{ env.release_type }}
+      package_targets: ${{ steps.configure.outputs.package_targets }}
+      cloudfront_url: ${{ steps.release_information.outputs.cloudfront_url }}
+      cloudfront_staging_url: ${{ steps.release_information.outputs.cloudfront_staging_url }}
+      s3_subdir_tar: ${{ steps.release_information.outputs.s3_subdir_tar }}
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Setup Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: Compute package version
+        id: rocm_package_version
+        run: |
+          python ./build_tools/compute_rocm_package_version.py \
+            --release-type=${{ env.release_type }} \
+            --prerelease-version=${{ inputs.prerelease_version }}
+
+      - name: Set variables for nightly release
+        if: ${{ env.release_type == 'nightly' }}
+        run: |
+          echo "tmp_cloudfront_url=https://rocm.nightlies.amd.com/v2" >> $GITHUB_ENV
+          echo "tmp_cloudfront_staging_url=https://rocm.nightlies.amd.com/v2-staging" >> $GITHUB_ENV
+          echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV
+
+      - name: Set variables for prerelease
+        if: ${{ env.release_type == 'prerelease' }}
+        run: |
+          echo "tmp_cloudfront_url=https://rocm.prereleases.amd.com/whl" >> $GITHUB_ENV
+          echo "tmp_cloudfront_staging_url=https://rocm.prereleases.amd.com/whl-staging" >> $GITHUB_ENV
+          echo "tmp_s3_subdir_tar=v3/tarball/" >> $GITHUB_ENV
+
+      - name: Set variables for development release
+        if: ${{ env.release_type == 'dev' }}
+        run: |
+          echo "tmp_cloudfront_url=https://rocm.devreleases.amd.com/v2" >> $GITHUB_ENV
+          echo "tmp_cloudfront_staging_url=https://rocm.devreleases.amd.com/v2-staging" >> $GITHUB_ENV
+          echo "tmp_s3_subdir_tar=''" >> $GITHUB_ENV
+
+      - name: Generate release information
+        id: release_information
+        run: |
+          echo "cloudfront_url=${tmp_cloudfront_url}" >> $GITHUB_OUTPUT
+          echo "cloudfront_staging_url=${tmp_cloudfront_staging_url}" >> $GITHUB_OUTPUT
+          echo "s3_subdir_tar=${tmp_s3_subdir_tar}" >> $GITHUB_OUTPUT
+
+      - name: Generating package target matrix
+        id: configure
+        env:
+          AMDGPU_FAMILIES: ${{ inputs.families }}
+          THEROCK_PACKAGE_PLATFORM: "windows"
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: python ./build_tools/github_actions/fetch_package_targets.py
+
+  windows_packages:
+    name: ${{ matrix.target_bundle.amdgpu_family }}::Build Windows
+    runs-on: ${{ github.repository_owner == 'ROCm' && 'azure-windows-scale-rocm' || 'windows-2022' }}
+    continue-on-error: ${{ matrix.target_bundle.expect_failure == true }} # for GPU families that are flaky, we mark as xfail
+    timeout-minutes: 720 # 12 hour timeout
+    needs: [setup_metadata]
+    permissions:
+      contents: write
+      actions: write # Added permission to trigger workflows
+      id-token: write # Added permission for AWS S3 upload
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      fail-fast: false
+      matrix:
+        target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }}
+    env:
+      TEATIME_LABEL_GH_GROUP: 1
+      BUILD_DIR: B:\build
+      CACHE_DIR: "${{github.workspace}}/.cache"
+      CCACHE_DIR: "${{github.workspace}}/.cache/ccache"
+      CCACHE_MAXSIZE: "4000M"
+      DIST_ARCHIVE: "B:/build/artifacts/therock-dist-windows-${{ matrix.target_bundle.amdgpu_family }}${{ inputs.package_suffix }}-${{ needs.setup_metadata.outputs.version }}.tar.gz"
+      RELEASE_TYPE: "${{ needs.setup_metadata.outputs.release_type }}"
+      S3_BUCKET_TAR: "therock-${{ needs.setup_metadata.outputs.release_type }}-tarball"
+      S3_SUBDIR_TAR: ${{ needs.setup_metadata.outputs.s3_subdir_tar }}
+      S3_BUCKET_PY: "therock-${{ needs.setup_metadata.outputs.release_type }}-python"
+      S3_SUBDIR: ${{ inputs.s3_subdir || 'v2' }}
+      S3_STAGING_SUBDIR: ${{ inputs.s3_staging_subdir || 'v2-staging' }}
+
+    steps:
+      - name: "Checking out repository"
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: Install python deps
+        run: |
+          pip install -r requirements.txt
+
+      # TODO(amd-justchen): share with build_windows_artifacts.yml. Include in VM image? Dockerfile?
+      - name: Install requirements
+        run: |
+          choco install --no-progress -y ccache
+          # ninja pinned due to a bug in the 1.13.0 release:
+          # https://github.com/ninja-build/ninja/issues/2616
+          choco install --no-progress -y ninja --version 1.12.1
+          choco install --no-progress -y strawberryperl
+          echo "$PATH;C:\Strawberry\c\bin" >> $GITHUB_PATH
+          choco install --no-progress -y awscli
+          choco install --no-progress -y pkgconfiglite
+          echo "$PATH;C:\Program Files\Amazon\AWSCLIV2" >> $GITHUB_PATH
+
+      - uses: iterative/setup-dvc@4bdfd2b0f6f1ad7e08afadb03b1a895c352a5239 # v2.0.0
+        with:
+          version: '3.62.0'
+
+      # After other installs, so MSVC get priority in the PATH.
+      - name: Configure MSVC
+        uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756 # v1.13.0
+
+      - name: Runner health status
+        run: |
+          ccache --zero-stats
+          python ./build_tools/health_status.py
+
+      # TODO: We shouldn't be using a cache on actual release branches, but it
+      # really helps for iteration time.
+      - name: Enable cache
+        uses: actions/cache/restore@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: windows-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }}
+          restore-keys: |
+            windows-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-
+
+      - name: Fetch sources
+        timeout-minutes: 30
+        run: |
+          git config fetch.parallel 10
+          git config --global core.symlinks true
+          git config --global core.longpaths true
+          python ./build_tools/fetch_sources.py --jobs 12
+
+      - name: Configure Projects
+        env:
+          amdgpu_families: ${{ matrix.target_bundle.amdgpu_family }}
+          package_version: "ADHOCBUILD"
+          extra_cmake_options: ${{ inputs.extra_cmake_options }}
+        run: |
+          # clear cache before build and after download
+          ccache -z
+
+          python3 build_tools/github_actions/build_configure.py
+
+      - name: Build therock-dist
+        run: cmake --build "${{ env.BUILD_DIR }}" --target therock-dist
+
+      - name: Build therock-archives
+        run: cmake --build "${{ env.BUILD_DIR }}" --target therock-archives
+
+      - name: Compress dist folder
+        run: |
+          cd ${{ env.BUILD_DIR }}/dist/rocm
+          echo "Compressing ${{ env.DIST_ARCHIVE }}"
+          tar cfz "${{ env.DIST_ARCHIVE }}" --force-local .
+
+      - name: Build Python Packages
+        run: |
+          python ./build_tools/build_python_packages.py \
+            --artifact-dir=${{ env.BUILD_DIR }}/artifacts \
+            --dest-dir=${{ env.BUILD_DIR }}/packages \
+            --version=${{ needs.setup_metadata.outputs.version }}
+
+      - name: Build report
+        if: ${{ !cancelled() }}
+        shell: bash
+        run: |
+          if [ -d "${{ env.BUILD_DIR }}" ]; then
+            echo "Build dir:"
+            echo "------------"
+            ls -lh "${{ env.BUILD_DIR }}"
+            echo "CCache Stats:"
+            echo "-------------"
+            ccache -s
+          else
+            echo "[ERROR] Build directory ${{ env.BUILD_DIR }} does not exist. Skipping report!"
+            echo "        This should only happen if the CI is cancelled before the build step."
+            exit 1  # Stop the CI as build did not happen
+          fi
+
+      - name: Configure AWS Credentials
+        if: ${{ github.repository_owner == 'ROCm' && !cancelled() }}
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708 # v5.1.1
+        with:
+          aws-region: us-east-2
+          role-to-assume: arn:aws:iam::692859939525:role/therock-${{ env.RELEASE_TYPE }}
+          special-characters-workaround: true
+
+      - name: Post Build Upload
+        if: ${{ github.repository_owner == 'ROCm' && !cancelled() }}
+        run: |
+          python3 build_tools/github_actions/post_build_upload.py \
+            --run-id ${{ github.run_id }} \
+            --artifact-group "${{ matrix.target_bundle.amdgpu_family }}" \
+            --build-dir ${{ env.BUILD_DIR }} \
+            --upload \
+            --job-status ${{ job.status }}
+
+      - name: Upload Releases to staging S3
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          aws s3 cp ${{ env.BUILD_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \
+          --recursive --no-follow-symlinks \
+          --exclude "*" \
+          --include "*.whl" \
+          --include "*.tar.gz"
+
+      - name: (Re-)Generate Python package release index for staging
+        if: ${{ github.repository_owner == 'ROCm' }}
+        env:
+          # Environment variable to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ env.S3_STAGING_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py ${{ env.CUSTOM_PREFIX }}
+
+      ## TODO: Restrict uploading to the non-staging S3 directory until sanity checks and all validation tests have successfully passed.
+      - name: Upload Releases to S3
+        if: ${{ github.repository_owner == 'ROCm' }}
+        run: |
+          aws s3 cp ${{ env.DIST_ARCHIVE }} s3://${{ env.S3_BUCKET_TAR }}/${{ env.S3_SUBDIR_TAR }}
+          aws s3 cp ${{ env.BUILD_DIR }}/packages/dist/ s3://${{ env.S3_BUCKET_PY }}/${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}/ \
+          --recursive --no-follow-symlinks \
+          --exclude "*" \
+          --include "*.whl" \
+          --include "*.tar.gz"
+
+      # TODO(marbre): guard against race conditions where multiple workflows update the index at the same time?
+      #    Moving the index computation server-side could help
+      - name: (Re-)Generate release index pages
+        if: ${{ github.repository_owner == 'ROCm' }}
+        env:
+          # Environment variable to be set for `manage.py`
+          CUSTOM_PREFIX: "${{ env.S3_SUBDIR }}/${{ matrix.target_bundle.amdgpu_family }}"
+        run: |
+          pip install boto3 packaging
+          python ./build_tools/third_party/s3_management/manage.py  ${{ env.CUSTOM_PREFIX }}
+          python ./build_tools/index_generation_s3_tar.py \
+            --bucket ${{ env.S3_BUCKET_TAR }} \
+            --directory ${{ env.S3_SUBDIR_TAR }} \
+            --upload
+
+      - name: Trigger building PyTorch wheels
+        if: ${{ github.repository_owner == 'ROCm' && matrix.target_bundle.expect_pytorch_failure == false }}
+        uses: benc-uk/workflow-dispatch@e2e5e9a103e331dad343f381a29e654aea3cf8fc # v1.2.4
+        with:
+          workflow: release_windows_pytorch_wheels.yml
+          inputs: |
+            { "amdgpu_family": "${{ matrix.target_bundle.amdgpu_family }}",
+              "release_type": "${{ env.RELEASE_TYPE }}",
+              "s3_subdir": "${{ env.S3_SUBDIR }}",
+              "s3_staging_subdir": "${{ env.S3_STAGING_SUBDIR }}",
+              "cloudfront_url": "${{ needs.setup_metadata.outputs.cloudfront_url }}",
+              "cloudfront_staging_url": "${{ needs.setup_metadata.outputs.cloudfront_staging_url }}",
+              "rocm_version": "${{ needs.setup_metadata.outputs.version }}",
+              "ref": "${{ inputs.ref || '' }}"
+            }
+
+      - name: Save cache
+        uses: actions/cache/save@0057852bfaa89a56745cba8c7296529d2fc39830 # v4.3.0
+        if: ${{ !cancelled() }}
+        with:
+          path: ${{ env.CACHE_DIR }}
+          key: windows-package-matrix-v1-${{ matrix.target_bundle.amdgpu_family }}-${{ github.sha }}
diff --git a/.github/workflows/release_windows_pytorch_wheels.yml b/.github/workflows/release_windows_pytorch_wheels.yml
new file mode 100644
index 0000000000000..85e0f6b88da81
--- /dev/null
+++ b/.github/workflows/release_windows_pytorch_wheels.yml
@@ -0,0 +1,110 @@
+name: Release Windows PyTorch Wheels
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: Staging subdirectory to push the wheels for test
+        type: string
+        default: "v2-staging"
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2"
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx1151
+      release_type:
+        description: The type of release to build ("dev", "nightly", or "prerelease"). All developer-triggered jobs should use "dev"!
+        type: string
+        default: "dev"
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        type: string
+        default: "v2"
+      s3_staging_subdir:
+        description: "Staging subdirectory to push the wheels for test"
+        type: string
+        default: "v2-staging"
+      cloudfront_url:
+        description: CloudFront URL pointing to Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2"
+      cloudfront_staging_url:
+        description: CloudFront base URL pointing to staging Python index
+        type: string
+        default: "https://rocm.devreleases.amd.com/v2-staging"
+      rocm_version:
+        description: ROCm version to pip install (e.g. "7.10.0a20251124")
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+        default: ''
+
+permissions:
+  id-token: write
+  contents: read
+
+run-name: Release Windows PyTorch Wheels (${{ inputs.amdgpu_family }}, ${{ inputs.release_type }}, ${{ inputs.rocm_version }})
+
+jobs:
+  release:
+    name: Release | ${{ inputs.amdgpu_family }} | py ${{ matrix.python_version }} | torch ${{ matrix.pytorch_git_ref }}
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11", "3.12", "3.13"]
+        pytorch_git_ref: ["release/2.9", "nightly"]
+        include:
+          - pytorch_git_ref: release/2.9
+            pytorch_patchset: rocm_2.9
+          - pytorch_git_ref: nightly
+            pytorch_patchset: nightly
+
+    uses: ./.github/workflows/build_windows_pytorch_wheels.yml
+    with:
+      amdgpu_family: ${{ inputs.amdgpu_family }}
+      python_version: ${{ matrix.python_version }}
+      release_type: ${{ inputs.release_type }}
+      s3_subdir: ${{ inputs.s3_subdir }}
+      s3_staging_subdir: ${{ inputs.s3_staging_subdir }}
+      cloudfront_url: ${{ inputs.cloudfront_url }}
+      cloudfront_staging_url: ${{ inputs.cloudfront_staging_url }}
+      rocm_version: ${{ inputs.rocm_version }}
+      pytorch_git_ref: ${{ matrix.pytorch_git_ref }}
+      pytorch_patchset: ${{ matrix.pytorch_patchset }}
+      ref: ${{ inputs.ref || '' }}
diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml
new file mode 100644
index 0000000000000..c0af83e89731d
--- /dev/null
+++ b/.github/workflows/setup.yml
@@ -0,0 +1,93 @@
+name: Setup
+
+on:
+  workflow_call:
+    inputs:
+      build_variant:
+        type: string
+        default: "release"
+      multi_arch:
+        type: boolean
+        default: false
+        description: "If true, group all families into one entry per build_variant instead of expanding cross-product"
+    outputs:
+      enable_build_jobs:
+        description: Whether to enable build jobs.
+        value: ${{ jobs.setup.outputs.enable_build_jobs }}
+      linux_variants:
+        description: Matrix variants to run on Linux
+        value: ${{ jobs.setup.outputs.linux_variants }}
+      linux_test_labels:
+        description: ROCm projects to run Linux tests on. Optional filter.
+        value: ${{ jobs.setup.outputs.linux_test_labels }}
+      windows_variants:
+        description: Matrix variants to run on Windows.
+        value: ${{ jobs.setup.outputs.windows_variants }}
+      test_type:
+        description: The test type to run for component tests (i.e. smoke, full)
+        value: ${{ jobs.setup.outputs.test_type }}
+      windows_test_labels:
+        description: ROCm projects to run Windows tests on. Optional filter.
+        value: ${{ jobs.setup.outputs.windows_test_labels }}
+      rocm_package_version:
+        description: ROCm package version (primarily for Python packages).
+        value: ${{ jobs.setup.outputs.rocm_package_version }}
+
+permissions:
+  contents: read
+
+jobs:
+  setup:
+    runs-on: ubuntu-24.04
+    env:
+      # The commit being checked out is the merge commit for a PR. Its first
+      # parent will be the tip of the base branch.
+      BASE_REF: HEAD^
+    outputs:
+      enable_build_jobs: ${{ steps.configure.outputs.enable_build_jobs }}
+      linux_variants: ${{ steps.configure.outputs.linux_variants }}
+      linux_test_labels: ${{ steps.configure.outputs.linux_test_labels }}
+      windows_variants: ${{ steps.configure.outputs.windows_variants }}
+      test_type: ${{ steps.configure.outputs.test_type }}
+      windows_test_labels: ${{ steps.configure.outputs.windows_test_labels }}
+      rocm_package_version: ${{ steps.rocm_package_version.outputs.rocm_package_version }}
+    steps:
+      - name: Checkout TheRock repository
+        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+          fetch-depth: 10
+      - name: SHA of TheRock
+        run: |
+             git rev-parse HEAD
+             git log -1
+      - name: Set PR_LABELS variable with labels assigned to pull request
+        if: ${{ github.event.pull_request }} # only set PR labels var if this is a pull request
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ github.event.number }}
+        run: |
+          echo "PR_LABELS=$(gh pr view ${PR_NUMBER} --repo ROCm/llvm-project --json labels)" >> $GITHUB_ENV
+
+      - name: Configuring CI options
+        id: configure
+        env:
+          #INPUT_LINUX_AMDGPU_FAMILIES: ${{ github.event.inputs.linux_amdgpu_families }}
+          INPUT_LINUX_AMDGPU_FAMILIES: "gfx94X"
+          LINUX_TEST_LABELS: ${{ github.event.inputs.linux_test_labels }}
+          LINUX_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.linux_use_prebuilt_artifacts }}
+          #INPUT_WINDOWS_AMDGPU_FAMILIES: ${{ github.event.inputs.windows_amdgpu_families }}
+          INPUT_WINDOWS_AMDGPU_FAMILIES: "gfx1151"
+          WINDOWS_TEST_LABELS: ${{ github.event.inputs.windows_test_labels }}
+          WINDOWS_USE_PREBUILT_ARTIFACTS: ${{ github.event.inputs.windows_use_prebuilt_artifacts }}
+          BUILD_VARIANT: ${{ inputs.build_variant }}
+          MULTI_ARCH: ${{ inputs.multi_arch }}
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: ./build_tools/github_actions/configure_ci.py
+
+      - name: Compute package version
+        id: rocm_package_version
+        run: python ./build_tools/compute_rocm_package_version.py --release-type=dev
diff --git a/.github/workflows/test_artifacts.yml b/.github/workflows/test_artifacts.yml
new file mode 100644
index 0000000000000..53a1e2442571d
--- /dev/null
+++ b/.github/workflows/test_artifacts.yml
@@ -0,0 +1,122 @@
+name: Test Artifacts
+
+on:
+  workflow_dispatch:
+    inputs:
+      artifact_group:
+        type: string
+      artifact_run_id:
+        type: string
+        default: ""
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      sanity_check_only_for_family:
+        type: boolean
+        default: false
+      test_type:
+        type: string
+      test_labels:
+        type: string
+  workflow_call:
+    inputs:
+      artifact_group:
+        type: string
+      artifact_run_id:
+        type: string
+        default: ""
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      sanity_check_only_for_family:
+        type: boolean
+        default: false
+      test_type:
+        type: string
+      test_labels:
+        type: string
+  push:
+    branches:
+      - ADHOCBUILD
+
+permissions:
+  contents: read
+
+jobs:
+  configure_test_matrix:
+    name: "Configure test matrix"
+    # if there is a test machine available
+    if: ${{ inputs.test_runs_on != '' }}
+    runs-on: ${{ inputs.test_runs_on }}
+    outputs:
+      components: ${{ steps.configure.outputs.components }}
+      platform: ${{ steps.configure.outputs.platform }}
+      shard_arr: ${{ steps.configure.outputs.shard_arr }}
+    steps:
+      - name: "Fetch 'build_tools' from repository"
+        if: ${{ runner.os == 'Windows' }}
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          sparse-checkout: build_tools
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+          path: "prejob"
+
+      # Checkout failure is possible on Windows, as it's the first job on a GPU test runner.
+      # Post-job cleanup isn't necessary since no executables are launched in this job.
+      - name: Pre-job cleanup processes on Windows
+        if: ${{ runner.os == 'Windows' }}
+        shell: powershell
+        run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1'
+
+      - name: "Checking out repository"
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+
+
+      - name: Setting up Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: "Configuring CI options"
+        id: configure
+        env:
+          ARTIFACT_GROUP: ${{ inputs.artifact_group }}
+          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          TEST_TYPE: ${{ inputs.test_type }}
+          TEST_LABELS: ${{ inputs.test_labels }}
+        run: python ./build_tools/github_actions/fetch_test_configurations.py
+
+  test_sanity_check:
+    name: 'Test Sanity Check'
+    needs: configure_test_matrix
+    uses: './.github/workflows/test_sanity_check.yml'
+    with:
+      artifact_group: ${{ inputs.artifact_group }}
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      platform: ${{ needs.configure_test_matrix.outputs.platform }}
+
+  test_components:
+    name: 'Test ${{ matrix.components.job_name }}'
+    needs: [test_sanity_check, configure_test_matrix]
+    # skip tests if no test matrix to run and sanity check only requested
+    if: ${{ needs.configure_test_matrix.outputs.components != '[]' && !inputs.sanity_check_only_for_family }}
+    strategy:
+      fail-fast: false
+      matrix:
+        components: ${{ fromJSON(needs.configure_test_matrix.outputs.components) }}
+    uses: './.github/workflows/test_component.yml'
+    with:
+      artifact_run_id: ${{ inputs.artifact_run_id }}
+      artifact_group: ${{ inputs.artifact_group }}
+      amdgpu_families: ${{ inputs.amdgpu_families }}
+      test_runs_on: ${{ inputs.test_runs_on }}
+      platform: ${{ needs.configure_test_matrix.outputs.platform }}
+      component: ${{ toJSON(matrix.components) }}
diff --git a/.github/workflows/test_component.yml b/.github/workflows/test_component.yml
new file mode 100644
index 0000000000000..7475e96436e9d
--- /dev/null
+++ b/.github/workflows/test_component.yml
@@ -0,0 +1,110 @@
+name: Test component
+
+on:
+  workflow_call:
+    inputs:
+      artifact_run_id:
+        type: string
+        default: ""
+      artifact_group:
+        type: string
+      amdgpu_families:
+        type: string
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+      component:
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  test_component:
+    name: 'Test ${{ fromJSON(inputs.component).job_name }} (shard ${{ matrix.shard }} of ${{ fromJSON(inputs.component).total_shards }})'
+    runs-on: ${{ inputs.test_runs_on }}
+    timeout-minutes: 210
+    container:
+      image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98' || null }}
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 110
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+        --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user
+    strategy:
+      fail-fast: false
+      matrix:
+        # The shard array is based on "total_shards" from "fetch_test_configurations.py"
+        # The test executable will shard based on the array. (ex: [1, 2, 3, 4] = four test shards)
+        shard: ${{ fromJSON(inputs.component).shard_arr }}
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      OUTPUT_ARTIFACTS_DIR: "./build"
+      THEROCK_BIN_DIR: "./build/bin"
+      AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+    steps:
+      - name: "Fetch 'build_tools' from repository"
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+          sparse-checkout: build_tools
+          path: "prejob"
+
+      - name: Pre-job cleanup processes on Windows
+        if: ${{ runner.os == 'Windows' }}
+        shell: powershell
+        run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1'
+
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          ARTIFACT_GROUP: ${{ inputs.artifact_group }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      # safe.directory must be set before Runner Health Status
+      - name: Adjust git config
+        run: |
+          git config --global --add safe.directory $PWD
+          git config fetch.parallel 10
+
+      - name: Runner health status
+        run: |
+          python ./build_tools/health_status.py
+
+      - name: Driver / GPU sanity check
+        run: |
+          python ./build_tools/print_driver_gpu_info.py
+
+      - name: Test
+        timeout-minutes: ${{ fromJSON(inputs.component).timeout_minutes }}
+        env:
+          SHARD_INDEX: ${{ matrix.shard }}
+          TOTAL_SHARDS: ${{ fromJSON(inputs.component).total_shards }}
+          TEST_TYPE: ${{ fromJSON(inputs.component).test_type }}
+        run: |
+          ${{ fromJSON(inputs.component).test_script }}
+
+      # GitHub's 'Complete job' step is unaware of launched executables
+      # and will fail to clean up orphan processes.
+      - name: Post-job cleanup processes on Windows
+        if: ${{ always() && runner.os == 'Windows' }}
+        shell: powershell
+        run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1'
diff --git a/.github/workflows/test_jax_dockerfile.yml b/.github/workflows/test_jax_dockerfile.yml
new file mode 100644
index 0000000000000..a577dbe5e4ef0
--- /dev/null
+++ b/.github/workflows/test_jax_dockerfile.yml
@@ -0,0 +1,54 @@
+name: Test JAX Wheels
+
+on:
+  workflow_dispatch:
+    inputs:
+      test_runs_on:
+        required: true
+        type: string
+        default: "linux-mi325-1gpu-ossci-rocm-frac"
+      image_name:
+        required: true
+        description: JAX docker image to run tests with
+        type: string
+      jax_version:
+        description: Version of JAX to install
+        required: false
+        type: string
+      jax_plugin_branch:
+        required: true
+        description: JAX plugin branch to checkout
+        type: string
+        default: "rocm-jaxlib-v0.6.0"
+
+  workflow_call:
+    inputs:
+      test_runs_on:
+        required: true
+        type: string
+      image_name:
+        required: true
+        description: JAX docker image to run tests with
+        type: string
+      jax_version:
+        description: Version of JAX to install instead of the one on the docker image
+        required: false
+        type: string
+      jax_plugin_branch:
+        description: JAX plugin branch to checkout to use for test scripts
+        type: string
+        default: "rocm-jaxlib-v0.8.0"
+
+permissions:
+  contents: read
+
+jobs:
+  test_wheels:
+    name: Test
+    runs-on: ${{ inputs.test_runs_on }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repo: rocm/rocm-jax
+      # TODO: Add steps for creating the JAX docker image with an install of TheRock and then running JAX tests on the container
diff --git a/.github/workflows/test_linux_jax_wheels.yml b/.github/workflows/test_linux_jax_wheels.yml
new file mode 100644
index 0000000000000..00823960f1b0d
--- /dev/null
+++ b/.github/workflows/test_linux_jax_wheels.yml
@@ -0,0 +1,203 @@
+name: Test Linux JAX Wheels
+
+on:
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      release_type:
+        required: true
+        type: string
+      s3_subdir:
+        required: true
+        type: string
+      package_index_url:
+        description: Base CloudFront URL for the Python package index
+        required: true
+        type: string
+      rocm_version:
+        description: ROCm version (optional, informational)
+        required: false
+        type: string
+      tar_url:
+        description: URL to TheRock tarball to configure ROCm
+        required: true
+        type: string
+      python_version:
+        description: Python version(s) to test (e.g., "3.12")
+        required: true
+        type: string
+      repository:
+        description: "Repository to checkout. Otherwise, defaults to `github.repository`."
+        type: string
+      jax_ref:
+        description: rocm-jax repository ref/branch to check out
+        required: false
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+      test_runs_on:
+        required: true
+        type: string
+
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        type: choice
+        options:
+          - gfx101X-dgpu
+          - gfx103X-dgpu
+          - gfx110X-all
+          - gfx1150
+          - gfx1151
+          - gfx120X-all
+          - gfx90X-dcgpu
+          - gfx94X-dcgpu
+          - gfx950-dcgpu
+        default: gfx94X-dcgpu
+      release_type:
+        description: The type of release ("nightly" or "dev")
+        required: true
+        type: string
+        default: dev
+      s3_subdir:
+        description: S3 subdirectory, not including the GPU-family
+        required: true
+        type: string
+        default: v2
+      package_index_url:
+        description: Base CloudFront URL for the Python package index
+        required: true
+        type: string
+        default: https://rocm.nightlies.amd.com/v2-staging/
+      rocm_version:
+        description: ROCm version
+        required: false
+        type: string
+      tar_url:
+        description: URL to TheRock tarball to configure ROCm
+        required: true
+        type: string
+      python_version:
+        description: Python version(s) to test (e.g., "3.12")
+        required: true
+        type: string
+        default: "3.12"
+      jax_ref:
+        description: rocm-jax repository ref/branch to check out
+        required: false
+        type: string
+      test_runs_on:
+        description: Runner label to use. The selected runner should have a GPU supported by amdgpu_family
+        required: true
+        type: string
+        default: "linux-mi325-1gpu-ossci-rocm-frac"
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+
+permissions:
+  contents: read
+  packages: read
+
+jobs:
+  test_jax_wheels:
+    name: Test JAX Wheels | ${{ inputs.amdgpu_family }}
+    runs-on: ${{ inputs.test_runs_on }}
+    container:
+      image: ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26
+      options:  >-
+          --device /dev/kfd
+          --device /dev/dri
+          --group-add render
+          --group-add video
+          --user root
+          --env-file /etc/podinfo/gha-gpu-isolation-settings
+    defaults:
+      run:
+        shell: bash
+    env:
+      VIRTUAL_ENV: ${{ github.workspace }}/.venv
+      AMDGPU_FAMILY: ${{ inputs.amdgpu_family }}
+      THEROCK_TAR_URL: ${{ inputs.tar_url }}
+      PYTHON_VERSION: ${{ inputs.python_version }}
+      WHEEL_INDEX_URL: ${{ inputs.package_index_url }}/${{ inputs.amdgpu_family }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Checkout rocm-jax (plugin + build scripts)
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          path: jax
+          repository: rocm/rocm-jax
+          ref: ${{ inputs.jax_ref }}
+
+      - name: Checkout JAX extended tests repo
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: rocm/jax
+          ref: ${{ inputs.jax_ref }}
+          path: jax/jax_tests
+
+      - name: Set up Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: ${{ inputs.python_version }}
+          check-latest: true
+
+      - name: System deps, venv configure
+        run: |
+          python3 -m venv "${VIRTUAL_ENV}"
+          echo "PATH=${VIRTUAL_ENV}/bin:${PATH}" >> "$GITHUB_ENV"
+          python3 build_tools/setup_venv.py "${VIRTUAL_ENV}" --activate-in-future-github-actions-steps
+
+      - name: Install base JAX test requirements
+        run: |
+          # This script sets up the venv and activates it across steps; keep it consistent
+          pip install -r external-builds/jax/requirements-jax.txt
+
+      - name: Configure ROCm from TheRock tarball
+        env:
+          ROCM_VERSION: ${{ inputs.rocm_version }}
+          AMDGPU_FAMILY: ${{ inputs.amdgpu_family }}
+        run: |
+          DEST="/opt/rocm-${{ inputs.rocm_version }}"
+          # Install directly from TheRock release buckets (nightly/dev) using the provided version
+          python build_tools/install_rocm_from_artifacts.py \
+            --release "${{ inputs.rocm_version }}" \
+            --artifact-group "${{ inputs.amdgpu_family }}" \
+            --output-dir "${DEST}"
+
+      - name: Extract JAX version and set to GITHUB_ENV
+        run: |
+          # Extract JAX version from requirements.txt (e.g., "jax==0.8.0")
+          # Remove all whitespace from requirements.txt to simplify parsing
+          # Search for lines starting with "jax==" or "jaxlib==" followed by version (excluding comments)
+          # Extract the version number by splitting on '=' and taking the 3rd field
+          # [^#]+ matches one or more characters that are NOT '#', ensuring we stop before any inline comments
+          JAX_VERSION=$(tr -d ' ' < jax/build/requirements.txt \
+          | grep -E '^(jax|jaxlib)==[^#]+' | head -n1 | cut -d'=' -f3)
+          echo "JAX_VERSION=$JAX_VERSION" >> "$GITHUB_ENV"
+
+      - name: Install JAX wheels from package index
+        run: |
+          # Install jaxlib/plugin/pjrt from the GPU-family index; install jax from PyPI to match the version
+          pip install --index-url "${{ env.WHEEL_INDEX_URL }}" \
+            "jaxlib==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" \
+            "jax-rocm7-plugin==${JAX_VERSION}+rocm${{ inputs.rocm_version }}" \
+            "jax-rocm7-pjrt==${JAX_VERSION}+rocm${{ inputs.rocm_version }}"
+          pip install --extra-index-url https://pypi.org/simple "jax==${JAX_VERSION}"
+
+      - name: Run JAX tests
+        run: |
+            pytest jax/jax_tests/tests/multi_device_test.py -q --log-cli-level=INFO
+            pytest jax/jax_tests/tests/core_test.py -q --log-cli-level=INFO
+            pytest jax/jax_tests/tests/util_test.py -q --log-cli-level=INFO
+            pytest jax/jax_tests/tests/scipy_stats_test.py -q --log-cli-level=INFO
diff --git a/.github/workflows/test_pytorch_wheels.yml b/.github/workflows/test_pytorch_wheels.yml
new file mode 100644
index 0000000000000..93fe73a704412
--- /dev/null
+++ b/.github/workflows/test_pytorch_wheels.yml
@@ -0,0 +1,190 @@
+name: Test PyTorch Wheels
+
+on:
+  workflow_dispatch:
+    inputs:
+      amdgpu_family:
+        description: GPU family to test
+        required: true
+        type: string
+        default: "gfx94X-dcgpu"
+      test_runs_on:
+        description: Runner label to use. The selected runner should have a GPU supported by amdgpu_family
+        required: true
+        type: string
+        default: "linux-mi325-1gpu-ossci-rocm-frac"
+      package_index_url:
+        description: Base Python package index URL to test, typically nightly/dev URL with a "v2" or "v2-staging" subdir (without a GPU family subdir)
+        required: true
+        type: string
+        default: "https://rocm.nightlies.amd.com/v2"
+      python_version:
+        required: true
+        type: string
+        default: "3.12"
+      torch_version:
+        description: torch package version to install. (e.g. "2.7.1+rocm7.10.0a20251120")
+        required: true
+        type: string
+      pytorch_git_ref:
+        description: PyTorch ref to checkout test sources from. (e.g. "nightly", or "release/2.7")
+        type: string
+        default: "release/2.7"
+
+  workflow_call:
+    inputs:
+      amdgpu_family:
+        required: true
+        type: string
+      test_runs_on:
+        required: true
+        type: string
+      package_index_url:
+        required: true
+        type: string
+      python_version:
+        required: true
+        type: string
+      torch_version:
+        required: true
+        type: string
+      pytorch_git_ref:
+        type: string
+        default: "release/2.7"
+      repository:
+        description: "Repository to checkout. Otherwise, defaults to `github.repository`."
+        type: string
+      ref:
+        description: "Branch, tag or SHA to checkout. Defaults to the reference or SHA that triggered the workflow."
+        type: string
+
+permissions:
+  contents: read
+
+run-name: Test PyTorch (${{ inputs.amdgpu_family }}, ${{ inputs.torch_version}}, ${{ inputs.test_runs_on }})
+
+jobs:
+  test_wheels:
+    name: Test PyTorch | ${{ inputs.amdgpu_family }}
+    runs-on: ${{ inputs.test_runs_on }}
+    container:
+      image: ${{ contains(inputs.test_runs_on, 'linux') && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' || null }}
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 110
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+        --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      AMDGPU_FAMILY: ${{ inputs.amdgpu_family }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: ${{ inputs.repository || github.repository }}
+          ref: ${{ inputs.ref || '' }}
+
+      - name: Set up Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: ${{ inputs.python_version }}
+
+      # TODO: also upload and reference test report together with this logging?
+      - name: Summarize workflow inputs
+        run: |
+          python build_tools/github_actions/summarize_test_pytorch_workflow.py \
+            --torch-version=${{ inputs.torch_version }} \
+            --pytorch-git-ref=${{ inputs.pytorch_git_ref }} \
+            --index-url=${{ inputs.package_index_url }} \
+            --index-subdir=${{ inputs.amdgpu_family }}
+
+      - name: Set git options
+        run: |
+          git config --global core.longpaths true
+
+      # Here we checkout the same version of PyTorch that wheels were built from
+      # so we have the right set of test source files. We _probably_ don't need
+      # to run HIPIFY or apply any patches, so we skip those steps to save time.
+      - name: Checkout PyTorch Source Repos from nightly branch
+        if: ${{ (inputs.pytorch_git_ref == 'nightly') }}
+        run: |
+          python external-builds/pytorch/pytorch_torch_repo.py checkout \
+            --gitrepo-origin https://github.com/pytorch/pytorch.git \
+            --repo-hashtag nightly \
+            --no-hipify --no-patch
+
+      - name: Checkout PyTorch Source Repos from stable branch
+        if: ${{ (inputs.pytorch_git_ref != 'nightly') }}
+        run: |
+          python external-builds/pytorch/pytorch_torch_repo.py checkout \
+            --gitrepo-origin https://github.com/ROCm/pytorch.git \
+            --repo-hashtag ${{ inputs.pytorch_git_ref }} \
+            --no-hipify --no-patch
+
+      - name: Set up virtual environment
+        run: |
+          python build_tools/setup_venv.py ${VENV_DIR} \
+            --packages torch==${{ inputs.torch_version }} \
+            --index-url=${{ inputs.package_index_url }} \
+            --index-subdir=${{ inputs.amdgpu_family }} \
+            --activate-in-future-github-actions-steps
+
+      - name: Install test requirements
+        run: |
+          python -m pip install -r external-builds/pytorch/requirements-test.txt
+          pip freeze
+
+      - name: Run rocm-sdk sanity tests
+        run: |
+          rocm-sdk test
+
+      - name: Run PyTorch smoketests
+        run: |
+          python ./external-builds/pytorch/run_pytorch_smoke_tests.py -- \
+            --log-cli-level=INFO \
+            -v
+
+      - name: (Linux) Run PyTorch tests
+        if: ${{ runner.os == 'Linux' }}
+        run: |
+          python ./external-builds/pytorch/run_pytorch_tests.py -- \
+            --continue-on-collection-errors \
+            --import-mode=importlib \
+            -v
+
+      # Windows testing is a recent addition and is being enabled incrementally.
+      # See https://github.com/ROCm/TheRock/issues/2258.
+      #
+      # Many tests are failing on torch 2.10+ so we limit testing to 2.9.
+      # (Obviously that's not ideal, but we need to start somewhere)
+      #
+      # HACK: The test process does not terminate on its own gracefully,
+      # so we write to run_pytorch_tests_exit_code.txt and then kill the process.
+      # After killing the process we read the return code to signal it normally.
+      # See https://github.com/ROCm/TheRock/issues/999.
+      - name: (Windows) Run PyTorch tests
+        if: ${{ runner.os == 'Windows' && contains(inputs.torch_version, '2.9') }}
+        continue-on-error: true
+        run: |
+          python ./external-builds/pytorch/run_pytorch_tests.py -- \
+            --continue-on-collection-errors \
+            --import-mode=importlib \
+            -v
+
+      - name: (Windows) Read and propagate exit code
+        if: ${{ runner.os == 'Windows' && contains(inputs.torch_version, '2.9') }}
+        run: |
+          if [ -f run_pytorch_tests_exit_code.txt ]; then
+            EXIT_CODE=$(cat run_pytorch_tests_exit_code.txt)
+            echo "Exit code from file: ${EXIT_CODE}"
+            exit ${EXIT_CODE}
+          else
+            echo "No run_pytorch_tests_exit_code.txt found"
+            exit 1
+          fi
diff --git a/.github/workflows/test_sanity_check.yml b/.github/workflows/test_sanity_check.yml
new file mode 100644
index 0000000000000..830e6beae8b40
--- /dev/null
+++ b/.github/workflows/test_sanity_check.yml
@@ -0,0 +1,118 @@
+name: TheRock Sanity Check
+
+on:
+  workflow_dispatch:
+    inputs:
+      artifact_group:
+        type: string
+      artifact_run_id:
+        type: string
+        default: ""
+      amdgpu_families:
+        type: string
+        default: ""
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+  workflow_call:
+    inputs:
+      artifact_group:
+        type: string
+      artifact_run_id:
+        type: string
+        default: ""
+      amdgpu_families:
+        type: string
+        default: ""
+      test_runs_on:
+        type: string
+      platform:
+        type: string
+  push:
+    branches:
+      - ADHOCBUILD
+
+permissions:
+  contents: read
+
+jobs:
+  test_sanity_check:
+    name: "Sanity ROCM Test"
+    runs-on: ${{ inputs.test_runs_on }}
+    container:
+      image: ${{ inputs.platform == 'linux' && 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:405945a40deaff9db90b9839c0f41d4cba4a383c1a7459b28627047bf6302a26' || null }}
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 110
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+        --user 0:0 # Running as root, by recommendation of GitHub: https://docs.github.com/en/actions/reference/workflows-and-actions/dockerfile-support#user
+    defaults:
+      run:
+        shell: bash
+    env:
+      VENV_DIR: ${{ github.workspace }}/.venv
+      ARTIFACT_RUN_ID: "${{ inputs.artifact_run_id != '' && inputs.artifact_run_id || github.run_id }}"
+      OUTPUT_ARTIFACTS_DIR: ${{ github.workspace }}/build
+      THEROCK_BIN_DIR: ${{ github.workspace }}/build/bin
+    steps:
+      - name: "Fetch 'build_tools' from repository"
+        if: ${{ runner.os == 'Windows' }}
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          sparse-checkout: build_tools
+          path: prejob
+
+      - name: Pre-job cleanup processes on Windows
+        if: ${{ runner.os == 'Windows' }}
+        shell: powershell
+        run: . '${{ github.workspace }}\prejob\build_tools\github_actions\cleanup_processes.ps1'
+
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+          ref: ${{ secrets.THEROCK_MAINLINE_REF }}
+
+      - name: Pre-job cleanup Docker containers on Linux
+        if: ${{ runner.os == 'Linux' }}
+        shell: bash
+        run: |
+          # Remove any stopped containers
+          docker container prune -f || true
+          # Remove dangling networks
+          docker network prune -f || true
+
+      - name: Run setup test environment workflow
+        uses: './.github/actions/setup_test_environment'
+        with:
+          ARTIFACT_GROUP: ${{ inputs.artifact_group }}
+          ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
+          OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
+          VENV_DIR: ${{ env.VENV_DIR }}
+          FETCH_ARTIFACT_ARGS: "--base-only"
+          IS_PR_FROM_FORK: ${{ github.event.pull_request.head.repo.fork }}
+
+      - name: Set HIP_CLANG_PATH for windows
+        if: ${{ runner.os == 'Windows' }}
+        run: echo "HIP_CLANG_PATH=${OUTPUT_ARTIFACTS_DIR}\lib\llvm\bin" >> $GITHUB_ENV
+
+      - name: Driver / GPU sanity check
+        run: |
+          python ./build_tools/print_driver_gpu_info.py
+
+      - name: Run ROCm Sanity Tests
+        timeout-minutes: 5
+        env:
+          # Enable verbose logging, see
+          # https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/debugging.html
+          AMD_LOG_LEVEL: 4
+        run: |
+          pytest tests/ --log-cli-level=info --timeout=60
+
+      - name: Post-job cleanup processes on Windows
+        if: ${{ always() && runner.os == 'Windows' }}
+        shell: powershell
+        run: . '${{ github.workspace }}\build_tools\github_actions\cleanup_processes.ps1'
diff --git a/.github/workflows/therock_test_harness.yml b/.github/workflows/therock_test_harness.yml
new file mode 100644
index 0000000000000..1699af369a140
--- /dev/null
+++ b/.github/workflows/therock_test_harness.yml
@@ -0,0 +1,101 @@
+name: TheRock Test Harness
+
+on:
+  workflow_dispatch:
+    inputs:
+      families:
+        type: string
+        description: 'The AMD GPU family to test. ex: gfx94X, gfx120X'
+        default: 'gfx94X'
+      release_version:
+        type: string
+        description: 'TheRock release version. (ex: nightly-tarball (X.Y.ZrcYYYYMMDD) or dev-tarball (X.Y.Z.dev0+{hash}))'
+        default: '7.9.0rc20251008'
+      tests_to_run:
+        type: string
+        description: 'The list of tests to run with "or" expression. (ex: "hipcub or rocprim")'
+        default: 'hipcub or rocprim or rocrand or rocthrust'
+
+permissions:
+  contents: read
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  setup_metadata:
+    runs-on: ubuntu-24.04
+    outputs:
+      package_targets: ${{ steps.configure.outputs.package_targets }}
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: Setup Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: Generating package target matrix
+        id: configure
+        env:
+          AMDGPU_FAMILIES: ${{ inputs.families }}
+          THEROCK_PACKAGE_PLATFORM: "linux"
+          TEST_HARNESS_TARGET_FETCH: true
+          # Variable comes from ROCm organization variable 'ROCM_THEROCK_TEST_RUNNERS'
+          ROCM_THEROCK_TEST_RUNNERS: ${{ vars.ROCM_THEROCK_TEST_RUNNERS }}
+          LOAD_TEST_RUNNERS_FROM_VAR: false
+        run: python ./build_tools/github_actions/fetch_package_targets.py
+
+
+  therock_test_harness_linux:
+    name: TheRock Tests Sharded Linux Nightly
+    needs: [setup_metadata]
+    runs-on: ${{ matrix.target_bundle.test_machine }}
+    container:
+      image: 'ghcr.io/rocm/no_rocm_image_ubuntu24_04@sha256:4150afe4759d14822f0e3f8930e1124f26e11f68b5c7b91ec9a02b20b1ebbb98'
+      options: --ipc host
+        --group-add video
+        --device /dev/kfd
+        --device /dev/dri
+        --group-add 110
+        --env-file /etc/podinfo/gha-gpu-isolation-settings
+    strategy:
+      fail-fast: false
+      matrix:
+        target_bundle: ${{ fromJSON(needs.setup_metadata.outputs.package_targets) }}
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
+        with:
+          repository: "ROCm/TheRock"
+
+      - name: Setup Python
+        uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
+        with:
+          python-version: 3.12
+
+      - name: Install TheRock
+        env:
+          release_version: ${{ inputs.release_version }}
+        run: |
+          pip install -r requirements-test.txt
+          python3 build_tools/install_rocm_from_artifacts.py --tests --amdgpu-family ${{ matrix.target_bundle.amdgpu_family }} --release ${{ env.release_version }}
+
+      # TODO: add parallelism
+      - name: Running test harness
+        # TESTING
+        run: |
+          python3 -m pytest -s -v --tb=short --therock-path=./therock-build tests/harness/tests*.py -k ${{ inputs.tests_to_run }}
+
+# TODO: Add windows tests
diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 1e3ac2e31870f..8250247a0204a 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -301,39 +301,6 @@ bool CodeGen::isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays,
   return true;
 }
 
-bool CodeGen::isEmptyFieldForLayout(const ASTContext &Context,
-                                    const FieldDecl *FD) {
-  if (FD->isZeroLengthBitField())
-    return true;
-
-  if (FD->isUnnamedBitField())
-    return false;
-
-  return isEmptyRecordForLayout(Context, FD->getType());
-}
-
-bool CodeGen::isEmptyRecordForLayout(const ASTContext &Context, QualType T) {
-  const auto *RD = T->getAsRecordDecl();
-  if (!RD)
-    return false;
-
-  // If this is a C++ record, check the bases first.
-  if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-    if (CXXRD->isDynamicClass())
-      return false;
-
-    for (const auto &I : CXXRD->bases())
-      if (!isEmptyRecordForLayout(Context, I.getType()))
-        return false;
-  }
-
-  for (const auto *I : RD->fields())
-    if (!isEmptyFieldForLayout(Context, I))
-      return false;
-
-  return true;
-}
-
 const Type *CodeGen::isSingleElementStruct(QualType T, ASTContext &Context) {
   const auto *RD = T->getAsRecordDecl();
   if (!RD)
diff --git a/clang/lib/CodeGen/ABIInfoImpl.h b/clang/lib/CodeGen/ABIInfoImpl.h
index d9d79c6a55ddb..f0276be8cb97f 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.h
+++ b/clang/lib/CodeGen/ABIInfoImpl.h
@@ -120,16 +120,6 @@ bool isEmptyField(ASTContext &Context, const FieldDecl *FD, bool AllowArrays,
 bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays,
                    bool AsIfNoUniqueAddr = false);
 
-/// isEmptyFieldForLayout - Return true iff the field is "empty", that is,
-/// either a zero-width bit-field or an \ref isEmptyRecordForLayout.
-bool isEmptyFieldForLayout(const ASTContext &Context, const FieldDecl *FD);
-
-/// isEmptyRecordForLayout - Return true iff a structure contains only empty
-/// base classes (per \ref isEmptyRecordForLayout) and fields (per
-/// \ref isEmptyFieldForLayout). Note, C++ record fields are considered empty
-/// if the [[no_unique_address]] attribute would have made them empty.
-bool isEmptyRecordForLayout(const ASTContext &Context, QualType T);
-
 /// isSingleElementStruct - Determine if a structure is a "single
 /// element struct", i.e. it has exactly one non-empty field or
 /// exactly one field which is itself a single element
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 62f5d2f789326..b292efea94861 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ABIInfoImpl.h"
 #include "CGBlocks.h"
 #include "CGCXXABI.h"
 #include "CGDebugInfo.h"
@@ -927,7 +926,7 @@ namespace {
     }
 
     void addMemcpyableField(FieldDecl *F) {
-      if (isEmptyFieldForLayout(CGF.getContext(), F))
+      if (F->isZeroSize(CGF.getContext()))
         return;
       if (!FirstField)
         addInitialField(F);
@@ -1884,7 +1883,7 @@ namespace {
                               const CXXDestructorDecl *DD)
        : Context(Context), EHStack(EHStack), DD(DD), StartIndex(std::nullopt) {}
    void PushCleanupForField(const FieldDecl *Field) {
-     if (isEmptyFieldForLayout(Context, Field))
+     if (Field->isZeroSize(Context))
        return;
      unsigned FieldIndex = Field->getFieldIndex();
      if (FieldHasTrivialDestructorBody(Context, Field)) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d80c2d20f3f19..7cd663f97a9ed 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -5245,7 +5245,7 @@ static Address emitAddrOfZeroSizeField(CodeGenFunction &CGF, Address Base,
 /// The resulting address doesn't necessarily have the right type.
 static Address emitAddrOfFieldStorage(CodeGenFunction &CGF, Address base,
                                       const FieldDecl *field, bool IsInBounds) {
-  if (isEmptyFieldForLayout(CGF.getContext(), field))
+  if (field->isZeroSize(CGF.getContext()))
     return emitAddrOfZeroSizeField(CGF, base, field, IsInBounds);
 
   const RecordDecl *rec = field->getParent();
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 6407afc3d9447..9dc74d5b78ea9 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ABIInfoImpl.h"
 #include "CGCXXABI.h"
 #include "CGObjCRuntime.h"
 #include "CGRecordLayout.h"
@@ -758,7 +757,7 @@ bool ConstStructBuilder::Build(const InitListExpr *ILE, bool AllowOverwrite) {
 
     // Zero-sized fields are not emitted, but their initializers may still
     // prevent emission of this struct as a constant.
-    if (isEmptyFieldForLayout(CGM.getContext(), Field)) {
+    if (Field->isZeroSize(CGM.getContext())) {
       if (Init && Init->HasSideEffects(CGM.getContext()))
         return false;
       continue;
@@ -893,8 +892,7 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
       continue;
 
     // Don't emit anonymous bitfields or zero-sized fields.
-    if (Field->isUnnamedBitField() ||
-        isEmptyFieldForLayout(CGM.getContext(), *Field))
+    if (Field->isUnnamedBitField() || Field->isZeroSize(CGM.getContext()))
       continue;
 
     // Emit the value of the initializer.
@@ -2642,10 +2640,8 @@ static llvm::Constant *EmitNullConstant(CodeGenModule &CGM,
 
       const auto *base = I.getType()->castAsCXXRecordDecl();
       // Ignore empty bases.
-      if (isEmptyRecordForLayout(CGM.getContext(), I.getType()) ||
-          CGM.getContext()
-              .getASTRecordLayout(base)
-              .getNonVirtualSize()
+      if (base->isEmpty() ||
+          CGM.getContext().getASTRecordLayout(base).getNonVirtualSize()
               .isZero())
         continue;
 
@@ -2659,8 +2655,7 @@ static llvm::Constant *EmitNullConstant(CodeGenModule &CGM,
   for (const auto *Field : record->fields()) {
     // Fill in non-bitfields. (Bitfields always use a zero pattern, which we
     // will fill in later.)
-    if (!Field->isBitField() &&
-        !isEmptyFieldForLayout(CGM.getContext(), Field)) {
+    if (!Field->isBitField() && !Field->isZeroSize(CGM.getContext())) {
       unsigned fieldIndex = layout.getLLVMFieldNo(Field);
       elements[fieldIndex] = CGM.EmitNullConstant(Field->getType());
     }
@@ -2680,7 +2675,7 @@ static llvm::Constant *EmitNullConstant(CodeGenModule &CGM,
     for (const auto &I : CXXR->vbases()) {
       const auto *base = I.getType()->castAsCXXRecordDecl();
       // Ignore empty bases.
-      if (isEmptyRecordForLayout(CGM.getContext(), I.getType()))
+      if (base->isEmpty())
         continue;
 
       unsigned fieldIndex = layout.getVirtualBaseIndex(base);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 5ceaaf30b8d24..75d7718562654 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "CGOpenMPRuntime.h"
-#include "ABIInfoImpl.h"
 #include "CGCXXABI.h"
 #include "CGCleanup.h"
 #include "CGDebugInfo.h"
@@ -8472,15 +8471,12 @@ class MappableExprsHandler {
     for (const auto &I : RD->bases()) {
       if (I.isVirtual())
         continue;
-
-      QualType BaseTy = I.getType();
-      const auto *Base = BaseTy->getAsCXXRecordDecl();
+      const auto *Base = I.getType()->getAsCXXRecordDecl();
       // Ignore empty bases.
-      if (isEmptyRecordForLayout(CGF.getContext(), BaseTy) ||
-          CGF.getContext()
-              .getASTRecordLayout(Base)
-              .getNonVirtualSize()
-              .isZero())
+      if (Base->isEmpty() || CGF.getContext()
+                                 .getASTRecordLayout(Base)
+                                 .getNonVirtualSize()
+                                 .isZero())
         continue;
 
       unsigned FieldIndex = RL.getNonVirtualBaseLLVMFieldNo(Base);
@@ -8488,12 +8484,10 @@ class MappableExprsHandler {
     }
     // Fill in virtual bases.
     for (const auto &I : RD->vbases()) {
-      QualType BaseTy = I.getType();
+      const auto *Base = I.getType()->getAsCXXRecordDecl();
       // Ignore empty bases.
-      if (isEmptyRecordForLayout(CGF.getContext(), BaseTy))
+      if (Base->isEmpty())
         continue;
-
-      const auto *Base = BaseTy->getAsCXXRecordDecl();
       unsigned FieldIndex = RL.getVirtualBaseIndex(Base);
       if (RecordLayout[FieldIndex])
         continue;
@@ -8504,8 +8498,7 @@ class MappableExprsHandler {
     for (const auto *Field : RD->fields()) {
       // Fill in non-bitfields. (Bitfields always use a zero pattern, which we
       // will fill in later.)
-      if (!Field->isBitField() &&
-          !isEmptyFieldForLayout(CGF.getContext(), Field)) {
+      if (!Field->isBitField() && !Field->isZeroSize(CGF.getContext())) {
         unsigned FieldIndex = RL.getLLVMFieldNo(Field);
         RecordLayout[FieldIndex] = Field;
       }
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index e9205c68c2812..5580cee1f49f6 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -10,9 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ABIInfoImpl.h"
-#include "CGCXXABI.h"
 #include "CGRecordLayout.h"
+#include "CGCXXABI.h"
 #include "CodeGenTypes.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
@@ -385,7 +384,7 @@ void CGRecordLowering::accumulateFields(bool isNonVirtualBaseType) {
       Field = accumulateBitFields(isNonVirtualBaseType, Field, FieldEnd);
       assert((Field == FieldEnd || !Field->isBitField()) &&
              "Failed to accumulate all the bitfields");
-    } else if (isEmptyFieldForLayout(Context, *Field)) {
+    } else if (Field->isZeroSize(Context)) {
       // Empty fields have no storage.
       ++Field;
     } else {
@@ -634,7 +633,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
           // non-reusable tail padding.
           CharUnits LimitOffset;
           for (auto Probe = Field; Probe != FieldEnd; ++Probe)
-            if (!isEmptyFieldForLayout(Context, *Probe)) {
+            if (!Probe->isZeroSize(Context)) {
               // A member with storage sets the limit.
               assert((getFieldBitOffset(*Probe) % CharBits) == 0 &&
                      "Next storage is not byte-aligned");
@@ -732,7 +731,7 @@ void CGRecordLowering::accumulateBases() {
     // Bases can be zero-sized even if not technically empty if they
     // contain only a trailing array member.
     const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
-    if (!isEmptyRecordForLayout(Context, Base.getType()) &&
+    if (!BaseDecl->isEmpty() &&
         !Context.getASTRecordLayout(BaseDecl).getNonVirtualSize().isZero())
       Members.push_back(MemberInfo(Layout.getBaseClassOffset(BaseDecl),
           MemberInfo::Base, getStorageType(BaseDecl), BaseDecl));
@@ -880,7 +879,7 @@ CGRecordLowering::calculateTailClippingOffset(bool isNonVirtualBaseType) const {
   if (!isNonVirtualBaseType && isOverlappingVBaseABI())
     for (const auto &Base : RD->vbases()) {
       const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
-      if (isEmptyRecordForLayout(Context, Base.getType()))
+      if (BaseDecl->isEmpty())
         continue;
       // If the vbase is a primary virtual base of some base, then it doesn't
       // get its own storage location but instead lives inside of that base.
@@ -896,7 +895,7 @@ CGRecordLowering::calculateTailClippingOffset(bool isNonVirtualBaseType) const {
 void CGRecordLowering::accumulateVBases() {
   for (const auto &Base : RD->vbases()) {
     const CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
-    if (isEmptyRecordForLayout(Context, Base.getType()))
+    if (BaseDecl->isEmpty())
       continue;
     CharUnits Offset = Layout.getVBaseClassOffset(BaseDecl);
     // If the vbase is a primary virtual base of some base, then it doesn't
@@ -1157,7 +1156,7 @@ CodeGenTypes::ComputeRecordLayout(const RecordDecl *D, llvm::StructType *Ty) {
     const FieldDecl *FD = *it;
 
     // Ignore zero-sized fields.
-    if (isEmptyFieldForLayout(getContext(), FD))
+    if (FD->isZeroSize(getContext()))
       continue;
 
     // For non-bit-fields, just check that the LLVM struct offset matches the
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index cd08f3ec397a0..d9eabf2c76989 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -15,7 +15,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTBAA.h"
-#include "ABIInfoImpl.h"
 #include "CGCXXABI.h"
 #include "CGRecordLayout.h"
 #include "CodeGenTypes.h"
@@ -448,7 +447,7 @@ CodeGenTBAA::CollectFields(uint64_t BaseOffset,
     unsigned idx = 0;
     for (RecordDecl::field_iterator i = RD->field_begin(), e = RD->field_end();
          i != e; ++i, ++idx) {
-      if (isEmptyFieldForLayout(Context, *i))
+      if ((*i)->isZeroSize(Context))
         continue;
 
       uint64_t Offset =
diff --git a/clang/test/CodeGen/2009-06-14-anonymous-union-init.c b/clang/test/CodeGen/2009-06-14-anonymous-union-init.c
index a4375d7868f01..3f4493deea79e 100644
--- a/clang/test/CodeGen/2009-06-14-anonymous-union-init.c
+++ b/clang/test/CodeGen/2009-06-14-anonymous-union-init.c
@@ -1,19 +1,8 @@
-// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-linux-gnu -o - | FileCheck %s --check-prefixes=CHECK,EMPTY
-// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-windows-msvc -o - | FileCheck %s --check-prefixes=CHECK,EMPTY-MSVC
+// RUN: %clang_cc1 -emit-llvm < %s | grep "zeroinitializer, i16 16877"
 // PR4390
 struct sysfs_dirent {
- union { struct sysfs_elem_dir { int x; } s_dir; };
+ union { struct sysfs_elem_dir {} s_dir; };
  unsigned short s_mode;
 };
 struct sysfs_dirent sysfs_root = { {}, 16877 };
 
-// CHECK: @sysfs_root = {{.*}}global { %union.anon, i16, [2 x i8] } { %union.anon zeroinitializer, i16 16877, [2 x i8] zeroinitializer }
-
-struct Foo {
- union { struct empty {} x; };
- unsigned short s_mode;
-};
-struct Foo foo = { {}, 16877 };
-
-// EMPTY:      @foo = {{.*}}global %struct.Foo { i16 16877 }
-// EMPTY-MSVC: @foo = {{.*}}global %struct.Foo { [4 x i8] zeroinitializer, i16 16877 }
diff --git a/clang/test/CodeGen/X86/x86_64-vaarg.c b/clang/test/CodeGen/X86/x86_64-vaarg.c
index 450dfe5d15020..19802eedb02b7 100644
--- a/clang/test/CodeGen/X86/x86_64-vaarg.c
+++ b/clang/test/CodeGen/X86/x86_64-vaarg.c
@@ -56,8 +56,7 @@ typedef struct {
 // CHECK:       vaarg.end:
 // CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP1]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[VAARG_ADDR]], i64 8, i1 false)
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load double, ptr [[RETVAL]], align 8
 // CHECK-NEXT:    ret double [[TMP3]]
 //
 s1 f(int z, ...) {
diff --git a/clang/test/CodeGen/paren-list-agg-init.cpp b/clang/test/CodeGen/paren-list-agg-init.cpp
index 235352382332a..5c1c598dcd466 100644
--- a/clang/test/CodeGen/paren-list-agg-init.cpp
+++ b/clang/test/CodeGen/paren-list-agg-init.cpp
@@ -48,13 +48,14 @@ struct E {
   ~E() {};
 };
 
+// CHECK-DAG: [[STRUCT_F:%.*]] = type { i8 }
 struct F {
   F (int i = 1);
   F (const F &f) = delete;
   F (F &&f) = default;
 };
 
-// CHECK-DAG: [[STRUCT_G:%.*]] = type <{ i32, [4 x i8] }>
+// CHECK-DAG: [[STRUCT_G:%.*]] = type <{ i32, [[STRUCT_F]], [3 x i8] }>
 struct G {
   int a;
   F f;
@@ -77,12 +78,12 @@ namespace gh61145 {
     ~Vec();
   };
 
-  // CHECK-DAG: [[STRUCT_S1:%.*]] = type { i8 }
+  // CHECK-DAG: [[STRUCT_S1:%.*]] = type { [[STRUCT_VEC]] }
   struct S1 {
     Vec v;
   };
 
-  // CHECK-DAG: [[STRUCT_S2:%.*]] = type { i8, i8 }
+  // CHECK-DAG: [[STRUCT_S2:%.*]] = type { [[STRUCT_VEC]], i8 }
   struct S2 {
     Vec v;
     char c;
@@ -376,7 +377,7 @@ void foo18() {
 // CHECK-NEXT: [[G:%.*g.*]] = alloca [[STRUCT_G]], align 4
 // CHECK-NEXT: [[A:%.*a.*]] = getelementptr inbounds nuw [[STRUCT_G]], ptr [[G]], i32 0, i32 0
 // CHECK-NEXT: store i32 2, ptr [[A]], align 4
-// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds i8, ptr [[G]], i64 4
+// CHECK-NEXT: [[F:%.*f.*]] = getelementptr inbounds nuw [[STRUCT_G]], ptr [[G]], i32 0, i32 1
 // CHECk-NEXT: call void @{{.*F.*}}(ptr noundef nonnull align 1 dereferenceable(1)) [[F]], ie32 noundef 1)
 // CHECK: ret void
 void foo19() {
@@ -391,8 +392,9 @@ namespace gh61145 {
   // CHECK-NEXT: [[AGG_TMP_ENSURED:%.*agg.tmp.ensured.*]] = alloca [[STRUCT_S1]], align 1
   // a.k.a. Vec::Vec()
   // CHECK-NEXT: call void @_ZN7gh611453VecC1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[V]])
+  // CHECK-NEXT: [[V1:%.*v1.*]] = getelementptr inbounds nuw [[STRUCT_S1]], ptr [[AGG_TMP_ENSURED]], i32 0, i32 0
   // a.k.a. Vec::Vec(Vec&&)
-  // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_TMP_ENSURED]], ptr noundef nonnull align 1 dereferenceable(1) [[V]])
+  // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[V1]], ptr noundef nonnull align 1 dereferenceable(1) [[V]])
   // a.k.a. S1::~S1()
   // CHECK-NEXT: call void @_ZN7gh611452S1D1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_TMP_ENSURED]])
   // a.k.a.Vec::~Vec()
@@ -411,8 +413,9 @@ namespace gh61145 {
   // CHECK-NEXT: [[AGG_TMP_ENSURED:%.*agg.tmp.ensured.*]] = alloca [[STRUCT_S2]], align 1
   // a.k.a. Vec::Vec()
   // CHECK-NEXT: call void @_ZN7gh611453VecC1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[V]])
+  // CHECK-NEXT: [[V1:%.*v1.*]] = getelementptr inbounds nuw [[STRUCT_S2]], ptr [[AGG_TMP_ENSURED]], i32 0, i32 0
   // a.k.a. Vec::Vec(Vec&&)
-  // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_TMP_ENSURED]], ptr noundef nonnull align 1 dereferenceable(1) [[V]])
+  // CHECK-NEXT: call void @_ZN7gh611453VecC1EOS0_(ptr noundef nonnull align 1 dereferenceable(1) [[V1]], ptr noundef nonnull align 1 dereferenceable(1) [[V]])
   // CHECK-NEXT: [[C:%.*c.*]] = getelementptr inbounds nuw [[STRUCT_S2]], ptr [[AGG_TMP_ENSURED]], i32 0, i32
   // CHECK-NEXT: store i8 0, ptr [[C]], align 1
   // a.k.a. S2::~S2()
diff --git a/clang/test/CodeGen/union-init2.c b/clang/test/CodeGen/union-init2.c
index ee35e78a4f301..6e039e7e27d53 100644
--- a/clang/test/CodeGen/union-init2.c
+++ b/clang/test/CodeGen/union-init2.c
@@ -13,7 +13,7 @@ union z {
 };
 union z y = {};
 
-// CHECK: @foo = {{.*}}global %union.Foo undef, align 1
+// CHECK: @foo = {{.*}}global %union.Foo zeroinitializer, align 1
 // CHECK-CXX: @foo = {{.*}}global %union.Foo undef, align 1
 union Foo {
   struct Empty {} val;
diff --git a/clang/test/CodeGen/voidptr-vaarg.c b/clang/test/CodeGen/voidptr-vaarg.c
index a0211642bd82f..9551418fe9258 100644
--- a/clang/test/CodeGen/voidptr-vaarg.c
+++ b/clang/test/CodeGen/voidptr-vaarg.c
@@ -245,8 +245,7 @@ typedef struct {
 // CHECK-NEXT:    [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4
 // CHECK-NEXT:    store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 4, i1 false)
-// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_EMPTY_INT_T]], ptr [[RETVAL]], i32 0, i32 0
-// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 empty_int_t empty_int(__builtin_va_list list) {
diff --git a/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp b/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp
index 3efb8c449c8fa..8922591f8e6f1 100644
--- a/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp
+++ b/clang/test/CodeGenCXX/2011-12-19-init-list-ctor.cpp
@@ -19,8 +19,8 @@ struct S {
 };
 
 // CHECK: store i32 0, ptr @arr
-// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (i8, ptr @arr, i64 4), ptr noundef @.str)
+// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds nuw (%struct.S, ptr @arr, i32 0, i32 1), ptr noundef @.str)
 // CHECK: store i32 1, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 1)
-// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (i8, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 1), i64 4), ptr noundef @.str.1)
+// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds nuw (%struct.S, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 1), i32 0, i32 1), ptr noundef @.str.1)
 // CHECK: store i32 2, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 2)
-// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds (i8, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 2), i64 4), ptr noundef @.str.2)
+// CHECK: call void @_ZN1AC1EPKc(ptr {{[^,]*}} getelementptr inbounds nuw (%struct.S, ptr getelementptr inbounds (%struct.S, ptr @arr, i64 2), i32 0, i32 1), ptr noundef @.str.2)
diff --git a/clang/test/CodeGenCXX/bitfield-access-empty.cpp b/clang/test/CodeGenCXX/bitfield-access-empty.cpp
index d1ae12e202cda..a06f62b3eb05a 100644
--- a/clang/test/CodeGenCXX/bitfield-access-empty.cpp
+++ b/clang/test/CodeGenCXX/bitfield-access-empty.cpp
@@ -84,8 +84,8 @@ struct P3 {
   unsigned b : 16;
 } p3;
 // CHECK-LABEL: LLVMType:%struct.P3 =
-// LAYOUT-SAME: type { i16, [2 x i8], i16, [2 x i8] }
-// LAYOUT-DWN32-SAME: type <{ i16, i8, i16 }>
+// LAYOUT-SAME: type { i16, %struct.Empty, i16, [2 x i8] }
+// LAYOUT-DWN32-SAME: type <{ i16, %struct.Empty, i16 }>
 // CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P3 =
 // CHECK: BitFields:[
 // LAYOUT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:16 IsSigned:0 StorageSize:16 StorageOffset:0
@@ -138,13 +138,13 @@ struct P7 {
   unsigned c;
 } p7;
 // CHECK-LABEL: LLVMType:%struct.P7 =
-// LAYOUT-SAME: type { i32, i32 }
-// LAYOUT-DWN32-SAME: type { i32, i32 }
+// LAYOUT-SAME: type { i16, i8, %struct.Empty, i32 }
+// LAYOUT-DWN32-SAME: type { i16, i8, %struct.Empty, i32 }
 // CHECK-NEXT: NonVirtualBaseLLVMType:%struct.P7 =
 // CHECK: BitFields:[
-// LAYOUT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:16 IsSigned:0 StorageSize:32 StorageOffset:0
-// LAYOUT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:8 IsSigned:0 StorageSize:32 StorageOffset:0
+// LAYOUT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:16 IsSigned:0 StorageSize:16 StorageOffset:0
+// LAYOUT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:8 IsSigned:0 StorageSize:8 StorageOffset:2
 
-// LAYOUT-DWN32-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:16 IsSigned:0 StorageSize:32 StorageOffset:0
-// LAYOUT-DWN32-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:8 IsSigned:0 StorageSize:32 StorageOffset:0
+// LAYOUT-DWN32-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:16 IsSigned:0 StorageSize:16 StorageOffset:0
+// LAYOUT-DWN32-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:8 IsSigned:0 StorageSize:8 StorageOffset:2
 // CHECK-NEXT: ]>
diff --git a/clang/test/CodeGenCXX/class-layout.cpp b/clang/test/CodeGenCXX/class-layout.cpp
index 90617d25b254e..84b0f887876ac 100644
--- a/clang/test/CodeGenCXX/class-layout.cpp
+++ b/clang/test/CodeGenCXX/class-layout.cpp
@@ -83,7 +83,7 @@ namespace Test6 {
 namespace Test7 {
   #pragma pack (1)
   class A {};
-  // CHECK: %"class.Test7::B" = type <{ ptr, i8 }>
+  // CHECK: %"class.Test7::B" = type <{ ptr, %"class.Test7::A" }>
   class B {
      virtual ~B();
      A a;
diff --git a/clang/test/CodeGenCXX/compound-literals.cpp b/clang/test/CodeGenCXX/compound-literals.cpp
index 1b4a1d4445123..fcec2d19e2def 100644
--- a/clang/test/CodeGenCXX/compound-literals.cpp
+++ b/clang/test/CodeGenCXX/compound-literals.cpp
@@ -20,7 +20,7 @@ int f() {
   // CHECK: [[LVALUE:%[a-z0-9.]+]] = alloca
   // CHECK-NEXT: [[I:%[a-z0-9]+]] = getelementptr inbounds {{.*}}, ptr [[LVALUE]], i32 0, i32 0
   // CHECK-NEXT: store i32 17, ptr [[I]]
-  // CHECK-NEXT: [[X:%[a-z0-9]+]] = getelementptr inbounds {{.*}} [[LVALUE]], i32 4
+  // CHECK-NEXT: [[X:%[a-z0-9]+]] = getelementptr inbounds {{.*}} [[LVALUE]], i32 0, i32 1
   // CHECK-NEXT: call noundef ptr @_ZN1XC1EPKc({{.*}}[[X]]
   // CHECK-NEXT: [[I:%[a-z0-9]+]] = getelementptr inbounds {{.*}} [[LVALUE]], i32 0, i32 0
   // CHECK-NEXT: [[RESULT:%[a-z0-9]+]] = load i32, ptr
diff --git a/clang/test/CodeGenCXX/exceptions.cpp b/clang/test/CodeGenCXX/exceptions.cpp
index 61cffd1023b88..9875740c09b41 100644
--- a/clang/test/CodeGenCXX/exceptions.cpp
+++ b/clang/test/CodeGenCXX/exceptions.cpp
@@ -513,7 +513,8 @@ namespace test11 {
   // CHECK-LABEL:    define{{.*}} void @_ZN6test111CC2Ev(
   // CHECK:      [[THIS:%.*]] = load ptr, ptr {{%.*}}
   //   Construct single.
-  // CHECK-NEXT: call void @_ZN6test111AC1Ev(ptr {{[^,]*}} [[THIS]])
+  // CHECK-NEXT: [[SINGLE:%.*]] = getelementptr inbounds nuw [[C:%.*]], ptr [[THIS]], i32 0, i32 0
+  // CHECK-NEXT: call void @_ZN6test111AC1Ev(ptr {{[^,]*}} [[SINGLE]])
   //   Construct array.
   // CHECK-NEXT: [[ARRAY:%.*]] = getelementptr inbounds nuw [[C:%.*]], ptr [[THIS]], i32 0, i32 1
   // CHECK-NEXT: [[ARRAYBEGIN:%.*]] = getelementptr inbounds [2 x [3 x [[A:%.*]]]], ptr [[ARRAY]], i32 0, i32 0, i32 0
@@ -559,8 +560,8 @@ namespace test11 {
   // CHECK:      br label
   //   Finally, the cleanup for single.
 
-  // CHECK98:      invoke void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[THIS]])
-  // CHECK11:      call void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[THIS]])
+  // CHECK98:      invoke void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[SINGLE]])
+  // CHECK11:      call void @_ZN6test111AD1Ev(ptr {{[^,]*}} [[SINGLE]])
 
   // CHECK:      br label
   // CHECK:      resume
diff --git a/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp b/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp
index ab44f43720832..ef3847d0c1e93 100644
--- a/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp
+++ b/clang/test/CodeGenCXX/lambda-deterministic-captures.cpp
@@ -16,7 +16,8 @@ void foo() {
 }
 
 // CHECK: define{{.*}} void @_Z3foov
-// CHECK:      getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
+// CHECK-NEXT: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 1
 // CHECK-NEXT: store float 0.000
 // CHECK-NEXT: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 2
 // CHECK-NEXT: store float 1.000
@@ -26,6 +27,7 @@ void foo() {
 // The lambda body.  Reverse iteration when the captures aren't deterministic
 // causes these to be laid out differently in the lambda.
 // CHECK: define internal void
+// CHECK: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
 // CHECK: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 1
 // CHECK: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 2
 // CHECK: getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 3
diff --git a/clang/test/CodeGenCXX/partial-destruction.cpp b/clang/test/CodeGenCXX/partial-destruction.cpp
index 548a9f154be9e..5412e1ddd6274 100644
--- a/clang/test/CodeGenCXX/partial-destruction.cpp
+++ b/clang/test/CodeGenCXX/partial-destruction.cpp
@@ -107,12 +107,13 @@ namespace test1 {
   // CHECK:      [[V:%.*]] = alloca [[B:%.*]], align 4
   // CHECK-NEXT: alloca ptr
   // CHECK-NEXT: alloca i32
-  // CHECK-NEXT: call void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[V]], i32 noundef 5)
-  // CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 1
+  // CHECK-NEXT: [[X:%.*]] = getelementptr inbounds nuw [[B]], ptr [[V]], i32 0, i32 0
+  // CHECK-NEXT: call void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[X]], i32 noundef 5)
+  // CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw [[B]], ptr [[V]], i32 0, i32 1
   // CHECK-NEXT: invoke void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[Y]], i32 noundef 6)
-  // CHECK:      [[Z:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 2
+  // CHECK:      [[Z:%.*]] = getelementptr inbounds nuw [[B]], ptr [[V]], i32 0, i32 2
   // CHECK-NEXT: invoke void @_ZN5test11AC1Ei(ptr {{[^,]*}} [[Z]], i32 noundef 7)
-  // CHECK:      [[W:%.*]] = getelementptr inbounds nuw [[B]], ptr [[V]], i32 0, i32 1
+  // CHECK:      [[W:%.*]] = getelementptr inbounds nuw [[B]], ptr [[V]], i32 0, i32 3
   // CHECK-NEXT: store i32 8, ptr [[W]], align 4
   // CHECK-NEXT: call void @_ZN5test11BD1Ev(ptr {{[^,]*}} [[V]])
   // CHECK-NEXT: ret void
@@ -123,9 +124,9 @@ namespace test1 {
   // CHECK:      landingpad { ptr, i32 }
   // CHECK-NEXT:   cleanup
   // CHECKv03:      invoke void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[Y]])
-  // CHECKv03:      invoke void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[V]])
+  // CHECKv03:      invoke void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[X]])
   // CHECKv11:      call   void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[Y]])
-  // CHECKv11:      call   void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[V]])
+  // CHECKv11:      call   void @_ZN5test11AD1Ev(ptr {{[^,]*}} [[X]])
 }
 
 namespace test2 {
diff --git a/clang/test/CodeGenCXX/pod-member-memcpys.cpp b/clang/test/CodeGenCXX/pod-member-memcpys.cpp
index 8efec6184a3da..16d3d45a8179b 100644
--- a/clang/test/CodeGenCXX/pod-member-memcpys.cpp
+++ b/clang/test/CodeGenCXX/pod-member-memcpys.cpp
@@ -1,8 +1,6 @@
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-apple-darwin10 -emit-llvm -std=c++03 -fexceptions -fcxx-exceptions -o - %s | FileCheck %s
 // RUN: %clang_cc1 -no-enable-noundef-analysis -triple i386-apple-darwin10 -emit-llvm -std=c++03 -o - %s | FileCheck --check-prefix=CHECK-2 %s
 
-struct Empty {};
-
 struct POD {
   int w, x, y, z;
 };
@@ -108,20 +106,6 @@ struct __attribute__((packed)) PackedMembers {
   int w, x, y, z;
 };
 
-struct WithEmptyField {
-    int a;
-    Empty e;
-    NonPOD np;
-    int b;
-};
-
-struct WithEmptyNUAField {
-    int a;
-    [[no_unique_address]] Empty e;
-    NonPOD np;
-    int b;
-};
-
 // COPY-ASSIGNMENT OPERATORS:
 
 // Assignment operators are output in the order they're encountered.
@@ -137,8 +121,6 @@ CALL_AO(VolatileMember)
 CALL_AO(BitfieldMember)
 CALL_AO(InnerClassMember)
 CALL_AO(PackedMembers)
-CALL_AO(WithEmptyField)
-CALL_AO(WithEmptyNUAField)
 
 // Basic copy-assignment:
 // CHECK-LABEL: define linkonce_odr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN5BasicaSERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0)
@@ -203,18 +185,6 @@ CALL_AO(WithEmptyNUAField)
 // CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 1 {{.*}} align 1 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret ptr
 
-// WithEmptyField copy-assignment:
-// CHECK-LABEL: define linkonce_odr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN14WithEmptyFieldaSERKS_
-// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}})
-// CHECK: call nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN6NonPODaSERKS_
-// CHECK: ret ptr
-
-// WithEmptyNUAField copy-assignment:
-// CHECK-LABEL: define linkonce_odr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN17WithEmptyNUAFieldaSERKS_
-// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}})
-// CHECK: call nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN6NonPODaSERKS_
-// CHECK: ret ptr
-
 // COPY-CONSTRUCTORS:
 
 // Clang outputs copy-constructors in the reverse of the order that
@@ -310,15 +280,3 @@ CALL_CC(Basic)
 // CHECK: call void @_ZN6NonPODC1ERKS_
 // CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 16, i1 {{.*}})
 // CHECK: ret void
-
-CALL_CC(WithEmptyField)
-// WithEmptyField copy-constructor:
-// CHECK-LABEL: define linkonce_odr void @_ZN14WithEmptyFieldC2ERKS_
-// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}})
-// CHECK: call void @_ZN6NonPODC1ERKS_
-
-CALL_CC(WithEmptyNUAField)
-// WithEmptyNUAField copy-constructor:
-// CHECK-LABEL: define linkonce_odr void @_ZN17WithEmptyNUAFieldC2ERKS_(ptr {{[^,]*}} %this, ptr nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %0)
-// CHECK: call void @llvm.memcpy.p0.p0.i64({{.*}} align 4 {{.*}} align 4 {{.*}}i64 4, i1 {{.*}})
-// CHECK: call void @_ZN6NonPODC1ERKS_
diff --git a/clang/test/CodeGenCXX/pr18962.cpp b/clang/test/CodeGenCXX/pr18962.cpp
index 9ac87003c94c5..b564a7b9a73af 100644
--- a/clang/test/CodeGenCXX/pr18962.cpp
+++ b/clang/test/CodeGenCXX/pr18962.cpp
@@ -23,6 +23,7 @@ D p3;
 
 // We end up using an opaque type for 'append' to avoid circular references.
 // CHECK: %class.A = type { ptr }
-// CHECK: %class.C = type <{ ptr, [4 x i8] }>
+// CHECK: %class.C = type <{ ptr, %class.B, [3 x i8] }>
+// CHECK: %class.B = type { i8 }
 // CHECK: %class.D = type { %class.C.base, [3 x i8] }
-// CHECK: %class.C.base = type <{ ptr, i8 }>
+// CHECK: %class.C.base = type <{ ptr, %class.B }>
diff --git a/clang/test/CodeGenCXX/references.cpp b/clang/test/CodeGenCXX/references.cpp
index b84cb788d161c..0fca5e76659c2 100644
--- a/clang/test/CodeGenCXX/references.cpp
+++ b/clang/test/CodeGenCXX/references.cpp
@@ -191,6 +191,7 @@ namespace N2 {
 
   // CHECK-LABEL: define{{.*}} void @_ZN2N21fEi
   // CHECK: call void @_ZN2N24getPEv
+  // CHECK: getelementptr inbounds
   // CHECK: store i32 17
   // CHECK: call void @_ZN2N21PD1Ev
   void f(int i) {
@@ -219,7 +220,8 @@ namespace N2 {
 
   // CHECK-LABEL: define{{.*}} void @_ZN2N21gEi
   // CHECK: call void @_ZN2N24getZEv
-  // CHECK: {{getelementptr inbounds.*i64 16}}
+  // CHECK: {{getelementptr inbounds.*i32 0, i32 0}}
+  // CHECK: {{getelementptr inbounds.*i32 0, i32 0}}
   // CHECK: store i32 19
   // CHECK: call void @_ZN2N21ZD1Ev
   // CHECK: ret void
diff --git a/clang/test/CodeGenCXX/temporaries.cpp b/clang/test/CodeGenCXX/temporaries.cpp
index 44978dd403ad9..36ab0e89f7d50 100644
--- a/clang/test/CodeGenCXX/temporaries.cpp
+++ b/clang/test/CodeGenCXX/temporaries.cpp
@@ -714,7 +714,7 @@ namespace MultipleExtension {
   // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1AD1Ev, {{.*}} @[[TEMPA]]
   // CHECK: store {{.*}} @[[TEMPA]], {{.*}} @[[TEMPE:_ZGRN17MultipleExtension2e1E.*]],
 
-  // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds ({{.*}} @[[TEMPE]], i64 8))
+  // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds nuw ({{.*}} @[[TEMPE]], i32 0, i32 1))
 
   // CHECK: call void @_ZN17MultipleExtension1DC1Ev({{.*}} @[[TEMPD:_ZGRN17MultipleExtension2e1E.*]])
   // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1DD1Ev, {{.*}} @[[TEMPD]]
@@ -728,7 +728,7 @@ namespace MultipleExtension {
   // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1AD1Ev, {{.*}} @[[TEMPA]]
   // CHECK: store {{.*}} @[[TEMPA]], {{.*}} @[[E:_ZN17MultipleExtension2e2E]]
 
-  // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds ({{.*}} @[[E]], i64 8))
+  // CHECK: call void @_ZN17MultipleExtension1BC1Ev({{.*}} getelementptr inbounds nuw ({{.*}} @[[E]], i32 0, i32 1))
 
   // CHECK: call void @_ZN17MultipleExtension1DC1Ev({{.*}} @[[TEMPD:_ZGRN17MultipleExtension2e2E.*]])
   // CHECK: call i32 @__cxa_atexit({{.*}} @_ZN17MultipleExtension1DD1Ev, {{.*}} @[[TEMPD]]
@@ -743,11 +743,11 @@ namespace MultipleExtension {
     // CHECK: %[[TEMPE1_A:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1:.*]], i32 0, i32 0
     // CHECK: call void @[[NS]]1AC1Ev({{.*}} %[[TEMPA1:.*]])
     // CHECK: store {{.*}} %[[TEMPA1]], {{.*}} %[[TEMPE1_A]]
-    // CHECK: %[[TEMPE1_B:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1]], i64 8
+    // CHECK: %[[TEMPE1_B:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1]], i32 0, i32 1
     // CHECK: call void @[[NS]]1BC1Ev({{.*}} %[[TEMPE1_B]])
     // CHECK: %[[TEMPE1_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPE1]], i32 0, i32 2
     // CHECK: call void @[[NS]]1DC1Ev({{.*}} %[[TEMPD1:.*]])
-    // CHECK: %[[TEMPD1_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD1]], i64 4
+    // CHECK: %[[TEMPD1_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD1]], i32 0, i32 1
     // CHECK: store {{.*}} %[[TEMPD1_C]], {{.*}} %[[TEMPE1_C]]
     // CHECK: store {{.*}} %[[TEMPE1]], {{.*}} %[[E1:.*]]
 
@@ -758,11 +758,11 @@ namespace MultipleExtension {
     // CHECK: %[[TEMPE2_A:.*]] = getelementptr inbounds {{.*}} %[[E2:.*]], i32 0, i32 0
     // CHECK: call void @[[NS]]1AC1Ev({{.*}} %[[TEMPA2:.*]])
     // CHECK: store {{.*}} %[[TEMPA2]], {{.*}} %[[TEMPE2_A]]
-    // CHECK: %[[TEMPE2_B:.*]] = getelementptr inbounds {{.*}} %[[E2]], i64 8
+    // CHECK: %[[TEMPE2_B:.*]] = getelementptr inbounds {{.*}} %[[E2]], i32 0, i32 1
     // CHECK: call void @[[NS]]1BC1Ev({{.*}} %[[TEMPE2_B]])
     // CHECK: %[[TEMPE2_C:.*]] = getelementptr inbounds {{.*}} %[[E2]], i32 0, i32 2
     // CHECK: call void @[[NS]]1DC1Ev({{.*}} %[[TEMPD2:.*]])
-    // CHECK: %[[TEMPD2_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD2]], i64 4
+    // CHECK: %[[TEMPD2_C:.*]] = getelementptr inbounds {{.*}} %[[TEMPD2]], i32 0, i32 1
     // CHECK: store {{.*}} %[[TEMPD2_C]], ptr %[[TEMPE2_C]]
 
     g();
diff --git a/clang/test/CodeGenObjCXX/lambda-to-block.mm b/clang/test/CodeGenObjCXX/lambda-to-block.mm
index a8657ca711f7c..b1e1338c6ac1e 100644
--- a/clang/test/CodeGenObjCXX/lambda-to-block.mm
+++ b/clang/test/CodeGenObjCXX/lambda-to-block.mm
@@ -2,10 +2,11 @@
 
 // Shouldn't crash!
 
-// CHECK: %[[CLASS_ANON:.*]] = type { i8 }
-// CHECK: %[[CLASS_ANON_0:.*]] = type { i8 }
-// CHECK: %[[CLASS_ANON_1:.*]] = type { i8 }
-// CHECK: %[[CLASS_ANON_2:.*]] = type { i8 }
+// CHECK: %[[CLASS_ANON:.*]] = type { %[[STRUCT_COPYABLE:.*]] }
+// CHECK: %[[STRUCT_COPYABLE]] = type { i8 }
+// CHECK: %[[CLASS_ANON_0:.*]] = type { %[[STRUCT_COPYABLE]] }
+// CHECK: %[[CLASS_ANON_1:.*]] = type { %[[STRUCT_COPYABLE]] }
+// CHECK: %[[CLASS_ANON_2:.*]] = type { %[[STRUCT_COPYABLE]] }
 
 // CHECK: @[[BLOCK_DESC0:.*]] = internal constant { i64, i64, ptr, ptr, ptr, ptr } { i64 0, i64 33, ptr @[[COPY_HELPER0:.*__copy_helper_block_.*]], ptr @__destroy_helper_block{{.*}}, {{.*}}}, align 8
 // CHECK: @[[BLOCK_DESC1:.*]] = internal constant { i64, i64, ptr, ptr, ptr, ptr } { i64 0, i64 33, ptr @[[COPY_HELPER1:.*__copy_helper_block_.*]], ptr @__destroy_helper_block{{.*}}, {{.*}}}, align 8
diff --git a/clang/test/OpenMP/amdgcn_sret_ctor.cpp b/clang/test/OpenMP/amdgcn_sret_ctor.cpp
index fc6f7c15eb5e6..81d0cce5190e7 100644
--- a/clang/test/OpenMP/amdgcn_sret_ctor.cpp
+++ b/clang/test/OpenMP/amdgcn_sret_ctor.cpp
@@ -19,8 +19,9 @@ E::E() noexcept : foo(s()) {}
 // CHECK-NEXT:    [[THIS_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[THIS_ADDR]] to ptr
 // CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[THIS1_ASCAST:%.*]] = addrspacecast ptr [[THIS1]] to ptr addrspace(5)
-// CHECK-NEXT:    call void @_Z1sv(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_S:%.*]]) align 1 [[THIS1_ASCAST]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    [[FOO:%.*]] = getelementptr inbounds nuw [[STRUCT_E:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[FOO_ASCAST:%.*]] = addrspacecast ptr [[FOO]] to ptr addrspace(5)
+// CHECK-NEXT:    call void @_Z1sv(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_S:%.*]]) align 1 [[FOO_ASCAST]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 //
diff --git a/clang/test/OpenMP/irbuilder_for_iterator.cpp b/clang/test/OpenMP/irbuilder_for_iterator.cpp
index ec1c3af744b49..e1e8ff66cd8aa 100644
--- a/clang/test/OpenMP/irbuilder_for_iterator.cpp
+++ b/clang/test/OpenMP/irbuilder_for_iterator.cpp
@@ -48,48 +48,49 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
 // CHECK-NEXT:    call void @_ZN10MyIteratorC1Ej(ptr noundef nonnull align 1 dereferenceable(1) [[IT]], i32 noundef 7)
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
 // CHECK-NEXT:    store ptr [[IT]], ptr [[TMP0]], align 8
-// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_CAPTURED1]], ptr noundef nonnull align 1 dereferenceable(1) [[IT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0]], ptr [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP1]], ptr noundef nonnull align 1 dereferenceable(1) [[IT]])
 // CHECK-NEXT:    call void @__captured_stmt(ptr [[DOTCOUNT_ADDR]], ptr [[AGG_CAPTURED]])
 // CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i64, ptr [[DOTCOUNT_ADDR]], align 8
 // CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
 // CHECK:       omp_loop.preheader:
 // CHECK-NEXT:    store i64 0, ptr [[P_LOWERBOUND]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[DOTCOUNT]], 1
-// CHECK-NEXT:    store i64 [[TMP1]], ptr [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[P_UPPERBOUND]], align 8
 // CHECK-NEXT:    store i64 1, ptr [[P_STRIDE]], align 8
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK-NEXT:    call void @__kmpc_for_static_init_8u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, ptr [[P_LASTITER]], ptr [[P_LOWERBOUND]], ptr [[P_UPPERBOUND]], ptr [[P_STRIDE]], i64 1, i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[P_LOWERBOUND]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[P_UPPERBOUND]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP2]]
-// CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP4]], 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], [[TMP3]]
+// CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
 // CHECK:       omp_loop.header:
 // CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
 // CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
 // CHECK:       omp_loop.cond:
-// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP5]]
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP6]]
 // CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
 // CHECK:       omp_loop.body:
-// CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP2]]
-// CHECK-NEXT:    call void @__captured_stmt.1(ptr [[IT]], i64 [[TMP6]], ptr [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP3]]
+// CHECK-NEXT:    call void @__captured_stmt.1(ptr [[IT]], i64 [[TMP7]], ptr [[AGG_CAPTURED1]])
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK10MyIteratordeEv(ptr noundef nonnull align 1 dereferenceable(1) [[IT]])
 // CHECK-NEXT:    store i32 [[CALL]], ptr [[I]], align 4
-// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP8]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP7]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM2]]
-// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP9]], [[TMP12]]
-// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP10]], [[TMP13]]
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
@@ -154,11 +155,12 @@ extern "C" void workshareloop_iterator(float *a, float *b, float *c) {
 // CHECK-NEXT:    store i64 [[LOGICAL]], ptr [[LOGICAL_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8
-// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[MUL]] to i32
-// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]], i32 noundef [[CONV]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 1 dereferenceable(1) ptr @_ZN10MyIteratoraSERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP2]], ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
+// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP1]], i32 noundef [[CONV]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 1 dereferenceable(1) ptr @_ZN10MyIteratoraSERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP3]], ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/irbuilder_for_rangefor.cpp b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
index 86a043e638bc3..635382f737f18 100644
--- a/clang/test/OpenMP/irbuilder_for_rangefor.cpp
+++ b/clang/test/OpenMP/irbuilder_for_rangefor.cpp
@@ -66,46 +66,47 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
 // CHECK-NEXT:    store ptr [[__BEGIN2]], ptr [[TMP2]], align 8
 // CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 1
 // CHECK-NEXT:    store ptr [[__END2]], ptr [[TMP3]], align 8
-// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_CAPTURED1]], ptr noundef nonnull align 1 dereferenceable(1) [[__BEGIN2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0]], ptr [[AGG_CAPTURED1]], i32 0, i32 0
+// CHECK-NEXT:    call void @_ZN10MyIteratorC1ERKS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP4]], ptr noundef nonnull align 1 dereferenceable(1) [[__BEGIN2]])
 // CHECK-NEXT:    call void @__captured_stmt(ptr [[DOTCOUNT_ADDR]], ptr [[AGG_CAPTURED]])
 // CHECK-NEXT:    [[DOTCOUNT:%.*]] = load i64, ptr [[DOTCOUNT_ADDR]], align 8
 // CHECK-NEXT:    br label [[OMP_LOOP_PREHEADER:%.*]]
 // CHECK:       omp_loop.preheader:
 // CHECK-NEXT:    store i64 0, ptr [[P_LOWERBOUND]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[DOTCOUNT]], 1
-// CHECK-NEXT:    store i64 [[TMP4]], ptr [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[DOTCOUNT]], 1
+// CHECK-NEXT:    store i64 [[TMP5]], ptr [[P_UPPERBOUND]], align 8
 // CHECK-NEXT:    store i64 1, ptr [[P_STRIDE]], align 8
 // CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK-NEXT:    call void @__kmpc_for_static_init_8u(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 34, ptr [[P_LASTITER]], ptr [[P_LOWERBOUND]], ptr [[P_UPPERBOUND]], ptr [[P_STRIDE]], i64 1, i64 0)
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[P_LOWERBOUND]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P_UPPERBOUND]], align 8
-// CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[TMP6]], [[TMP5]]
-// CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[P_LOWERBOUND]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[P_UPPERBOUND]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[TMP7]], [[TMP6]]
+// CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], 1
 // CHECK-NEXT:    br label [[OMP_LOOP_HEADER:%.*]]
 // CHECK:       omp_loop.header:
 // CHECK-NEXT:    [[OMP_LOOP_IV:%.*]] = phi i64 [ 0, [[OMP_LOOP_PREHEADER]] ], [ [[OMP_LOOP_NEXT:%.*]], [[OMP_LOOP_INC:%.*]] ]
 // CHECK-NEXT:    br label [[OMP_LOOP_COND:%.*]]
 // CHECK:       omp_loop.cond:
-// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP8]]
+// CHECK-NEXT:    [[OMP_LOOP_CMP:%.*]] = icmp ult i64 [[OMP_LOOP_IV]], [[TMP9]]
 // CHECK-NEXT:    br i1 [[OMP_LOOP_CMP]], label [[OMP_LOOP_BODY:%.*]], label [[OMP_LOOP_EXIT:%.*]]
 // CHECK:       omp_loop.body:
-// CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP5]]
-// CHECK-NEXT:    call void @__captured_stmt.1(ptr [[I]], i64 [[TMP9]], ptr [[AGG_CAPTURED1]])
-// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP11]] to i64
-// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP10]], i64 [[IDXPROM]]
-// CHECK-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP13]], i64 [[IDXPROM2]]
-// CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
-// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP12]], [[TMP15]]
-// CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4
-// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP17]] to i64
-// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP16]], i64 [[IDXPROM4]]
+// CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OMP_LOOP_IV]], [[TMP6]]
+// CHECK-NEXT:    call void @__captured_stmt.1(ptr [[I]], i64 [[TMP10]], ptr [[AGG_CAPTURED1]])
+// CHECK-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[B_ADDR]], align 8
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP12]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM2:%.*]] = zext i32 [[TMP15]] to i64
+// CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP14]], i64 [[IDXPROM2]]
+// CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX3]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP13]], [[TMP16]]
+// CHECK-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP18]] to i64
+// CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP17]], i64 [[IDXPROM4]]
 // CHECK-NEXT:    store float [[MUL]], ptr [[ARRAYIDX5]], align 4
 // CHECK-NEXT:    br label [[OMP_LOOP_INC]]
 // CHECK:       omp_loop.inc:
@@ -172,12 +173,13 @@ extern "C" void workshareloop_rangefor(float *a, float *b, float *c) {
 // CHECK-NEXT:    store i64 [[LOGICAL]], ptr [[LOGICAL_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[__CONTEXT]], ptr [[__CONTEXT_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__CONTEXT_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8
-// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP1]]
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_ANON_0:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[LOGICAL_ADDR]], align 8
+// CHECK-NEXT:    [[MUL:%.*]] = mul i64 1, [[TMP2]]
 // CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[MUL]] to i32
-// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]], i32 noundef [[CONV]])
+// CHECK-NEXT:    call void @_ZNK10MyIteratorplEj(ptr dead_on_unwind writable sret([[STRUCT_MYITERATOR]]) align 1 [[REF_TMP]], ptr noundef nonnull align 1 dereferenceable(1) [[TMP1]], i32 noundef [[CONV]])
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZNK10MyIteratordeEv(ptr noundef nonnull align 1 dereferenceable(1) [[REF_TMP]])
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[CALL]], ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[LOOPVAR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[CALL]], ptr [[TMP3]], align 4
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/task_member_call_codegen.cpp b/clang/test/OpenMP/task_member_call_codegen.cpp
index a6ae29c1f9f6d..8f7d2d15d0e26 100644
--- a/clang/test/OpenMP/task_member_call_codegen.cpp
+++ b/clang/test/OpenMP/task_member_call_codegen.cpp
@@ -32,8 +32,9 @@ void c() {
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    [[TMP1:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i64 48, i64 1, ptr @.omp_task_entry.)
 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
-// CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i32 0, i32 1
+// CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
 // CHECK1-NEXT:    ret void
 //
 //
@@ -45,8 +46,9 @@ void c() {
 // CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP3]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP2]], i32 0, i32 0
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[TMP4]], align 8
 // CHECK1-NEXT:    ret void
 //
 //
@@ -70,7 +72,7 @@ void c() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 40
+// CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
 // CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 // CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
 // CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
@@ -98,9 +100,10 @@ void c() {
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
 // CHECK3-NEXT:    [[TMP0:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1:[0-9]+]], i32 [[OMP_GLOBAL_THREAD_NUM]], i32 1, i64 48, i64 1, ptr @.omp_task_entry.)
 // CHECK3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40
+// CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP0]], i32 0, i32 1
+// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP2]], i32 0, i32 0
 // CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
-// CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], ptr [[TMP0]])
+// CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM1]], ptr [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
@@ -112,8 +115,9 @@ void c() {
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
 // CHECK3-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
-// CHECK3-NEXT:    store ptr [[TMP2]], ptr [[TMP3]], align 8
+// CHECK3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw [[STRUCT__KMP_PRIVATES_T:%.*]], ptr [[TMP2]], i32 0, i32 0
+// CHECK3-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[TMP4]], align 8
 // CHECK3-NEXT:    ret void
 //
 //
@@ -137,7 +141,7 @@ void c() {
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 40
+// CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 // CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
 // CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
diff --git a/compiler-rt/lib/asan/asan_allocator.cpp b/compiler-rt/lib/asan/asan_allocator.cpp
index 06c827c41eacc..5bcae291428f6 100644
--- a/compiler-rt/lib/asan/asan_allocator.cpp
+++ b/compiler-rt/lib/asan/asan_allocator.cpp
@@ -1399,7 +1399,10 @@ DECLARE_REAL(hsa_status_t, hsa_amd_ipc_memory_attach,
 DECLARE_REAL(hsa_status_t, hsa_amd_ipc_memory_detach, void *mapped_ptr)
 DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_reserve_align, void** ptr,
              size_t size, uint64_t address, uint64_t alignment, uint64_t flags)
-DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size);
+DECLARE_REAL(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size)
+DECLARE_REAL(hsa_status_t, hsa_amd_pointer_info, const void* ptr,
+             hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
+             uint32_t* num_agents_accessible, hsa_agent_t** accessible)
 
 namespace __asan {
 
@@ -1452,18 +1455,22 @@ static struct AP64<LocalAddressSpaceView> AP_;
 static struct AP32<LocalAddressSpaceView> AP_;
 #endif
 
-hsa_status_t asan_hsa_amd_ipc_memory_create(void *ptr, size_t len,
-  hsa_amd_ipc_memory_t * handle) {
-  void *ptr_;
-  size_t len_ = get_allocator().GetActuallyAllocatedSize(ptr);
-  if (len_) {
+hsa_status_t asan_hsa_amd_ipc_memory_create(void* ptr, size_t len,
+                                            hsa_amd_ipc_memory_t* handle) {
+  void* ptr_ = get_allocator().GetBlockBegin(ptr);
+  AsanChunk* m = ptr_
+                     ? instance.GetAsanChunkByAddr(reinterpret_cast<uptr>(ptr_))
+                     : nullptr;
+  if (ptr_ && m) {
     static_assert(AP_.kMetadataSize == 0, "Expression below requires this");
-    ptr_ = reinterpret_cast<void *>(reinterpret_cast<uptr>(ptr) - kPageSize_);
-  } else {
-    ptr_ = ptr;
-    len_ = len;
+    uptr p = reinterpret_cast<uptr>(ptr);
+    uptr p_ = reinterpret_cast<uptr>(ptr_);
+    if (p == p_ + kPageSize_ && len == m->UsedSize()) {
+      size_t len_ = get_allocator().GetActuallyAllocatedSize(ptr_);
+      return REAL(hsa_amd_ipc_memory_create)(ptr_, len_, handle);
+    }
   }
-  return REAL(hsa_amd_ipc_memory_create)(ptr_, len_, handle);
+  return REAL(hsa_amd_ipc_memory_create)(ptr, len, handle);
 }
 
 hsa_status_t asan_hsa_amd_ipc_memory_attach(const hsa_amd_ipc_memory_t *handle,
@@ -1540,5 +1547,36 @@ hsa_status_t asan_hsa_amd_vmem_address_free(void* ptr, size_t size,
   }
   return REAL(hsa_amd_vmem_address_free)(ptr, size);
 }
+
+hsa_status_t asan_hsa_amd_pointer_info(const void* ptr,
+                                       hsa_amd_pointer_info_t* info,
+                                       void* (*alloc)(size_t),
+                                       uint32_t* num_agents_accessible,
+                                       hsa_agent_t** accessible) {
+  void* ptr_ = get_allocator().GetBlockBegin(ptr);
+  AsanChunk* m = ptr_
+                     ? instance.GetAsanChunkByAddr(reinterpret_cast<uptr>(ptr_))
+                     : nullptr;
+  if (ptr_ && m) {
+    hsa_status_t status = REAL(hsa_amd_pointer_info)(
+        ptr_, info, alloc, num_agents_accessible, accessible);
+    if (status == HSA_STATUS_SUCCESS && info) {
+      static_assert(AP_.kMetadataSize == 0, "Expression below requires this");
+      // Adjust base address of agent,host and sizeInBytes so as to return
+      // the actual pointer information of user allocation rather than asan
+      // allocation. Asan allocation pointer info can be acquired using internal
+      // 'GetPointerInfo'
+      info->agentBaseAddress = reinterpret_cast<void*>(
+          reinterpret_cast<uptr>(info->agentBaseAddress) + kPageSize_);
+      info->hostBaseAddress = reinterpret_cast<void*>(
+          reinterpret_cast<uptr>(info->hostBaseAddress) + kPageSize_);
+      info->sizeInBytes = m->UsedSize();
+    }
+    return status;
+  }
+  return REAL(hsa_amd_pointer_info)(ptr, info, alloc, num_agents_accessible,
+                                    accessible);
+}
+
 }  // namespace __asan
 #endif
diff --git a/compiler-rt/lib/asan/asan_allocator.h b/compiler-rt/lib/asan/asan_allocator.h
index ced10f62b7a58..f33e8d3b2819e 100644
--- a/compiler-rt/lib/asan/asan_allocator.h
+++ b/compiler-rt/lib/asan/asan_allocator.h
@@ -341,6 +341,11 @@ hsa_status_t asan_hsa_amd_vmem_address_reserve_align(void** ptr, size_t size,
                                                      BufferedStackTrace* stack);
 hsa_status_t asan_hsa_amd_vmem_address_free(void* ptr, size_t size,
                                             BufferedStackTrace* stack);
+hsa_status_t asan_hsa_amd_pointer_info(const void* ptr,
+                                       hsa_amd_pointer_info_t* info,
+                                       void* (*alloc)(size_t),
+                                       uint32_t* num_agents_accessible,
+                                       hsa_agent_t** accessible);
 } // namespace __asan
 #endif
 
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 0951a77b1b93e..c04d532f909b1 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -948,6 +948,15 @@ INTERCEPTOR(hsa_status_t, hsa_amd_vmem_address_free, void* ptr, size_t size) {
   return asan_hsa_amd_vmem_address_free(ptr, size, &stack);
 }
 
+INTERCEPTOR(hsa_status_t, hsa_amd_pointer_info, const void* ptr,
+            hsa_amd_pointer_info_t* info, void* (*alloc)(size_t),
+            uint32_t* num_agents_accessible, hsa_agent_t** accessible) {
+  AsanInitFromRtl();
+  ENSURE_HSA_INITED();
+  return asan_hsa_amd_pointer_info(ptr, info, alloc, num_agents_accessible,
+                                   accessible);
+}
+
 void InitializeAmdgpuInterceptors() {
   ASAN_INTERCEPT_FUNC(hsa_memory_copy);
   ASAN_INTERCEPT_FUNC(hsa_amd_memory_pool_allocate);
@@ -962,6 +971,7 @@ void InitializeAmdgpuInterceptors() {
   ASAN_INTERCEPT_FUNC(hsa_amd_ipc_memory_detach);
   ASAN_INTERCEPT_FUNC(hsa_amd_vmem_address_reserve_align);
   ASAN_INTERCEPT_FUNC(hsa_amd_vmem_address_free);
+  ASAN_INTERCEPT_FUNC(hsa_amd_pointer_info);
 }
 
 void ENSURE_HSA_INITED() {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 96a715a2cb0f0..0289d19a44c49 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -914,12 +914,8 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   // If DWARF address space value is other than None, add it.  The IR
   // verifier checks that DWARF address space only exists for pointer
   // or reference types.
-  if (auto AS = DTy->getDWARFAddressSpace()) {
-    // TODO: Drop address_class once the debugger adopts address_space
-    for (auto ASTag :
-         {dwarf::DW_AT_address_class, dwarf::DW_AT_LLVM_address_space})
-      addUInt(Buffer, ASTag, dwarf::DW_FORM_data4, *AS);
-  }
+  if (auto AS = DTy->getDWARFAddressSpace())
+    addUInt(Buffer, dwarf::DW_AT_LLVM_address_space, dwarf::DW_FORM_data4, *AS);
 
   // Add template alias template parameters.
   if (Tag == dwarf::DW_TAG_template_alias)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0bf460ab53a0c..e7241e460ccaa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -39,10 +39,9 @@ enum ImplicitArgumentPositions {
 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
 
 enum ImplicitArgumentMask {
-  UNKNOWN_INTRINSIC = 0,
+  NOT_IMPLICIT_INPUT = 0,
 #include "AMDGPUAttributes.def"
-  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
-  NOT_IMPLICIT_INPUT
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
 };
 
 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -117,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
     return QUEUE_PTR;
   default:
-    return UNKNOWN_INTRINSIC;
+    return NOT_IMPLICIT_INPUT;
   }
 }
 
@@ -536,21 +535,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       ImplicitArgumentMask AttrMask =
           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                               HasApertureRegs, SupportsGetDoorbellID, COV);
-
-      if (AttrMask == UNKNOWN_INTRINSIC) {
-        // Assume not-nocallback intrinsics may invoke a function which accesses
-        // implicit arguments.
-        //
-        // FIXME: This isn't really the correct check. We want to ensure it
-        // isn't calling any function that may use implicit arguments regardless
-        // of whether it's internal to the module or not.
-        //
-        // TODO: Ignoring callsite attributes.
-        if (!Callee->hasFnAttribute(Attribute::NoCallback))
-          return indicatePessimisticFixpoint();
-        continue;
-      }
-
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
@@ -1374,10 +1358,7 @@ struct AAAMDGPUMinAGPRAlloc
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-
-        // Assume !nocallback intrinsics may call a function which requires
-        // AGPRs.
-        return CB.hasFnAttr(Attribute::NoCallback);
+        return true;
       }
 
       // TODO: Handle callsite attributes
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 8f17f9c2760ef..5bf9b3a822f36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -649,7 +649,8 @@ class AMDGPULowerModuleLDS {
           ModuleScopeVariables.insert(GV);
         } else if (K.second.size() == 1) {
           KernelAccessVariables.insert(GV);
-        } else if (K.second == HybridModuleRootKernels) {
+        } else if (K.second == HybridModuleRootKernels &&
+                   set_is_subset(K.second, HybridModuleRootKernels)) {
           ModuleScopeVariables.insert(GV);
         } else {
           TableLookupVariables.insert(GV);
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index b7a92a0a1d634..0d206aba33543 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -523,6 +523,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_HW_ID1 = 23,
   ID_HW_ID2 = 24,
   ID_POPS_PACKER = 25,
+  ID_SCHED_MODE = 26,
   ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
   ID_IB_STS2 = 28,
   ID_SHADER_CYCLES = 29,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 6489e63d4f6b8..ce782b025464e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -211,6 +211,7 @@ static constexpr CustomOperand Operands[] = {
   {{"HW_REG_HW_ID2"},                 ID_HW_ID2,                      isGFX10Plus},
   {{"HW_REG_SQ_PERF_SNAPSHOT_PC_HI"}, ID_SQ_PERF_SNAPSHOT_PC_HI,      isGFX940},
   {{"HW_REG_POPS_PACKER"},            ID_POPS_PACKER,                 isGFX10},
+  {{"HW_REG_WAVE_SCHED_MODE"},        ID_SCHED_MODE,                  isGFX12Plus},
   {{"HW_REG_PERF_SNAPSHOT_DATA"},     ID_PERF_SNAPSHOT_DATA_gfx11,    isGFX11},
   {{"HW_REG_IB_STS2"},                ID_IB_STS2,                     isGFX1250},
   {{"HW_REG_SHADER_CYCLES"},          ID_SHADER_CYCLES,               isGFX10_3_GFX11},
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 491685f9a032b..65abd97c6d642 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -83,6 +83,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -5280,6 +5281,34 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     // FIXME: We might want to defer PHI speculation until after here.
     // FIXME: return nullptr;
   } else {
+    // AMDGPU: If the target is AMDGPU and the chosen SliceTy is a HIP vector
+    // struct of 2 or 4 identical elements, canonicalize it to an IR vector.
+    // This helps SROA treat it as a single value and unlock vector ld/st.
+    // We pattern-match struct names starting with "struct.HIP_vector".
+    if (Function *F = AI.getFunction()) {
+      Triple TT(F->getParent()->getTargetTriple());
+      if (TT.isAMDGPU()) {
+        if (auto *STy = dyn_cast<StructType>(SliceTy)) {
+          StringRef Name = STy->hasName() ? STy->getName() : StringRef();
+          if (Name.starts_with("struct.HIP_vector")) {
+            unsigned NumElts = STy->getNumElements();
+            if ((NumElts == 2 || NumElts == 4) && NumElts > 0) {
+              Type *EltTy = STy->getElementType(0);
+              bool AllSame = true;
+              for (unsigned I = 1; I < NumElts; ++I)
+                if (STy->getElementType(I) != EltTy) {
+                  AllSame = false;
+                  break;
+                }
+              if (AllSame && VectorType::isValidElementType(EltTy)) {
+                SliceTy = FixedVectorType::get(EltTy, NumElts);
+              }
+            }
+          }
+        }
+      }
+    }
+
     // Make sure the alignment is compatible with P.beginOffset().
     const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
     // If we will get at least this much alignment from the type alone, leave
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index ac5bfc5a4f27a..ccf599140fd5d 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -585,7 +585,8 @@ if(build_runtimes)
           INSTALL_COMMAND ""
           CMAKE_ARGS -DCMAKE_PREFIX_PATH=${CMAKE_BINARY_DIR}/lib/cmake
                      -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_NEW=${ROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC}
-                     -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn)
+                     -DROCM_DEVICE_LIBS_BITCODE_INSTALL_LOC_OLD=amdgcn
+                     ${extra_cmake_args})
       endif()
     endif()
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
deleted file mode 100644
index d7d623ac89146..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
-
-; Make sure we do not infer anything about implicit inputs through an
-; intrinsic call which is not nocallback.
-
-declare zeroext i32 @return_i32()
-
-define i32 @test_i32_return() gc "statepoint-example" {
-; CHECK-LABEL: define i32 @test_i32_return(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
-; CHECK-NEXT:    [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
-; CHECK-NEXT:    ret i32 [[CALL1]]
-;
-entry:
-  %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
-  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
-  ret i32 %call1
-}
-
-declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
-declare i32 @llvm.experimental.gc.result.i32(token) #0
-
-attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
-;.
-; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
deleted file mode 100644
index 71c509afa8e64..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
-
-; Make sure we infer no inputs are used through some intrinsics
-
-define void @use_fake_use(i32 %arg) {
-; CHECK-LABEL: define void @use_fake_use(
-; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG]])
-; CHECK-NEXT:    ret void
-;
-  call void (...) @llvm.fake.use(i32 %arg)
-  ret void
-}
-
-define void @use_donothing() {
-; CHECK-LABEL: define void @use_donothing(
-; CHECK-SAME: ) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.donothing()
-  ret void
-}
-
-define void @use_assume(i1 %arg) {
-; CHECK-LABEL: define void @use_assume(
-; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    call void @llvm.assume(i1 [[ARG]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.assume(i1 %arg)
-  ret void
-}
-
-define void @use_trap() {
-; CHECK-LABEL: define void @use_trap(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    call void @llvm.trap()
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.trap()
-  ret void
-}
-
-define void @use_debugtrap() {
-; CHECK-LABEL: define void @use_debugtrap(
-; CHECK-SAME: ) #[[ATTR1]] {
-; CHECK-NEXT:    call void @llvm.debugtrap()
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.debugtrap()
-  ret void
-}
-
-define void @use_ubsantrap() {
-; CHECK-LABEL: define void @use_ubsantrap(
-; CHECK-SAME: ) #[[ATTR1]] {
-; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.ubsantrap(i8 0)
-  ret void
-}
-
-;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
-;.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
index bd29e9e5855ff..8fec92ca8cfd9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
@@ -138,4 +138,3 @@ define amdgpu_kernel void @kern_block_direct_allocation() {
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-;.
diff --git a/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll b/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll
index d9d143d4823b9..05d3583197f77 100644
--- a/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll
+++ b/llvm/test/DebugInfo/AMDGPU/heterogeneous-dwarf-diop-diexpression-address-spaces.ll
@@ -123,7 +123,6 @@ attributes #0 = { "frame-pointer"="all" }
 
 ; CHECK: [[PTR_AS_3]]: DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type
-; CHECK-NEXT: DW_AT_address_class (0x00000003)
 ; CHECK-NEXT: DW_AT_LLVM_address_space (0x00000003 "DW_ASPACE_LLVM_AMDGPU_local")
 
 ; CHECK: [[PTR_AS_NONE]]: DW_TAG_pointer_type
@@ -132,7 +131,6 @@ attributes #0 = { "frame-pointer"="all" }
 
 ; CHECK: [[PTR_AS_5]]: DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type
-; CHECK-NEXT: DW_AT_address_class (0x00000005)
 ; CHECK-NEXT: DW_AT_LLVM_address_space (0x00000005 "DW_ASPACE_LLVM_AMDGPU_private_lane")
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/DebugInfo/AMDGPU/pointer-address-space.ll b/llvm/test/DebugInfo/AMDGPU/pointer-address-space.ll
index 60df8365e321e..3e8e80e442e5b 100644
--- a/llvm/test/DebugInfo/AMDGPU/pointer-address-space.ll
+++ b/llvm/test/DebugInfo/AMDGPU/pointer-address-space.ll
@@ -50,13 +50,11 @@
 
 ; CHECK:      0x[[LOCAL]]: DW_TAG_pointer_type
 ; CHECK-NEXT:                DW_AT_type
-; CHECK-NEXT:                DW_AT_address_class [DW_FORM_data4] (0x00000002)
 ; CHECK-NEXT:                DW_AT_LLVM_address_space [DW_FORM_data4] (0x00000002 "DW_ASPACE_LLVM_AMDGPU_region")
 ; CHECK-NEXT:                DW_AT_LLVM_memory_space [DW_FORM_data4] (DW_MSPACE_LLVM_group)
 
 ; CHECK:      0x[[PRIVATE]]: DW_TAG_pointer_type
 ; CHECK-NEXT:                  DW_AT_type
-; CHECK-NEXT:                  DW_AT_address_class [DW_FORM_data4] (0x00000001)
 ; CHECK-NEXT:                  DW_AT_LLVM_address_space [DW_FORM_data4] (0x00000001 "DW_ASPACE_LLVM_AMDGPU_generic")
 ; CHECK-NEXT:                  DW_AT_LLVM_memory_space [DW_FORM_data4] (DW_MSPACE_LLVM_private)
 
diff --git a/llvm/test/DebugInfo/Generic/address_space_rvalue.ll b/llvm/test/DebugInfo/Generic/address_space_rvalue.ll
index 38798c11b5667..b16ac7e6ce987 100644
--- a/llvm/test/DebugInfo/Generic/address_space_rvalue.ll
+++ b/llvm/test/DebugInfo/Generic/address_space_rvalue.ll
@@ -6,7 +6,8 @@
 
 ; CHECK: DW_TAG_rvalue_reference_type
 ; CHECK-NOT: DW_TAG
-; CHECK: DW_AT_address_class	(0x00000001)
+; CHECK-NOT: DW_AT_address_class
+; CHECK: DW_AT_LLVM_address_space (0x00000001)
 
 @y = global ptr null, align 8, !dbg !0
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s
index 819ecb866c5ae..ba5159482df50 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopk.s
@@ -258,3 +258,12 @@ s_getreg_b32 s0, hwreg(HW_REG_SHADER_CYCLES_LO)
 
 s_getreg_b32 s0, hwreg(HW_REG_SHADER_CYCLES_HI)
 // GFX12: encoding: [0x1e,0xf8,0x80,0xb8]
+
+s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCHED_MODE)
+// GFX12: encoding: [0x1a,0xf8,0x80,0xb8]
+
+s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), s2
+// GFX12: encoding: [0x1a,0x08,0x02,0xb9]
+
+s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE), 0x2
+// GFX12: encoding: [0x1a,0xf8,0x80,0xb9,0x02,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
index 41c5724a596f9..63ad07acee36f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopk.txt
@@ -276,3 +276,12 @@
 
 # GFX12: s_getreg_b32 s0, hwreg(HW_REG_SHADER_CYCLES_HI) ; encoding: [0x1e,0xf8,0x80,0xb8]
 0x1e,0xf8,0x80,0xb8
+
+# GFX12: s_getreg_b32 s0, hwreg(HW_REG_WAVE_SCHED_MODE) ; encoding: [0x1a,0xf8,0x80,0xb8]
+0x1a,0xf8,0x80,0xb8
+
+# GFX12: s_setreg_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), s2 ; encoding: [0x1a,0x08,0x02,0xb9]
+0x1a,0x08,0x02,0xb9
+
+# GFX12: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE), 2 ; encoding: [0x1a,0xf8,0x80,0xb9,0x02,0x00,0x00,0x00]
+0x1a,0xf8,0x80,0xb9,0x02,0x00,0x00,0x00
diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt
new file mode 100644
index 0000000000000..f6b0d87ba563d
--- /dev/null
+++ b/offload/DeviceRTL/CMakeLists.txt
@@ -0,0 +1,232 @@
+set(LIBOMPTARGET_BUILD_DEVICERTL_BCLIB TRUE CACHE BOOL
+  "Can be set to false to disable building this library.")
+
+if (NOT LIBOMPTARGET_BUILD_DEVICERTL_BCLIB)
+  message(STATUS "Not building DeviceRTL: Disabled by LIBOMPTARGET_BUILD_DEVICERTL_BCLIB")
+  return()
+endif()
+
+# Check to ensure the host system is a supported host architecture.
+if(NOT ${CMAKE_SIZEOF_VOID_P} EQUAL "8")
+  message(STATUS "Not building DeviceRTL: Runtime does not support 32-bit hosts")
+  return()
+endif()
+
+if (LLVM_DIR)
+  # Builds that use pre-installed LLVM have LLVM_DIR set.
+  # A standalone or LLVM_ENABLE_RUNTIMES=openmp build takes this route
+  find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
+elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
+  # LLVM in-tree builds may use CMake target names to discover the tools.
+  # A LLVM_ENABLE_PROJECTS=openmp build takes this route
+  set(CLANG_TOOL $<TARGET_FILE:clang>)
+else()
+  message(STATUS "Not building DeviceRTL. No appropriate clang found")
+  return()
+endif()
+
+set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
+set(include_directory ${devicertl_base_directory}/include)
+set(source_directory ${devicertl_base_directory}/src)
+
+if(OFFLOAD_ENABLE_EMISSARY_APIS)
+  set(emissary_includes ${include_directory}/EmissaryIds.h)
+  set(emissary_sources src/EmissaryFortrt.cpp src/EmissaryPrint.cpp)
+endif()
+
+set(include_files
+  ${include_directory}/Allocator.h
+  ${include_directory}/Configuration.h
+  ${include_directory}/Platform.h
+  ${include_directory}/Debug.h
+  ${include_directory}/Interface.h
+  ${include_directory}/LibC.h
+  ${include_directory}/Mapping.h
+  ${include_directory}/Profiling.h
+  ${include_directory}/State.h
+  ${include_directory}/Synchronization.h
+  ${include_directory}/DeviceTypes.h
+  ${include_directory}/DeviceUtils.h
+  ${include_directory}/Xteamr.h
+  ${include_directory}/Xteams.h
+  ${include_directory}/Workshare.h
+  ${emissary_includes}
+)
+
+set(src_files
+  ${source_directory}/Allocator.cpp
+  ${source_directory}/Configuration.cpp
+  ${source_directory}/Debug.cpp
+  ${source_directory}/Kernel.cpp
+  ${source_directory}/LibC.cpp
+  ${source_directory}/LibM.cpp
+  ${source_directory}/Mapping.cpp
+  ${source_directory}/Misc.cpp
+  ${source_directory}/Parallelism.cpp
+  ${source_directory}/Profiling.cpp
+  ${source_directory}/Reduction.cpp
+  ${source_directory}/State.cpp
+  ${source_directory}/Synchronization.cpp
+  ${source_directory}/Tasking.cpp
+  ${source_directory}/DeviceUtils.cpp
+  ${source_directory}/Workshare.cpp
+  ${source_directory}/ExtraMapping.cpp
+  ${source_directory}/Xteamr.cpp
+  ${source_directory}/Memory.cpp
+  ${source_directory}/Xteams.cpp
+  ${emissary_sources}
+)
+
+# We disable the slp vectorizer during the runtime optimization to avoid
+# vectorized accesses to the shared state. Generally, those are "good" but
+# the optimizer pipeline (esp. Attributor) does not fully support vectorized
+# instructions yet and we end up missing out on way more important constant
+# propagation. That said, we will run the vectorizer again after the runtime
+# has been linked into the user program.
+set(clang_opt_flags -O3 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=512 -mllvm -vectorize-slp=false )
+
+# If the user built with the GPU C library enabled we will use that instead.
+if(${LIBOMPTARGET_GPU_LIBC_SUPPORT})
+  list(APPEND clang_opt_flags -DOMPTARGET_HAS_LIBC)
+endif()
+
+# Set flags for LLVM Bitcode compilation.
+set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
+             ${clang_opt_flags} -nogpulib -nostdlibinc
+             -fno-rtti -fno-exceptions -fconvergent-functions
+             -Wno-unknown-cuda-version
+             -DOMPTARGET_DEVICE_RUNTIME
+             -I${include_directory}
+             -I${devicertl_base_directory}/../include
+             -I${devicertl_base_directory}/../../libc
+)
+
+if(${LIBOMPTARGET_DEVICE_DEBUG})
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=-1)
+else()
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=0)
+endif()
+
+# first create an object target
+add_library(omptarget.devicertl.all_objs OBJECT IMPORTED)
+function(compileDeviceRTLLibrary target_name target_triple)
+  set(target_bc_flags ${ARGN})
+
+  if(${target_name} MATCHES "amdgpu")
+    find_package(AMDDeviceLibs REQUIRED CONFIG
+                 HINTS ${CMAKE_BINARY_DIR}/../../tools/rocm-device-libs
+                       ${CMAKE_BINARY_DIR}/../rocm-device-libs-prefix/src/rocm-device-libs-build
+                       ${CMAKE_INSTALL_PREFIX}
+    )
+    get_target_property(_ocml_bc ocml IMPORTED_LOCATION)
+    get_target_property(_ockl_bc ockl IMPORTED_LOCATION)
+    if(NOT _ockl_bc)
+      message(FATAL_ERROR "Could not find ockl.bc")
+    endif()
+    if(NOT _ocml_bc)
+      message(FATAL_ERROR "Could not find ocml.bc")
+    endif()
+    list(APPEND target_bc_flags -Xclang -mlink-builtin-bitcode -Xclang ${_ockl_bc})
+    list(APPEND target_bc_flags -Xclang -mlink-builtin-bitcode -Xclang ${_ocml_bc})
+  endif()
+
+  foreach(src ${src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-${target_name}.o")
+    set(depfile "${outfile}.d")
+
+    # Passing an empty CPU to -march= suppressed target specific metadata.
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${CLANG_TOOL}
+      ${bc_flags}
+      --target=${target_triple}
+      ${target_bc_flags}
+      -MD -MF ${depfile}
+      ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      DEPFILE ${depfile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
+    )
+    if(TARGET clang)
+      # Add a file-level dependency to ensure that clang is up-to-date.
+      # By default, add_custom_command only builds clang if the
+      # executable is missing.
+      add_custom_command(OUTPUT ${outfile}
+        DEPENDS clang
+        APPEND
+      )
+    endif()
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+    list(APPEND obj_files ${CMAKE_CURRENT_BINARY_DIR}/${outfile})
+  endforeach()
+  # Trick to combine these into a bitcode file via the linker's LTO pass. This
+  # is used to provide the legacy `libomptarget-<name>.bc` files. Hack this
+  # through as an executable to get it to use the relocatable link.
+  add_executable(libomptarget-${target_name})
+  target_sources(libomptarget-${target_name} PRIVATE ${obj_files})
+  set_target_properties(libomptarget-${target_name} PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}
+    LINKER_LANGUAGE CXX
+    BUILD_RPATH ""
+    INSTALL_RPATH ""
+    RUNTIME_OUTPUT_NAME libomptarget-${target_name}.bc)
+  target_compile_options(libomptarget-${target_name} PRIVATE
+    "--target=${target_triple}" "-fuse-ld=lld" "-march=" "-mcpu="
+    "-Wno-unused-command-line-argument")
+  target_link_options(libomptarget-${target_name} PRIVATE
+    "--target=${target_triple}" "-r" "-nostdlib" "-flto" "-Wl,--lto-emit-llvm"
+    "-Wl,--lto-newpm-passes=default<O3>" "-Wl,-plugin-opt=-openmp-opt-disable"
+    "-Wl,-plugin-opt=-attributor-enable=module"
+    "-Wl,-plugin-opt=-vectorize-slp=false" "-fuse-ld=lld" "-march=" "-mcpu=")
+  install(TARGETS libomptarget-${target_name}
+          PERMISSIONS OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+          DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
+
+  add_library(omptarget.${target_name}.all_objs OBJECT IMPORTED)
+  set_property(TARGET omptarget.${target_name}.all_objs APPEND PROPERTY IMPORTED_OBJECTS
+               ${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/libomptarget-${target_name}.bc)
+  add_dependencies(omptarget.${target_name}.all_objs libomptarget-${target_name})
+
+  # Archive all the object files generated above into a static library
+  add_library(omptarget.${target_name} STATIC)
+  set_target_properties(omptarget.${target_name} PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${LIBOMPTARGET_LLVM_LIBRARY_INTDIR}/${target_triple}"
+    ARCHIVE_OUTPUT_NAME ompdevice
+    LINKER_LANGUAGE CXX
+  )
+  add_dependencies(omptarget.${target_name} libomptarget-${target_name})
+  target_link_libraries(omptarget.${target_name} PRIVATE omptarget.${target_name}.all_objs)
+  target_link_options(omptarget.${target_name} PRIVATE "--target=${target_triple}"
+                      "-Wno-unused-command-line-argument" "-r" "-nostdlib" "-flto"
+                       "-Wl,--lto-emit-llvm" "-fuse-ld=lld" "-march=" "-mcpu=")
+
+  install(TARGETS omptarget.${target_name}
+          ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}/${target_triple}")
+
+  if (CMAKE_EXPORT_COMPILE_COMMANDS)
+    set(ide_target_name omptarget-ide-${target_name})
+    add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
+    target_compile_options(${ide_target_name} PRIVATE
+      -fvisibility=hidden --target=${target_triple}
+      -nogpulib -nostdlibinc -Wno-unknown-cuda-version
+    )
+    target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
+    target_include_directories(${ide_target_name} PRIVATE
+      ${include_directory}
+      ${devicertl_base_directory}/../../libc
+      ${devicertl_base_directory}/../include
+    )
+    install(TARGETS ${ide_target_name} EXCLUDE_FROM_ALL)
+  endif()
+endfunction()
+
+if(NOT LLVM_TARGETS_TO_BUILD OR "AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)
+  compileDeviceRTLLibrary(amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
+endif()
+
+if(NOT LLVM_TARGETS_TO_BUILD OR "NVPTX" IN_LIST LLVM_TARGETS_TO_BUILD)
+  compileDeviceRTLLibrary(nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx63)
+endif()
diff --git a/offload/liboffload/CMakeLists.txt b/offload/liboffload/CMakeLists.txt
index efb800f2495f9..613c6373d0e4d 100644
--- a/offload/liboffload/CMakeLists.txt
+++ b/offload/liboffload/CMakeLists.txt
@@ -39,10 +39,18 @@ target_compile_definitions(LLVMOffload PRIVATE
   DEBUG_PREFIX="Liboffload"
 )
 
-set_target_properties(LLVMOffload PROPERTIES
-                      POSITION_INDEPENDENT_CODE ON
-                      INSTALL_RPATH "$ORIGIN"
-                      BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
+# Don't override an externally defined RPATH
+if(NOT DEFINED CMAKE_INSTALL_RPATH)
+  set_target_properties(LLVMOffload PROPERTIES
+                        POSITION_INDEPENDENT_CODE ON
+                        INSTALL_RPATH "$ORIGIN:$ORIGIN/../lib:$ORIGIN/../../lib"
+                        BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
+else()
+  set_target_properties(LLVMOffload PROPERTIES
+                        POSITION_INDEPENDENT_CODE ON
+                        INSTALL_RPATH ${CMAKE_INSTALL_RPATH}
+                        BUILD_RPATH "$ORIGIN:${CMAKE_CURRENT_BINARY_DIR}/..")
+endif()
 install(TARGETS LLVMOffload LIBRARY COMPONENT LLVMOffload DESTINATION "${OFFLOAD_INSTALL_LIBDIR}")
 
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/API/OffloadAPI.h DESTINATION ${CMAKE_INSTALL_PREFIX}/include/offload)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 0388bbba4ee28..6bf8aac70fd4c 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -5135,7 +5135,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                                  .OMPX_XTeamReductionOccupancyBasedOpt = false,
                                  .OMPX_AdjustNumTeamsForXteamRedSmallBlockSize=1}},
                       // Default config for unknown devices.
-                      {"DEFAULT", {.OMPX_UseMultipleSdmaEngines = true,
+                      {"DEFAULT", {.OMPX_UseMultipleSdmaEngines = false,
                                  .OMPX_XteamBlockSize = 512,
                                  .OMPX_XTeamReductionOccupancyBasedOpt = false,
                                  .OMPX_AdjustNumTeamsForXteamRedSmallBlockSize=1}}};