-
Notifications
You must be signed in to change notification settings - Fork 308
ci(release): add manylinux2_28+ROCm builder path with auditwheel gate (fixes #2843 long-term) #2851
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -63,6 +63,10 @@ on: | |
| description: 'Add date stamp to version (+yyyymmdd)' | ||
| type: boolean | ||
| default: false | ||
| use_manylinux: | ||
| description: 'Use pytorch/manylinux2_28-builder ROCm image (AlmaLinux 8 + devtoolset, glibc 2.28). Produces wheels ABI-compatible with vLLM/Ubuntu 22 containers.' | ||
| type: boolean | ||
| default: false | ||
| workflow_call: | ||
| inputs: | ||
| release_type: | ||
|
|
@@ -102,6 +106,11 @@ on: | |
| type: boolean | ||
| required: false | ||
| default: false | ||
| use_manylinux: | ||
| description: 'Use pytorch/manylinux2_28-builder ROCm image for ABI-compatible wheels' | ||
| type: boolean | ||
| required: false | ||
| default: false | ||
| outputs: | ||
| wheel_names: | ||
| description: 'Space-separated list of built wheel filenames' | ||
|
|
@@ -135,6 +144,7 @@ jobs: | |
| GPU_ARCHS: ${{ inputs.gpu_archs || github.event.inputs.gpu_archs }} | ||
| RELEASE_TYPE: ${{ inputs.release_type || github.event.inputs.release_type }} | ||
| ADD_DATE_STAMP: ${{ inputs.add_date_stamp || github.event.inputs.add_date_stamp }} | ||
| USE_MANYLINUX: ${{ inputs.use_manylinux || github.event.inputs.use_manylinux || (startsWith(matrix.docker_image, 'pytorch/manylinux') && 'true') || 'false' }} | ||
|
|
||
| steps: | ||
| - name: Checkout aiter repo | ||
|
|
@@ -184,16 +194,22 @@ jobs: | |
| run: | | ||
| set -ex | ||
| echo "Starting container: aiter_build_${{ matrix.python_version }}" | ||
| # The manylinux2_28-builder image has no ENTRYPOINT but a default | ||
| # CMD of /bin/bash, which exits immediately for `docker run -d`. | ||
| # Force a long-lived `sleep infinity` so subsequent `docker exec` | ||
| # invocations succeed for both legacy and manylinux images. | ||
| docker run -dt \ | ||
| --shm-size=16G \ | ||
| --network=host \ | ||
| --entrypoint="" \ | ||
| -v "${{ github.workspace }}:/workspace" \ | ||
| -w /workspace \ | ||
| --name aiter_build_${{ matrix.python_version }} \ | ||
| ${{ env.BUILD_DOCKER_IMAGE }} | ||
| ${{ env.BUILD_DOCKER_IMAGE }} \ | ||
| sleep infinity | ||
|
|
||
| - name: Install Dependencies | ||
| if: ${{ matrix.build_enabled }} | ||
| - name: Install Dependencies (legacy rocm/pytorch image) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} | ||
| run: | | ||
| set -e | ||
| echo "Install Dependencies" | ||
|
|
@@ -202,17 +218,17 @@ jobs: | |
| aiter_build_${{ matrix.python_version }} \ | ||
| pip install --timeout=60 --retries=10 -r requirements.txt | ||
|
|
||
| - name: Pin setuptools_scm | ||
| if: ${{ matrix.build_enabled }} | ||
| - name: Pin setuptools_scm (legacy) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} | ||
| run: | | ||
| set -e | ||
| docker exec \ | ||
| -w /workspace \ | ||
| aiter_build_${{ matrix.python_version }} \ | ||
| pip install --timeout=60 --retries=10 "setuptools_scm<10" | ||
|
|
||
| - name: Install ninja | ||
| if: ${{ matrix.build_enabled }} | ||
| - name: Install ninja (legacy) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} | ||
| run: | | ||
| set -e | ||
| echo "Install ninja" | ||
|
|
@@ -221,17 +237,17 @@ jobs: | |
| aiter_build_${{ matrix.python_version }} \ | ||
| pip install --timeout=60 --retries=10 ninja | ||
|
|
||
| - name: Pin setuptools-scm | ||
| if: ${{ matrix.build_enabled }} | ||
| - name: Pin setuptools-scm (legacy) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} | ||
| run: | | ||
| set -e | ||
| docker exec \ | ||
| -w /workspace \ | ||
| aiter_build_${{ matrix.python_version }} \ | ||
| pip install --timeout=60 --retries=10 'setuptools-scm<10' | ||
|
|
||
| - name: Build Aiter | ||
| if: ${{ matrix.build_enabled }} | ||
| - name: Build Aiter (legacy) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} | ||
| run: | | ||
| set -e | ||
| echo "Building aiter whl packages for Python ${{ matrix.python_version }}..." | ||
|
|
@@ -254,6 +270,206 @@ jobs: | |
| bash -c "python3 setup.py bdist_wheel && ls dist/*.whl" | ||
| fi | ||
|
|
||
| # ============================================================ | ||
| # manylinux2_28 build path (pytorch/manylinux2_28-builder:rocmX) | ||
| # ============================================================ | ||
| # Produces wheels that obey the manylinux_2_28 ABI contract | ||
| # (glibc 2.28, libstdc++ from system / no GLIBCXX > 3.4.25 in | ||
| # devtoolset-static path), making them dlopen-safe inside | ||
| # Ubuntu 22 based vLLM containers (libstdc++ tops out at | ||
| # GLIBCXX_3.4.30 there). See issue #2843. | ||
| # ============================================================ | ||
|
|
||
| - name: Resolve manylinux Python path | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} | ||
| id: ml_py | ||
| run: | | ||
| set -e | ||
| PY="${PYTHON_VERSION/./}" # 3.12 -> 312 | ||
| PYBIN="/opt/python/cp${PY}-cp${PY}/bin" | ||
| echo "pybin=${PYBIN}" >> "$GITHUB_OUTPUT" | ||
| echo "Using python: ${PYBIN}/python" | ||
|
|
||
| - name: Install Dependencies (manylinux) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} | ||
| env: | ||
| PYBIN: ${{ steps.ml_py.outputs.pybin }} | ||
| run: | | ||
| set -e | ||
| # Detect ROCm flavor of the builder image to pick a torch index | ||
| # Image tag looks like pytorch/manylinux2_28-builder:rocm7.2 | ||
| IMG="${BUILD_DOCKER_IMAGE}" | ||
| ROCM_TAG="${IMG##*:}" # rocm7.2 / rocm7.1 / rocm7.0 | ||
| ROCM_NUM="${ROCM_TAG#rocm}" # 7.2 | ||
| TORCH_INDEX="https://download.pytorch.org/whl/rocm${ROCM_NUM}" | ||
| echo "Torch index: ${TORCH_INDEX}" | ||
| docker exec \ | ||
| -w /workspace \ | ||
| -e PYBIN="${PYBIN}" \ | ||
| -e TORCH_INDEX="${TORCH_INDEX}" \ | ||
| aiter_build_${{ matrix.python_version }} \ | ||
| bash -c ' | ||
| set -e | ||
| ${PYBIN}/pip install --upgrade --timeout=60 --retries=10 pip | ||
| ${PYBIN}/pip install --timeout=60 --retries=10 --index-url "${TORCH_INDEX}" torch | ||
| # flydsl publishes only manylinux_2_35 wheels which cannot install | ||
| # on AlmaLinux 8 (glibc 2.28). FlyDSL AOT pre-compilation in | ||
| # setup.py is wrapped in try/except and is skipped gracefully when | ||
| # flydsl is missing, so we drop it here. The resulting wheel still | ||
| # contains all CK-free + Triton kernels; only the optional | ||
| # FlyDSL AOT cache is omitted (kernels JIT on first use). | ||
| grep -v "^flydsl" requirements.txt > /tmp/requirements-no-flydsl.txt | ||
| ${PYBIN}/pip install --timeout=60 --retries=10 -r /tmp/requirements-no-flydsl.txt | ||
| ${PYBIN}/pip install --timeout=60 --retries=10 "setuptools_scm<10" ninja auditwheel patchelf | ||
| ' | ||
|
|
||
| - name: Build Aiter (manylinux) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} | ||
| env: | ||
| PYBIN: ${{ steps.ml_py.outputs.pybin }} | ||
| run: | | ||
| set -e | ||
| echo "Building aiter whl in pytorch/manylinux2_28-builder for Python ${PYTHON_VERSION}..." | ||
|
|
||
| # Compose SETUPTOOLS_SCM_PRETEND_VERSION | ||
| if [ "${ADD_DATE_STAMP}" = "true" ] && [ -n "${{ steps.version.outputs.nightly_version }}" ]; then | ||
| BASE_VER="${{ steps.version.outputs.nightly_version }}" | ||
| else | ||
| BASE_VER="$(git describe --tags --abbrev=0 2>/dev/null | sed 's/^v//' || echo 0.1.0)" | ||
| fi | ||
| # Tag the wheel so it is unambiguously the manylinux build | ||
| IMG="${BUILD_DOCKER_IMAGE}" | ||
| ROCM_TAG="${IMG##*:}" | ||
| PRETEND_VER="${BASE_VER}+${ROCM_TAG}.manylinux_2_28" | ||
| echo "SETUPTOOLS_SCM_PRETEND_VERSION=${PRETEND_VER}" | ||
|
|
||
| docker exec \ | ||
| -e SETUPTOOLS_SCM_PRETEND_VERSION="${PRETEND_VER}" \ | ||
| -e PREBUILD_KERNELS=1 \ | ||
| -e GPU_ARCHS="${GPU_ARCHS}" \ | ||
| -e PYBIN="${PYBIN}" \ | ||
| -w /workspace \ | ||
| aiter_build_${{ matrix.python_version }} \ | ||
| bash -c ' | ||
| set -e | ||
| # gcc-toolset-13 is the default manylinux_2_28 toolchain | ||
| source /opt/rh/gcc-toolset-13/enable 2>/dev/null || true | ||
| export PATH="${PYBIN}:${PATH}" | ||
| python --version | ||
| which python gcc | ||
| gcc --version | head -1 | ||
| python setup.py bdist_wheel | ||
| ls dist/*.whl | ||
| ' | ||
|
|
||
| - name: Repair wheel for manylinux compliance | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} | ||
| env: | ||
| PYBIN: ${{ steps.ml_py.outputs.pybin }} | ||
| run: | | ||
| set -e | ||
| docker exec \ | ||
| -e PYBIN="${PYBIN}" \ | ||
| -w /workspace \ | ||
| aiter_build_${{ matrix.python_version }} \ | ||
| bash -c ' | ||
| set -e | ||
| mkdir -p dist-repaired | ||
| # We exclude the ROCm and torch shared libs from bundling. | ||
| # ROCm libs ship in the consumer container; torch libs are | ||
| # provided by the user-installed PyTorch wheel. | ||
| EXCLUDES=( | ||
| --exclude libamdhip64.so.7 | ||
| --exclude libamdhip64.so | ||
| --exclude libhsa-runtime64.so.1 | ||
| --exclude libhsa-runtime64.so | ||
| --exclude librocblas.so | ||
| --exclude librocsparse.so | ||
| --exclude librocsolver.so | ||
| --exclude libhipblas.so | ||
| --exclude libhipblaslt.so | ||
| --exclude librccl.so.1 | ||
| --exclude librccl.so | ||
| --exclude libMIOpen.so.1 | ||
| --exclude libMIOpen.so | ||
| --exclude libtorch.so | ||
| --exclude libtorch_cpu.so | ||
| --exclude libtorch_hip.so | ||
| --exclude libtorch_python.so | ||
| --exclude libc10.so | ||
| --exclude libc10_hip.so | ||
| ) | ||
| for whl in dist/*.whl; do | ||
| echo "=== auditwheel show (raw) ===" | ||
| ${PYBIN}/auditwheel show "${whl}" || true | ||
| echo "=== auditwheel repair ===" | ||
| ${PYBIN}/auditwheel repair "${whl}" "${EXCLUDES[@]}" \ | ||
| --plat manylinux_2_28_x86_64 -w dist-repaired/ | ||
| done | ||
| echo "=== Repaired wheels ===" | ||
| ls -lh dist-repaired/ | ||
| # Replace original dist with repaired copies (keeps existing | ||
| # downstream upload logic untouched). | ||
| rm -f dist/*.whl | ||
| mv dist-repaired/*.whl dist/ | ||
| ls -lh dist/ | ||
| ' | ||
|
|
||
| - name: Verify wheel symbol floor (auditwheel + objdump) | ||
| if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} | ||
| env: | ||
| PYBIN: ${{ steps.ml_py.outputs.pybin }} | ||
| run: | | ||
| set -e | ||
| # Hard ceilings: | ||
| # GLIBCXX <= 3.4.29 (Ubuntu 22 / vLLM container ships up to 3.4.30) | ||
| # GLIBC <= 2.34 (Ubuntu 22 ships 2.35; 2.34 is a safety margin) | ||
| docker exec \ | ||
| -e PYBIN="${PYBIN}" \ | ||
| -w /workspace \ | ||
| aiter_build_${{ matrix.python_version }} \ | ||
| bash -c ' | ||
| set -e | ||
| MAX_GLIBCXX_MINOR=29 | ||
| MAX_GLIBC_MAJOR=2 | ||
| MAX_GLIBC_MINOR=34 | ||
| fail=0 | ||
| for whl in dist/*.whl; do | ||
| echo "=============================================" | ||
| echo "Wheel: ${whl}" | ||
| echo "=============================================" | ||
| ${PYBIN}/auditwheel show "${whl}" || true | ||
| tmpd=$(mktemp -d) | ||
| ${PYBIN}/python -m zipfile -e "${whl}" "${tmpd}" | ||
| # Check every shared object under aiter/ | ||
| while IFS= read -r so; do | ||
| syms=$(objdump -p "${so}" 2>/dev/null | grep -oE "GLIBC(_2|XX_3\.4)\.[0-9]+" | sort -V | uniq) | ||
|
Comment on lines
+443
to
+446
|
||
| max_gxx=$(echo "${syms}" | grep -oE "GLIBCXX_3\.4\.[0-9]+" | awk -F. "{print \$3}" | sort -n | tail -1) | ||
| max_gxx=${max_gxx:-0} | ||
| max_gc=$(echo "${syms}" | grep -oE "GLIBC_2\.[0-9]+" | awk -F. "{print \$2}" | sort -n | tail -1) | ||
| max_gc=${max_gc:-0} | ||
| status=OK | ||
| if [ "${max_gxx}" -gt "${MAX_GLIBCXX_MINOR}" ]; then | ||
| status="FAIL(GLIBCXX_3.4.${max_gxx} > 3.4.${MAX_GLIBCXX_MINOR})" | ||
| fail=1 | ||
| fi | ||
| if [ "${max_gc}" -gt "${MAX_GLIBC_MINOR}" ]; then | ||
| status="${status} FAIL(GLIBC_${MAX_GLIBC_MAJOR}.${max_gc} > 2.${MAX_GLIBC_MINOR})" | ||
| fail=1 | ||
| fi | ||
| printf " %-80s GLIBCXX=3.4.%-3s GLIBC=2.%-3s %s\n" "${so#${tmpd}/}" "${max_gxx}" "${max_gc}" "${status}" | ||
| done < <(find "${tmpd}" -type f -name "*.so" -o -name "*.so.*") | ||
| rm -rf "${tmpd}" | ||
| done | ||
| if [ "${fail}" -ne 0 ]; then | ||
| echo "" | ||
| echo "ERROR: one or more shared objects exceed the manylinux_2_28 / vLLM compat symbol floor" | ||
| exit 1 | ||
| fi | ||
| echo "" | ||
| echo "All shared objects pass GLIBCXX <= 3.4.${MAX_GLIBCXX_MINOR} and GLIBC <= 2.${MAX_GLIBC_MINOR}" | ||
| ' | ||
|
|
||
| - name: Upload whl file as artifact | ||
| if: ${{ matrix.build_enabled }} | ||
| uses: actions/upload-artifact@v4 | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -219,8 +219,6 @@ void gated_rmsnorm_fp8_group_quant_launcher_impl( | |||||||
| num_heads, | ||||||||
| head_dim | ||||||||
| ); | ||||||||
|
||||||||
| ); | |
| ); | |
| HIP_CALL_LAUNCH(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The PR description claims legacy behavior is “byte-identical to before”, but this step now clears the image ENTRYPOINT and forces
sleep infinityfor all images (including the legacyrocm/pytorch:*path). If preserving legacy behavior is important, consider gating--entrypoint=""/sleep infinityto only the manylinux path, or update the PR description to reflect this behavior change.