diff --git a/.github/workflows/aiter-release.yaml b/.github/workflows/aiter-release.yaml index 5c9468e7a2..cd1fcab09e 100644 --- a/.github/workflows/aiter-release.yaml +++ b/.github/workflows/aiter-release.yaml @@ -63,6 +63,10 @@ on: description: 'Add date stamp to version (+yyyymmdd)' type: boolean default: false + use_manylinux: + description: 'Use pytorch/manylinux2_28-builder ROCm image (AlmaLinux 8 + devtoolset, glibc 2.28). Produces wheels ABI-compatible with vLLM/Ubuntu 22 containers.' + type: boolean + default: false workflow_call: inputs: release_type: @@ -102,6 +106,11 @@ on: type: boolean required: false default: false + use_manylinux: + description: 'Use pytorch/manylinux2_28-builder ROCm image for ABI-compatible wheels' + type: boolean + required: false + default: false outputs: wheel_names: description: 'Space-separated list of built wheel filenames' @@ -135,6 +144,7 @@ jobs: GPU_ARCHS: ${{ inputs.gpu_archs || github.event.inputs.gpu_archs }} RELEASE_TYPE: ${{ inputs.release_type || github.event.inputs.release_type }} ADD_DATE_STAMP: ${{ inputs.add_date_stamp || github.event.inputs.add_date_stamp }} + USE_MANYLINUX: ${{ inputs.use_manylinux || github.event.inputs.use_manylinux || (startsWith(matrix.docker_image, 'pytorch/manylinux') && 'true') || 'false' }} steps: - name: Checkout aiter repo @@ -184,16 +194,22 @@ jobs: run: | set -ex echo "Starting container: aiter_build_${{ matrix.python_version }}" + # The manylinux2_28-builder image has no ENTRYPOINT but a default + # CMD of /bin/bash, which exits immediately for `docker run -d`. + # Force a long-lived `sleep infinity` so subsequent `docker exec` + # invocations succeed for both legacy and manylinux images. docker run -dt \ --shm-size=16G \ --network=host \ + --entrypoint="" \ -v "${{ github.workspace }}:/workspace" \ -w /workspace \ --name aiter_build_${{ matrix.python_version }} \ - ${{ env.BUILD_DOCKER_IMAGE }} + ${{ env.BUILD_DOCKER_IMAGE }} \ + sleep infinity - - name: Install Dependencies - if: ${{ matrix.build_enabled }} + - name: Install Dependencies (legacy rocm/pytorch image) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} run: | set -e echo "Install Dependencies" @@ -202,8 +218,8 @@ jobs: aiter_build_${{ matrix.python_version }} \ pip install --timeout=60 --retries=10 -r requirements.txt - - name: Pin setuptools_scm - if: ${{ matrix.build_enabled }} + - name: Pin setuptools_scm (legacy) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} run: | set -e docker exec \ @@ -211,8 +227,8 @@ jobs: aiter_build_${{ matrix.python_version }} \ pip install --timeout=60 --retries=10 "setuptools_scm<10" - - name: Install ninja - if: ${{ matrix.build_enabled }} + - name: Install ninja (legacy) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} run: | set -e echo "Install ninja" @@ -221,8 +237,8 @@ jobs: aiter_build_${{ matrix.python_version }} \ pip install --timeout=60 --retries=10 ninja - - name: Pin setuptools-scm - if: ${{ matrix.build_enabled }} + - name: Pin setuptools-scm (legacy) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} run: | set -e docker exec \ @@ -230,8 +246,8 @@ jobs: aiter_build_${{ matrix.python_version }} \ pip install --timeout=60 --retries=10 'setuptools-scm<10' - - name: Build Aiter - if: ${{ matrix.build_enabled }} + - name: Build Aiter (legacy) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX != 'true' }} run: | set -e echo "Building aiter whl packages for Python ${{ matrix.python_version }}..." @@ -254,6 +270,206 @@ jobs: bash -c "python3 setup.py bdist_wheel && ls dist/*.whl" fi + # ============================================================ + # manylinux2_28 build path (pytorch/manylinux2_28-builder:rocmX) + # ============================================================ + # Produces wheels that obey the manylinux_2_28 ABI contract + # (glibc 2.28, libstdc++ from system / no GLIBCXX > 3.4.25 in + # devtoolset-static path), making them dlopen-safe inside + # Ubuntu 22 based vLLM containers (libstdc++ tops out at + # GLIBCXX_3.4.30 there). See issue #2843. + # ============================================================ + + - name: Resolve manylinux Python path + if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} + id: ml_py + run: | + set -e + PY="${PYTHON_VERSION/./}" # 3.12 -> 312 + PYBIN="/opt/python/cp${PY}-cp${PY}/bin" + echo "pybin=${PYBIN}" >> "$GITHUB_OUTPUT" + echo "Using python: ${PYBIN}/python" + + - name: Install Dependencies (manylinux) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} + env: + PYBIN: ${{ steps.ml_py.outputs.pybin }} + run: | + set -e + # Detect ROCm flavor of the builder image to pick a torch index + # Image tag looks like pytorch/manylinux2_28-builder:rocm7.2 + IMG="${BUILD_DOCKER_IMAGE}" + ROCM_TAG="${IMG##*:}" # rocm7.2 / rocm7.1 / rocm7.0 + ROCM_NUM="${ROCM_TAG#rocm}" # 7.2 + TORCH_INDEX="https://download.pytorch.org/whl/rocm${ROCM_NUM}" + echo "Torch index: ${TORCH_INDEX}" + docker exec \ + -w /workspace \ + -e PYBIN="${PYBIN}" \ + -e TORCH_INDEX="${TORCH_INDEX}" \ + aiter_build_${{ matrix.python_version }} \ + bash -c ' + set -e + ${PYBIN}/pip install --upgrade --timeout=60 --retries=10 pip + ${PYBIN}/pip install --timeout=60 --retries=10 --index-url "${TORCH_INDEX}" torch + # flydsl publishes only manylinux_2_35 wheels which cannot install + # on AlmaLinux 8 (glibc 2.28). FlyDSL AOT pre-compilation in + # setup.py is wrapped in try/except and is skipped gracefully when + # flydsl is missing, so we drop it here. The resulting wheel still + # contains all CK-free + Triton kernels; only the optional + # FlyDSL AOT cache is omitted (kernels JIT on first use). + grep -v "^flydsl" requirements.txt > /tmp/requirements-no-flydsl.txt + ${PYBIN}/pip install --timeout=60 --retries=10 -r /tmp/requirements-no-flydsl.txt + ${PYBIN}/pip install --timeout=60 --retries=10 "setuptools_scm<10" ninja auditwheel patchelf + ' + + - name: Build Aiter (manylinux) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} + env: + PYBIN: ${{ steps.ml_py.outputs.pybin }} + run: | + set -e + echo "Building aiter whl in pytorch/manylinux2_28-builder for Python ${PYTHON_VERSION}..." + + # Compose SETUPTOOLS_SCM_PRETEND_VERSION + if [ "${ADD_DATE_STAMP}" = "true" ] && [ -n "${{ steps.version.outputs.nightly_version }}" ]; then + BASE_VER="${{ steps.version.outputs.nightly_version }}" + else + BASE_VER="$(git describe --tags --abbrev=0 2>/dev/null | sed 's/^v//' || echo 0.1.0)" + fi + # Tag the wheel so it is unambiguously the manylinux build + IMG="${BUILD_DOCKER_IMAGE}" + ROCM_TAG="${IMG##*:}" + PRETEND_VER="${BASE_VER}+${ROCM_TAG}.manylinux_2_28" + echo "SETUPTOOLS_SCM_PRETEND_VERSION=${PRETEND_VER}" + + docker exec \ + -e SETUPTOOLS_SCM_PRETEND_VERSION="${PRETEND_VER}" \ + -e PREBUILD_KERNELS=1 \ + -e GPU_ARCHS="${GPU_ARCHS}" \ + -e PYBIN="${PYBIN}" \ + -w /workspace \ + aiter_build_${{ matrix.python_version }} \ + bash -c ' + set -e + # gcc-toolset-13 is the default manylinux_2_28 toolchain + source /opt/rh/gcc-toolset-13/enable 2>/dev/null || true + export PATH="${PYBIN}:${PATH}" + python --version + which python gcc + gcc --version | head -1 + python setup.py bdist_wheel + ls dist/*.whl + ' + + - name: Repair wheel for manylinux compliance + if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} + env: + PYBIN: ${{ steps.ml_py.outputs.pybin }} + run: | + set -e + docker exec \ + -e PYBIN="${PYBIN}" \ + -w /workspace \ + aiter_build_${{ matrix.python_version }} \ + bash -c ' + set -e + mkdir -p dist-repaired + # We exclude the ROCm and torch shared libs from bundling. + # ROCm libs ship in the consumer container; torch libs are + # provided by the user-installed PyTorch wheel. + EXCLUDES=( + --exclude libamdhip64.so.7 + --exclude libamdhip64.so + --exclude libhsa-runtime64.so.1 + --exclude libhsa-runtime64.so + --exclude librocblas.so + --exclude librocsparse.so + --exclude librocsolver.so + --exclude libhipblas.so + --exclude libhipblaslt.so + --exclude librccl.so.1 + --exclude librccl.so + --exclude libMIOpen.so.1 + --exclude libMIOpen.so + --exclude libtorch.so + --exclude libtorch_cpu.so + --exclude libtorch_hip.so + --exclude libtorch_python.so + --exclude libc10.so + --exclude libc10_hip.so + ) + for whl in dist/*.whl; do + echo "=== auditwheel show (raw) ===" + ${PYBIN}/auditwheel show "${whl}" || true + echo "=== auditwheel repair ===" + ${PYBIN}/auditwheel repair "${whl}" "${EXCLUDES[@]}" \ + --plat manylinux_2_28_x86_64 -w dist-repaired/ + done + echo "=== Repaired wheels ===" + ls -lh dist-repaired/ + # Replace original dist with repaired copies (keeps existing + # downstream upload logic untouched). + rm -f dist/*.whl + mv dist-repaired/*.whl dist/ + ls -lh dist/ + ' + + - name: Verify wheel symbol floor (auditwheel + objdump) + if: ${{ matrix.build_enabled && env.USE_MANYLINUX == 'true' }} + env: + PYBIN: ${{ steps.ml_py.outputs.pybin }} + run: | + set -e + # Hard ceilings: + # GLIBCXX <= 3.4.29 (Ubuntu 22 / vLLM container ships up to 3.4.30) + # GLIBC <= 2.34 (Ubuntu 22 ships 2.35; 2.34 is a safety margin) + docker exec \ + -e PYBIN="${PYBIN}" \ + -w /workspace \ + aiter_build_${{ matrix.python_version }} \ + bash -c ' + set -e + MAX_GLIBCXX_MINOR=29 + MAX_GLIBC_MAJOR=2 + MAX_GLIBC_MINOR=34 + fail=0 + for whl in dist/*.whl; do + echo "=============================================" + echo "Wheel: ${whl}" + echo "=============================================" + ${PYBIN}/auditwheel show "${whl}" || true + tmpd=$(mktemp -d) + ${PYBIN}/python -m zipfile -e "${whl}" "${tmpd}" + # Check every shared object under aiter/ + while IFS= read -r so; do + syms=$(objdump -p "${so}" 2>/dev/null | grep -oE "GLIBC(_2|XX_3\.4)\.[0-9]+" | sort -V | uniq) + max_gxx=$(echo "${syms}" | grep -oE "GLIBCXX_3\.4\.[0-9]+" | awk -F. "{print \$3}" | sort -n | tail -1) + max_gxx=${max_gxx:-0} + max_gc=$(echo "${syms}" | grep -oE "GLIBC_2\.[0-9]+" | awk -F. "{print \$2}" | sort -n | tail -1) + max_gc=${max_gc:-0} + status=OK + if [ "${max_gxx}" -gt "${MAX_GLIBCXX_MINOR}" ]; then + status="FAIL(GLIBCXX_3.4.${max_gxx} > 3.4.${MAX_GLIBCXX_MINOR})" + fail=1 + fi + if [ "${max_gc}" -gt "${MAX_GLIBC_MINOR}" ]; then + status="${status} FAIL(GLIBC_${MAX_GLIBC_MAJOR}.${max_gc} > 2.${MAX_GLIBC_MINOR})" + fail=1 + fi + printf " %-80s GLIBCXX=3.4.%-3s GLIBC=2.%-3s %s\n" "${so#${tmpd}/}" "${max_gxx}" "${max_gc}" "${status}" + done < <(find "${tmpd}" -type f -name "*.so" -o -name "*.so.*") + rm -rf "${tmpd}" + done + if [ "${fail}" -ne 0 ]; then + echo "" + echo "ERROR: one or more shared objects exceed the manylinux_2_28 / vLLM compat symbol floor" + exit 1 + fi + echo "" + echo "All shared objects pass GLIBCXX <= 3.4.${MAX_GLIBCXX_MINOR} and GLIBC <= 2.${MAX_GLIBC_MINOR}" + ' + - name: Upload whl file as artifact if: ${{ matrix.build_enabled }} uses: actions/upload-artifact@v4 diff --git a/csrc/kernels/gated_rmsnorm_quant_kernels.cu b/csrc/kernels/gated_rmsnorm_quant_kernels.cu index 7fa28e72ff..31423f36a9 100644 --- a/csrc/kernels/gated_rmsnorm_quant_kernels.cu +++ b/csrc/kernels/gated_rmsnorm_quant_kernels.cu @@ -219,8 +219,6 @@ void gated_rmsnorm_fp8_group_quant_launcher_impl( num_heads, head_dim ); - - C10_HIP_KERNEL_LAUNCH_CHECK(); } template