From f7a4b7e48b06a870a3e31c440cfdeed22777b0f1 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 07:39:25 +0000 Subject: [PATCH 1/7] detect aiter and rebuild --- scripts/ci/amd_ci_install_dependency.sh | 109 ++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/scripts/ci/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh index f458eddec2f3..3245fc307c79 100755 --- a/scripts/ci/amd_ci_install_dependency.sh +++ b/scripts/ci/amd_ci_install_dependency.sh @@ -80,6 +80,115 @@ docker cp ./dummy-grok ci_sglang:/ docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet] docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest +# Detect AITER version +############################################# +# Detect correct AITER_COMMIT for this runner +# + Check mismatch +# + Rebuild AITER if needed +############################################# + +echo "[CI] === AITER VERSION CHECK START ===" + +DOCKERFILE="docker/rocm.Dockerfile" + +# GPU_ARCH +GPU_ARCH="${GPU_ARCH:-mi30x}" +echo "[CI] Runner GPU_ARCH=${GPU_ARCH}" + +############################################# +# 1. Extract AITER_COMMIT from correct Dockerfile block +############################################# +if [[ "${GPU_ARCH}" == "mi35x" ]]; then + echo "[CI] Using gfx950 block from Dockerfile..." + REPO_AITER_COMMIT=$(awk ' + $0 ~ /FROM.*BASE_IMAGE_950/ {in_block=1} + in_block && $0 ~ /AITER_COMMIT=/ { + match($0, /AITER_COMMIT="([^"]+)"/, arr) + print arr[1]; exit + } + ' "$DOCKERFILE") +else + echo "[CI] Using gfx942-rocm700 block from Dockerfile..." + REPO_AITER_COMMIT=$(awk ' + $0 ~ /FROM.*BASE_IMAGE_942_ROCM700/ {in_block=1} + in_block && $0 ~ /AITER_COMMIT=/ { + match($0, /AITER_COMMIT="([^"]+)"/, arr) + print arr[1]; exit + } + ' "$DOCKERFILE") +fi + +if [[ -z "${REPO_AITER_COMMIT}" ]]; then + echo "[CI] ERROR: Failed to extract AITER_COMMIT from Dockerfile." + exit 1 +fi + +echo "[CI] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}" + +############################################# +# 2. Check container pre-installed AITER version +############################################# +IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none") + +echo "[CI] AITER version inside CI image: ${IMAGE_AITER_VERSION}" + +############################################# +# 3. Decide rebuild +############################################# +NEED_REBUILD="false" + +if [[ "${IMAGE_AITER_VERSION}" == "none" ]]; then + echo "[CI] No AITER found in image → rebuild required." + NEED_REBUILD="true" +elif [[ "${IMAGE_AITER_VERSION}" != "${REPO_AITER_COMMIT}" ]]; then + echo "[CI] Version mismatch:" + echo " Image: ${IMAGE_AITER_VERSION}" + echo " Repo : ${REPO_AITER_COMMIT}" + NEED_REBUILD="true" +else + echo "[CI] AITER version matches → using image's version." +fi + + +############################################# +# 4. Rebuild AITER if needed +############################################# +if [[ "${NEED_REBUILD}" == "true" ]]; then + echo "[CI] === AITER REBUILD START ===" + + # uninstall existing aiter + docker exec ci_sglang pip uninstall -y aiter || true + + # delete old aiter directory + docker exec ci_sglang rm -rf /sgl-workspace/aiter + + # clone a fresh copy to /sgl-workspace/aiter + docker exec ci_sglang git clone https://github.com/ROCm/aiter.git /sgl-workspace/aiter + + # checkout correct version + docker exec ci_sglang bash -c " + cd /sgl-workspace/aiter && \ + git fetch --all && \ + git checkout ${REPO_AITER_COMMIT} && \ + git submodule update --init --recursive + " + + # detect GPU ARCH list inside container + GPU_ARCH_LIST=$(docker exec ci_sglang rocminfo | grep -oE 'gfx[0-9]+' | sort -u | tr '\n' ',' | sed 's/,$//') + echo "[CI] GPU_ARCH_LIST=${GPU_ARCH_LIST}" + + # build AITER + docker exec ci_sglang bash -c " + cd /sgl-workspace/aiter && \ + GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop + " + + echo "[CI] === AITER REBUILD COMPLETE ===" +fi + +echo "[CI] === AITER VERSION CHECK END ===" + + # Clear pre-built AITER kernels from Docker image to avoid segfaults # The Docker image may contain pre-compiled kernels incompatible with the current environment echo "Clearing pre-built AITER kernels from Docker image..." From a4392916c489dcab1006695a3ee0a0b9e891c226 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 07:41:49 +0000 Subject: [PATCH 2/7] detect aiter and rebuild --- scripts/ci/amd_ci_install_dependency.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/ci/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh index 3245fc307c79..241479e9cb71 100755 --- a/scripts/ci/amd_ci_install_dependency.sh +++ b/scripts/ci/amd_ci_install_dependency.sh @@ -138,8 +138,7 @@ echo "[CI] AITER version inside CI image: ${IMAGE_AITER_VERSION}" NEED_REBUILD="false" if [[ "${IMAGE_AITER_VERSION}" == "none" ]]; then - echo "[CI] No AITER found in image → rebuild required." - NEED_REBUILD="true" + echo "[CI] No AITER found in image" elif [[ "${IMAGE_AITER_VERSION}" != "${REPO_AITER_COMMIT}" ]]; then echo "[CI] Version mismatch:" echo " Image: ${IMAGE_AITER_VERSION}" From 05a9206a572870d8d650a6c0ed62ce6d2aa50b5f Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 09:03:02 +0000 Subject: [PATCH 3/7] fix gpu_arc_list --- scripts/ci/amd_ci_install_dependency.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/ci/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh index 241479e9cb71..b19f47b80c00 100755 --- a/scripts/ci/amd_ci_install_dependency.sh +++ b/scripts/ci/amd_ci_install_dependency.sh @@ -172,8 +172,11 @@ if [[ "${NEED_REBUILD}" == "true" ]]; then git submodule update --init --recursive " - # detect GPU ARCH list inside container - GPU_ARCH_LIST=$(docker exec ci_sglang rocminfo | grep -oE 'gfx[0-9]+' | sort -u | tr '\n' ',' | sed 's/,$//') + if [[ "${GPU_ARCH}" == "mi35x" ]]; then + GPU_ARCH_LIST="gfx950" + else + GPU_ARCH_LIST="gfx942" + fi echo "[CI] GPU_ARCH_LIST=${GPU_ARCH_LIST}" # build AITER From 92134220c181fb21fe9aa0ec0e5797cefc840cb3 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 10:24:06 +0000 Subject: [PATCH 4/7] temporary commit to test aiter version bump --- docker/rocm.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index 1862828128bf..7b91a03a0e05 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -31,7 +31,7 @@ ENV BUILD_TRITON="0" ENV BUILD_LLVM="0" ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" -ENV AITER_COMMIT="v0.1.7.post5" +ENV AITER_COMMIT="v0.1.8" ENV NO_DEPS_FLAG="" # =============================== From 4aa069dc6529a7d8063b5df28f0c9be52fbc01fe Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 11:25:11 +0000 Subject: [PATCH 5/7] try to fix unexpected syntax error --- scripts/ci/amd_ci_install_dependency.sh | 52 ++++++++++++------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/scripts/ci/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh index b19f47b80c00..9200101a21b4 100755 --- a/scripts/ci/amd_ci_install_dependency.sh +++ b/scripts/ci/amd_ci_install_dependency.sh @@ -87,50 +87,45 @@ docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest # + Rebuild AITER if needed ############################################# -echo "[CI] === AITER VERSION CHECK START ===" +echo "[CI-AITER-CHECK] === AITER VERSION CHECK START ===" DOCKERFILE="docker/rocm.Dockerfile" # GPU_ARCH GPU_ARCH="${GPU_ARCH:-mi30x}" -echo "[CI] Runner GPU_ARCH=${GPU_ARCH}" +echo "[CI-AITER-CHECK] Runner GPU_ARCH=${GPU_ARCH}" ############################################# # 1. Extract AITER_COMMIT from correct Dockerfile block ############################################# if [[ "${GPU_ARCH}" == "mi35x" ]]; then - echo "[CI] Using gfx950 block from Dockerfile..." - REPO_AITER_COMMIT=$(awk ' - $0 ~ /FROM.*BASE_IMAGE_950/ {in_block=1} - in_block && $0 ~ /AITER_COMMIT=/ { - match($0, /AITER_COMMIT="([^"]+)"/, arr) - print arr[1]; exit - } - ' "$DOCKERFILE") + echo "[CI-AITER-CHECK] Using gfx950 block from Dockerfile..." + REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_950 AS gfx950' docker/rocm.Dockerfile \ + | grep 'AITER_COMMIT=' \ + | head -n1 \ + | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/') else - echo "[CI] Using gfx942-rocm700 block from Dockerfile..." - REPO_AITER_COMMIT=$(awk ' - $0 ~ /FROM.*BASE_IMAGE_942_ROCM700/ {in_block=1} - in_block && $0 ~ /AITER_COMMIT=/ { - match($0, /AITER_COMMIT="([^"]+)"/, arr) - print arr[1]; exit - } - ' "$DOCKERFILE") + echo "[CI-AITER-CHECK] Using gfx942-rocm700 block from Dockerfile..." + REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_942_ROCM700 AS gfx942-rocm700' docker/rocm.Dockerfile \ + | grep 'AITER_COMMIT=' \ + | head -n1 \ + | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/') fi + if [[ -z "${REPO_AITER_COMMIT}" ]]; then - echo "[CI] ERROR: Failed to extract AITER_COMMIT from Dockerfile." + echo "[CI-AITER-CHECK] ERROR: Failed to extract AITER_COMMIT from Dockerfile." exit 1 fi -echo "[CI] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}" +echo "[CI-AITER-CHECK] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}" ############################################# # 2. Check container pre-installed AITER version ############################################# IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none") -echo "[CI] AITER version inside CI image: ${IMAGE_AITER_VERSION}" +echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}" ############################################# # 3. Decide rebuild @@ -138,14 +133,15 @@ echo "[CI] AITER version inside CI image: ${IMAGE_AITER_VERSION}" NEED_REBUILD="false" if [[ "${IMAGE_AITER_VERSION}" == "none" ]]; then - echo "[CI] No AITER found in image" + echo "[CI-AITER-CHECK] No AITER found in image" + NEED_REBUILD="true" elif [[ "${IMAGE_AITER_VERSION}" != "${REPO_AITER_COMMIT}" ]]; then - echo "[CI] Version mismatch:" + echo "[CI-AITER-CHECK] Version mismatch:" echo " Image: ${IMAGE_AITER_VERSION}" echo " Repo : ${REPO_AITER_COMMIT}" NEED_REBUILD="true" else - echo "[CI] AITER version matches → using image's version." + echo "[CI-AITER-CHECK] AITER version matches → using image's version." fi @@ -153,7 +149,7 @@ fi # 4. Rebuild AITER if needed ############################################# if [[ "${NEED_REBUILD}" == "true" ]]; then - echo "[CI] === AITER REBUILD START ===" + echo "[CI-AITER-CHECK] === AITER REBUILD START ===" # uninstall existing aiter docker exec ci_sglang pip uninstall -y aiter || true @@ -177,7 +173,7 @@ if [[ "${NEED_REBUILD}" == "true" ]]; then else GPU_ARCH_LIST="gfx942" fi - echo "[CI] GPU_ARCH_LIST=${GPU_ARCH_LIST}" + echo "[CI-AITER-CHECK] GPU_ARCH_LIST=${GPU_ARCH_LIST}" # build AITER docker exec ci_sglang bash -c " @@ -185,10 +181,10 @@ if [[ "${NEED_REBUILD}" == "true" ]]; then GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop " - echo "[CI] === AITER REBUILD COMPLETE ===" + echo "[CI-AITER-CHECK] === AITER REBUILD COMPLETE ===" fi -echo "[CI] === AITER VERSION CHECK END ===" +echo "[CI-AITER-CHECK] === AITER VERSION CHECK END ===" # Clear pre-built AITER kernels from Docker image to avoid segfaults From d4e55b5a64cf29824580c543e21587c383192101 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 16:24:47 +0000 Subject: [PATCH 6/7] remove the temporary version bump for test --- docker/rocm.Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/rocm.Dockerfile b/docker/rocm.Dockerfile index 7b91a03a0e05..1862828128bf 100644 --- a/docker/rocm.Dockerfile +++ b/docker/rocm.Dockerfile @@ -31,7 +31,7 @@ ENV BUILD_TRITON="0" ENV BUILD_LLVM="0" ENV BUILD_AITER_ALL="1" ENV BUILD_MOONCAKE="1" -ENV AITER_COMMIT="v0.1.8" +ENV AITER_COMMIT="v0.1.7.post5" ENV NO_DEPS_FLAG="" # =============================== From 5ce8965ea58eafd749af453c906d169280c49d28 Mon Sep 17 00:00:00 2001 From: yctseng0211 Date: Fri, 19 Dec 2025 16:37:07 +0000 Subject: [PATCH 7/7] add prefix v to IMAGE_AITER_VERSION --- scripts/ci/amd_ci_install_dependency.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/amd_ci_install_dependency.sh b/scripts/ci/amd_ci_install_dependency.sh index 9200101a21b4..8b47cc87390f 100755 --- a/scripts/ci/amd_ci_install_dependency.sh +++ b/scripts/ci/amd_ci_install_dependency.sh @@ -124,7 +124,7 @@ echo "[CI-AITER-CHECK] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}" # 2. Check container pre-installed AITER version ############################################# IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none") - +IMAGE_AITER_VERSION="v${IMAGE_AITER_VERSION}" echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}" #############################################