-
Notifications
You must be signed in to change notification settings - Fork 5.3k
[AMD] Merge Dockerfiles for ROCm #19203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,20 @@ | ||
| # Usage (to build SGLang ROCm docker image): | ||
| # docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx942 -t v0.5.8-rocm700-mi30x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx950 -t v0.5.8-rocm700-mi35x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942 -t v0.5.9-rocm700-mi30x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942-rocm720 -t v0.5.9-rocm720-mi30x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950 -t v0.5.9-rocm700-mi35x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950-rocm720 -t v0.5.9-rocm720-mi35x -f rocm.Dockerfile . | ||
|
|
||
| # Usage (to build SGLang ROCm + Mori docker image): | ||
| # docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8-rocm700-mi30x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8-rocm700-mi35x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm700-mi30x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942-rocm720 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm720-mi30x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm700-mi35x -f rocm.Dockerfile . | ||
| # docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950-rocm720 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm720-mi35x -f rocm.Dockerfile . | ||
|
|
||
| # Default base images | ||
| ARG BASE_IMAGE_942="rocm/sgl-dev:rocm7-vllm-20250904" | ||
| ARG BASE_IMAGE_942_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1" | ||
| ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904" | ||
| ARG BASE_IMAGE_950_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1" | ||
|
|
||
| # This is necessary for scope purpose | ||
| ARG GPU_ARCH=gfx950 | ||
|
|
@@ -23,6 +29,16 @@ ENV BUILD_AITER_ALL="1" | |
| ENV BUILD_MOONCAKE="1" | ||
| ENV AITER_COMMIT="v0.1.10.post3" | ||
|
|
||
| # =============================== | ||
| # Base image 942 with rocm720 and args | ||
| FROM $BASE_IMAGE_942_ROCM720 AS gfx942-rocm720 | ||
| ENV BUILD_VLLM="0" | ||
| ENV BUILD_TRITON="1" | ||
| ENV BUILD_LLVM="0" | ||
| ENV BUILD_AITER_ALL="1" | ||
| ENV BUILD_MOONCAKE="1" | ||
| ENV AITER_COMMIT="v0.1.10.post3" | ||
|
|
||
| # =============================== | ||
| # Base image 950 and args | ||
| FROM $BASE_IMAGE_950 AS gfx950 | ||
|
|
@@ -32,13 +48,25 @@ ENV BUILD_LLVM="0" | |
| ENV BUILD_AITER_ALL="1" | ||
| ENV BUILD_MOONCAKE="1" | ||
| ENV AITER_COMMIT="v0.1.10.post3" | ||
|
|
||
| # =============================== | ||
| # Base image 950 with rocm720 and args | ||
| FROM $BASE_IMAGE_950_ROCM720 AS gfx950-rocm720 | ||
| ENV BUILD_VLLM="0" | ||
| ENV BUILD_TRITON="1" | ||
| ENV BUILD_LLVM="0" | ||
| ENV BUILD_AITER_ALL="1" | ||
| ENV BUILD_MOONCAKE="1" | ||
| ENV AITER_COMMIT="v0.1.10.post3" | ||
|
|
||
| # =============================== | ||
| # Chosen arch and args | ||
| FROM ${GPU_ARCH} | ||
|
|
||
| # This is necessary for scope purpose, again | ||
| ARG GPU_ARCH=gfx950 | ||
| ENV GPU_ARCH_LIST=${GPU_ARCH%-*} | ||
| ENV PYTORCH_ROCM_ARCH=gfx942;gfx950 | ||
|
|
||
| ARG SGL_REPO="https://github.com/sgl-project/sglang.git" | ||
| ARG SGL_DEFAULT="main" | ||
|
|
@@ -47,8 +75,8 @@ ARG SGL_BRANCH=${SGL_DEFAULT} | |
| # Version override for setuptools_scm (used in nightly builds) | ||
| ARG SETUPTOOLS_SCM_PRETEND_VERSION="" | ||
|
|
||
| ARG TRITON_REPO="https://github.com/ROCm/triton.git" | ||
| ARG TRITON_COMMIT="improve_fa_decode_3.0.0" | ||
| ARG TRITON_REPO="https://github.com/triton-lang/triton.git" | ||
| ARG TRITON_COMMIT="42270451990532c67e69d753fbd026f28fcc4840" | ||
|
|
||
| ARG AITER_REPO="https://github.com/ROCm/aiter.git" | ||
|
|
||
|
|
@@ -81,6 +109,20 @@ USER root | |
| RUN python -m pip install --upgrade pip && pip install setuptools_scm | ||
| RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)" | ||
|
|
||
| # Install AMD SMI Python package from ROCm distribution. | ||
| # The ROCm 7.2 base image (rocm/pytorch) does not pre-install this package. | ||
| RUN set -eux; \ | ||
| case "${GPU_ARCH}" in \ | ||
| *rocm720*) \ | ||
| echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \ | ||
| cd /opt/rocm/share/amd_smi \ | ||
| && python3 -m pip install --no-cache-dir . \ | ||
| ;; \ | ||
| *) \ | ||
| echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip amdsmi installation"; \ | ||
| ;; \ | ||
| esac | ||
|
|
||
| WORKDIR /sgl-workspace | ||
|
|
||
| # ----------------------- | ||
|
|
@@ -102,11 +144,28 @@ RUN if [ "$BUILD_LLVM" = "1" ]; then \ | |
| # (SETUPTOOLS_SCM_PRETEND_VERSION is set later for SGLang nightly builds and would otherwise | ||
| # leak into AITER's version when AITER uses setuptools_scm) | ||
| ENV SETUPTOOLS_SCM_PRETEND_VERSION= | ||
| RUN pip uninstall -y aiter | ||
| RUN pip uninstall -y aiter \ | ||
| && pip install psutil pybind11 # Required by AITER setup.py | ||
| RUN git clone ${AITER_REPO} \ | ||
| && cd aiter \ | ||
| && git checkout ${AITER_COMMIT} \ | ||
| && git submodule update --init --recursive | ||
|
|
||
| # Hot patches for AITER in v0.1.10.post3 | ||
| # This is for ROCm 7.2 only, because of the image rebase from vllm | ||
| # to rocm/pytorch. | ||
| RUN set -eux; \ | ||
| case "${GPU_ARCH}" in \ | ||
| *rocm720*) \ | ||
| echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \ | ||
| cd aiter \ | ||
| && sed -i '459 s/if.*:/if False:/' aiter/ops/triton/attention/pa_mqa_logits.py; \ | ||
| ;; \ | ||
| *) \ | ||
| echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \ | ||
| ;; \ | ||
| esac | ||
|
|
||
HaiShaw marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| RUN cd aiter \ | ||
| && echo "[AITER] GPU_ARCH=${GPU_ARCH}" \ | ||
| && if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \ | ||
|
|
@@ -115,31 +174,8 @@ RUN cd aiter \ | |
| sh -c "PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ | ||
| else \ | ||
| sh -c "GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \ | ||
| fi | ||
|
|
||
| # ----------------------- | ||
| # Triton (TODO: remove this after Triton is no longer needed) | ||
| RUN if [ "$BUILD_TRITON" = "1" ]; then \ | ||
| pip uninstall -y triton \ | ||
| && git clone ${TRITON_REPO} \ | ||
| && cd triton \ | ||
| && git checkout ${TRITON_COMMIT} \ | ||
| && cd python \ | ||
| && python setup.py install; \ | ||
| fi | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should keep this for ROCm 7.0
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was just move to bottom, not removed. |
||
| # ----------------------- | ||
| # Build vLLM | ||
| ARG VLLM_REPO="https://github.com/ROCm/vllm.git" | ||
| ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c" | ||
| RUN if [ "$BUILD_VLLM" = "1" ]; then \ | ||
| git clone ${VLLM_REPO} \ | ||
| && cd vllm \ | ||
| && git checkout ${VLLM_BRANCH} \ | ||
| && python -m pip install -r requirements/rocm.txt \ | ||
| && python setup.py clean --all \ | ||
| && python setup.py develop; \ | ||
| fi | ||
| fi \ | ||
| && echo "export PYTHONPATH=/sgl-workspace/aiter:\${PYTHONPATH}" >> /etc/bash.bashrc | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Lost
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We (YC, BingXu) discussed yesterday that the line wasn't needed.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is needed so
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry but can you elaborate? If it is the case, why doesn't ROCm 7.0 image need it in the first place? |
||
| # ----------------------- | ||
| # Build Mooncake | ||
|
|
@@ -234,7 +270,7 @@ RUN /bin/bash -lc 'set -euo pipefail; \ | |
| libgtest-dev libgmock-dev \ | ||
| libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \ | ||
| python3 python3-dev python3-setuptools python3-pip python3-apt \ | ||
| gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \ | ||
| gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev vim \ | ||
| cmake ninja-build pkg-config libstdc++6 software-properties-common \ | ||
| && rm -rf /var/lib/apt/lists/*; \ | ||
| \ | ||
|
|
@@ -306,7 +342,8 @@ RUN /bin/bash -lc 'set -euo pipefail; \ | |
| # Python tools | ||
| RUN python3 -m pip install --no-cache-dir \ | ||
| py-spy \ | ||
| pre-commit | ||
| pre-commit \ | ||
| tabulate | ||
|
|
||
| # ----------------------- | ||
| # MORI (optional) | ||
|
|
@@ -375,9 +412,95 @@ RUN /bin/bash -lc 'set -euo pipefail; \ | |
| echo "export PYTHONPATH=/sgl-workspace/mori:\${PYTHONPATH}" >> /etc/bash.bashrc; \ | ||
| echo "[MORI] Done."' | ||
|
|
||
| # ----------------------- | ||
| # Hot patch: torch-ROCm | ||
| # The artifact hardcoded the supported triton version to be 3.5.1. | ||
| # Rewrite the restriction directly. | ||
| ARG TORCH_ROCM_FILE="torch-2.9.1+rocm7.2.0.lw.git7e1940d4-cp310-cp310-linux_x86_64.whl" | ||
| RUN mkdir /tmp/whl && cd /tmp/whl \ | ||
| && export TORCH_ROCM_FILE="${TORCH_ROCM_FILE}" \ | ||
| && cat > hack.py <<"PY" | ||
| import zipfile, csv, os, re | ||
| from pathlib import Path | ||
|
|
||
| fname = os.environ["TORCH_ROCM_FILE"] | ||
| in_whl = Path("/") / fname | ||
| out_whl = Path("/tmp")/ fname | ||
| work = Path("/tmp/whl") | ||
|
|
||
| # 1) Extract | ||
| with zipfile.ZipFile(in_whl, "r") as z: | ||
| z.extractall(work) | ||
|
|
||
| # 2) Locate dist-info and patch METADATA (edit this logic to match your exact line) | ||
| dist_info = next(work.glob("*.dist-info")) | ||
| meta = dist_info / "METADATA" | ||
| txt = meta.read_text(encoding="utf-8") | ||
|
|
||
| # Example: replace one exact requirement form. | ||
| # Adjust the string to match what you actually see. | ||
| pat = r"^Requires-Dist:\s*triton==3.5.1[^\s]*;" | ||
| txt2, n = re.subn(pat, r"triton>=3.5.1;", txt, flags=re.MULTILINE) | ||
| if txt2 == txt: | ||
| raise SystemExit("Did not find expected Requires-Dist line to replace in METADATA") | ||
| meta.write_text(txt2, encoding="utf-8") | ||
|
|
||
| # 3) Hacky step: blank hash/size columns in RECORD | ||
| record = dist_info / "RECORD" | ||
| rows = [] | ||
| with record.open(newline="", encoding="utf-8") as f: | ||
| for r in csv.reader(f): | ||
| if not r: | ||
| continue | ||
| # keep filename, blank out hash and size | ||
| rows.append([r[0], "", ""]) | ||
| with record.open("w", newline="", encoding="utf-8") as f: | ||
| csv.writer(f).writerows(rows) | ||
|
|
||
| # 4) Re-zip as a wheel | ||
| with zipfile.ZipFile(out_whl, "w", compression=zipfile.ZIP_DEFLATED) as z: | ||
| for p in work.rglob("*"): | ||
| if p.is_file(): | ||
| z.write(p, p.relative_to(work).as_posix()) | ||
|
|
||
| print("Wrote", out_whl) | ||
| PY | ||
HaiShaw marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| RUN cd /tmp/whl \ | ||
| && case "${GPU_ARCH}" in \ | ||
| *rocm720*) \ | ||
| echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \ | ||
| python hack.py \ | ||
| && python3 -m pip install --force --no-deps /tmp/${TORCH_ROCM_FILE} \ | ||
| && rm -fr /tmp/whl /tmp/${TORCH_ROCM_FILE} \ | ||
| ;; \ | ||
| *) \ | ||
| echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \ | ||
| ;; \ | ||
| esac | ||
|
|
||
|
|
||
| # ----------------------- | ||
| # Hot patch: Triton | ||
| # For ROCm 7.2, this custom build breaks pip dependency management, | ||
| # so future `pip install` will break the ROCm stack. | ||
| # A workaround for this is to reinstall the default triton | ||
| # wheel with the `rocm/pytorch` image in the root directory. | ||
| RUN if [ "$BUILD_TRITON" = "1" ]; then \ | ||
| pip uninstall -y triton \ | ||
| && apt install -y cmake \ | ||
| && git clone ${TRITON_REPO} triton-custom \ | ||
| && cd triton-custom \ | ||
| && git checkout ${TRITON_COMMIT} \ | ||
| && pip install -r python/requirements.txt \ | ||
| && pip install -e .; \ | ||
| fi | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. L426-L509, pls keep it identical as in ROCm 7.2 dockerfile, and apply it only to ROCm7.2 build
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No. I believe Currently ROCm 7.2 images gives |
||
| # ----------------------- | ||
| # Performance environment variable. | ||
|
|
||
| # Skip CuDNN compatibility check - not applicable for ROCm (uses MIOpen instead) | ||
| ENV SGLANG_DISABLE_CUDNN_CHECK=1 | ||
| ENV HIP_FORCE_DEV_KERNARG=1 | ||
| ENV HSA_NO_SCRATCH_RECLAIM=1 | ||
| ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.