Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pr-test-amd-rocm720.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
# - "test/**"
# - "sgl-kernel/**"
# - ".github/workflows/pr-test-amd-rocm720.yml"
# - "docker/rocm720.Dockerfile"
# - "docker/rocm.Dockerfile"
# pull_request:
# branches: [ main ]
# paths:
Expand All @@ -24,7 +24,7 @@ on:
# - "test/**"
# - "sgl-kernel/**"
# - ".github/workflows/pr-test-amd-rocm720.yml"
# - "docker/rocm720.Dockerfile"
# - "docker/rocm.Dockerfile"
workflow_dispatch:
inputs:
target_stage:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release-docker-amd-rocm720-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,5 @@ jobs:

tag=v${version}-${rocm_tag}

docker build . -f docker/rocm720.Dockerfile --build-arg SGL_BRANCH=${{ github.ref_name }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic --build-arg SETUPTOOLS_SCM_PRETEND_VERSION=${pretend_version} -t rocm/sgl-dev:${tag}-${{ env.DATE }} --no-cache
docker build . -f docker/rocm.Dockerfile --build-arg SGL_BRANCH=${{ github.ref_name }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic --build-arg SETUPTOOLS_SCM_PRETEND_VERSION=${pretend_version} -t rocm/sgl-dev:${tag}-${{ env.DATE }} --no-cache
docker push rocm/sgl-dev:${tag}-${{ env.DATE }}
5 changes: 1 addition & 4 deletions .github/workflows/release-docker-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,8 @@ jobs:
version=${{ steps.version.outputs.version }}
echo "Version: ${version}"

dockerfile=""
gpu_arch_suffix=""
if [ "${{ matrix.rocm_version }}" = "rocm700" ]; then
dockerfile="docker/rocm.Dockerfile"
if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then
rocm_tag="rocm700-mi30x"
elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then
Expand All @@ -70,7 +68,6 @@ jobs:
fi
elif [ "${{ matrix.rocm_version }}" = "rocm720" ]; then
gpu_arch_suffix="-${{ matrix.rocm_version }}"
dockerfile="docker/rocm720.Dockerfile"
if [ "${{ matrix.gpu_arch }}" = "gfx942" ]; then
rocm_tag="rocm720-mi30x"
elif [ "${{ matrix.gpu_arch }}" = "gfx950" ]; then
Expand All @@ -87,5 +84,5 @@ jobs:
tag=v${version}-${rocm_tag}

# rocm.Dockerfile expects SGL_BRANCH with 'v' prefix for git tag checkout
docker build . -f ${dockerfile} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }}${gpu_arch_suffix} --build-arg SGL_BRANCH=v${version} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t lmsysorg/sglang:${tag} --no-cache
docker build . -f docker/rocm.Dockerfile --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GPU_ARCH=${{ matrix.gpu_arch }}${gpu_arch_suffix} --build-arg SGL_BRANCH=v${version} --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t lmsysorg/sglang:${tag} --no-cache
docker push lmsysorg/sglang:${tag}
191 changes: 157 additions & 34 deletions docker/rocm.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
# Usage (to build SGLang ROCm docker image):
# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx942 -t v0.5.8-rocm700-mi30x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx950 -t v0.5.8-rocm700-mi35x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942 -t v0.5.9-rocm700-mi30x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942-rocm720 -t v0.5.9-rocm720-mi30x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950 -t v0.5.9-rocm700-mi35x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950-rocm720 -t v0.5.9-rocm720-mi35x -f rocm.Dockerfile .

# Usage (to build SGLang ROCm + Mori docker image):
# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8-rocm700-mi30x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.8 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.8-rocm700-mi35x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm700-mi30x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx942-rocm720 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm720-mi30x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm700-mi35x -f rocm.Dockerfile .
# docker build --build-arg SGL_BRANCH=v0.5.9 --build-arg GPU_ARCH=gfx950-rocm720 --build-arg ENABLE_MORI=1 --build-arg NIC_BACKEND=ainic -t v0.5.9-rocm720-mi35x -f rocm.Dockerfile .

# Default base images
ARG BASE_IMAGE_942="rocm/sgl-dev:rocm7-vllm-20250904"
ARG BASE_IMAGE_942_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1"
ARG BASE_IMAGE_950="rocm/sgl-dev:rocm7-vllm-20250904"
ARG BASE_IMAGE_950_ROCM720="rocm/pytorch:rocm7.2_ubuntu22.04_py3.10_pytorch_release_2.9.1"

# This is necessary for scope purpose
ARG GPU_ARCH=gfx950
Expand All @@ -23,6 +29,16 @@ ENV BUILD_AITER_ALL="1"
ENV BUILD_MOONCAKE="1"
ENV AITER_COMMIT="v0.1.10.post3"

# ===============================
# Base image 942 with rocm720 and args
FROM $BASE_IMAGE_942_ROCM720 AS gfx942-rocm720
ENV BUILD_VLLM="0"
ENV BUILD_TRITON="1"
ENV BUILD_LLVM="0"
ENV BUILD_AITER_ALL="1"
ENV BUILD_MOONCAKE="1"
ENV AITER_COMMIT="v0.1.10.post3"

# ===============================
# Base image 950 and args
FROM $BASE_IMAGE_950 AS gfx950
Expand All @@ -32,13 +48,25 @@ ENV BUILD_LLVM="0"
ENV BUILD_AITER_ALL="1"
ENV BUILD_MOONCAKE="1"
ENV AITER_COMMIT="v0.1.10.post3"

# ===============================
# Base image 950 with rocm720 and args
FROM $BASE_IMAGE_950_ROCM720 AS gfx950-rocm720
ENV BUILD_VLLM="0"
ENV BUILD_TRITON="1"
ENV BUILD_LLVM="0"
ENV BUILD_AITER_ALL="1"
ENV BUILD_MOONCAKE="1"
ENV AITER_COMMIT="v0.1.10.post3"

# ===============================
# Chosen arch and args
FROM ${GPU_ARCH}

# This is necessary for scope purpose, again
ARG GPU_ARCH=gfx950
ENV GPU_ARCH_LIST=${GPU_ARCH%-*}
ENV PYTORCH_ROCM_ARCH=gfx942;gfx950

ARG SGL_REPO="https://github.com/sgl-project/sglang.git"
ARG SGL_DEFAULT="main"
Expand All @@ -47,8 +75,8 @@ ARG SGL_BRANCH=${SGL_DEFAULT}
# Version override for setuptools_scm (used in nightly builds)
ARG SETUPTOOLS_SCM_PRETEND_VERSION=""

ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG TRITON_COMMIT="improve_fa_decode_3.0.0"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG TRITON_COMMIT="42270451990532c67e69d753fbd026f28fcc4840"

ARG AITER_REPO="https://github.com/ROCm/aiter.git"

Expand Down Expand Up @@ -81,6 +109,20 @@ USER root
RUN python -m pip install --upgrade pip && pip install setuptools_scm
RUN apt-get purge -y sccache; python -m pip uninstall -y sccache; rm -f "$(which sccache)"

# Install AMD SMI Python package from ROCm distribution.
# The ROCm 7.2 base image (rocm/pytorch) does not pre-install this package.
RUN set -eux; \
case "${GPU_ARCH}" in \
*rocm720*) \
echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \
cd /opt/rocm/share/amd_smi \
&& python3 -m pip install --no-cache-dir . \
;; \
*) \
echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip amdsmi installation"; \
;; \
esac

WORKDIR /sgl-workspace

# -----------------------
Expand All @@ -102,11 +144,28 @@ RUN if [ "$BUILD_LLVM" = "1" ]; then \
# (SETUPTOOLS_SCM_PRETEND_VERSION is set later for SGLang nightly builds and would otherwise
# leak into AITER's version when AITER uses setuptools_scm)
ENV SETUPTOOLS_SCM_PRETEND_VERSION=
RUN pip uninstall -y aiter
RUN pip uninstall -y aiter \
&& pip install psutil pybind11 # Required by AITER setup.py
RUN git clone ${AITER_REPO} \
&& cd aiter \
&& git checkout ${AITER_COMMIT} \
&& git submodule update --init --recursive

# Hot patches for AITER in v0.1.10.post3
# This is for ROCm 7.2 only, because of the image rebase from vllm
# to rocm/pytorch.
RUN set -eux; \
case "${GPU_ARCH}" in \
*rocm720*) \
echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \
cd aiter \
&& sed -i '459 s/if.*:/if False:/' aiter/ops/triton/attention/pa_mqa_logits.py; \
;; \
*) \
echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \
;; \
esac

RUN cd aiter \
&& echo "[AITER] GPU_ARCH=${GPU_ARCH}" \
&& if [ "$BUILD_AITER_ALL" = "1" ] && [ "$BUILD_LLVM" = "1" ]; then \
Expand All @@ -115,31 +174,8 @@ RUN cd aiter \
sh -c "PREBUILD_KERNELS=1 GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
else \
sh -c "GPU_ARCHS=$GPU_ARCH_LIST python setup.py develop"; \
fi

# -----------------------
# Triton (TODO: remove this after Triton is no longer needed)
RUN if [ "$BUILD_TRITON" = "1" ]; then \
pip uninstall -y triton \
&& git clone ${TRITON_REPO} \
&& cd triton \
&& git checkout ${TRITON_COMMIT} \
&& cd python \
&& python setup.py install; \
fi

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should keep this for ROCm 7.0

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was just move to bottom, not removed.

# -----------------------
# Build vLLM
ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
ARG VLLM_BRANCH="9f6b92db47c3444b7a7d67451ba0c3a2d6af4c2c"
RUN if [ "$BUILD_VLLM" = "1" ]; then \
git clone ${VLLM_REPO} \
&& cd vllm \
&& git checkout ${VLLM_BRANCH} \
&& python -m pip install -r requirements/rocm.txt \
&& python setup.py clean --all \
&& python setup.py develop; \
fi
fi \
&& echo "export PYTHONPATH=/sgl-workspace/aiter:\${PYTHONPATH}" >> /etc/bash.bashrc

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lost && echo "export PYTHONPATH=/sgl-workspace/aiter:\${PYTHONPATH}" >> /etc/bash.bashrc in case ROCm 7.2
Please be careful at merging two dockerfiles

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We (YC, BingXu) discussed yesterday that the line wasn't needed.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed so sys.path on ROCm 7.2 is set properly to include /sgl-workspace/aiter

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry but can you elaborate? If it is the case, why doesn't ROCm 7.0 image need it in the first place?
CC. @yctseng0211 @bingxche

# -----------------------
# Build Mooncake
Expand Down Expand Up @@ -234,7 +270,7 @@ RUN /bin/bash -lc 'set -euo pipefail; \
libgtest-dev libgmock-dev \
libprotobuf-dev protobuf-compiler libgflags-dev libsqlite3-dev \
python3 python3-dev python3-setuptools python3-pip python3-apt \
gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev \
gcc libtinfo-dev zlib1g-dev libedit-dev libxml2-dev vim \
cmake ninja-build pkg-config libstdc++6 software-properties-common \
&& rm -rf /var/lib/apt/lists/*; \
\
Expand Down Expand Up @@ -306,7 +342,8 @@ RUN /bin/bash -lc 'set -euo pipefail; \
# Python tools
RUN python3 -m pip install --no-cache-dir \
py-spy \
pre-commit
pre-commit \
tabulate

# -----------------------
# MORI (optional)
Expand Down Expand Up @@ -375,9 +412,95 @@ RUN /bin/bash -lc 'set -euo pipefail; \
echo "export PYTHONPATH=/sgl-workspace/mori:\${PYTHONPATH}" >> /etc/bash.bashrc; \
echo "[MORI] Done."'

# -----------------------
# Hot patch: torch-ROCm
# The artifact hardcoded the supported triton version to be 3.5.1.
# Rewrite the restriction directly.
ARG TORCH_ROCM_FILE="torch-2.9.1+rocm7.2.0.lw.git7e1940d4-cp310-cp310-linux_x86_64.whl"
RUN mkdir /tmp/whl && cd /tmp/whl \
&& export TORCH_ROCM_FILE="${TORCH_ROCM_FILE}" \
&& cat > hack.py <<"PY"
import zipfile, csv, os, re
from pathlib import Path

fname = os.environ["TORCH_ROCM_FILE"]
in_whl = Path("/") / fname
out_whl = Path("/tmp")/ fname
work = Path("/tmp/whl")

# 1) Extract
with zipfile.ZipFile(in_whl, "r") as z:
z.extractall(work)

# 2) Locate dist-info and patch METADATA (edit this logic to match your exact line)
dist_info = next(work.glob("*.dist-info"))
meta = dist_info / "METADATA"
txt = meta.read_text(encoding="utf-8")

# Example: replace one exact requirement form.
# Adjust the string to match what you actually see.
pat = r"^Requires-Dist:\s*triton==3.5.1[^\s]*;"
txt2, n = re.subn(pat, r"triton>=3.5.1;", txt, flags=re.MULTILINE)
if txt2 == txt:
raise SystemExit("Did not find expected Requires-Dist line to replace in METADATA")
meta.write_text(txt2, encoding="utf-8")

# 3) Hacky step: blank hash/size columns in RECORD
record = dist_info / "RECORD"
rows = []
with record.open(newline="", encoding="utf-8") as f:
for r in csv.reader(f):
if not r:
continue
# keep filename, blank out hash and size
rows.append([r[0], "", ""])
with record.open("w", newline="", encoding="utf-8") as f:
csv.writer(f).writerows(rows)

# 4) Re-zip as a wheel
with zipfile.ZipFile(out_whl, "w", compression=zipfile.ZIP_DEFLATED) as z:
for p in work.rglob("*"):
if p.is_file():
z.write(p, p.relative_to(work).as_posix())

print("Wrote", out_whl)
PY

RUN cd /tmp/whl \
&& case "${GPU_ARCH}" in \
*rocm720*) \
echo "ROCm 7.2 flavor detected from GPU_ARCH=${GPU_ARCH}"; \
python hack.py \
&& python3 -m pip install --force --no-deps /tmp/${TORCH_ROCM_FILE} \
&& rm -fr /tmp/whl /tmp/${TORCH_ROCM_FILE} \
;; \
*) \
echo "Not rocm720 (GPU_ARCH=${GPU_ARCH}), skip patch"; \
;; \
esac


# -----------------------
# Hot patch: Triton
# For ROCm 7.2, this custom build breaks pip dependency management,
# so future `pip install` will break the ROCm stack.
# A workaround for this is to reinstall the default triton
# wheel with the `rocm/pytorch` image in the root directory.
RUN if [ "$BUILD_TRITON" = "1" ]; then \
pip uninstall -y triton \
&& apt install -y cmake \
&& git clone ${TRITON_REPO} triton-custom \
&& cd triton-custom \
&& git checkout ${TRITON_COMMIT} \
&& pip install -r python/requirements.txt \
&& pip install -e .; \
fi

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L426-L509, pls keep it identical as in ROCm 7.2 dockerfile, and apply it only to ROCm7.2 build

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. I believe BUILD_TRITON is already a better flag for this block and serves its purpose correctly.

Currently ROCm 7.2 images gives BUILD_TRITON=1 while ROCm 7.0 images give BUILD_TRITON=0. The flag has a finer granularity that can serve us well. For example, if we don't want to update gfx942-rocm720 for some reason, then it can keep as is while we may have fixed the triton dependency issue for gfx950-rocm720.

# -----------------------
# Performance environment variable.

# Skip CuDNN compatibility check - not applicable for ROCm (uses MIOpen instead)
ENV SGLANG_DISABLE_CUDNN_CHECK=1
ENV HIP_FORCE_DEV_KERNARG=1
ENV HSA_NO_SCRATCH_RECLAIM=1
ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
Expand Down
Loading
Loading