Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-and-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ on:
default: ""

env:
IMAGE_REPO: kyuz0/vllm-therock-gfx1151
IMAGE_REPO: lafunamor/vllm-therock-gfx1151
DOCKER_BUILDKIT: "1"

jobs:
Expand Down
143 changes: 101 additions & 42 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,33 @@
FROM registry.fedoraproject.org/fedora:43

# 1. System Base & Build Tools
# Added 'gperftools-libs' for tcmalloc (fixes double-free)
##############################################################################
# Stage 1 – builder
# Contains the full ROCm SDK + compiler toolchain needed to compile vLLM,
# flash-attention, and bitsandbytes. Nothing from this stage except /opt/rocm
# and /opt/venv is carried forward.
##############################################################################
FROM registry.fedoraproject.org/fedora:43 AS builder

# System build tools + runtime deps (gperftools-libs for tcmalloc)
COPY scripts/install_deps.sh /tmp/install_deps.sh
RUN sh /tmp/install_deps.sh

# 2. Install "TheRock" ROCm SDK (Tarball Method)
# TheRock ROCm SDK (full SDK including LLVM/Clang needed as CC/CXX)
WORKDIR /tmp
ARG ROCM_MAJOR_VER=7
ARG GFX=gfx1151
# We pass ARGs to the script via ENV or rely on defaults.
# But let's be explicit and export them for the RUN command.
COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
RUN chmod +x /tmp/install_rocm_sdk.sh && \
export ROCM_MAJOR_VER=$ROCM_MAJOR_VER && \
export GFX=$GFX && \
/tmp/install_rocm_sdk.sh

# 4. Python Venv Setup
# Python venv
RUN /usr/bin/python3.12 -m venv /opt/venv
ENV VIRTUAL_ENV=/opt/venv
ENV PATH=/opt/venv/bin:$PATH
ENV PIP_NO_CACHE_DIR=1
RUN printf 'source /opt/venv/bin/activate\n' > /etc/profile.d/venv.sh
RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"

# 5. Install PyTorch (TheRock Nightly)
# PyTorch (TheRock Nightly) — slow and stable, keep early for cache
RUN python -m pip install \
--index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
--pre torch torchaudio torchvision && \
Expand All @@ -38,55 +40,49 @@ WORKDIR /opt
ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
ENV LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"

RUN git clone https://github.com/ROCm/flash-attention.git &&\
RUN git clone https://github.com/ROCm/flash-attention.git &&\
cd flash-attention &&\
git checkout main_perf &&\
python setup.py install && \
cd /opt && rm -rf /opt/flash-attention

# 6. Clone vLLM
# Clone vLLM
RUN git clone https://github.com/vllm-project/vllm.git /opt/vllm
WORKDIR /opt/vllm

# --- PATCHING ---
# Patch for Strix Halo (gfx1151)
COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
RUN python /opt/vllm/patch_strix.py && \
sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt
sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt

# 7. Build vLLM (Wheel Method) with CLANG Host Compiler
# Build vLLM wheel with ROCm Clang as host compiler
RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
ENV ROCM_HOME="/opt/rocm"
ENV HIP_PATH="/opt/rocm"
ENV VLLM_TARGET_DEVICE="rocm"
ENV PYTORCH_ROCM_ARCH="gfx1151"
ENV HIP_ARCHITECTURES="gfx1151"
ENV AMDGPU_TARGETS="gfx1151"
ENV HIP_ARCHITECTURES="gfx1151"
ENV AMDGPU_TARGETS="gfx1151"
ENV MAX_JOBS="4"

# --- CRITICAL FIX FOR SEGFAULT ---
# We force the Host Compiler (CC/CXX) to be the ROCm Clang, not Fedora GCC.
# This aligns the ABI of the compiled vLLM extensions with PyTorch.
# Force ROCm Clang as CC/CXX to align ABI with PyTorch (prevents segfault)
ENV CC="/opt/rocm/llvm/bin/clang"
ENV CXX="/opt/rocm/llvm/bin/clang++"

RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
python -m pip install /tmp/dist/*.whl

RUN python -m pip install ray

# --- bitsandbytes (ROCm) ---
# bitsandbytes (ROCm)
WORKDIR /opt
RUN git clone -b rocm_enabled_multi_backend https://github.com/ROCm/bitsandbytes.git
WORKDIR /opt/bitsandbytes

# Explicitly set HIP_PLATFORM (Docker ENV, not /etc/profile)
ENV HIP_PLATFORM="amd"
ENV CMAKE_PREFIX_PATH="/opt/rocm"

# Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
RUN cmake -S . \
-DGPU_TARGETS="gfx1151" \
-DBNB_ROCM_ARCH="gfx1151" \
Expand All @@ -97,13 +93,85 @@ RUN cmake -S . \
make -j$(nproc) && \
python -m pip install --no-cache-dir . --no-build-isolation --no-deps

# 8. Final Cleanup & Runtime
# Strip debug symbols and remove bytecode caches from venv
WORKDIR /opt
RUN chmod -R a+rwX /opt && \
find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
rm -rf /root/.cache/pip || true && \
dnf clean all && rm -rf /var/cache/dnf/*
RUN find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} +

##############################################################################
# Stage 2 – runtime
# Clean Fedora base with only runtime packages. Receives /opt/rocm (full,
# generous copy) and /opt/venv from builder. No compiler toolchain, no source
# trees, no build caches.
##############################################################################
FROM registry.fedoraproject.org/fedora:43

# Runtime-only packages — no gcc, cmake, ninja, git, aria2c
# glibc-devel + python3.12-devel required: Triton JIT-compiles Python extension
# modules at runtime via ROCm Clang and needs stdlib.h and Python.h
RUN dnf -y install --setopt=install_weak_deps=False --nodocs \
python3.12 python3.12-devel libatomic bash ca-certificates curl rsync \
ffmpeg-free \
vim nano dialog \
libdrm numactl-libs gperftools-libs glibc-devel \
iproute libibverbs-utils procps-ng \
perftest ping iperf3 perfquery \
&& dnf clean all && rm -rf /var/cache/dnf/*

# ROCm runtime — full copy, then strip everything not needed at runtime
COPY --chmod=755 --from=builder /opt/rocm /opt/rocm
RUN \
# Test/benchmark client suites
rm -rf /opt/rocm/clients /opt/rocm/tests \
# Profiler and video decode tooling (irrelevant for LLM inference)
/opt/rocm/share/rocprofiler-systems \
/opt/rocm/share/rocprofiler-sdk \
/opt/rocm/share/rocdecode \
/opt/rocm/lib/rdc \
/opt/rocm/lib/rocprofiler-systems \
# Static libraries — build-time only, never loaded at runtime
/opt/rocm/lib/libdevice_conv_operations.a \
/opt/rocm/lib/libdevice_reduction_operations.a \
/opt/rocm/lib/libdevice_contraction_operations.a \
/opt/rocm/lib/librocshmem.a \
# LLVM static libs — same, build-time only
&& find /opt/rocm/lib/llvm/lib -name '*.a' -delete \
# Test/bench/validate binaries and gtest data files in bin/
&& find /opt/rocm/bin \( \
-name '*-test' -o -name '*_test' -o \
-name '*-bench' -o \
-name '*-validate' -o \
-name '*_gtest.data' -o \
-name '*-gtest' \
\) -delete \
# Specific build/tuning tools not needed at runtime
&& rm -f \
/opt/rocm/bin/hipify-clang \
/opt/rocm/bin/rocblas-gemm-tune \
/opt/rocm/bin/rocshmem_functional_tests

# Python venv with all compiled packages (vLLM, PyTorch, flash-attn, bnb, ray)
COPY --chmod=755 --from=builder /opt/venv /opt/venv

# Profile script generated by install_rocm_sdk.sh (sets ROCM_PATH, LD_LIBRARY_PATH etc.)
COPY --from=builder /etc/profile.d/rocm-sdk.sh /etc/profile.d/rocm-sdk.sh

# Venv setup
ENV VIRTUAL_ENV=/opt/venv
ENV PATH=/opt/venv/bin:$PATH
ENV PIP_NO_CACHE_DIR=1
RUN printf 'source /opt/venv/bin/activate\n' > /etc/profile.d/venv.sh

# Runtime environment
ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
ENV LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"
ENV ROCM_HOME="/opt/rocm"
ENV HIP_PATH="/opt/rocm"
ENV VLLM_TARGET_DEVICE="rocm"
ENV PYTORCH_ROCM_ARCH="gfx1151"
ENV HIP_ARCHITECTURES="gfx1151"
ENV AMDGPU_TARGETS="gfx1151"
ENV HIP_PLATFORM="amd"

COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
Expand All @@ -130,22 +198,13 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
RUN chmod 0644 /etc/profile.d/*.sh
RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh

# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
# Custom RCCL (gfx1151) — replaces stock library with RDMA-capable build
COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
RUN echo "Installing Custom RCCL..." && \
gzip -d /tmp/librccl.so.1.gz && \
chmod 755 /tmp/librccl.so.1 && \
# Replace /opt/rocm library strictly as managed_rccl_install.sh does
cp -fv /tmp/librccl.so.1 /opt/rocm/lib/librccl.so.1.0 && \
# Replace /opt/venv library
find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \
rm /tmp/librccl.so.1

# 10. Force Upgrade Transformers (User Override)
# Required for GLM Flash, Qwen 3.5 etc.... vLLM reports incompatibility with transformers >= 5,
# but this version (5.3.0) has been tested and confirmed working.
RUN python -m pip install transformers==5.3.0

RUN chmod -R a+rwX /opt

CMD ["/bin/bash"]