kyuz0 · Lafunamor · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.github/workflows/build-and-publish.yml b/.github/workflows/build-and-publish.yml
@@ -13,7 +13,7 @@ on:
         default: ""
 
 env:
-  IMAGE_REPO: kyuz0/vllm-therock-gfx1151
+  IMAGE_REPO: lafunamor/vllm-therock-gfx1151
   DOCKER_BUILDKIT: "1"
 
 jobs:

diff --git a/Dockerfile b/Dockerfile
@@ -1,31 +1,33 @@
-FROM registry.fedoraproject.org/fedora:43
-
-# 1. System Base & Build Tools
-# Added 'gperftools-libs' for tcmalloc (fixes double-free)
+##############################################################################
+# Stage 1 – builder
+# Contains the full ROCm SDK + compiler toolchain needed to compile vLLM,
+# flash-attention, and bitsandbytes. Nothing from this stage except /opt/rocm
+# and /opt/venv is carried forward.
+##############################################################################
+FROM registry.fedoraproject.org/fedora:43 AS builder
+
+# System build tools + runtime deps (gperftools-libs for tcmalloc)
 COPY scripts/install_deps.sh /tmp/install_deps.sh
 RUN sh /tmp/install_deps.sh
 
-# 2. Install "TheRock" ROCm SDK (Tarball Method)
+# TheRock ROCm SDK (full SDK including LLVM/Clang needed as CC/CXX)
 WORKDIR /tmp
 ARG ROCM_MAJOR_VER=7
 ARG GFX=gfx1151
-# We pass ARGs to the script via ENV or rely on defaults. 
-# But let's be explicit and export them for the RUN command.
 COPY scripts/install_rocm_sdk.sh /tmp/install_rocm_sdk.sh
 RUN chmod +x /tmp/install_rocm_sdk.sh && \
   export ROCM_MAJOR_VER=$ROCM_MAJOR_VER && \
   export GFX=$GFX && \
   /tmp/install_rocm_sdk.sh
 
-# 4. Python Venv Setup
+# Python venv
 RUN /usr/bin/python3.12 -m venv /opt/venv
 ENV VIRTUAL_ENV=/opt/venv
 ENV PATH=/opt/venv/bin:$PATH
 ENV PIP_NO_CACHE_DIR=1
-RUN printf 'source /opt/venv/bin/activate\n' > /etc/profile.d/venv.sh
 RUN python -m pip install --upgrade pip wheel packaging "setuptools<80.0.0"
 
-# 5. Install PyTorch (TheRock Nightly)
+# PyTorch (TheRock Nightly) — slow and stable, keep early for cache
 RUN python -m pip install \
   --index-url https://rocm.nightlies.amd.com/v2-staging/gfx1151/ \
   --pre torch torchaudio torchvision && \
@@ -38,55 +40,49 @@ WORKDIR /opt
 ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
 ENV LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"
 
-RUN git clone https://github.com/ROCm/flash-attention.git &&\ 
+RUN git clone https://github.com/ROCm/flash-attention.git &&\
   cd flash-attention &&\
   git checkout main_perf &&\
   python setup.py install && \
   cd /opt && rm -rf /opt/flash-attention
 
-# 6. Clone vLLM
+# Clone vLLM
 RUN git clone https://github.com/vllm-project/vllm.git /opt/vllm
 WORKDIR /opt/vllm
 
-# --- PATCHING ---
+# Patch for Strix Halo (gfx1151)
 COPY scripts/patch_strix.py /opt/vllm/patch_strix.py
 RUN python /opt/vllm/patch_strix.py && \
-  sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt  
+  sed -i 's/gfx1200;gfx1201/gfx1151/' CMakeLists.txt
 
-# 7. Build vLLM (Wheel Method) with CLANG Host Compiler
+# Build vLLM wheel with ROCm Clang as host compiler
 RUN python -m pip install --upgrade cmake ninja packaging wheel numpy "setuptools-scm>=8" "setuptools<80.0.0" scikit-build-core pybind11
 ENV ROCM_HOME="/opt/rocm"
 ENV HIP_PATH="/opt/rocm"
 ENV VLLM_TARGET_DEVICE="rocm"
 ENV PYTORCH_ROCM_ARCH="gfx1151"
-ENV HIP_ARCHITECTURES="gfx1151"          
-ENV AMDGPU_TARGETS="gfx1151"              
+ENV HIP_ARCHITECTURES="gfx1151"
+ENV AMDGPU_TARGETS="gfx1151"
 ENV MAX_JOBS="4"
 
-# --- CRITICAL FIX FOR SEGFAULT ---
-# We force the Host Compiler (CC/CXX) to be the ROCm Clang, not Fedora GCC.
-# This aligns the ABI of the compiled vLLM extensions with PyTorch.
+# Force ROCm Clang as CC/CXX to align ABI with PyTorch (prevents segfault)
 ENV CC="/opt/rocm/llvm/bin/clang"
 ENV CXX="/opt/rocm/llvm/bin/clang++"
 
 RUN export HIP_DEVICE_LIB_PATH=$(find /opt/rocm -type d -name bitcode -print -quit) && \
   echo "Compiling with Bitcode: $HIP_DEVICE_LIB_PATH" && \
-  export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \   
+  export CMAKE_ARGS="-DROCM_PATH=/opt/rocm -DHIP_PATH=/opt/rocm -DAMDGPU_TARGETS=gfx1151 -DHIP_ARCHITECTURES=gfx1151" && \
   python -m pip wheel --no-build-isolation --no-deps -w /tmp/dist -v . && \
   python -m pip install /tmp/dist/*.whl
 
 RUN python -m pip install ray
 
-# --- bitsandbytes (ROCm) ---
+# bitsandbytes (ROCm)
 WORKDIR /opt
 RUN git clone -b rocm_enabled_multi_backend https://github.com/ROCm/bitsandbytes.git
 WORKDIR /opt/bitsandbytes
-
-# Explicitly set HIP_PLATFORM (Docker ENV, not /etc/profile)
 ENV HIP_PLATFORM="amd"
 ENV CMAKE_PREFIX_PATH="/opt/rocm"
-
-# Force CMake to use the System ROCm Compiler (/opt/rocm/llvm/bin/clang++)
 RUN cmake -S . \
   -DGPU_TARGETS="gfx1151" \
   -DBNB_ROCM_ARCH="gfx1151" \
@@ -97,13 +93,85 @@ RUN cmake -S . \
   make -j$(nproc) && \
   python -m pip install --no-cache-dir . --no-build-isolation --no-deps
 
-# 8. Final Cleanup & Runtime
+# Strip debug symbols and remove bytecode caches from venv
 WORKDIR /opt
-RUN chmod -R a+rwX /opt && \
-  find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
-  find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} + && \
-  rm -rf /root/.cache/pip || true && \
-  dnf clean all && rm -rf /var/cache/dnf/*
+RUN find /opt/venv -type f -name "*.so" -exec strip -s {} + 2>/dev/null || true && \
+  find /opt/venv -type d -name "__pycache__" -prune -exec rm -rf {} +
+
+##############################################################################
+# Stage 2 – runtime
+# Clean Fedora base with only runtime packages. Receives /opt/rocm (full,
+# generous copy) and /opt/venv from builder. No compiler toolchain, no source
+# trees, no build caches.
+##############################################################################
+FROM registry.fedoraproject.org/fedora:43
+
+# Runtime-only packages — no gcc, cmake, ninja, git, aria2c
+# glibc-devel + python3.12-devel required: Triton JIT-compiles Python extension
+# modules at runtime via ROCm Clang and needs stdlib.h and Python.h
+RUN dnf -y install --setopt=install_weak_deps=False --nodocs \
+  python3.12 python3.12-devel libatomic bash ca-certificates curl rsync \
+  ffmpeg-free \
+  vim nano dialog \
+  libdrm numactl-libs gperftools-libs glibc-devel \
+  iproute libibverbs-utils procps-ng \
+  perftest ping iperf3 perfquery \
+  && dnf clean all && rm -rf /var/cache/dnf/*
+
+# ROCm runtime — full copy, then strip everything not needed at runtime
+COPY --chmod=755 --from=builder /opt/rocm /opt/rocm
+RUN \
+  # Test/benchmark client suites
+  rm -rf /opt/rocm/clients /opt/rocm/tests \
+  # Profiler and video decode tooling (irrelevant for LLM inference)
+  /opt/rocm/share/rocprofiler-systems \
+  /opt/rocm/share/rocprofiler-sdk \
+  /opt/rocm/share/rocdecode \
+  /opt/rocm/lib/rdc \
+  /opt/rocm/lib/rocprofiler-systems \
+  # Static libraries — build-time only, never loaded at runtime
+  /opt/rocm/lib/libdevice_conv_operations.a \
+  /opt/rocm/lib/libdevice_reduction_operations.a \
+  /opt/rocm/lib/libdevice_contraction_operations.a \
+  /opt/rocm/lib/librocshmem.a \
+  # LLVM static libs — same, build-time only
+  && find /opt/rocm/lib/llvm/lib -name '*.a' -delete \
+  # Test/bench/validate binaries and gtest data files in bin/
+  && find /opt/rocm/bin \( \
+    -name '*-test' -o -name '*_test' -o \
+    -name '*-bench' -o \
+    -name '*-validate' -o \
+    -name '*_gtest.data' -o \
+    -name '*-gtest' \
+  \) -delete \
+  # Specific build/tuning tools not needed at runtime
+  && rm -f \
+    /opt/rocm/bin/hipify-clang \
+    /opt/rocm/bin/rocblas-gemm-tune \
+    /opt/rocm/bin/rocshmem_functional_tests
+
+# Python venv with all compiled packages (vLLM, PyTorch, flash-attn, bnb, ray)
+COPY --chmod=755 --from=builder /opt/venv /opt/venv
+
+# Profile script generated by install_rocm_sdk.sh (sets ROCM_PATH, LD_LIBRARY_PATH etc.)
+COPY --from=builder /etc/profile.d/rocm-sdk.sh /etc/profile.d/rocm-sdk.sh
+
+# Venv setup
+ENV VIRTUAL_ENV=/opt/venv
+ENV PATH=/opt/venv/bin:$PATH
+ENV PIP_NO_CACHE_DIR=1
+RUN printf 'source /opt/venv/bin/activate\n' > /etc/profile.d/venv.sh
+
+# Runtime environment
+ENV FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+ENV LD_LIBRARY_PATH="/opt/rocm/lib:/opt/rocm/lib64:$LD_LIBRARY_PATH"
+ENV ROCM_HOME="/opt/rocm"
+ENV HIP_PATH="/opt/rocm"
+ENV VLLM_TARGET_DEVICE="rocm"
+ENV PYTORCH_ROCM_ARCH="gfx1151"
+ENV HIP_ARCHITECTURES="gfx1151"
+ENV AMDGPU_TARGETS="gfx1151"
+ENV HIP_PLATFORM="amd"
 
 COPY scripts/01-rocm-env-for-triton.sh /etc/profile.d/01-rocm-env-for-triton.sh
 COPY scripts/99-toolbox-banner.sh /etc/profile.d/99-toolbox-banner.sh
@@ -130,22 +198,13 @@ RUN chmod +x /opt/start-vllm /opt/start-vllm-cluster /opt/vllm_cluster_bench.py
 RUN chmod 0644 /etc/profile.d/*.sh
 RUN printf 'ulimit -S -c 0\n' > /etc/profile.d/90-nocoredump.sh && chmod 0644 /etc/profile.d/90-nocoredump.sh
 
-# 9. Install Custom RCCL (gfx1151) - Replaces standard library with manually built one
+# Custom RCCL (gfx1151) — replaces stock library with RDMA-capable build
 COPY custom_libs/librccl.so.1.gz /tmp/librccl.so.1.gz
 RUN echo "Installing Custom RCCL..." && \
   gzip -d /tmp/librccl.so.1.gz && \
   chmod 755 /tmp/librccl.so.1 && \
-  # Replace /opt/rocm library strictly as managed_rccl_install.sh does
   cp -fv /tmp/librccl.so.1 /opt/rocm/lib/librccl.so.1.0 && \
-  # Replace /opt/venv library
   find /opt/venv -name "librccl.so.1" -exec cp -fv /tmp/librccl.so.1 {} + && \
   rm /tmp/librccl.so.1
 
-# 10. Force Upgrade Transformers (User Override)
-# Required for GLM Flash, Qwen 3.5 etc.... vLLM reports incompatibility with transformers >= 5, 
-# but this version (5.3.0) has been tested and confirmed working.
-RUN python -m pip install transformers==5.3.0
-
-RUN chmod -R a+rwX /opt
-
 CMD ["/bin/bash"]