Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 75 additions & 42 deletions docker/Dockerfile.deepep
Original file line number Diff line number Diff line change
@@ -1,67 +1,100 @@
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
# Deps
RUN apt-get update && apt-get install -y netcat-openbsd \
libopenmpi-dev \
kmod \
rdma-core \
infiniband-diags \
openssh-server \
perftest \
ibverbs-providers \
libibumad3 \
libibverbs1 \
libnl-3-200 \
libnl-route-3-200 \
librdmacm1 \
build-essential \
cmake \
libibverbs-dev \
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libnuma-dev \
libibverbs-dev \
libunwind-dev \
libgoogle-glog-dev \
libpython3-dev \
libboost-all-dev \
libssl-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
pybind11-dev \
libhiredis-dev \
pkg-config \
patchelf \
ccache \
libcurl4-openssl-dev \
curl \
pkg-config libczmq4 libczmq-dev \
libnl-route-3-dev libnl-3-dev librdmacm1 \
libhiredis-dev \
nvidia-dkms-535 \
build-essential \
devscripts \
debhelper \
fakeroot \
dkms \
check \
libsubunit0 \
libsubunit-dev \
libfabric-dev \
python3 \
python3-pip \
&& rm -rf /var/lib/apt/lists/* \
&& ln -s /usr/bin/python3 /usr/bin/python
Comment on lines +4 to +57
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The dependency installation layer can be optimized:

  1. Add --no-install-recommends to reduce image size.
  2. Remove duplicate packages.
  3. Consider sorting packages alphabetically for maintainability.
RUN apt-get update &&  apt-get install -y --no-install-recommends netcat-openbsd  \
                        libopenmpi-dev \
                        kmod \
                        rdma-core  \
                        infiniband-diags \
                        openssh-server \
                        perftest \
                        ibverbs-providers \
                        libibumad3 \
                        libibverbs1 \
                        libnl-3-200 \
                        libnl-route-3-200 \
                        librdmacm1  \
                        cmake \
                        libibverbs-dev \
                        libgoogle-glog-dev \
                        libgtest-dev \
                        libjsoncpp-dev \
                        libnuma-dev \
                        libunwind-dev \
                        libpython3-dev \
                        libboost-all-dev \
                        libssl-dev \
                        libgrpc-dev \
                        libgrpc++-dev \
                        libprotobuf-dev \
                        protobuf-compiler-grpc \
                        pybind11-dev \
                        libhiredis-dev \
                        pkg-config \
                        patchelf \
                        ccache \
                        libcurl4-openssl-dev \
                        curl \
                        libczmq4 libczmq-dev \
                        libnl-route-3-dev libnl-3-dev \
                        nvidia-dkms-535 \
                        devscripts \
                        debhelper \
                        fakeroot \
                        dkms \
                        check \
                        libsubunit0 \
                        libsubunit-dev \
                        libfabric-dev \
                        python3 \
                        python3-pip \
                        && rm -rf /var/lib/apt/lists/* \
                        && ln -s /usr/bin/python3 /usr/bin/python


# CMake
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
wget \
libssl-dev \
&& wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
RUN wget https://github.com/Kitware/CMake/releases/download/v3.27.4/cmake-3.27.4-linux-x86_64.sh \
&& chmod +x cmake-3.27.4-linux-x86_64.sh \
&& ./cmake-3.27.4-linux-x86_64.sh --skip-license --prefix=/usr/local \
&& rm cmake-3.27.4-linux-x86_64.sh

# Python
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
python3 \
python3-pip \
&& ln -s /usr/bin/python3 /usr/bin/python

ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
# GDRCopy
WORKDIR /tmp
RUN git clone https://github.com/NVIDIA/gdrcopy.git
WORKDIR /tmp/gdrcopy
RUN git checkout v2.4.4

RUN apt update
RUN apt install -y nvidia-dkms-535
RUN apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
RUN apt install -y check libsubunit0 libsubunit-dev
RUN mkdir -p /tmp \
&& cd /tmp \
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
&& cd /tmp/gdrcopy/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb \
&& dpkg -i libgdrapi_*.deb \
&& dpkg -i gdrcopy-tests_*.deb \
&& dpkg -i gdrcopy_*.deb
Comment on lines +68 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Remove the cloned repository and .deb files to reduce image size.

RUN  mkdir -p /tmp \
    &&  cd /tmp \
    && git clone https://github.com/NVIDIA/gdrcopy.git  -b v2.4.4 \
    && cd /tmp/gdrcopy/packages \
    && CUDA=/usr/local/cuda ./build-deb-packages.sh \
    && dpkg -i gdrdrv-dkms_*.deb \
    && dpkg -i libgdrapi_*.deb \
    && dpkg -i gdrcopy-tests_*.deb \
    && dpkg -i gdrcopy_*.deb \
    && rm -rf /tmp/gdrcopy


WORKDIR /tmp/gdrcopy/packages
RUN CUDA=/usr/local/cuda ./build-deb-packages.sh
RUN dpkg -i gdrdrv-dkms_*.deb
RUN dpkg -i libgdrapi_*.deb
RUN dpkg -i gdrcopy-tests_*.deb
RUN dpkg -i gdrcopy_*.deb

ENV GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/

# IBGDA dependency
RUN ln -sf /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so
RUN apt-get install -y libfabric-dev

# DeepEP
WORKDIR /sgl-workspace
RUN git clone https://github.com/deepseek-ai/DeepEP.git

# NVSHMEM
WORKDIR /sgl-workspace
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
RUN tar -xf nvshmem_src_3.2.5-1.txz \
&& mv nvshmem_src nvshmem

WORKDIR /sgl-workspace/nvshmem
RUN git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch
&& mv nvshmem_src nvshmem \
&& cd /sgl-workspace/nvshmem \
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
&& sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu \
&& cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu

RUN sed -i '1i#include <unistd.h>' /sgl-workspace/nvshmem/examples/moe_shuffle.cu && \
cat /sgl-workspace/nvshmem/examples/moe_shuffle.cu

WORKDIR /sgl-workspace/nvshmem
# Compile NVSHMEM
ENV CUDA_HOME=/usr/local/cuda
RUN NVSHMEM_SHMEM_SUPPORT=0 \
RUN cd /sgl-workspace/nvshmem && NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
Expand All @@ -77,5 +110,5 @@ WORKDIR /sgl-workspace/DeepEP
ENV NVSHMEM_DIR=/sgl-workspace/nvshmem/install
RUN NVSHMEM_DIR=/sgl-workspace/nvshmem/install pip install --break-system-packages .

# Set workspace
WORKDIR /sgl-workspace
# Install mooncake transfer engine
RUN pip install --upgrade mooncake_transfer_engine --break-system-packages
Loading