Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
357 changes: 357 additions & 0 deletions docker/Dockerfile.gb200
Original file line number Diff line number Diff line change
@@ -0,0 +1,357 @@
ARG CUDA_VERSION=12.8.1
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04

ARG BUILD_TYPE=blackwell
ENV DEBIAN_FRONTEND=noninteractive \
CUDA_HOME=/usr/local/cuda \
GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ \
NVSHMEM_DIR=/sgl-workspace/nvshmem/install \
BUILD_TYPE=${BUILD_TYPE} \
TORCH_CUDA_ARCH_LIST="10.0 12.0"

# Set timezone and install all packages
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update && apt-get install -y --no-install-recommends \
tzdata \
software-properties-common netcat-openbsd kmod unzip openssh-server \
curl wget lsof zsh ccache tmux htop git-lfs tree \
python3 python3-pip python3-dev libpython3-dev \
build-essential cmake \
libopenmpi-dev libnuma1 libnuma-dev \
libibverbs-dev libibverbs1 libibumad3 \
librdmacm1 libnl-3-200 libnl-route-3-200 libnl-route-3-dev libnl-3-dev \
ibverbs-providers infiniband-diags perftest \
libgoogle-glog-dev libgtest-dev libjsoncpp-dev libunwind-dev \
libboost-all-dev libssl-dev \
libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc \
pybind11-dev \
libhiredis-dev libcurl4-openssl-dev \
libczmq4 libczmq-dev \
libfabric-dev \
patchelf \
nvidia-dkms-550 \
devscripts debhelper fakeroot dkms check libsubunit0 libsubunit-dev \
&& ln -sf /usr/bin/python3 /usr/bin/python \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean


# --- Install SGLang missing package
RUN pip install netifaces

# --- Install nightly PyTorch ---
RUN pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 --force-reinstall


# GDRCopy installation
RUN mkdir -p /tmp/gdrcopy && cd /tmp \
&& git clone https://github.com/NVIDIA/gdrcopy.git -b v2.4.4 \
&& cd gdrcopy/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
&& cd / && rm -rf /tmp/gdrcopy

# Fix DeepEP IBGDA symlink
RUN ln -sf /usr/lib/$(uname -m)-linux-gnu/libmlx5.so.1 /usr/lib/$(uname -m)-linux-gnu/libmlx5.so

# Clone and install SGLang
# FIXME: Forcing SGLang to 2a2d3478afe8cdb336888f2e6faa3775ac40254e because sgl-kernel v0.2.5 is missing aarch64 package
WORKDIR /sgl-workspace
RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5lib six \
&& git clone https://github.com/sgl-project/sglang.git \
&& cd sglang \
&& git checkout 2a2d3478afe8cdb336888f2e6faa3775ac40254e \
&& case "$CUDA_VERSION" in \
12.6.1) CUINDEX=126 ;; \
12.8.1) CUINDEX=128 ;; \
*) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
esac \
&& python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
&& if [ "$CUDA_VERSION" = "12.8.1" ]; then \
python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.5 --force-reinstall --no-deps ; \
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.4/sgl_kernel-0.2.4+cu128-cp39-abi3-manylinux2014_$(uname -m).whl --force-reinstall --no-deps ; \
fi


# Build NVSHMEM
# Build and install NVSHMEM + DeepEP
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz \
&& git clone https://github.com/fzyzcjy/DeepEP.git \
&& cd DeepEP \
&& git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 \
&& cd .. \
&& tar -xf nvshmem_src_3.2.5-1.txz && mv nvshmem_src nvshmem \
&& cd nvshmem \
&& git apply /sgl-workspace/DeepEP/third-party/nvshmem.patch \
&& sed -i '1i#include <unistd.h>' examples/moe_shuffle.cu \
&& rm -f /sgl-workspace/nvshmem_src_3.2.5-1.txz \
&& NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \
NVSHMEM_MPI_SUPPORT=0 \
NVSHMEM_IBGDA_SUPPORT=1 \
NVSHMEM_PMIX_SUPPORT=0 \
NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
NVSHMEM_USE_GDRCOPY=1 \
cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=${NVSHMEM_DIR} -DCMAKE_CUDA_ARCHITECTURES="100;120" \
&& cmake --build build --target install -j \
&& cd /sgl-workspace/DeepEP \
&& NVSHMEM_DIR=${NVSHMEM_DIR} pip install .

# Python tools
RUN python3 -m pip install --no-cache-dir \
datamodel_code_generator \
pre-commit \
pytest \
black \
isort \
icdiff \
uv \
wheel \
scikit-build-core

# Install development tools and utilities
RUN apt-get update && apt-get install -y \
gdb \
ninja-build \
vim \
tmux \
htop \
wget \
curl \
locales \
lsof \
git \
git-lfs \
zsh \
tree \
silversearcher-ag \
cloc \
unzip \
pkg-config \
libssl-dev \
bear \
ccache \
less \
&& apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
&& rm -rf /var/lib/apt/lists/* \
&& apt-get clean
Comment on lines +115 to +139
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This RUN command reinstalls many packages that are already present from the RUN command at lines 15-34 (e.g., tmux, htop, wget, curl, git-lfs, zsh, unzip, libssl-dev, ccache, and most of the infiniband related libraries). This is inefficient as it increases the layer size and makes the Dockerfile harder to maintain.

I suggest removing the duplicate packages from this RUN command and only installing the new tools required. The apt-get update is also redundant if this layer is built right after the previous apt-get layer.

RUN apt-get install -y --no-install-recommends \
    gdb \
    ninja-build \
    vim \
    locales \
    silversearcher-ag \
    cloc \
    pkg-config \
    bear \
    less \
    rdma-core \
    && rm -rf /var/lib/apt/lists/* \
    && apt-get clean


RUN apt update -y \
&& apt install -y --no-install-recommends gnupg \
&& echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu2204/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list \
&& apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$(if [ "$(uname -m)" = "aarch64" ]; then echo "sbsa"; else echo "x86_64"; fi)/3bf863cc.pub \
&& apt update -y \
&& apt install nsight-systems-cli -y
Comment on lines +141 to +146
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The apt-key command is deprecated and its usage can be a security risk. It's recommended to store GPG keys in /usr/share/keyrings and reference them from the sources.list file. This avoids polluting the main keyring.

Also, this RUN command is missing cleanup of apt lists (rm -rf /var/lib/apt/lists/*), which can increase the final image size.

RUN apt-get update && apt-get install -y --no-install-recommends gnupg ca-certificates curl && \
    KEY_URL="http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/$(if [ "$(uname -m)" = "aarch64" ]; then echo "sbsa"; else echo "x86_64"; fi)/3bf863cc.pub" && \
    curl -fsSL "$KEY_URL" | gpg --dearmor -o /usr/share/keyrings/nvidia-devtools.gpg && \
    echo "deb [signed-by=/usr/share/keyrings/nvidia-devtools.gpg] http://developer.download.nvidia.com/devtools/repos/ubuntu2204/$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list > /dev/null && \
    apt-get update && \
    apt-get install -y nsight-systems-cli && \
    rm -rf /var/lib/apt/lists/*


RUN git clone https://github.com/kvcache-ai/Mooncake.git \
&& cd Mooncake \
&& bash dependencies.sh -y \
&& mkdir build \
&& cd build \
&& cmake .. -DUSE_MNNVL=ON \
&& make -j \
&& make install
Comment on lines +148 to +155
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

This RUN command clones the main branch of the Mooncake repository. This makes the Docker build non-reproducible, as changes to the main branch will alter the build result. It's a best practice to pin dependencies to a specific commit hash or tag.

Additionally, the cloned repository is not removed after installation, which unnecessarily increases the image size.

RUN MOONCAKE_COMMIT="<please-pin-to-a-specific-commit-hash>" && \
    git clone https://github.com/kvcache-ai/Mooncake.git /tmp/Mooncake \
    && cd /tmp/Mooncake \
    && git checkout ${MOONCAKE_COMMIT} \
    && bash dependencies.sh -y \
    && mkdir build \
    && cd build \
    && cmake .. -DUSE_MNNVL=ON \
    && make -j \
    && make install \
    && rm -rf /tmp/Mooncake


# Set up locale
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8

# Install minimal Python packages
RUN python3 -m pip install --no-cache-dir --break-system-packages \
pytest \
black \
isort \
icdiff \
scikit_build_core \
uv \
pre-commit \
pandas \
matplotlib \
tabulate
Comment on lines +164 to +174
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This pip install command reinstalls several packages that were already installed in the RUN block at lines 103-112 (e.g., pytest, black, isort, uv, pre-commit). This is redundant and can be confusing. It's better to either merge the two pip install commands or remove the duplicates from this one.

RUN python3 -m pip install --no-cache-dir --break-system-packages \
    pandas \
    matplotlib \
    tabulate


# Install diff-so-fancy
RUN curl -LSso /usr/local/bin/diff-so-fancy https://github.com/so-fancy/diff-so-fancy/releases/download/v1.4.4/diff-so-fancy \
&& chmod +x /usr/local/bin/diff-so-fancy

# Install clang-format
RUN curl -LSso /usr/local/bin/clang-format https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-amd64 \
&& chmod +x /usr/local/bin/clang-format
Comment on lines +181 to +182
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The URL for downloading clang-format has a hardcoded architecture (amd64). This will fail on aarch64 systems, for which this Dockerfile seems to be designed as well (given the use of uname -m elsewhere). The architecture should be determined dynamically to ensure multi-arch support.

RUN ARCH=$(uname -m) && \
    if [ "$ARCH" = "x86_64" ]; then CLANG_ARCH="amd64"; elif [ "$ARCH" = "aarch64" ]; then CLANG_ARCH="arm64"; else echo "Unsupported arch: $ARCH" >&2; exit 1; fi && \
    curl -LSso /usr/local/bin/clang-format "https://github.com/muttleyxd/clang-tools-static-binaries/releases/download/master-32d3ac78/clang-format-16_linux-${CLANG_ARCH}" \
    && chmod +x /usr/local/bin/clang-format


# Install clangd
RUN curl -L https://github.com/clangd/clangd/releases/download/18.1.3/clangd-linux-18.1.3.zip -o clangd.zip \
&& unzip clangd.zip \
&& cp -r clangd_18.1.3/bin/* /usr/local/bin/ \
&& cp -r clangd_18.1.3/lib/* /usr/local/lib/ \
&& rm -rf clangd_18.1.3 clangd.zip
Comment on lines +185 to +189
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The clangd binary being downloaded is for the x86_64 architecture. The official clangd GitHub releases for version 18.1.3 do not provide a pre-built binary for aarch64. This will cause the build to fail or result in a non-functional tool on aarch64 systems.

To support aarch64, you should consider one of the following:

  • Build clangd from source within the Dockerfile for the aarch64 architecture.
  • Find an alternative source that provides aarch64 binaries for clangd 18.1.3.
  • Use a version of clangd that is available for both architectures (e.g., from apt repositories, though this will likely be an older version).


# Install CMake
RUN CMAKE_VERSION=3.31.1 \
&& ARCH=$(uname -m) \
&& CMAKE_INSTALLER="cmake-${CMAKE_VERSION}-linux-${ARCH}" \
&& wget "https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/${CMAKE_INSTALLER}.tar.gz" \
&& tar -xzf "${CMAKE_INSTALLER}.tar.gz" \
&& cp -r "${CMAKE_INSTALLER}/bin/"* /usr/local/bin/ \
&& cp -r "${CMAKE_INSTALLER}/share/"* /usr/local/share/ \
&& rm -rf "${CMAKE_INSTALLER}" "${CMAKE_INSTALLER}.tar.gz"

# Add yank script
COPY --chown=root:root <<-"EOF" /usr/local/bin/yank
#!/bin/bash
put() {
esc=$1
test -n "$TMUX" -o -z "${TERM##screen*}" && esc="\033Ptmux;\033$esc\033\\"
printf "$esc"
}
put "\033]52;c;!\a"
buf=$( cat "$@" )
len=$( printf %s "$buf" | wc -c ) max=74994
test $len -gt $max && echo "$0: input is $(( len - max )) bytes too long" >&2
put "\033]52;c;$( printf %s "$buf" | head -c $max | base64 | tr -d '\r\n' )\a"
test -n "$TMUX" && tmux set-buffer "$buf" ||:
EOF

RUN chmod +x /usr/local/bin/yank

# Install oh-my-zsh and plugins
RUN sh -c "$(curl -fsSL https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" "" --unattended \
&& git clone https://github.com/zsh-users/zsh-autosuggestions ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-autosuggestions \
&& git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ${ZSH_CUSTOM:-~/.oh-my-zsh/custom}/plugins/zsh-syntax-highlighting

# Configure Vim
COPY --chown=root:root <<-"EOF" /root/.vimrc
function! Yank(text) abort
let escape = system('yank', a:text)
if v:shell_error
echoerr escape
else
call writefile([escape], '/dev/tty', 'b')
endif
endfunction

noremap <silent> <Leader>y y:<C-U>call Yank(@0)<CR>

" automatically run yank(1) whenever yanking in Vim
function! CopyYank() abort
call Yank(join(v:event.regcontents, "\n"))
endfunction

autocmd TextYankPost * call CopyYank()

" Basic settings
set number
syntax on
set mouse=a
filetype indent on

" Indentation
set autoindent nosmartindent
set smarttab
set expandtab
set shiftwidth=4
set softtabstop=4

" Visual guides
set colorcolumn=120
highlight ColorColumn ctermbg=5

" Status line
set laststatus=2
set statusline=%<%f\ %h%m%r%=%{\"[\".(&fenc==\"\"?&enc:&fenc).((exists(\"+bomb\")\ &&\ &bomb)?\",B\":\"\").\"]\ \"}%k\ %-14.(%l,%c%V%)\ %P

" Backspace behavior
set backspace=2

" Encoding
set encoding=utf-8
set fileencoding=utf-8
EOF

# Configure tmux
COPY --chown=root:root <<-"EOF" /root/.tmux.conf
# Pane border styling
set -g pane-border-style fg='#742727',bg=black
set -g pane-active-border-style fg=red,bg=black

# Status bar styling
set -g status-style bg='#0C8A92',fg=black

# Change prefix key to backtick
set-option -g prefix `
unbind C-b
bind-key ` send-prefix

# Split panes using - and = with current path
unbind '"'
bind - splitw -v -c '#{pane_current_path}'
unbind '%'
bind = splitw -h -c '#{pane_current_path}'

# Vi mode settings
bind-key -T copy-mode-vi Y send-keys -X copy-pipe 'yank > #{pane_tty}'
set-window-option -g mode-keys vi

# Other settings
set-option -g escape-time 0
set-option -g base-index 1
set-window-option -g mouse on
EOF

# Configure Git
RUN git config --global core.editor "vim" \
&& git config --global core.whitespace "fix,-indent-with-non-tab,trailing-space,cr-at-eol" \
&& git config --global core.pager "diff-so-fancy | less --tabs=4 -RFX" \
&& git config --global color.ui true \
&& git config --global color."diff-highlight".oldNormal "red bold" \
&& git config --global color."diff-highlight".oldHighlight "red bold 52" \
&& git config --global color."diff-highlight".newNormal "green bold" \
&& git config --global color."diff-highlight".newHighlight "green bold 22" \
&& git config --global color.diff.meta "11" \
&& git config --global color.diff.frag "magenta bold" \
&& git config --global color.diff.commit "yellow bold" \
&& git config --global color.diff.old "red bold" \
&& git config --global color.diff.new "green bold" \
&& git config --global color.diff.whitespace "red reverse" \
&& git config --global alias.lg "log --color --graph --pretty=format:'%Cred%h%Creset - %s %Cgreen(%cr) %C(bold blue)<%an>%Creset%C(auto)%d%Creset' --abbrev-commit --" \
&& git config --global http.sslVerify false \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Disabling SSL verification for git (http.sslVerify false) is a significant security risk, as it makes git operations vulnerable to man-in-the-middle attacks. This setting should be removed. If it's required for a specific internal network or proxy, it should be configured only for that specific repository/URL, not globally.

git config --global --unset http.sslVerify

&& git config --global pull.rebase true

# Configure zsh
COPY --chown=root:root <<-"EOF" /root/.zshrc
export ZSH="/root/.oh-my-zsh"

# Theme
ZSH_THEME="robbyrussell"

# Plugins
plugins=(
git
z
zsh-autosuggestions
zsh-syntax-highlighting
)

source $ZSH/oh-my-zsh.sh

# Aliases
alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CF'
alias vi='vim'

# Enhanced history
HISTSIZE=10000
SAVEHIST=10000
setopt HIST_IGNORE_ALL_DUPS
setopt HIST_FIND_NO_DUPS
setopt INC_APPEND_HISTORY
EOF

RUN set -euxo ; \
curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin

# Set workspace directory
WORKDIR /sgl-workspace/sglang
Loading