Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 22 additions & 16 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,27 @@ ARG WORKER_CUDA_VERSION=11.8.0
ARG BASE_IMAGE_VERSION=1.0.0
FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base

RUN apt-get update -y \
&& apt-get install -y python3-pip
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y --no-install-recommends python3-pip curl \
&& rm -rf /var/lib/apt/lists/*

# Install UV
ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
RUN /install.sh && rm /install.sh

# Install Python dependencies
COPY builder/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade -r /requirements.txt
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache -r /requirements.txt

# Setup for Option 2: Building the Image with the Model included
ARG MODEL_NAME=""
ARG TOKENIZER_NAME=""
ARG BASE_PATH="/runpod-volume"
ARG QUANTIZATION=""
ARG MODEL_REVISION=""
ARG TOKENIZER_REVISION=""
ARG MODEL_NAME="" \
TOKENIZER_NAME="" \
BASE_PATH="/runpod-volume" \
QUANTIZATION="" \
MODEL_REVISION="" \
TOKENIZER_REVISION=""

ENV MODEL_NAME=$MODEL_NAME \
MODEL_REVISION=$MODEL_REVISION \
Expand All @@ -28,9 +33,8 @@ ENV MODEL_NAME=$MODEL_NAME \
HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
HF_HUB_ENABLE_HF_TRANSFER=1

ENV PYTHONPATH="/:/vllm-workspace"
HF_HUB_ENABLE_HF_TRANSFER=1 \
PYTHONPATH="/:/vllm-workspace"

COPY src/download_model.py /download_model.py
RUN --mount=type=secret,id=HF_TOKEN,required=false \
Expand All @@ -41,10 +45,12 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \
python3 /download_model.py; \
fi

# Add source files
# Add source files and remove download_model.py
COPY src /src
# Remove download_model.py
RUN rm /download_model.py

# Add a health check
HEALTHCHECK CMD python3 -c "import vllm" || exit 1

# Start the handler
CMD ["python3", "/src/handler.py"]
120 changes: 64 additions & 56 deletions vllm-base-image/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,29 @@ FROM nvidia/cuda:${WORKER_CUDA_VERSION}-devel-ubuntu22.04 AS dev
ARG WORKER_CUDA_VERSION

# Update and install dependencies
RUN apt-get update -y \
&& apt-get install -y python3-pip git
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y --no-install-recommends python3-pip git curl \
&& rm -rf /var/lib/apt/lists/*

# Install UV
ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
RUN /install.sh && rm /install.sh

# Set working directory
WORKDIR /vllm-installation

RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/

# Install build and runtime dependencies
COPY vllm/requirements-common.txt requirements-common.txt
COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-cuda.txt
COPY vllm/requirements-common.txt vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt ./
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache -r requirements-common.txt -r requirements-cuda${WORKER_CUDA_VERSION}.txt

# Install development dependencies
COPY vllm/requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
COPY vllm/requirements-dev.txt .
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache -r requirements-dev.txt

ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
Expand All @@ -42,108 +47,111 @@ FROM dev AS build
ARG WORKER_CUDA_VERSION

# Install build dependencies
COPY vllm/requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt
COPY vllm/requirements-build.txt .
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache
# Install compiler cache
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y && apt-get install -y --no-install-recommends ccache \
&& rm -rf /var/lib/apt/lists/*

# Copy necessary files
COPY vllm/csrc csrc
COPY vllm/setup.py setup.py
COPY vllm/cmake cmake
COPY vllm/CMakeLists.txt CMakeLists.txt
COPY vllm/requirements-common.txt requirements-common.txt
COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt
COPY vllm/pyproject.toml pyproject.toml
COPY vllm/vllm vllm
COPY vllm/setup.py vllm/cmake vllm/CMakeLists.txt vllm/pyproject.toml ./
COPY vllm/vllm vllm

# Set environment variables for building extensions
ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION}
ENV VLLM_INSTALL_PUNICA_KERNELS=0
ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} \
VLLM_INSTALL_PUNICA_KERNELS=0 \
CCACHE_DIR=/root/.cache/ccache

# Build extensions
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
python3 setup.py bdist_wheel --dist-dir=dist

RUN --mount=type=cache,target=/root/.cache/pip \
pip cache remove vllm_nccl*
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip cache remove vllm_nccl*

FROM dev as flash-attn-builder
# max jobs used for build
# flash attention version

ARG flash_attn_version=v2.5.8
ENV FLASH_ATTN_VERSION=${flash_attn_version}

WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip wheel flash-attn==${FLASH_ATTN_VERSION} \
--no-build-isolation --no-deps --no-cache-dir

FROM dev as NCCL-installer

# Re-declare ARG after FROM
ARG WORKER_CUDA_VERSION

# Update and install necessary libraries
RUN apt-get update -y \
&& apt-get install -y wget
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y --no-install-recommends wget \
&& rm -rf /var/lib/apt/lists/*

# Install NCCL library
RUN if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \
RUN --mount=type=cache,target=/var/cache/apt \
if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& apt-get update \
&& apt install -y libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \
&& apt-get install -y --no-install-recommends libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \
elif [ "$WORKER_CUDA_VERSION" = "12.1.0" ]; then \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
&& dpkg -i cuda-keyring_1.0-1_all.deb \
&& apt-get update \
&& apt install -y libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \
&& apt-get install -y --no-install-recommends libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \
else \
echo "Unsupported CUDA version: $WORKER_CUDA_VERSION"; \
exit 1; \
fi
fi \
&& rm -rf /var/lib/apt/lists/*

FROM nvidia/cuda:${WORKER_CUDA_VERSION}-base-ubuntu22.04 AS vllm-base

# Re-declare ARG after FROM
ARG WORKER_CUDA_VERSION

# Update and install necessary libraries
RUN apt-get update -y \
&& apt-get install -y python3-pip
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y --no-install-recommends python3-pip curl \
&& rm -rf /var/lib/apt/lists/*

# Install UV
ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
RUN /install.sh && rm /install.sh

# Set working directory
WORKDIR /vllm-workspace

RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/

RUN --mount=type=bind,from=build,src=/vllm-installation/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
pip install dist/*.whl --verbose
--mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache dist/*.whl

RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
--mount=type=cache,target=/root/.cache/pip \
pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
--mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache /usr/src/flash-attention-v2/*.whl

FROM vllm-base AS runtime

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer modelscope tensorizer
# Install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/uv \
/root/.cargo/bin/uv pip install --system --no-cache accelerate hf_transfer modelscope tensorizer

# Set PYTHONPATH environment variable
ENV PYTHONPATH="/"
ENV PYTHONPATH="/" \
VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2"

# Copy NCCL library
COPY --from=NCCL-installer /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/x86_64-linux-gnu/libnccl.so.2
# Set the VLLM_NCCL_SO_PATH environment variable
ENV VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2"


# Validate the installation
RUN python3 -c "import vllm; print(vllm.__file__)"
RUN python3 -c "import vllm; print(vllm.__file__)"

# Add a health check
HEALTHCHECK CMD python3 -c "import vllm" || exit 1