runpod-workers · rachfop · Aug 5, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -2,22 +2,27 @@ ARG WORKER_CUDA_VERSION=11.8.0
 ARG BASE_IMAGE_VERSION=1.0.0
 FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base
 
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends python3-pip curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install UV
+ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
+RUN /install.sh && rm /install.sh
 
 # Install Python dependencies
 COPY builder/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache -r /requirements.txt
 
 # Setup for Option 2: Building the Image with the Model included
-ARG MODEL_NAME=""
-ARG TOKENIZER_NAME=""
-ARG BASE_PATH="/runpod-volume"
-ARG QUANTIZATION=""
-ARG MODEL_REVISION=""
-ARG TOKENIZER_REVISION=""
+ARG MODEL_NAME="" \
+    TOKENIZER_NAME="" \
+    BASE_PATH="/runpod-volume" \
+    QUANTIZATION="" \
+    MODEL_REVISION="" \
+    TOKENIZER_REVISION=""
 
 ENV MODEL_NAME=$MODEL_NAME \
     MODEL_REVISION=$MODEL_REVISION \
@@ -28,9 +33,8 @@ ENV MODEL_NAME=$MODEL_NAME \
     HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \
     HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \
     HF_HOME="${BASE_PATH}/huggingface-cache/hub" \
-    HF_HUB_ENABLE_HF_TRANSFER=1 
-
-ENV PYTHONPATH="/:/vllm-workspace"
+    HF_HUB_ENABLE_HF_TRANSFER=1 \
+    PYTHONPATH="/:/vllm-workspace"
 
 COPY src/download_model.py /download_model.py
 RUN --mount=type=secret,id=HF_TOKEN,required=false \
@@ -41,10 +45,12 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \
         python3 /download_model.py; \
     fi
 
-# Add source files
+# Add source files and remove download_model.py
 COPY src /src
-# Remove download_model.py
 RUN rm /download_model.py
 
+# Add a health check
+HEALTHCHECK CMD python3 -c "import vllm" || exit 1
+
 # Start the handler
 CMD ["python3", "/src/handler.py"]
diff --git a/vllm-base-image/Dockerfile b/vllm-base-image/Dockerfile
@@ -14,24 +14,29 @@ FROM nvidia/cuda:${WORKER_CUDA_VERSION}-devel-ubuntu22.04 AS dev
 ARG WORKER_CUDA_VERSION
 
 # Update and install dependencies
-RUN apt-get update -y \
-    && apt-get install -y python3-pip git
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends python3-pip git curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install UV
+ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
+RUN /install.sh && rm /install.sh
 
 # Set working directory
 WORKDIR /vllm-installation
 
 RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/
 
 # Install build and runtime dependencies
-COPY vllm/requirements-common.txt requirements-common.txt
-COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt  requirements-cuda.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-cuda.txt
+COPY vllm/requirements-common.txt vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt ./
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache -r requirements-common.txt -r requirements-cuda${WORKER_CUDA_VERSION}.txt
 
 # Install development dependencies
-COPY vllm/requirements-dev.txt requirements-dev.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+COPY vllm/requirements-dev.txt .
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache -r requirements-dev.txt
 
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
@@ -42,108 +47,111 @@ FROM dev AS build
 ARG WORKER_CUDA_VERSION
 
 # Install build dependencies
-COPY vllm/requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
+COPY vllm/requirements-build.txt .
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache -r requirements-build.txt
 
-# install compiler cache to speed up compilation leveraging local or remote caching
-RUN apt-get update -y && apt-get install -y ccache
+# Install compiler cache
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y && apt-get install -y --no-install-recommends ccache \
+    && rm -rf /var/lib/apt/lists/*
 
 # Copy necessary files
 COPY vllm/csrc csrc
-COPY vllm/setup.py setup.py
-COPY vllm/cmake cmake
-COPY vllm/CMakeLists.txt CMakeLists.txt
-COPY vllm/requirements-common.txt requirements-common.txt
-COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt
-COPY vllm/pyproject.toml pyproject.toml
-COPY vllm/vllm vllm 
+COPY vllm/setup.py vllm/cmake vllm/CMakeLists.txt vllm/pyproject.toml ./
+COPY vllm/vllm vllm
 
 # Set environment variables for building extensions
-ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION}
-ENV VLLM_INSTALL_PUNICA_KERNELS=0
+ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} \
+    VLLM_INSTALL_PUNICA_KERNELS=0 \
+    CCACHE_DIR=/root/.cache/ccache
+
 # Build extensions
-ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/uv \
     python3 setup.py bdist_wheel --dist-dir=dist
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip cache remove vllm_nccl*
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip cache remove vllm_nccl*
 
 FROM dev as flash-attn-builder
-# max jobs used for build
-# flash attention version
+
 ARG flash_attn_version=v2.5.8
 ENV FLASH_ATTN_VERSION=${flash_attn_version}
 
 WORKDIR /usr/src/flash-attention-v2
 
 # Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip wheel flash-attn==${FLASH_ATTN_VERSION} \
     --no-build-isolation --no-deps --no-cache-dir
 
 FROM dev as NCCL-installer
 
-# Re-declare ARG after FROM
 ARG WORKER_CUDA_VERSION
 
-# Update and install necessary libraries
-RUN apt-get update -y \
-    && apt-get install -y wget
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends wget \
+    && rm -rf /var/lib/apt/lists/*
 
 # Install NCCL library
-RUN if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \
+RUN --mount=type=cache,target=/var/cache/apt \
+    if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
         && dpkg -i cuda-keyring_1.0-1_all.deb \
         && apt-get update \
-        && apt install -y libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \
+        && apt-get install -y --no-install-recommends libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \
     elif [ "$WORKER_CUDA_VERSION" = "12.1.0" ]; then \
         wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
         && dpkg -i cuda-keyring_1.0-1_all.deb \
         && apt-get update \
-        && apt install -y libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \
+        && apt-get install -y --no-install-recommends libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \
     else \
         echo "Unsupported CUDA version: $WORKER_CUDA_VERSION"; \
         exit 1; \
-    fi
+    fi \
+    && rm -rf /var/lib/apt/lists/*
 
 FROM nvidia/cuda:${WORKER_CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
 
-# Re-declare ARG after FROM
 ARG WORKER_CUDA_VERSION
 
-# Update and install necessary libraries
-RUN apt-get update -y \
-    && apt-get install -y python3-pip
+RUN --mount=type=cache,target=/var/cache/apt \
+    apt-get update -y \
+    && apt-get install -y --no-install-recommends python3-pip curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install UV
+ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh
+RUN /install.sh && rm /install.sh
 
-# Set working directory
 WORKDIR /vllm-workspace
 
 RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/
 
 RUN --mount=type=bind,from=build,src=/vllm-installation/dist,target=/vllm-workspace/dist \
-    --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache dist/*.whl
 
 RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    --mount=type=cache,target=/root/.cache/pip \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+    --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache /usr/src/flash-attention-v2/*.whl
 
 FROM vllm-base AS runtime
 
-# install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer modelscope tensorizer
+# Install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/uv \
+    /root/.cargo/bin/uv pip install --system --no-cache accelerate hf_transfer modelscope tensorizer
 
-# Set PYTHONPATH environment variable
-ENV PYTHONPATH="/"
+ENV PYTHONPATH="/" \
+    VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2"
 
 # Copy NCCL library
 COPY --from=NCCL-installer /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/x86_64-linux-gnu/libnccl.so.2
-# Set the VLLM_NCCL_SO_PATH environment variable
-ENV VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2"
-
 
 # Validate the installation
-RUN python3 -c "import vllm; print(vllm.__file__)"
+RUN python3 -c "import vllm; print(vllm.__file__)"
+
+# Add a health check
+HEALTHCHECK CMD python3 -c "import vllm" || exit 1