From 19c0de7218fd6ccbfd980ae92b4866462aeb70f4 Mon Sep 17 00:00:00 2001 From: Patrick Date: Mon, 5 Aug 2024 09:16:07 -0700 Subject: [PATCH] introduces UV --- Dockerfile | 38 +++++++----- vllm-base-image/Dockerfile | 120 ++++++++++++++++++++----------------- 2 files changed, 86 insertions(+), 72 deletions(-) diff --git a/Dockerfile b/Dockerfile index f48b832..9a34013 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,22 +2,27 @@ ARG WORKER_CUDA_VERSION=11.8.0 ARG BASE_IMAGE_VERSION=1.0.0 FROM runpod/worker-vllm:base-${BASE_IMAGE_VERSION}-cuda${WORKER_CUDA_VERSION} AS vllm-base -RUN apt-get update -y \ - && apt-get install -y python3-pip +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y --no-install-recommends python3-pip curl \ + && rm -rf /var/lib/apt/lists/* + +# Install UV +ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh +RUN /install.sh && rm /install.sh # Install Python dependencies COPY builder/requirements.txt /requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - python3 -m pip install --upgrade pip && \ - python3 -m pip install --upgrade -r /requirements.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache -r /requirements.txt # Setup for Option 2: Building the Image with the Model included -ARG MODEL_NAME="" -ARG TOKENIZER_NAME="" -ARG BASE_PATH="/runpod-volume" -ARG QUANTIZATION="" -ARG MODEL_REVISION="" -ARG TOKENIZER_REVISION="" +ARG MODEL_NAME="" \ + TOKENIZER_NAME="" \ + BASE_PATH="/runpod-volume" \ + QUANTIZATION="" \ + MODEL_REVISION="" \ + TOKENIZER_REVISION="" ENV MODEL_NAME=$MODEL_NAME \ MODEL_REVISION=$MODEL_REVISION \ @@ -28,9 +33,8 @@ ENV MODEL_NAME=$MODEL_NAME \ HF_DATASETS_CACHE="${BASE_PATH}/huggingface-cache/datasets" \ HUGGINGFACE_HUB_CACHE="${BASE_PATH}/huggingface-cache/hub" \ HF_HOME="${BASE_PATH}/huggingface-cache/hub" \ - HF_HUB_ENABLE_HF_TRANSFER=1 - -ENV PYTHONPATH="/:/vllm-workspace" + HF_HUB_ENABLE_HF_TRANSFER=1 \ + PYTHONPATH="/:/vllm-workspace" COPY src/download_model.py /download_model.py RUN --mount=type=secret,id=HF_TOKEN,required=false \ @@ -41,10 +45,12 @@ RUN --mount=type=secret,id=HF_TOKEN,required=false \ python3 /download_model.py; \ fi -# Add source files +# Add source files and remove download_model.py COPY src /src -# Remove download_model.py RUN rm /download_model.py +# Add a health check +HEALTHCHECK CMD python3 -c "import vllm" || exit 1 + # Start the handler CMD ["python3", "/src/handler.py"] \ No newline at end of file diff --git a/vllm-base-image/Dockerfile b/vllm-base-image/Dockerfile index 39f751a..e167655 100644 --- a/vllm-base-image/Dockerfile +++ b/vllm-base-image/Dockerfile @@ -14,8 +14,14 @@ FROM nvidia/cuda:${WORKER_CUDA_VERSION}-devel-ubuntu22.04 AS dev ARG WORKER_CUDA_VERSION # Update and install dependencies -RUN apt-get update -y \ - && apt-get install -y python3-pip git +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y --no-install-recommends python3-pip git curl \ + && rm -rf /var/lib/apt/lists/* + +# Install UV +ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh +RUN /install.sh && rm /install.sh # Set working directory WORKDIR /vllm-installation @@ -23,15 +29,14 @@ WORKDIR /vllm-installation RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/ # Install build and runtime dependencies -COPY vllm/requirements-common.txt requirements-common.txt -COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-cuda.txt +COPY vllm/requirements-common.txt vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt ./ +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache -r requirements-common.txt -r requirements-cuda${WORKER_CUDA_VERSION}.txt # Install development dependencies -COPY vllm/requirements-dev.txt requirements-dev.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-dev.txt +COPY vllm/requirements-dev.txt . +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache -r requirements-dev.txt ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} @@ -42,108 +47,111 @@ FROM dev AS build ARG WORKER_CUDA_VERSION # Install build dependencies -COPY vllm/requirements-build.txt requirements-build.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -r requirements-build.txt +COPY vllm/requirements-build.txt . +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache -r requirements-build.txt -# install compiler cache to speed up compilation leveraging local or remote caching -RUN apt-get update -y && apt-get install -y ccache +# Install compiler cache +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y && apt-get install -y --no-install-recommends ccache \ + && rm -rf /var/lib/apt/lists/* # Copy necessary files COPY vllm/csrc csrc -COPY vllm/setup.py setup.py -COPY vllm/cmake cmake -COPY vllm/CMakeLists.txt CMakeLists.txt -COPY vllm/requirements-common.txt requirements-common.txt -COPY vllm/requirements-cuda${WORKER_CUDA_VERSION}.txt requirements-cuda.txt -COPY vllm/pyproject.toml pyproject.toml -COPY vllm/vllm vllm +COPY vllm/setup.py vllm/cmake vllm/CMakeLists.txt vllm/pyproject.toml ./ +COPY vllm/vllm vllm # Set environment variables for building extensions -ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} -ENV VLLM_INSTALL_PUNICA_KERNELS=0 +ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} \ + VLLM_INSTALL_PUNICA_KERNELS=0 \ + CCACHE_DIR=/root/.cache/ccache + # Build extensions -ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ python3 setup.py bdist_wheel --dist-dir=dist -RUN --mount=type=cache,target=/root/.cache/pip \ - pip cache remove vllm_nccl* +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip cache remove vllm_nccl* FROM dev as flash-attn-builder -# max jobs used for build -# flash attention version + ARG flash_attn_version=v2.5.8 ENV FLASH_ATTN_VERSION=${flash_attn_version} WORKDIR /usr/src/flash-attention-v2 # Download the wheel or build it if a pre-compiled release doesn't exist -RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip wheel flash-attn==${FLASH_ATTN_VERSION} \ --no-build-isolation --no-deps --no-cache-dir FROM dev as NCCL-installer -# Re-declare ARG after FROM ARG WORKER_CUDA_VERSION -# Update and install necessary libraries -RUN apt-get update -y \ - && apt-get install -y wget +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y --no-install-recommends wget \ + && rm -rf /var/lib/apt/lists/* # Install NCCL library -RUN if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \ +RUN --mount=type=cache,target=/var/cache/apt \ + if [ "$WORKER_CUDA_VERSION" = "11.8.0" ]; then \ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \ && dpkg -i cuda-keyring_1.0-1_all.deb \ && apt-get update \ - && apt install -y libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \ + && apt-get install -y --no-install-recommends libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8; \ elif [ "$WORKER_CUDA_VERSION" = "12.1.0" ]; then \ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \ && dpkg -i cuda-keyring_1.0-1_all.deb \ && apt-get update \ - && apt install -y libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \ + && apt-get install -y --no-install-recommends libnccl2=2.17.1-1+cuda12.1 libnccl-dev=2.17.1-1+cuda12.1; \ else \ echo "Unsupported CUDA version: $WORKER_CUDA_VERSION"; \ exit 1; \ - fi + fi \ + && rm -rf /var/lib/apt/lists/* FROM nvidia/cuda:${WORKER_CUDA_VERSION}-base-ubuntu22.04 AS vllm-base -# Re-declare ARG after FROM ARG WORKER_CUDA_VERSION -# Update and install necessary libraries -RUN apt-get update -y \ - && apt-get install -y python3-pip +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get update -y \ + && apt-get install -y --no-install-recommends python3-pip curl \ + && rm -rf /var/lib/apt/lists/* + +# Install UV +ADD --chmod=755 https://astral.sh/uv/install.sh /install.sh +RUN /install.sh && rm /install.sh -# Set working directory WORKDIR /vllm-workspace RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/ RUN --mount=type=bind,from=build,src=/vllm-installation/dist,target=/vllm-workspace/dist \ - --mount=type=cache,target=/root/.cache/pip \ - pip install dist/*.whl --verbose + --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache dist/*.whl RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ - --mount=type=cache,target=/root/.cache/pip \ - pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir + --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache /usr/src/flash-attention-v2/*.whl FROM vllm-base AS runtime -# install additional dependencies for openai api server -RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope tensorizer +# Install additional dependencies for openai api server +RUN --mount=type=cache,target=/root/.cache/uv \ + /root/.cargo/bin/uv pip install --system --no-cache accelerate hf_transfer modelscope tensorizer -# Set PYTHONPATH environment variable -ENV PYTHONPATH="/" +ENV PYTHONPATH="/" \ + VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2" # Copy NCCL library COPY --from=NCCL-installer /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/lib/x86_64-linux-gnu/libnccl.so.2 -# Set the VLLM_NCCL_SO_PATH environment variable -ENV VLLM_NCCL_SO_PATH="/usr/lib/x86_64-linux-gnu/libnccl.so.2" - # Validate the installation -RUN python3 -c "import vllm; print(vllm.__file__)" \ No newline at end of file +RUN python3 -c "import vllm; print(vllm.__file__)" + +# Add a health check +HEALTHCHECK CMD python3 -c "import vllm" || exit 1 \ No newline at end of file