Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .tekton/vllm-cuda-v2-19-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ spec:
value: Dockerfile.ubi
- name: path-context
value: .
- name: additional-build-secret
value: rhel-ai-private-index-auth
- name: build-args-file
value: argfile.konflux
taskRunSpecs:
- pipelineTaskName: ecosystem-cert-preflight-checks
computeResources:
Expand Down Expand Up @@ -294,6 +298,8 @@ spec:
- $(params.build-platforms)
name: build-images
params:
- name: ADDITIONAL_SECRET
value: $(params.additional-build-secret)
- name: IMAGE
value: $(params.output-image)
- name: DOCKERFILE
Expand Down
138 changes: 47 additions & 91 deletions Dockerfile.ubi
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
## Global Args #################################################################
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
ARG PYTHON_VERSION=3.12

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
ARG BASE_UBI_IMAGE_TAG
ARG PYTHON_VERSION

## Base Layer ##################################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf -y update && microdnf install -y --nodocs \
Expand All @@ -19,25 +16,28 @@ ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8

# Some utils for dev purposes - tar required for kubectl cp

RUN microdnf install -y --nodocs \
which procps findutils tar vim git\
which procps findutils tar vim git \
&& microdnf clean all


## Python Installer ############################################################
FROM base as python-install
FROM base AS python-install
ARG PYTHON_VERSION

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf install -y --nodocs \
python${PYTHON_VERSION}-devel && \
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && \
pip install --no-cache -U pip wheel uv && \
microdnf clean all


## CUDA Base ###################################################################
FROM python-install as cuda-base
FROM python-install AS cuda-base

RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
Expand All @@ -51,88 +51,30 @@ RUN microdnf install -y --nodocs \
ln -s ${CUDA_HOME}/lib64/stubs/libcuda.so /usr/lib64/



## Python cuda base #################################################################
FROM cuda-base AS python-cuda-base

ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"

# install cuda and common dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
uv pip install \
-r requirements-cuda.txt


## Development #################################################################
FROM python-cuda-base AS dev

# install build and runtime dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
--mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
--mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \
--mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \
uv pip install \
-r requirements-cuda.txt \
-r requirements-dev.txt

## Builder #####################################################################
FROM dev AS build

# install build dependencies
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
uv pip install -r requirements-build.txt

# install compiler cache to speed up compilation leveraging local or remote caching
# git is required for the cutlass kernels
RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y --nodocs git ccache && microdnf clean all

COPY . .

ARG TORCH_CUDA_ARCH_LIST
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
ARG vllm_fa_cmake_gpu_arches
ENV VLLM_FA_CMAKE_GPU_ARCHES=${vllm_fa_cmake_gpu_arches}

# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1

# Make sure the cuda environment is in the PATH
ENV PATH=/usr/local/cuda/bin:$PATH

ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=.git,target=/workspace/.git \
env CFLAGS="-march=haswell" \
CXXFLAGS="$CFLAGS $CXXFLAGS" \
CMAKE_BUILD_TYPE=Release \
python3 setup.py bdist_wheel --dist-dir=dist

#################### libsodium Build IMAGE ####################
FROM base as libsodium-builder
FROM base AS libsodium-builder

RUN microdnf install -y --nodocs gcc gzip \
&& microdnf clean all

WORKDIR /usr/src/libsodium

ARG LIBSODIUM_VERSION=1.0.20
ARG LIBSODIUM_VERSION
RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \
&& tar -xzvf libsodium*.tar.gz \
&& rm -f libsodium*.tar.gz \
Expand All @@ -156,25 +98,32 @@ ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nv
ENV LD_LIBRARY_PATH="${VIRTUAL_ENV}/lib/python${PYTHON_VERSION}/site-packages/nvidia/nvtx/lib:${LD_LIBRARY_PATH}"

# Triton needs a CC compiler

RUN microdnf install -y --nodocs gcc \
rsync \
&& microdnf clean all

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install "$(echo dist/*.whl)[tensorizer]" --verbose

# Install libsodium for Tensorizer encryption
RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \
cd /usr/src/libsodium \
&& make install
make -C /usr/src/libsodium install

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
uv pip install \
"https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.0.post2/flashinfer_python-0.2.0.post2+cu124torch2.5-cp312-cp312-linux_x86_64.whl"
COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/

# install vllm by running the payload script and then install flashinfer

ARG VLLM_WHEEL_VERSION
ARG VLLM_WHEEL_INDEX
ARG FLASHINFER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=payload,target=/workspace/payload \
--mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
env BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
VLLM_WHEEL_VERSION=${VLLM_VERSION} \
VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
./payload/run.sh && \
uv pip install "${FLASHINFER_VERSION}"

ENV HF_HUB_OFFLINE=1 \
HOME=/home/vllm \
Expand All @@ -199,25 +148,32 @@ ENV HF_HUB_OFFLINE=1 \
RUN umask 002 && \
useradd --uid 2000 --gid 0 vllm && \
mkdir -p /home/vllm && \
chmod g+rwx /home/vllm /usr/src /workspace

COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/
chmod g+rwx /home/vllm

USER 2000
WORKDIR /home/vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]


FROM vllm-openai as vllm-grpc-adapter
## TGIS Adapter layer #####################################################################
FROM vllm-openai AS vllm-grpc-adapter

USER root

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
HOME=/root uv pip install "$(echo /workspace/dist/*.whl)[tensorizer]" vllm-tgis-adapter==0.6.3
ARG VLLM_TGIS_ADAPTER_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=payload,target=/workspace/payload \
--mount=type=secret,id=rhel-ai-private-index-auth/BOT_PAT \
cd /workspace && \
ls && \
env HOME=/root \
BOT_PAT=$(cat /run/secrets/rhel-ai-private-index-auth/BOT_PAT) \
VLLM_WHEEL_VERSION=${VLLM_VERSION} \
VLLM_TGIS_ADAPTER_VERSION=${VLLM_TGIS_ADAPTER_VERSION} \
VLLM_WHEEL_INDEX=${VLLM_WHEEL_INDEX} \
./payload/run.sh


ENV GRPC_PORT=8033 \
PORT=8000 \
Expand Down
7 changes: 7 additions & 0 deletions argfile.konflux
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BASE_UBI_IMAGE_TAG=9.5-1739420147
PYTHON_VERSION=3.11
LIBSODIUM_VERSION=1.0.20
VLLM_TGIS_ADAPTER_VERSION=0.6.3
FLASHINFER_VERSION=https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post1/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
VLLM_WHEEL_VERSION=0.7.2
VLLM_WHEEL_INDEX=https://gitlab.com/api/v4/projects/66664052/packages/pypi/simple
34 changes: 34 additions & 0 deletions payload/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
# required env vars:
# $BOT_PAT
# $WHEEL_RELEASE_ARTIFACTS
# optional:
# $VLLM_TGIS_ADAPTER_VERSION
# $VLLM_WHEEL_VERSION
set -ex

cat <<EOF > ${HOME}/.netrc
machine gitlab.com
login rhel-ai-wheels-prefetch-token-rhoai
password $BOT_PAT
EOF

trap "rm ${HOME}/.netrc" EXIT

# https://docs.astral.sh/uv/configuration/indexes/#searching-across-multiple-indexes
# This will prefer to use the custom index, and fall back to pypi if needed
export UV_EXTRA_INDEX_URL=${VLLM_WHEEL_INDEX}

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't work, RHEL AI wheels are an all-or-nothing solution which cannot be mixed with PyPi wheels

export UV_INDEX_STRATEGY=unsafe-first-match

vllm="vllm[tensorizer,audio,video]"

if [[ -n "$VLLM_TGIS_ADAPTER_VERSION" ]]; then
vllm_tgis_adapter="vllm-tgis-adapter==${VLLM_TGIS_ADAPTER_VERSION}"
fi

if [[ -n "$VLLM_WHEEL_VERSION" ]]; then
vllm="${vllm}==${$VLLM_WHEEL_VERSION}"
fi

uv pip install $vllm $vllm_tgis_adapter