diff --git a/.github/workflows/add_label_automerge.yml b/.github/workflows/add_label_automerge.yml deleted file mode 100644 index cd53b764c720..000000000000 --- a/.github/workflows/add_label_automerge.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Add label on auto-merge enabled -on: - pull_request_target: - types: - - auto_merge_enabled -jobs: - add-label-on-auto-merge: - runs-on: ubuntu-latest - steps: - - name: Add label - uses: actions/github-script@v5 - with: - script: | - github.rest.issues.addLabels({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - labels: ['ready'] - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/add_label_ready_comment.yml b/.github/workflows/add_label_ready_comment.yml deleted file mode 100644 index 729c1452af03..000000000000 --- a/.github/workflows/add_label_ready_comment.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Add Ready Label on Ready Comment - -on: - issue_comment: - types: [created] - -jobs: - add-ready-label: - runs-on: ubuntu-latest - if: github.event.issue.pull_request && contains(github.event.comment.body, '/ready') - steps: - - name: Add label - uses: actions/github-script@v5 - with: - script: | - github.rest.issues.addLabels({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - labels: ['ready'] - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml deleted file mode 100644 index 390c88bb6530..000000000000 --- a/.github/workflows/reminder_comment.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: PR Reminder Comment Bot -on: - pull_request_target: - types: [opened] - -jobs: - pr_reminder: - runs-on: ubuntu-latest - steps: - - name: Remind to run full CI on PR - uses: actions/github-script@v6 - with: - script: | - github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: 'šŸ‘‹ Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which consists a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of default ones by unblocking the steps in your `fast-check` build on Buildkite UI. \n\nOnce the PR is approved and ready to go, please make sure to run full CI as it is required to merge (or just use auto-merge).\n\n To run full CI, you can do one of these:\n- Comment `/ready` on the PR\n- Add `ready` label to the PR\n- Enable auto-merge.\n\nšŸš€' - }) - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/sync-with-upstream.yml b/.github/workflows/sync-with-upstream.yml new file mode 100644 index 000000000000..53751552f4d2 --- /dev/null +++ b/.github/workflows/sync-with-upstream.yml @@ -0,0 +1,84 @@ +name: "Sync with upstream" + +on: + schedule: + - cron: 20 4 * * * + + workflow_dispatch: + + +env: + # repo to fetch changes from + UPSTREAM_REPO: vllm-project/vllm + # branch to sync + BRANCH: main + +jobs: + upstream-sync: + name: Sync with upstream + runs-on: ubuntu-latest + permissions: + pull-requests: write + contents: write + + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch upstream repo + run: | + git remote add upstream https://github.com/${UPSTREAM_REPO} + git fetch upstream + + - name: Check diff + id: diff + shell: bash + run: | + echo 'diff<> $GITHUB_OUTPUT + git diff --stat upstream/${BRANCH} | tee -a >(cat >> $GITHUB_OUTPUT) + echo 'EOF' >> $GITHUB_OUTPUT + + - name: Create PR + if: ${{ steps.diff.outputs.diff != '' }} + env: + GH_TOKEN: ${{ github.token }} + run: | + set -xeu + + git_hash="$(git rev-parse upstream/${BRANCH})" + echo "git_hash=$git_hash" >> $GITHUB_OUTPUT + git_describe="$(git describe --tags upstream/${BRANCH})" + echo "git_describe=$git_describe" >> $GITHUB_OUTPUT + + # echo 'commits<> $GITHUB_OUTPUT + # git log --oneline ..upstream/${BRANCH} >> $GITHUB_OUTPUT + # echo 'EOF' >> $GITHUB_OUTPUT + + upstream_url="https://github.com/${UPSTREAM_REPO}" + upstream_branch="$upstream_url/tree/${BRANCH}" + + title="Sync with upstream@${git_describe}" + body="Merge [${UPSTREAM_REPO}]($upstream_url):[${BRANCH}]($upstream_branch)@[${git_describe}](${upstream_url}/commit/$git_hash) into $BRANCH" + + gh repo set-default $GITHUB_REPOSITORY + pr_number=$(gh pr list -S "Sync with upstream@" --json number --jq '.[0].number') + + if [[ -z $pr_number ]]; then + echo "Creating PR" + gh pr create \ + --head $(echo $UPSTREAM_REPO | sed 's|/|:|g'):${BRANCH} \ + --base ${BRANCH} \ + --label code-sync \ + --title "$title" \ + --body "$body" \ + --draft \ + --no-maintainer-edit + exit 0 + fi + + echo "Updating PR \#${pr_number}" + gh pr edit \ + $pr_number \ + --body "$body" \ + --title "$title" diff --git a/Dockerfile.ubi b/Dockerfile.ubi new file mode 100644 index 000000000000..2d37e232a465 --- /dev/null +++ b/Dockerfile.ubi @@ -0,0 +1,202 @@ +## Global Args ################################################################# +ARG BASE_UBI_IMAGE_TAG=9.4 +ARG PYTHON_VERSION=3.11 + +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" + +## Base Layer ################################################################## +FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base +ARG PYTHON_VERSION + +RUN microdnf install -y \ + python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel \ + && microdnf clean all + +WORKDIR /workspace + +ENV LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Some utils for dev purposes - tar required for kubectl cp +RUN microdnf install -y \ + which procps findutils tar vim git\ + && microdnf clean all + + +## Python Installer ############################################################ +FROM base as python-install + +ARG PYTHON_VERSION + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN microdnf install -y \ + python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \ + python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U uv pip wheel && microdnf clean all + + +## CUDA Base ################################################################### +FROM python-install as cuda-base + +RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \ + https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo + +RUN microdnf install -y \ + cuda-nvcc-12-4 cuda-nvtx-12-4 cuda-libraries-devel-12-4 && \ + microdnf clean all + +ENV CUDA_HOME="/usr/local/cuda" \ + PATH="${CUDA_HOME}/bin:${PATH}" \ + LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64:${LD_LIBRARY_PATH}" + +## Python cuda base ################################################################# +FROM cuda-base AS python-cuda-base + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# install cuda and common dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + uv pip install \ + -r requirements-cuda.txt + +## Development ################################################################# +FROM python-cuda-base AS dev + +# install build and runtime dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \ + --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \ + --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \ + --mount=type=bind,source=requirements-lint.txt,target=requirements-lint.txt \ + --mount=type=bind,source=requirements-test.txt,target=requirements-test.txt \ + uv pip install \ + -r requirements-cuda.txt \ + -r requirements-dev.txt + +## Builder ##################################################################### +FROM dev AS build + +# install build dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \ + uv pip install -r requirements-build.txt + +# install compiler cache to speed up compilation leveraging local or remote caching +# git is required for the cutlass kernels +RUN rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && rpm -ql epel-release && microdnf install -y git ccache && microdnf clean all +# install build dependencies + +# copy input files +COPY csrc csrc +COPY setup.py setup.py +COPY cmake cmake +COPY CMakeLists.txt CMakeLists.txt +COPY requirements-common.txt requirements-common.txt +COPY requirements-cuda.txt requirements-cuda.txt +COPY pyproject.toml pyproject.toml + +ARG TORCH_CUDA_ARCH_LIST +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST + +# max jobs used by Ninja to build extensions +ARG max_jobs=2 +ENV MAX_JOBS=${max_jobs} +# number of threads used by nvcc +ARG nvcc_threads=8 +ENV NVCC_THREADS=$nvcc_threads +# make sure punica kernels are built (for LoRA) +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +# Make sure the cuda environment is in the PATH +ENV PATH=/usr/local/cuda/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Copy the entire directory before building wheel +COPY vllm vllm + +ENV CCACHE_DIR=/root/.cache/ccache +RUN --mount=type=cache,target=/root/.cache/ccache \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=bind,src=.git,target=/workspace/.git \ + env CFLAGS="-march=haswell" \ + CXXFLAGS="$CFLAGS $CXXFLAGS" \ + CMAKE_BUILD_TYPE=Release \ + python3 setup.py bdist_wheel --dist-dir=dist + +#################### libsodium Build IMAGE #################### +FROM base as libsodium-builder + +RUN microdnf install -y gcc gzip \ + && microdnf clean all + +WORKDIR /usr/src/libsodium + +ARG LIBSODIUM_VERSION=1.0.20 +RUN curl -LO https://github.com/jedisct1/libsodium/releases/download/${LIBSODIUM_VERSION}-RELEASE/libsodium-${LIBSODIUM_VERSION}.tar.gz \ + && tar -xzvf libsodium*.tar.gz \ + && rm -f libsodium*.tar.gz \ + && mv libsodium*/* ./ + +RUN ./configure --prefix="/usr/" && make && make check + +## Release ##################################################################### +FROM python-install AS vllm-openai + +WORKDIR /workspace + +ENV VIRTUAL_ENV=/opt/vllm +ENV PATH=$VIRTUAL_ENV/bin/:$PATH + +# Triton needs a CC compiler +RUN microdnf install -y gcc \ + && microdnf clean all + +# install vllm wheel first, so that torch etc will be installed +RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install $(echo dist/*.whl)'[tensorizer]' --verbose + +# Install libsodium for Tensorizer encryption +RUN --mount=type=bind,from=libsodium-builder,src=/usr/src/libsodium,target=/usr/src/libsodium \ + cd /usr/src/libsodium \ + && make install + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp311-cp311-linux_x86_64.whl + +ENV HF_HUB_OFFLINE=1 \ + PORT=8000 \ + HOME=/home/vllm \ + VLLM_USAGE_SOURCE=production-docker-image \ + VLLM_WORKER_MULTIPROC_METHOD=fork + +# setup non-root user for OpenShift +RUN umask 002 \ + && useradd --uid 2000 --gid 0 vllm \ + && chmod g+rwx $HOME /usr/src /workspace + +COPY LICENSE /licenses/vllm.md + +USER 2000 +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + + +FROM vllm-openai as vllm-grpc-adapter + +USER root + +RUN --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/uv \ + uv pip install vllm-tgis-adapter==0.2.3 + +ENV GRPC_PORT=8033 +USER 2000 +ENTRYPOINT ["python3", "-m", "vllm_tgis_adapter"] diff --git a/OWNERS b/OWNERS new file mode 100644 index 000000000000..09b25dab41c0 --- /dev/null +++ b/OWNERS @@ -0,0 +1,28 @@ +approvers: + - dtrifiro + - fialhocoelho + - heyselbi + - joerunde + - maxdebayser + - njhill + - prashantgupta24 + - RH-steve-grubb + - rpancham + - terrytangyuan + - vaibhavjainwiz + - z103cb + - Xaenalt +reviewers: + - dtrifiro + - fialhocoelho + - heyselbi + - joerunde + - maxdebayser + - njhill + - prashantgupta24 + - RH-steve-grubb + - rpancham + - terrytangyuan + - vaibhavjainwiz + - Xaenalt + - z103cb diff --git a/extras/smoke-test.sh b/extras/smoke-test.sh new file mode 100644 index 000000000000..f03edea4f619 --- /dev/null +++ b/extras/smoke-test.sh @@ -0,0 +1,73 @@ +#!/bin/bash +set -uxo pipefail + +# we will need to download test models off HF hub +unset HF_HUB_OFFLINE + +export HTTP_PORT=8080 +export GRPC_PORT=8033 + + +function wait_for(){ + trap "" ERR # we don't care about errors in this function + + name=$1 + shift + command=$@ + + max_retries=10 + until $command ; do + echo "Waiting for $name to be up (retries_left=$max_retries)..." + sleep 30 + max_retries=$((max_retries-1)) + if [[ max_retries -le 0 ]]; then + echo "Timed out waiting for $name server" >&2 + exit 1 + fi + done +} + +# stop the server on any errors +trap 'kill -9 $server_pid && exit 1' ERR + +# spin up the OpenAPI server in the background +python -m vllm.entrypoints.openai.api_server --port $HTTP_PORT & +server_pid=$! +server_url="http://localhost:$HTTP_PORT" + +wait_for "http server" curl --verbose --connect-timeout 1 --fail-with-body --no-progress-meter "${server_url}/health" + +curl -v --no-progress-meter --fail-with-body \ + "${server_url}/v1/models" | python -m json.tool || \ + +curl -v --no-progress-meter --fail-with-body \ + --header "Content-Type: application/json" \ + --data '{ + "prompt": "A red fedora symbolizes ", + "model": "facebook/opt-125m" +}' \ + "${server_url}/v1/completions" | python -m json.tool + +echo "OpenAI API success" && kill -9 $server_pid + + +# spin up the grpc server in the background +python -m vllm_tgis_adapter --grpc-port $GRPC_PORT & +server_pid=$! +server_url="localhost:$GRPC_PORT" + +# get grpcurl +curl --no-progress-meter --location --output /tmp/grpcurl.tar.gz \ + https://github.com/fullstorydev/grpcurl/releases/download/v1.9.1/grpcurl_1.9.1_linux_x86_64.tar.gz +tar -xf /tmp/grpcurl.tar.gz --directory /tmp + +wait_for "grpc_server" grpc_healthcheck # healthcheck is part of vllm_tgis_adapter + +/tmp/grpcurl -v \ + -plaintext \ + -use-reflection \ + -d '{ "requests": [{"text": "A red fedora symbolizes "}]}' \ + "$server_url" \ + fmaas.GenerationService/Generate + +echo "GRPC API success" && kill -9 $server_pid diff --git a/extras/unit-tests.sh b/extras/unit-tests.sh new file mode 100644 index 000000000000..08b2388b646e --- /dev/null +++ b/extras/unit-tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# partially copied from from .buildkite/test-pipeline.yml +set -e + +cd tests || exit 1 + +# we will need to download test models off HF hub +unset HF_HUB_OFFLINE + +# basic correctness +pytest -v -s test_regression.py +pytest -v -s async_engine +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py +VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py +VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py +VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py + +# core +pytest -v -s core + +# note: distributed tests are disabled + +# engine tests +pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py +# entrypoint +pytest -v -s entrypoints -m openai + +#inputs (note: multimodal tests are skipped) +pytest -v -s test_inputs.py + +#models +pytest -v -s models -m \"not vlm\" + +# misc +pytest -v -s prefix_caching +pytest -v -s samplers +pytest -v -s test_logits_processor.py +pytest -v -s models -m \"not vlm\" +pytest -v -s worker +VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s spec_decode +# pytest -v -s tensorizer_loader # disabled: requires libsodium +pytest -v -s metrics +pytest -v -s quantization