Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
05921a9
Changed scheduler to use deques instead of lists (#2290)
NadavShmayo Jan 7, 2024
c884819
Fix eager mode performance (#2377)
WoosukKwon Jan 8, 2024
28c3f12
[Minor] Remove unused code in attention (#2384)
WoosukKwon Jan 8, 2024
74cd5ab
Add baichuan chat template jinjia file (#2390)
kky42 Jan 9, 2024
79d64c4
[Speculative decoding 1/9] Optimized rejection sampler (#2336)
cadedaniel Jan 9, 2024
4b61c6b
`get_ip()`: Fix ipv4 ipv6 dualstack (#2408)
yunfeng-scale Jan 10, 2024
50376fa
Rename phi_1_5 -> phi (#2385)
WoosukKwon Jan 12, 2024
b20ed29
multi step worker
cadedaniel Jan 12, 2024
6549aef
[DOC] Add additional comments for LLMEngine and AsyncLLMEngine (#1011)
litone01 Jan 12, 2024
f745847
[Minor] Fix the format in quick start guide related to Model Scope (#…
zhuohan123 Jan 12, 2024
9746058
Add gradio chatbot for openai webserver (#2307)
arkohut Jan 12, 2024
48cf1e4
fix: deque mutated during iteration in abort_seq_group (#2371)
chenxu2048 Jan 12, 2024
ce03624
Allow setting fastapi root_path argument (#2341)
chiragjn Jan 12, 2024
7878958
Address Phi modeling update 2 (#2428)
huiwy Jan 12, 2024
cb7a1c1
Suggest using dtype=half when OOM.
aiopx2024 Jan 12, 2024
827cbcd
Update quickstart.rst (#2369)
nautsimon Jan 12, 2024
218dc2c
Aligning `top_p` and `top_k` Sampling (#1885)
chenxu2048 Jan 12, 2024
35c4bc2
[Minor] Fix err msg (#2431)
WoosukKwon Jan 12, 2024
9f659bf
[Minor] Optimize cuda graph memory usage (#2437)
esmeetu Jan 14, 2024
6e01e8c
[CI] Add Buildkite (#2355)
simon-mo Jan 14, 2024
2a18da2
Announce the second vLLM meetup (#2444)
WoosukKwon Jan 15, 2024
bfc072a
Allow buildkite to retry build on agent lost (#2446)
simon-mo Jan 15, 2024
f780504
fix weigit loading for GQA with TP (#2379)
zhangch9 Jan 15, 2024
947f0b2
CI: make sure benchmark script exit on error (#2449)
simon-mo Jan 16, 2024
8cd5a99
ci: retry on build failure as well (#2457)
simon-mo Jan 16, 2024
e1957c6
Add StableLM3B model (#2372)
ita9naiwa Jan 17, 2024
14cc317
OpenAI Server refactoring (#2360)
FlorianJoncour Jan 17, 2024
5ffceca
fix block table size miscalculation
cadedaniel Jan 17, 2024
6a22d07
Merge remote-tracking branch 'upstream/main' into public-vllm-upstrea…
cadedaniel Jan 17, 2024
0e1c3b3
lint
cadedaniel Jan 17, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions .buildkite/run-benchmarks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# This script is run by buildkite to run the benchmarks and upload the results to buildkite

set -ex

# cd into parent directory of this file
cd "$(dirname "${BASH_SOURCE[0]}")/.."

# run benchmarks and upload the result to buildkite
python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?

python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?

# write the results into a markdown file
echo "### Latency Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_latency.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_latency.txt >> benchmark_results.md
echo "### Throughput Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
echo "" >> benchmark_results.md
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md

# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md

# exit with the exit code of the benchmarks
if [ $bench_latency_exit_code -ne 0 ]; then
exit $bench_latency_exit_code
fi

if [ $bench_throughput_exit_code -ne 0 ]; then
exit $bench_throughput_exit_code
fi
44 changes: 44 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template.j2` to generate
# the final pipeline yaml file.

steps:
- label: Regression Test
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional

- label: AsyncEngine Test
command: pytest -v -s async_engine

- label: Distributed Test
command: pytest -v -s test_comm_ops.py
working_dir: "/vllm-workspace/tests/distributed"
num_gpus: 2 # only support 1 or 2 for now.

- label: Engine Test
command: pytest -v -s engine

- label: Entrypoints Test
command: pytest -v -s entrypoints

- label: Kernels Test
command: pytest -v -s kernels
soft_fail: true

- label: Models Test
commands:
- pytest -v -s models --forked
soft_fail: true

- label: Samplers Test
command: pytest -v -s samplers --forked

- label: Worker Test
command: pytest -v -s worker

- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
- pip install aiohttp
- bash run-benchmarks.sh
54 changes: 54 additions & 0 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
{% set default_num_gpu = 1 %}
{% set default_working_dir = "/vllm-workspace/tests" %}

steps:
- label: ":docker: build image"
commands:
- "docker build --tag {{ docker_image }} --target test --progress plain ."
- "docker push {{ docker_image }}"
env:
DOCKER_BUILDKIT: "1"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
- wait

{% for step in steps %}
- label: "{{ step.label }}"
agents:
queue: kubernetes
soft_fail: {{ step.soft_fail or false }}
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
plugins:
- kubernetes:
podSpec:
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- image: "{{ docker_image }}"
command: ["bash"]
args:
- "-c"
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
resources:
requests:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
limits:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
env:
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
volumeMounts:
- mountPath: /dev/shm
name: dshm
{% endfor %}
36 changes: 24 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev

RUN apt-get update -y \
&& apt-get install -y python3-pip
&& apt-get install -y python3-pip git

WORKDIR /workspace

Expand All @@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
#################### BASE BUILD IMAGE ####################


# image to build pytorch extensions
#################### EXTENSION BUILD IMAGE ####################
FROM dev AS build

# install build dependencies
Expand All @@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

# cuda arch list used by torch
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
Expand All @@ -40,18 +47,26 @@ ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads

RUN python3 setup.py build_ext --inplace
#################### EXTENSION Build IMAGE ####################


#################### TEST IMAGE ####################
# image to run unit testing suite
FROM dev AS test

# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY tests tests
COPY vllm vllm
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################

ENTRYPOINT ["python3", "-m", "pytest", "tests"]

#################### RUNTIME BASE IMAGE ####################
# use CUDA base as CUDA runtime dependencies are already installed via pip
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base

Expand All @@ -63,14 +78,10 @@ WORKDIR /workspace
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
#################### RUNTIME BASE IMAGE ####################

FROM vllm-base AS vllm
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

EXPOSE 8000
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]

#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
# install additional dependencies for openai api server
Expand All @@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
COPY vllm vllm

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone

---

**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**

We are thrilled to announce our second vLLM Meetup!
The vLLM team will share recent updates and roadmap.
We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
Please register [here](https://lu.ma/ygxbpzhl) and join us!

---

*Latest News* 🔥
- [2023/12] Added ROCm support to vLLM.
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
Expand Down Expand Up @@ -68,6 +77,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)

Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
Expand Down
43 changes: 36 additions & 7 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,22 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))

import os
import sys
from sphinx.ext import autodoc
import logging

sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))

logger = logging.getLogger(__name__)

# -- Project information -----------------------------------------------------

project = 'vLLM'
copyright = '2023, vLLM Team'
author = 'the vLLM Team'


# -- General configuration ---------------------------------------------------

# Add any Sphinx extension module names here, as strings. They can be
Expand All @@ -32,6 +35,8 @@
"sphinx.ext.viewcode",
"sphinx.ext.intersphinx",
"sphinx_copybutton",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
]

# Add any paths that contain templates here, relative to this directory.
Expand All @@ -55,7 +60,6 @@
html_theme = 'sphinx_book_theme'
html_logo = 'assets/logos/vllm-logo-text-light.png'
html_theme_options = {
'logo_only': True,
'path_to_docs': 'docs/source',
'repository_url': 'https://github.com/vllm-project/vllm',
'use_repository_button': True,
Expand All @@ -64,4 +68,29 @@
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# html_static_path = ['_static']

# Mock out external dependencies here.
autodoc_mock_imports = [
"torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
"vllm.cuda_utils", "vllm._C"
]

for mock_target in autodoc_mock_imports:
if mock_target in sys.modules:
logger.info(
f"Potentially problematic mock target ({mock_target}) found; "
"autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.")


class MockedClassDocumenter(autodoc.ClassDocumenter):
"""Remove note about base class when a class is derived from object."""

def add_line(self, line: str, source: str, *lineno: int) -> None:
if line == " Bases: :py:class:`object`":
return
super().add_line(line, source, *lineno)


autodoc.ClassDocumenter = MockedClassDocumenter
7 changes: 7 additions & 0 deletions docs/source/dev/engine/async_llm_engine.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

AsyncLLMEngine
=================================

.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
:members: generate, abort
:show-inheritance:
13 changes: 13 additions & 0 deletions docs/source/dev/engine/engine_index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
vLLM Engine
=================================

.. automodule:: vllm.engine
.. currentmodule:: vllm.engine

.. toctree::
:maxdepth: 2
:caption: Engines

llm_engine
async_llm_engine

6 changes: 6 additions & 0 deletions docs/source/dev/engine/llm_engine.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
LLMEngine
=================================

.. autoclass:: vllm.engine.llm_engine.LLMEngine
:members: add_request, abort_request, step, _init_cache
:show-inheritance:
Loading