cadedaniel · cadedaniel · Jan 7, 2024 · Jan 8, 2024 · Jan 8, 2024 · Jan 9, 2024
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -0,0 +1,35 @@
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+
+set -ex
+
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+# run benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+bench_latency_exit_code=$?
+
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+bench_throughput_exit_code=$?
+
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
+
+# upload the results to buildkite
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+
+# exit with the exit code of the benchmarks
+if [ $bench_latency_exit_code -ne 0 ]; then
+    exit $bench_latency_exit_code
+fi
+
+if [ $bench_throughput_exit_code -ne 0 ]; then
+    exit $bench_throughput_exit_code
+fi
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -0,0 +1,44 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
+
+steps:
+- label: Regression Test
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  command: pytest -v -s async_engine
+
+- label: Distributed Test
+  command: pytest -v -s test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Engine Test
+  command: pytest -v -s engine
+
+- label: Entrypoints Test
+  command: pytest -v -s entrypoints
+
+- label: Kernels Test
+  command: pytest -v -s kernels
+  soft_fail: true
+
+- label: Models Test
+  commands:
+    - pytest -v -s models --forked
+  soft_fail: true
+
+- label: Samplers Test
+  command: pytest -v -s samplers --forked
+
+- label: Worker Test
+  command: pytest -v -s worker
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -0,0 +1,54 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands:
+      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+    plugins:
+      - kubernetes:
+          podSpec:
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - "-c"
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                env:
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,11 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+#################### BASE BUILD IMAGE ####################
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
-    && apt-get install -y python3-pip
+    && apt-get install -y python3-pip git
 
 WORKDIR /workspace
 
@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
+#################### BASE BUILD IMAGE ####################
+
 
-# image to build pytorch extensions
+#################### EXTENSION BUILD IMAGE ####################
 FROM dev AS build
 
 # install build dependencies
@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
 
+# cuda arch list used by torch
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
@@ -40,18 +47,26 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 
 RUN python3 setup.py build_ext --inplace
+#################### EXTENSION Build IMAGE ####################
 
+
+#################### TEST IMAGE ####################
 # image to run unit testing suite
 FROM dev AS test
 
 # copy pytorch extensions separately to avoid having to rebuild
 # when python code changes
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY tests tests
-COPY vllm vllm
+WORKDIR /vllm-workspace
+# ADD is used to preserve directory structure
+ADD . /vllm-workspace/
+COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
+# ignore build dependencies installation because we are using pre-complied extensions
+RUN rm pyproject.toml
+RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
+#################### TEST IMAGE ####################
 
-ENTRYPOINT ["python3", "-m", "pytest", "tests"]
 
+#################### RUNTIME BASE IMAGE ####################
 # use CUDA base as CUDA runtime dependencies are already installed via pip
 FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
 
@@ -63,14 +78,10 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
+#################### RUNTIME BASE IMAGE ####################
 
-FROM vllm-base AS vllm
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
-
-EXPOSE 8000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
 
+#################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
@@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
diff --git a/README.md b/README.md
@@ -16,6 +16,15 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
+**The Second vLLM Bay Area Meetup (Jan 31st 5pm-7:30pm PT)**
+
+We are thrilled to announce our second vLLM Meetup!
+The vLLM team will share recent updates and roadmap.
+We will also have vLLM collaborators from IBM coming up to the stage to discuss their insights on LLM optimizations.
+Please register [here](https://lu.ma/ygxbpzhl) and join us!
+
+---
+
 *Latest News* 🔥
 - [2023/12] Added ROCm support to vLLM.
 - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
@@ -68,6 +77,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 - Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
 - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
 - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
 
 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -9,19 +9,22 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
 
+import os
+import sys
+from sphinx.ext import autodoc
+import logging
+
+sys.path.insert(0, os.path.abspath(os.path.join('..', '..')))
+
+logger = logging.getLogger(__name__)
 
 # -- Project information -----------------------------------------------------
 
 project = 'vLLM'
 copyright = '2023, vLLM Team'
 author = 'the vLLM Team'
 
-
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be
@@ -32,6 +35,8 @@
     "sphinx.ext.viewcode",
     "sphinx.ext.intersphinx",
     "sphinx_copybutton",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
@@ -55,7 +60,6 @@
 html_theme = 'sphinx_book_theme'
 html_logo = 'assets/logos/vllm-logo-text-light.png'
 html_theme_options = {
-    'logo_only': True,
     'path_to_docs': 'docs/source',
     'repository_url': 'https://github.com/vllm-project/vllm',
     'use_repository_button': True,
@@ -64,4 +68,29 @@
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+# html_static_path = ['_static']
+
+# Mock out external dependencies here.
+autodoc_mock_imports = [
+    "torch", "transformers", "psutil", "aioprometheus", "sentencepiece",
+    "vllm.cuda_utils", "vllm._C"
+]
+
+for mock_target in autodoc_mock_imports:
+    if mock_target in sys.modules:
+        logger.info(
+            f"Potentially problematic mock target ({mock_target}) found; "
+            "autodoc_mock_imports cannot mock modules that have already "
+            "been loaded into sys.modules when the sphinx build starts.")
+
+
+class MockedClassDocumenter(autodoc.ClassDocumenter):
+    """Remove note about base class when a class is derived from object."""
+
+    def add_line(self, line: str, source: str, *lineno: int) -> None:
+        if line == "   Bases: :py:class:`object`":
+            return
+        super().add_line(line, source, *lineno)
+
+
+autodoc.ClassDocumenter = MockedClassDocumenter
diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.rst
@@ -0,0 +1,7 @@
+
+AsyncLLMEngine
+=================================
+
+.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
+    :members: generate, abort
+    :show-inheritance:
diff --git a/docs/source/dev/engine/engine_index.rst b/docs/source/dev/engine/engine_index.rst
@@ -0,0 +1,13 @@
+vLLM Engine
+=================================
+
+.. automodule:: vllm.engine
+.. currentmodule:: vllm.engine
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Engines
+
+   llm_engine
+   async_llm_engine
+
diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst
@@ -0,0 +1,6 @@
+LLMEngine
+=================================
+
+.. autoclass:: vllm.engine.llm_engine.LLMEngine
+    :members: add_request, abort_request, step, _init_cache
+    :show-inheritance: