vllm-project · simon-mo · Jan 14, 2024 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -0,0 +1,24 @@
+# This script is run by buildkite to run the benchmarks and upload the results to buildkite
+
+set -ex
+
+# cd into parent directory of this file
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+# run benchmarks and upload the result to buildkite
+python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+
+# write the results into a markdown file
+echo "### Latency Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_latency.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_latency.txt >> benchmark_results.md
+echo "### Throughput Benchmarks" >> benchmark_results.md
+sed -n '1p' benchmark_throughput.txt >> benchmark_results.md
+echo "" >> benchmark_results.md
+sed -n '$p' benchmark_throughput.txt >> benchmark_results.md
+
+# upload the results to buildkite
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -0,0 +1,41 @@
+# In this file, you can add more tests to run either by adding a new step or
+# adding a new command to an existing step. See different options here for examples.
+# This script will be feed into Jinja template in `test-template.j2` to generate
+# the final pipeline yaml file.
+
+steps:
+- label: Regression Test
+  command: pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
+
+- label: AsyncEngine Test
+  command: pytest -v -s async_engine
+
+- label: Distributed Test
+  command: pytest -v -s test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Engine Test
+  command: pytest -v -s engine
+
+- label: Kernels Test
+  command: pytest -v -s kernels
+  soft_fail: true
+
+- label: Models Test
+  commands:
+    - pytest -v -s models --forked
+  soft_fail: true
+
+- label: Samplers Test
+  command: pytest -v -s samplers --forked
+
+- label: Worker Test
+  command: pytest -v -s worker
+
+- label: Benchmarks
+  working_dir: "/vllm-workspace/.buildkite"
+  commands:
+  - pip install aiohttp
+  - bash run-benchmarks.sh
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -0,0 +1,46 @@
+{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
+{% set default_num_gpu = 1 %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    commands:
+      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      queue: kubernetes
+    soft_fail: {{ step.soft_fail or false }}
+    plugins:
+      - kubernetes:
+          podSpec:
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+            containers:
+              - image: "{{ docker_image }}"
+                command: ["bash"]
+                args:
+                - "-c"
+                - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                resources:
+                  requests:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                  limits:
+                    nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                env:
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+                volumeMounts:
+                  - mountPath: /dev/shm
+                    name: dshm
+  {% endfor %}
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,11 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+#################### BASE BUILD IMAGE ####################
 FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 
 RUN apt-get update -y \
-    && apt-get install -y python3-pip
+    && apt-get install -y python3-pip git
 
 WORKDIR /workspace
 
@@ -14,8 +18,10 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
+#################### BASE BUILD IMAGE ####################
+
 
-# image to build pytorch extensions
+#################### EXTENSION BUILD IMAGE ####################
 FROM dev AS build
 
 # install build dependencies
@@ -30,6 +36,7 @@ COPY requirements.txt requirements.txt
 COPY pyproject.toml pyproject.toml
 COPY vllm/__init__.py vllm/__init__.py
 
+# cuda arch list used by torch
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # max jobs used by Ninja to build extensions
@@ -40,18 +47,26 @@ ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 
 RUN python3 setup.py build_ext --inplace
+#################### EXTENSION Build IMAGE ####################
 
+
+#################### TEST IMAGE ####################
 # image to run unit testing suite
 FROM dev AS test
 
 # copy pytorch extensions separately to avoid having to rebuild
 # when python code changes
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY tests tests
-COPY vllm vllm
+WORKDIR /vllm-workspace
+# ADD is used to preserve directory structure
+ADD . /vllm-workspace/
+COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
+# ignore build dependencies installation because we are using pre-complied extensions
+RUN rm pyproject.toml
+RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
+#################### TEST IMAGE ####################
 
-ENTRYPOINT ["python3", "-m", "pytest", "tests"]
 
+#################### RUNTIME BASE IMAGE ####################
 # use CUDA base as CUDA runtime dependencies are already installed via pip
 FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
 
@@ -63,14 +78,10 @@ WORKDIR /workspace
 COPY requirements.txt requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
+#################### RUNTIME BASE IMAGE ####################
 
-FROM vllm-base AS vllm
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
-
-EXPOSE 8000
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
 
+#################### OPENAI API SERVER ####################
 # openai api server alternative
 FROM vllm-base AS vllm-openai
 # install additional dependencies for openai api server
@@ -81,3 +92,4 @@ COPY --from=build /workspace/vllm/*.so /workspace/vllm/
 COPY vllm vllm
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -13,4 +13,6 @@ types-setuptools
 pytest
 pytest-forked
 pytest-asyncio
-
+httpx
+einops # required for MPT
+flash_attn # required for HuggingFace's llama implementation
diff --git a/setup.py b/setup.py
@@ -293,6 +293,11 @@ def get_requirements() -> List[str]:
     return requirements
 
 
+package_data = {"vllm": ["py.typed"]}
+if os.environ.get("VLLM_USE_PRECOMPILED"):
+    ext_modules = []
+    package_data["vllm"].append("*.so")
+
 setuptools.setup(
     name="vllm",
     version=get_vllm_version(),
@@ -321,5 +326,5 @@ def get_requirements() -> List[str]:
     install_requires=get_requirements(),
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},
-    package_data={"vllm": ["py.typed"]},
+    package_data=package_data,
 )
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
@@ -29,8 +29,13 @@ def api_server():
     script_path = Path(__file__).parent.joinpath(
         "api_server_async_engine.py").absolute()
     uvicorn_process = subprocess.Popen([
-        sys.executable, "-u",
-        str(script_path), "--model", "facebook/opt-125m"
+        sys.executable,
+        "-u",
+        str(script_path),
+        "--model",
+        "facebook/opt-125m",
+        "--host",
+        "127.0.0.1",
     ])
     yield
     uvicorn_process.terminate()
@@ -81,6 +86,9 @@ def test_api_server(api_server):
         pool.join()
 
         # check cancellation stats
+        # give it some times to update the stats
+        time.sleep(1)
+
         num_aborted_requests = requests.get(
             "http://localhost:8000/stats").json()["num_aborted_requests"]
         assert num_aborted_requests > 0

diff --git a/tests/async_engine/test_openai_server.py b/tests/async_engine/test_openai_server.py
@@ -1,28 +1,32 @@
 from argparse import Namespace
 from dataclasses import dataclass
+import os
+import pathlib
 
 import pytest
 from fastapi.testclient import TestClient
 
 from vllm.entrypoints.openai.api_server import *
 
+chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
+    __file__))).parent.parent / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
+
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
     ("facebook/opt-125m", None, True,
      "Hello</s>Hi there!</s>What is the capital of</s>"),
     ("facebook/opt-125m", None, False,
      "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
-     """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
 <|im_start|>user
 What is the capital of<|im_end|>
 <|im_start|>assistant
 """),
-    ("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
-     """<|im_start|>user
+    ("facebook/opt-125m", chatml_jinja_path, False, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
 Hi there!<|im_end|>
@@ -54,8 +58,7 @@ class MockTokenizer:
 
 def test_load_chat_template():
     # Testing chatml template
-    template = "../../examples/template_chatml.jinja"
-    mock_args = Namespace(chat_template=template)
+    mock_args = Namespace(chat_template=chatml_jinja_path)
     tokenizer = MockTokenizer()
 
     # Call the function with the mocked args

diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
@@ -2,10 +2,9 @@
 
 Run `pytest tests/distributed/test_comm_ops.py --forked`.
 """
-from multiprocessing import Process, set_start_method
-
 import pytest
 import torch
+import ray
 
 from vllm.config import ParallelConfig
 from vllm.utils import get_open_port
@@ -23,11 +22,11 @@ def init_test_distributed_environment(pipeline_parallel_size: int,
                                      tensor_parallel_size,
                                      worker_use_ray=True)
     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
-    torch.cuda.set_device(rank)
     _init_distributed_environment(parallel_config, rank,
                                   distributed_init_method)
 
 
+@ray.remote(num_gpus=1, max_calls=1)
 def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
                            distributed_init_port: str):
     init_test_distributed_environment(1, tensor_parallel_size, rank,
@@ -43,6 +42,7 @@ def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
     assert torch.allclose(t, expected)
 
 
+@ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(tensor_parallel_size: int, rank: int,
                            distributed_init_port: str):
     init_test_distributed_environment(1, tensor_parallel_size, rank,
@@ -70,14 +70,16 @@ def all_gather_test_worker(tensor_parallel_size: int, rank: int,
 @pytest.mark.parametrize("test_target",
                          [all_reduce_test_worker, all_gather_test_worker])
 def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
-    set_start_method("spawn", force=True)
+    # Using ray helps debugging the error when it failed
+    # as compared to multiprocessing.
+    ray.init()
+
     distributed_init_port = get_open_port()
-    processes = []
+    refs = []
     for rank in range(tensor_parallel_size):
-        p = Process(target=test_target,
-                    args=(tensor_parallel_size, rank, distributed_init_port))
-        p.start()
-        processes.append(p)
-    for p in processes:
-        p.join()
-    assert all(p.exitcode == 0 for p in processes)
+        refs.append(
+            test_target.remote(tensor_parallel_size, rank,
+                               distributed_init_port))
+    ray.get(refs)
+
+    ray.shutdown()
@@ -13,7 +13,7 @@
 # This will change depending on the compute capability.
 # - 512 as a buffer
 MAX_SEQ_LEN = get_max_shared_memory_bytes() // FLOAT32_BYTES - 512
-NUM_BLOCKS = 40000  # Arbitrary values for testing
+NUM_BLOCKS = 12000  # Arbitrary values for testing
 PARTITION_SIZE = 512
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]

@@ -6,12 +6,12 @@
 from vllm._C import cache_ops
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-NUM_TOKENS = [83]  # Arbitrary values for testing
+NUM_TOKENS = [42]  # Arbitrary values for testing
 NUM_LAYERS = [1]  # Arbitrary values for testing
 NUM_HEADS = [8]  # Arbitrary values for testing
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 BLOCK_SIZES = [8, 16, 32]
-NUM_BLOCKS = [1024, 36000]  # Arbitrary values for testing
+NUM_BLOCKS = [1024, 3600]  # Arbitrary values for testing
 NUM_MAPPINGS = [256]  # Arbitrary values for testing
 SEEDS = [0]
 DEVICES = [i for i in range(1 if torch.cuda.device_count() == 1 else 2)]

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
@@ -30,6 +30,7 @@ def test_get_prompt_logprobs(
                                           temperature=0.0)
     vllm_results = vllm_model.model.generate(
         example_prompts, sampling_params=vllm_sampling_params)
+    del vllm_model
 
     # Test whether logprobs are included in the results.
     for result in vllm_results: