vllm-project · michaelfeil · Jun 22, 2023 · Jun 22, 2023 · Jun 22, 2023 · Jun 22, 2023
diff --git a/.gitignore b/.gitignore
@@ -170,3 +170,6 @@ cython_debug/
 
 # Python pickle files
 *.pkl
+
+# Sphinx documentation
+_build/
diff --git a/README.md b/README.md
@@ -17,8 +17,8 @@ Easy, fast, and cheap LLM serving for everyone
 ---
 
 *Latest News* 🔥
-
-- [2023/06] We officially released vLLM! vLLM has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid April. Check out our [blog post](https://vllm.ai).
+- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 
 ---
 
@@ -28,7 +28,7 @@ vLLM is fast with:
 
 - State-of-the-art serving throughput
 - Efficient management of attention key and value memory with **PagedAttention**
-- Dynamic batching of incoming requests
+- Continuous batching of incoming requests
 - Optimized CUDA kernels
 
 vLLM is flexible and easy to use with:
@@ -42,7 +42,8 @@ vLLM is flexible and easy to use with:
 vLLM seamlessly supports many Huggingface models, including the following architectures:
 
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
-- GPTNeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 - LLaMA (`lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -17,6 +17,7 @@ def main(args: argparse.Namespace):
     # the engine will automatically process the request in multiple batches.
     llm = LLM(
         model=args.model,
+        tokenizer=args.tokenizer,
         tensor_parallel_size=args.tensor_parallel_size,
         max_num_seqs=args.batch_size,
         max_num_batched_tokens=args.batch_size * args.input_len,
@@ -63,6 +64,7 @@ def run_to_completion(profile: bool = False):
         description='Benchmark the latency of processing a single batch of '
                     'requests till completion.')
     parser.add_argument('--model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--tokenizer', type=str, default=None)
     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
     parser.add_argument('--input-len', type=int, default=32)
     parser.add_argument('--output-len', type=int, default=128)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -24,20 +24,13 @@
 
 import aiohttp
 import numpy as np
-from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
+from transformers import PreTrainedTokenizerBase
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 # (prompt len, output len, latency)
 REQUEST_LATENCY: List[Tuple[int, int, float]] = []
 
 
-def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase:
-    config = AutoConfig.from_pretrained(model_name)
-    if config.model_type == "llama":
-        # A workaround for potential protobuf errors.
-        model_name = "hf-internal-testing/llama-tokenizer"
-    return AutoTokenizer.from_pretrained(model_name)
-
-
 def sample_requests(
     dataset_path: str,
     num_requests: int,
@@ -217,7 +210,7 @@ def main(args: argparse.Namespace):
     parser.add_argument("--backend", type=str, default="vllm",
                         choices=["vllm", "tgi"])
     parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--dataset", type=str, required=True,
                         help="Path to the dataset.")
     parser.add_argument("--tokenizer", type=str, required=True,

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -6,23 +6,11 @@
 from typing import List, Tuple
 
 import torch
-from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM,
-                          PreTrainedTokenizerBase)
+from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
 from tqdm import tqdm
 
 from vllm import LLM, SamplingParams
-
-
-def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase:
-    config = AutoConfig.from_pretrained(model_name)
-    if config.model_type == "llama":
-        # A workaround for potential protobuf errors.
-        model_name = "hf-internal-testing/llama-tokenizer"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        # To enable padding in the HF backend.
-        tokenizer.pad_token = tokenizer.eos_token
-        return tokenizer
-    return AutoTokenizer.from_pretrained(model_name)
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 
 def sample_requests(
@@ -74,13 +62,15 @@ def sample_requests(
 def run_vllm(
     requests: List[Tuple[str, int, int]],
     model: str,
+    tokenizer: str,
     tensor_parallel_size: int,
     seed: int,
     n: int,
     use_beam_search: bool,
 ) -> float:
     llm = LLM(
         model=model,
+        tokenizer=tokenizer,
         tensor_parallel_size=tensor_parallel_size,
         seed=seed,
     )
@@ -118,9 +108,10 @@ def run_hf(
     max_batch_size: int,
 ) -> float:
     assert not use_beam_search
-    tokenizer = get_tokenizer(model)
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16)
+    llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
     llm = llm.cuda()
 
     pbar = tqdm(total=len(requests))
@@ -170,13 +161,13 @@ def main(args: argparse.Namespace):
     random.seed(args.seed)
 
     # Sample the requests.
-    tokenizer = get_tokenizer(args.model)
+    tokenizer = get_tokenizer(args.tokenizer)
     requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
 
     if args.backend == "vllm":
         elapsed_time = run_vllm(
-            requests, args.model, args.tensor_parallel_size, args.seed, args.n,
-            args.use_beam_search)
+            requests, args.model, args.tokenizer, args.tensor_parallel_size,
+            args.seed, args.n, args.use_beam_search)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -198,6 +189,7 @@ def main(args: argparse.Namespace):
     parser.add_argument("--dataset", type=str, required=True,
                         help="Path to the dataset.")
     parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
     parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
     parser.add_argument("--n", type=int, default=1,
                         help="Number of generated sequences per prompt.")
@@ -208,11 +200,14 @@ def main(args: argparse.Namespace):
     parser.add_argument("--hf-max-batch-size", type=int, default=None,
                         help="Maximum batch size for HF backend.")
     args = parser.parse_args()
+
     if args.backend == "vllm":
         if args.hf_max_batch_size is not None:
             raise ValueError("HF max batch size is only for HF backend.")
     elif args.backend == "hf":
         if args.hf_max_batch_size is None:
             raise ValueError("HF max batch size is required for HF backend.")
+    if args.tokenizer is None:
+        args.tokenizer = args.model
 
     main(args)
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-PORT=8001
+PORT=8000
 MODEL=$1
 TOKENS=$2
 

diff --git a/docs/README.md b/docs/README.md
@@ -4,14 +4,14 @@
 
 ```bash
 # Install dependencies.
-pip -r requirements-docs.txt
+pip install -r requirements-docs.txt
 
 # Build the docs.
 make clean
 make html
 ```
 
-## Open the docs with your brower
+## Open the docs with your browser
 
 ```bash
 python -m http.server -d build/html/

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -29,7 +29,7 @@ vLLM is fast with:
 
 * State-of-the-art serving throughput
 * Efficient management of attention key and value memory with **PagedAttention**
-* Dynamic batching of incoming requests
+* Continuous batching of incoming requests
 * Optimized CUDA kernels
 
 vLLM is flexible and easy to use with:
@@ -40,7 +40,11 @@ vLLM is flexible and easy to use with:
 * Streaming outputs
 * OpenAI-compatible API server
 
-For more information, please refer to our `blog post <https://vllm.ai>`_.
+For more information, check out the following:
+
+* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
+* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
+
 
 
 Documentation
@@ -53,6 +57,12 @@ Documentation
    getting_started/installation
    getting_started/quickstart
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Serving
+
+   serving/distributed_serving
+
 .. toctree::
    :maxdepth: 1
    :caption: Models

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -17,6 +17,9 @@ Alongside each architecture, we include some popular models that use it.
   * - :code:`GPT2LMHeadModel`
     - GPT-2
     - :code:`gpt2`, :code:`gpt2-xl`, etc.
+  * - :code:`GPTBigCodeForCausalLM`
+    - StarCoder, SantaCoder, WizardCoder
+    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
   * - :code:`GPTNeoXForCausalLM`
     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
@@ -0,0 +1,38 @@
+.. _distributed_serving:
+
+Distributed Inference and Serving
+=================================
+
+vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
+
+.. code-block:: console
+
+    $ pip install ray
+
+To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+
+.. code-block:: python
+
+    from vllm import LLM
+    llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
+    output = llm.generate("San Franciso is a")
+
+To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+
+.. code-block:: console
+
+    $ python -m vllm.entrypoints.api_server \
+    $     --model facebook/opt-13b \
+    $     --tensor-parallel-size 4
+
+To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
+
+.. code-block:: console
+
+    $ # On head node
+    $ ray start --head
+
+    $ # On worker nodes
+    $ ray start --address=<ray-head-address>
+
+After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
diff --git a/examples/llm_engine_example.py b/examples/llm_engine_example.py
@@ -30,7 +30,7 @@ def main(args: argparse.Namespace):
 
         request_outputs = engine.step()
         for request_output in request_outputs:
-            if request_output.finished():
+            if request_output.finished:
                 print(request_output)
 
         if not (engine.has_unfinished_requests() or test_prompts):

diff --git a/setup.py b/setup.py
@@ -20,10 +20,9 @@
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
-if not torch.cuda.is_available():
+if CUDA_HOME is None:
     raise RuntimeError(
-        f"Cannot find CUDA at CUDA_HOME: {CUDA_HOME}. "
-        "CUDA must be available in order to build the package.")
+        f"Cannot find CUDA_HOME. CUDA must be available in order to build the package.")
 
 
 def get_nvcc_cuda_version(cuda_dir: str) -> Version:

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -6,7 +6,7 @@
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 
 __all__ = [
     "LLM",

diff --git a/vllm/config.py b/vllm/config.py
@@ -16,6 +16,9 @@ class ModelConfig:
 
     Args:
         model: Name or path of the huggingface model to use.
+        tokenizer: Name or path of the huggingface tokenizer to use.
+        tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+            available, and "slow" will always use the slow tokenizer.
         download_dir: Directory to download and load the weights, default to the
             default cache directory of huggingface.
         use_np_weights: Save a numpy copy of model weights for faster loading.
@@ -30,20 +33,33 @@ class ModelConfig:
     def __init__(
         self,
         model: str,
+        tokenizer: str,
+        tokenizer_mode: str,
         download_dir: Optional[str],
         use_np_weights: bool,
         use_dummy_weights: bool,
         dtype: str,
         seed: int,
     ) -> None:
         self.model = model
+        self.tokenizer = tokenizer
+        self.tokenizer_mode = tokenizer_mode
         self.download_dir = download_dir
         self.use_np_weights = use_np_weights
         self.use_dummy_weights = use_dummy_weights
         self.seed = seed
 
         self.hf_config: PretrainedConfig = AutoConfig.from_pretrained(model)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
+        self._verify_tokenizer_mode()
+
+    def _verify_tokenizer_mode(self) -> None:
+        tokenizer_mode = self.tokenizer_mode.lower()
+        if tokenizer_mode not in ["auto", "slow"]:
+            raise ValueError(
+                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
+                "either 'auto' or 'slow'.")
+        self.tokenizer_mode = tokenizer_mode
 
     def verify_with_parallel_config(
         self,
@@ -170,14 +186,18 @@ class SchedulerConfig:
             a single iteration.
         max_num_seqs: Maximum number of sequences to be processed in a single
             iteration.
+        max_seq_len: Maximum length of a sequence (including prompt
+            and generated text).
     """
     def __init__(
         self,
         max_num_batched_tokens: int,
         max_num_seqs: int,
+        max_seq_len: int
     ) -> None:
         self.max_num_batched_tokens = max_num_batched_tokens
         self.max_num_seqs = max_num_seqs
+        self.max_seq_len = max_seq_len
 
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
-Original file line number
+Diff line change
@@ Expand Up / @@ -170,3 +170,6 @@ cython_debug/ @@
     # Python pickle files
     *.pkl
+    # Sphinx documentation
+    _build/