vllm-project · KrishnaM251 · Feb 29, 2024 · Mar 4, 2024 · Mar 4, 2024 · Mar 4, 2024
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
@@ -16,8 +16,110 @@ Async Engine Arguments
 
 Below are the additional arguments related to the asynchronous engine:
 
-.. argparse::
-    :module: vllm.engine.arg_utils
-    :func: _async_engine_args_parser
-    :prog: -m vllm.entrypoints.openai.api_server
-    :nodefaultconst:
+    Name or path of the huggingface tokenizer to use.
+
+.. option:: --revision <revision>
+
+    The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+
+.. option:: --tokenizer-revision <revision>
+
+    The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+
+.. option:: --tokenizer-mode {auto,slow}
+
+    The tokenizer mode.
+
+    * "auto" will use the fast tokenizer if available.
+    * "slow" will always use the slow tokenizer.
+
+.. option:: --trust-remote-code
+
+    Trust remote code from huggingface.
+
+.. option:: --download-dir <directory>
+
+    Directory to download and load the weights, default to the default cache dir of huggingface.
+
+.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
+
+    The format of the model weights to load.
+
+    * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
+    * "pt" will load the weights in the pytorch bin format.
+    * "safetensors" will load the weights in the safetensors format.
+    * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
+    * "dummy" will initialize the weights with random values, mainly for profiling.
+
+.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
+
+    Data type for model weights and activations.
+
+    * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
+    * "half" for FP16. Recommended for AWQ quantization.
+    * "float16" is the same as "half".
+    * "bfloat16" for a balance between precision and range.
+    * "float" is shorthand for FP32 precision.
+    * "float32" for FP32 precision.
+
+.. option:: --max-model-len <length>
+
+    Model context length. If unspecified, will be automatically derived from the model config.
+
+.. option:: --worker-use-ray
+
+    Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
+
+.. option:: --pipeline-parallel-size (-pp) <size>
+
+    Number of pipeline stages.
+
+.. option:: --tensor-parallel-size (-tp) <size>
+
+    Number of tensor parallel replicas.
+
+.. option:: --max-parallel-loading-workers <workers>
+
+    Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
+
+.. option:: --block-size {8,16,32}
+
+    Token block size for contiguous chunks of tokens.
+
+.. option:: --seed <seed>
+
+    Random seed for operations.
+
+.. option:: --swap-space <size>
+
+    CPU swap space size (GiB) per GPU.
+
+.. option:: --gpu-memory-utilization <fraction>
+
+    The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. 
+    For example, a value of 0.5 would imply 50% GPU memory utilization.
+    If unspecified, will use the default value of 0.9.
+
+.. option:: --max-num-batched-tokens <tokens>
+
+    Maximum number of batched tokens per iteration.
+
+.. option:: --max-num-seqs <sequences>
+
+    Maximum number of sequences per iteration.
+
+.. option:: --max-paddings <paddings>
+
+    Maximum number of paddings in a batch.
+
+.. option:: --max-queue-length <size>
+
+    Maximum number of requests that can be present across all queues.
+
+.. option:: --disable-log-stats
+
+    Disable logging statistics.
+
+.. option:: --quantization (-q) {awq,squeezellm,None}
+
+    Method used to quantize the weights.
diff --git a/tests/engine/test_max_queue_length.py b/tests/engine/test_max_queue_length.py
@@ -0,0 +1,107 @@
+import pytest
+import argparse
+from typing import List, Tuple
+from vllm.engine.llm_engine import QueueOverflowError
+from vllm.logger import init_logger
+
+from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
+
+# initialize constants
+logger = init_logger(__name__)
+
+
+@pytest.fixture
+def test_prompts() -> List[Tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+        ("What is the meaning of life?",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+        ("It is only with the heart that one can see rightly",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            try:
+                engine.add_request(str(request_id), prompt, sampling_params)
+            except ValueError as e:
+                # Log error, cleanup, end test
+                logger.info(f"{e}")
+                for i in range(request_id):
+                    engine.abort_request(str(i))
+                raise QueueOverflowError(
+                    f"Queue exceeded max length: {e}") from e
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+
+
+@pytest.mark.parametrize(
+    "max_wait_q_len, expect_error",
+    [
+        (1, True),  # error expected 
+        (2, True),
+        (3, False),  # No error expected 
+        (4, False),
+    ])
+def test_max_queue_length(max_wait_q_len, expect_error, test_prompts):
+
+    # Setup engine with appropriate max_queue_length value
+    parser = argparse.ArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    args_to_test = [
+        '--max-num-seqs',
+        str(1),
+        '--max-queue-length',
+        str(max_wait_q_len),
+        "--max-num-batched-tokens",
+        "2048",
+        "--gpu-memory-utilization",
+        "0.7",
+        "--max-model-len",
+        "1024",
+    ]
+    args = parser.parse_args(args_to_test)
+    engine_args = EngineArgs.from_cli_args(args)
+    engine = LLMEngine.from_engine_args(engine_args)
+
+    # Test engine against request
+    try:
+        process_requests(engine, test_prompts)
+        assert not expect_error
+        print("QueueOverflowError did not occur as expected.")
+        "QueueOverflowError did not occur as expected."
+    except QueueOverflowError as e:
+        assert expect_error
+        print(f" QueueOverflowError occurred as expected: {e}")
diff --git a/tests/engine/tmql.py b/tests/engine/tmql.py
@@ -0,0 +1,100 @@
+import pytest
+import argparse
+from typing import List, Tuple
+from vllm.logger import init_logger
+
+from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
+
+# init variables
+max_wait_q_len = 2
+
+logger = init_logger(__name__)
+
+
+class QueueOverflowError(Exception):
+    pass
+
+
+def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
+    """Create a list of test prompts with their sampling parameters."""
+    return [
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+        ("To be or not to be,",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+        ("What is the meaning of life?",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+        ("It is only with the heart that one can see rightly",
+         SamplingParams(temperature=0.8,
+                        top_k=5,
+                        presence_penalty=0.2,
+                        ignore_eos=True,
+                        max_tokens=1000)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+    # make sure to set something like max_num_seq to ONE
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params = test_prompts.pop(0)
+            try:
+                engine.add_request(str(request_id), prompt, sampling_params)
+            except ValueError as e:
+                # Log error, cleanup, end test
+                logger.info(f"{e}")
+                for i in range(request_id):
+                    engine.abort_request(str(i))
+                raise QueueOverflowError(
+                    f"Queue exceeded max length: {e}") from e
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+
+        for request_output in request_outputs:
+            if request_output.finished:
+                print(request_output)
+
+
+def initialize_engine(args: argparse.Namespace) -> LLMEngine:
+    """Initialize the LLMEngine from the command line arguments."""
+    engine_args = EngineArgs.from_cli_args(args)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main(args: argparse.Namespace):
+    """Main function that sets up and runs the prompt processing."""
+    engine = initialize_engine(args)
+    test_prompts = create_test_prompts()
+    with pytest.raises(QueueOverflowError,
+                       match="Queue exceeded max length: .*"):
+        process_requests(engine, test_prompts)
+
+
+# def test_max_queue_length():
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Demo on using the LLMEngine class directly')
+    parser = EngineArgs.add_cli_args(parser)
+    args_to_test = [
+        '--max-num-seqs',
+        str(1), '--max-queue-length',
+        str(max_wait_q_len)
+    ]
+    args = parser.parse_args(args_to_test)
+    main(args)