vllm-project · DarkLight1337 · Mar 26, 2026 · Mar 24, 2026 · Mar 24, 2026 · Mar 24, 2026
diff --git a/examples/online_serving/batched_chat_completions.py b/examples/online_serving/batched_chat_completions.py
@@ -0,0 +1,194 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Examples of batched chat completions via the vLLM OpenAI-compatible API.
+
+The /v1/chat/completions/batch endpoint accepts ``messages`` as a list of
+conversations.  Each conversation is processed independently and the response
+contains one choice per conversation, indexed 0, 1, ..., N-1.
+
+Start a server first, e.g.:
+    vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000
+
+Current limitations compared to /v1/chat/completions:
+    - Streaming is not supported.
+    - Tool use is not supported.
+    - Beam search is not supported.
+"""
+
+import json
+import os
+
+import httpx
+
+BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000")
+MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
+BATCH_URL = f"{BASE_URL}/v1/chat/completions/batch"
+
+
+def post_batch(payload: dict) -> dict:
+    response = httpx.post(BATCH_URL, json=payload, timeout=60)
+    response.raise_for_status()
+    return response.json()
+
+
+def main() -> None:
+    print("=== Example 1a: single conversation (standard endpoint) ===")
+    response = httpx.post(
+        f"{BASE_URL}/v1/chat/completions",
+        json={
+            "model": MODEL,
+            "messages": [{"role": "user", "content": "What is the capital of Japan?"}],
+        },
+        timeout=60,
+    )
+    response.raise_for_status()
+    data = response.json()
+    for choice in data["choices"]:
+        print(f"  [{choice['index']}] {choice['message']['content']}")
+
+    print("\n=== Example 1b: batched plain text (2 conversations) ===")
+    data = post_batch(
+        {
+            "model": MODEL,
+            "messages": [
+                [{"role": "user", "content": "What is the capital of France?"}],
+                [{"role": "user", "content": "What is the capital of Japan?"}],
+            ],
+        }
+    )
+    for choice in data["choices"]:
+        print(f"  [{choice['index']}] {choice['message']['content']}")
+
+    print("\n=== Example 2: batch with regex constraint (yes|no) ===")
+    data = post_batch(
+        {
+            "model": MODEL,
+            "messages": [
+                [{"role": "user", "content": "Is the sky blue? Answer yes or no."}],
+                [{"role": "user", "content": "Is fire cold? Answer yes or no."}],
+            ],
+            "structured_outputs": {"regex": "(yes|no)"},
+        }
+    )
+    for choice in data["choices"]:
+        print(f"  [{choice['index']}] {choice['message']['content']}")
+
+    print("\n=== Example 3: batch with json_schema ===")
+    person_schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string", "description": "Full name of the person"},
+            "age": {"type": "integer", "description": "Age in years"},
+        },
+        "required": ["name", "age"],
+    }
+    data = post_batch(
+        {
+            "model": MODEL,
+            "messages": [
+                [
+                    {
+                        "role": "user",
+                        "content": "Describe the person: name Alice, age 30.",
+                    }
+                ],
+                [{"role": "user", "content": "Describe the person: name Bob, age 25."}],
+            ],
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "person",
+                    "strict": True,
+                    "schema": person_schema,
+                },
+            },
+        }
+    )
+    for choice in data["choices"]:
+        person = json.loads(choice["message"]["content"])
+        print(f"  [{choice['index']}] {person}")
+
+    print("\n=== Example 4: batch book summaries ===")
+    book_schema = {
+        "type": "object",
+        "properties": {
+            "author": {
+                "type": "string",
+                "description": "Full name of the author",
+            },
+            "num_pages": {
+                "type": "integer",
+                "description": "Number of pages in the book",
+            },
+            "short_summary": {
+                "type": "string",
+                "description": "A one-sentence summary of the book",
+            },
+            "long_summary": {
+                "type": "string",
+                "description": (
+                    "A detailed two to three sentence summary covering "
+                    "the main themes and plot"
+                ),
+            },
+        },
+        "required": ["author", "num_pages", "short_summary", "long_summary"],
+    }
+    system_msg = {
+        "role": "system",
+        "content": (
+            "You are a literary analyst. Extract structured information "
+            "from book descriptions."
+        ),
+    }
+    data = post_batch(
+        {
+            "model": MODEL,
+            "messages": [
+                [
+                    system_msg,
+                    {
+                        "role": "user",
+                        "content": (
+                            "Extract information from this book: '1984' by George"
+                            " Orwell, published in 1949, 328 pages. A dystopian"
+                            " novel set in a totalitarian society ruled by Big"
+                            " Brother, following Winston Smith as he secretly"
+                            " rebels against the oppressive Party that surveils"
+                            " and controls every aspect of life."
+                        ),
+                    },
+                ],
+                [
+                    system_msg,
+                    {
+                        "role": "user",
+                        "content": (
+                            "Extract information from this book: 'The Hitchhiker's"
+                            " Guide to the Galaxy' by Douglas Adams, published in"
+                            " 1979, 193 pages. A comedic science fiction novel"
+                            " following Arthur Dent, an ordinary Englishman who is"
+                            " whisked off Earth moments before it is demolished to"
+                            " make way for a hyperspace bypass, and his subsequent"
+                            " absurd adventures across the universe."
+                        ),
+                    },
+                ],
+            ],
+            "response_format": {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "book_summary",
+                    "strict": True,
+                    "schema": book_schema,
+                },
+            },
+        }
+    )
+    for choice in data["choices"]:
+        book = json.loads(choice["message"]["content"])
+        print(f"  [{choice['index']}] {book}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import httpx
+import pytest
+
+from tests.utils import RemoteOpenAIServer
+
+# any model with a chat template defined in tokenizer_config should work here
+MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager",
+    ]
+
+
+@pytest.fixture(scope="module")
+def server(default_server_args):
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_batched_chat_completions(
+    server: RemoteOpenAIServer, model_name: str
+) -> None:
+    conversations = [
+        [{"role": "user", "content": "Reply with exactly the word: alpha"}],
+        [{"role": "user", "content": "Reply with exactly the word: beta"}],
+    ]
+
+    async with httpx.AsyncClient() as http_client:
+        response = await http_client.post(
+            f"{server.url_for('v1/chat/completions/batch')}",
+            json={
+                "model": model_name,
+                "messages": conversations,
+            },
+            timeout=60,
+        )
+
+    assert response.status_code == 200, response.text
+    data = response.json()
+
+    choices = data["choices"]
+    assert len(choices) == 2
+
+    indices = {choice["index"] for choice in choices}
+    assert indices == {0, 1}
+
+    # Each conversation should produce a non-empty text response.
+    for choice in choices:
+        assert choice["message"]["content"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_batched_chat_completions_with_json_schema(
+    server: RemoteOpenAIServer, model_name: str
+) -> None:
+    schema = {
+        "type": "object",
+        "properties": {
+            "answer": {"type": "string", "enum": ["yes", "no"]},
+        },
+        "required": ["answer"],
+    }
+    conversations = [
+        [{"role": "user", "content": "Is the sky blue? Answer in JSON."}],
+        [{"role": "user", "content": "Is fire cold? Answer in JSON."}],
+    ]
+
+    async with httpx.AsyncClient() as http_client:
+        response = await http_client.post(
+            f"{server.url_for('v1/chat/completions/batch')}",
+            json={
+                "model": model_name,
+                "messages": conversations,
+                "response_format": {
+                    "type": "json_schema",
+                    "json_schema": {"name": "answer", "schema": schema, "strict": True},
+                },
+            },
+            timeout=60,
+        )
+
+    assert response.status_code == 200, response.text
+    data = response.json()
+
+    choices = data["choices"]
+    assert len(choices) == 2
+
+    for choice in choices:
+        parsed = json.loads(choice["message"]["content"])
+        assert "answer" in parsed
+        assert parsed["answer"] in ("yes", "no")
@@ -174,6 +174,7 @@ def test_openapi_stateless(case: Case):
     timeout = {
         # requires a longer timeout
         ("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
+        ("POST", "/v1/chat/completions/batch"): LONG_TIMEOUT_SECONDS,
         ("POST", "/v1/completions"): LONG_TIMEOUT_SECONDS,
         ("POST", "/v1/messages"): LONG_TIMEOUT_SECONDS,
     }.get(key, DEFAULT_TIMEOUT_SECONDS)

@@ -7,7 +7,9 @@
 from fastapi import APIRouter, Depends, FastAPI, Request
 from fastapi.responses import JSONResponse, StreamingResponse
 
+from vllm.entrypoints.openai.chat_completion.batch_serving import OpenAIServingChatBatch
 from vllm.entrypoints.openai.chat_completion.protocol import (
+    BatchChatCompletionRequest,
     ChatCompletionRequest,
     ChatCompletionResponse,
 )
@@ -31,6 +33,10 @@ def chat(request: Request) -> OpenAIServingChat | None:
     return request.app.state.openai_serving_chat
 
 
+def batch_chat(request: Request) -> OpenAIServingChatBatch | None:
+    return request.app.state.openai_serving_chat_batch
+
+
 @router.post(
     "/v1/chat/completions",
     dependencies=[Depends(validate_json_request)],
@@ -68,5 +74,33 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
     return StreamingResponse(content=generator, media_type="text/event-stream")
 
 
+@router.post(
+    "/v1/chat/completions/batch",
+    dependencies=[Depends(validate_json_request)],
+    responses={
+        HTTPStatus.OK.value: {},
+        HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
+        HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
+        HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
+    },
+)
+@with_cancellation
+@load_aware_call
+async def create_batch_chat_completion(
+    request: BatchChatCompletionRequest, raw_request: Request
+):
+    handler = batch_chat(raw_request)
+    if handler is None:
+        raise NotImplementedError("The model does not support Chat Completions API")
+
+    result = await handler.create_batch_chat_completion(request, raw_request)
+
+    if isinstance(result, ErrorResponse):
+        return JSONResponse(content=result.model_dump(), status_code=result.error.code)
+
+    return JSONResponse(content=result.model_dump())
+
+
 def attach_router(app: FastAPI):
     app.include_router(router)