Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7d93bde
Add batch requests to vllm api
MatejRojec Mar 24, 2026
47e124e
Fix n>1 validation and echo condition for batched chat completions
MatejRojec Mar 24, 2026
b8a91ff
Fix pre-commit warnings
MatejRojec Mar 24, 2026
eb2e531
Update system prompt in .sh file
MatejRojec Mar 24, 2026
2c76e2a
Add edge case to manual tests
MatejRojec Mar 24, 2026
6e268e9
Improve test by making them more robust
MatejRojec Mar 24, 2026
724baee
Improve documentation and add ordering so results are alwaus ordered …
MatejRojec Mar 24, 2026
0d2848a
Fix pre commit checks with ruff
MatejRojec Mar 24, 2026
09177f4
Implement batch completions as a separate endpoint
MatejRojec Mar 25, 2026
819052d
Fix None request_id validation in to_chat_completion_request
MatejRojec Mar 25, 2026
cf107e3
Replace shell script example with Python script
MatejRojec Mar 25, 2026
47b92b9
Move render_batch_chat_request into OpenAIServingChat
MatejRojec Mar 25, 2026
27706be
Move render_batch_chat_request after render_chat_request
MatejRojec Mar 25, 2026
158ad59
Move batch chat completion logic into OpenAIServingChatBatch subclass
MatejRojec Mar 25, 2026
d199a6f
Move import json to top of test_chat_completion.py
MatejRojec Mar 25, 2026
b92996a
Move batch chat completion tests to dedicated test file
MatejRojec Mar 25, 2026
92b996a
Add separate state for batch chat completion handler
MatejRojec Mar 25, 2026
9bd033a
Fix mypy errors in OpenAIServingChatBatch
MatejRojec Mar 25, 2026
f2600be
Add BatchChatCompletionRequest to ChatLikeRequest type alias
MatejRojec Mar 25, 2026
bc23224
Add validator to BatchChatCompletionRequest
MatejRojec Mar 25, 2026
eddd17b
Fix schemathesis failures for batch chat completion endpoint
MatejRojec Mar 25, 2026
434a529
Merge branch 'main' into feature/add-batch-requests-to-chat-completio…
MatejRojec Mar 25, 2026
2dc0065
Fix ProcessorInputs import path in batch_serving.py
MatejRojec Mar 25, 2026
cc69644
Fix EngineInput import in batch_serving.py
MatejRojec Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
194 changes: 194 additions & 0 deletions examples/online_serving/batched_chat_completions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Examples of batched chat completions via the vLLM OpenAI-compatible API.

The /v1/chat/completions/batch endpoint accepts ``messages`` as a list of
conversations. Each conversation is processed independently and the response
contains one choice per conversation, indexed 0, 1, ..., N-1.

Start a server first, e.g.:
vllm serve Qwen/Qwen2.5-1.5B-Instruct --port 8000

Current limitations compared to /v1/chat/completions:
- Streaming is not supported.
- Tool use is not supported.
- Beam search is not supported.
"""

import json
import os

import httpx

BASE_URL = os.environ.get("VLLM_BASE_URL", "http://localhost:8000")
MODEL = os.environ.get("VLLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct")
BATCH_URL = f"{BASE_URL}/v1/chat/completions/batch"


def post_batch(payload: dict) -> dict:
response = httpx.post(BATCH_URL, json=payload, timeout=60)
response.raise_for_status()
return response.json()


def main() -> None:
print("=== Example 1a: single conversation (standard endpoint) ===")
response = httpx.post(
f"{BASE_URL}/v1/chat/completions",
json={
"model": MODEL,
"messages": [{"role": "user", "content": "What is the capital of Japan?"}],
},
timeout=60,
)
response.raise_for_status()
data = response.json()
for choice in data["choices"]:
print(f" [{choice['index']}] {choice['message']['content']}")

print("\n=== Example 1b: batched plain text (2 conversations) ===")
data = post_batch(
{
"model": MODEL,
"messages": [
[{"role": "user", "content": "What is the capital of France?"}],
[{"role": "user", "content": "What is the capital of Japan?"}],
],
}
)
for choice in data["choices"]:
print(f" [{choice['index']}] {choice['message']['content']}")

print("\n=== Example 2: batch with regex constraint (yes|no) ===")
data = post_batch(
{
"model": MODEL,
"messages": [
[{"role": "user", "content": "Is the sky blue? Answer yes or no."}],
[{"role": "user", "content": "Is fire cold? Answer yes or no."}],
],
"structured_outputs": {"regex": "(yes|no)"},
}
)
for choice in data["choices"]:
print(f" [{choice['index']}] {choice['message']['content']}")

print("\n=== Example 3: batch with json_schema ===")
person_schema = {
"type": "object",
"properties": {
"name": {"type": "string", "description": "Full name of the person"},
"age": {"type": "integer", "description": "Age in years"},
},
"required": ["name", "age"],
}
data = post_batch(
{
"model": MODEL,
"messages": [
[
{
"role": "user",
"content": "Describe the person: name Alice, age 30.",
}
],
[{"role": "user", "content": "Describe the person: name Bob, age 25."}],
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "person",
"strict": True,
"schema": person_schema,
},
},
}
)
for choice in data["choices"]:
person = json.loads(choice["message"]["content"])
print(f" [{choice['index']}] {person}")

print("\n=== Example 4: batch book summaries ===")
book_schema = {
"type": "object",
"properties": {
"author": {
"type": "string",
"description": "Full name of the author",
},
"num_pages": {
"type": "integer",
"description": "Number of pages in the book",
},
"short_summary": {
"type": "string",
"description": "A one-sentence summary of the book",
},
"long_summary": {
"type": "string",
"description": (
"A detailed two to three sentence summary covering "
"the main themes and plot"
),
},
},
"required": ["author", "num_pages", "short_summary", "long_summary"],
}
system_msg = {
"role": "system",
"content": (
"You are a literary analyst. Extract structured information "
"from book descriptions."
),
}
data = post_batch(
{
"model": MODEL,
"messages": [
[
system_msg,
{
"role": "user",
"content": (
"Extract information from this book: '1984' by George"
" Orwell, published in 1949, 328 pages. A dystopian"
" novel set in a totalitarian society ruled by Big"
" Brother, following Winston Smith as he secretly"
" rebels against the oppressive Party that surveils"
" and controls every aspect of life."
),
},
],
[
system_msg,
{
"role": "user",
"content": (
"Extract information from this book: 'The Hitchhiker's"
" Guide to the Galaxy' by Douglas Adams, published in"
" 1979, 193 pages. A comedic science fiction novel"
" following Arthur Dent, an ordinary Englishman who is"
" whisked off Earth moments before it is demolished to"
" make way for a hyperspace bypass, and his subsequent"
" absurd adventures across the universe."
),
},
],
],
"response_format": {
"type": "json_schema",
"json_schema": {
"name": "book_summary",
"strict": True,
"schema": book_schema,
},
},
}
)
for choice in data["choices"]:
book = json.loads(choice["message"]["content"])
print(f" [{choice['index']}] {book}")


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import httpx
import pytest

from tests.utils import RemoteOpenAIServer

# any model with a chat template defined in tokenizer_config should work here
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"


@pytest.fixture(scope="module")
def default_server_args():
return [
# use half precision for speed and memory savings in CI environment
"--max-model-len",
"2048",
"--max-num-seqs",
"128",
"--enforce-eager",
]


@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server


@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_batched_chat_completions(
server: RemoteOpenAIServer, model_name: str
) -> None:
conversations = [
[{"role": "user", "content": "Reply with exactly the word: alpha"}],
[{"role": "user", "content": "Reply with exactly the word: beta"}],
]

async with httpx.AsyncClient() as http_client:
response = await http_client.post(
f"{server.url_for('v1/chat/completions/batch')}",
json={
"model": model_name,
"messages": conversations,
},
timeout=60,
)

assert response.status_code == 200, response.text
data = response.json()

choices = data["choices"]
assert len(choices) == 2

indices = {choice["index"] for choice in choices}
assert indices == {0, 1}

# Each conversation should produce a non-empty text response.
for choice in choices:
assert choice["message"]["content"]


@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[MODEL_NAME],
)
async def test_batched_chat_completions_with_json_schema(
server: RemoteOpenAIServer, model_name: str
) -> None:
schema = {
"type": "object",
"properties": {
"answer": {"type": "string", "enum": ["yes", "no"]},
},
"required": ["answer"],
}
conversations = [
[{"role": "user", "content": "Is the sky blue? Answer in JSON."}],
[{"role": "user", "content": "Is fire cold? Answer in JSON."}],
]

async with httpx.AsyncClient() as http_client:
response = await http_client.post(
f"{server.url_for('v1/chat/completions/batch')}",
json={
"model": model_name,
"messages": conversations,
"response_format": {
"type": "json_schema",
"json_schema": {"name": "answer", "schema": schema, "strict": True},
},
},
timeout=60,
)

assert response.status_code == 200, response.text
data = response.json()

choices = data["choices"]
assert len(choices) == 2

for choice in choices:
parsed = json.loads(choice["message"]["content"])
assert "answer" in parsed
assert parsed["answer"] in ("yes", "no")
1 change: 1 addition & 0 deletions tests/entrypoints/openai/test_openai_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def test_openapi_stateless(case: Case):
timeout = {
# requires a longer timeout
("POST", "/v1/chat/completions"): LONG_TIMEOUT_SECONDS,
("POST", "/v1/chat/completions/batch"): LONG_TIMEOUT_SECONDS,
("POST", "/v1/completions"): LONG_TIMEOUT_SECONDS,
("POST", "/v1/messages"): LONG_TIMEOUT_SECONDS,
}.get(key, DEFAULT_TIMEOUT_SECONDS)
Expand Down
34 changes: 34 additions & 0 deletions vllm/entrypoints/openai/chat_completion/api_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
from fastapi import APIRouter, Depends, FastAPI, Request
from fastapi.responses import JSONResponse, StreamingResponse

from vllm.entrypoints.openai.chat_completion.batch_serving import OpenAIServingChatBatch
from vllm.entrypoints.openai.chat_completion.protocol import (
BatchChatCompletionRequest,
ChatCompletionRequest,
ChatCompletionResponse,
)
Expand All @@ -31,6 +33,10 @@ def chat(request: Request) -> OpenAIServingChat | None:
return request.app.state.openai_serving_chat


def batch_chat(request: Request) -> OpenAIServingChatBatch | None:
return request.app.state.openai_serving_chat_batch


@router.post(
"/v1/chat/completions",
dependencies=[Depends(validate_json_request)],
Expand Down Expand Up @@ -68,5 +74,33 @@ async def create_chat_completion(request: ChatCompletionRequest, raw_request: Re
return StreamingResponse(content=generator, media_type="text/event-stream")


@router.post(
"/v1/chat/completions/batch",
dependencies=[Depends(validate_json_request)],
responses={
HTTPStatus.OK.value: {},
HTTPStatus.BAD_REQUEST.value: {"model": ErrorResponse},
HTTPStatus.NOT_FOUND.value: {"model": ErrorResponse},
HTTPStatus.INTERNAL_SERVER_ERROR.value: {"model": ErrorResponse},
HTTPStatus.NOT_IMPLEMENTED.value: {"model": ErrorResponse},
},
)
@with_cancellation
@load_aware_call
async def create_batch_chat_completion(
request: BatchChatCompletionRequest, raw_request: Request
):
handler = batch_chat(raw_request)
if handler is None:
raise NotImplementedError("The model does not support Chat Completions API")

result = await handler.create_batch_chat_completion(request, raw_request)

if isinstance(result, ErrorResponse):
return JSONResponse(content=result.model_dump(), status_code=result.error.code)

return JSONResponse(content=result.model_dump())


def attach_router(app: FastAPI):
app.include_router(router)
Loading
Loading