Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
20f42bd
support anthropic endpoint
Liliu1997 Aug 6, 2025
82851a3
add antropic api test
Liliu1997 Aug 13, 2025
34f17fc
add antropic unit test
Liliu1997 Aug 13, 2025
0697fe8
add spdx-header
Liliu1997 Aug 13, 2025
9b3fc78
stream bug fix
Liliu1997 Aug 14, 2025
d5b9860
python lint fix
Liliu1997 Aug 15, 2025
8f54993
Merge remote-tracking branch 'origin/main'
Liliu1997 Aug 20, 2025
78b1601
some pylint fix
Liliu1997 Aug 20, 2025
8764e7f
Merge remote-tracking branch 'origin/main'
Liliu1997 Sep 3, 2025
39f72bc
support anthropic list system prompt
Liliu1997 Sep 3, 2025
2afcd93
add debug logs
Liliu1997 Sep 3, 2025
9d7db4e
skip empty user block
Liliu1997 Sep 5, 2025
6828d9b
Merge remote-tracking branch 'origin/main' into dev/antropic_v2
Liliu1997 Oct 17, 2025
b3db9c2
pre-commit fix
Liliu1997 Oct 17, 2025
ed7cacc
pre-commit fix
Liliu1997 Oct 17, 2025
c691e08
resolve comments
Liliu1997 Oct 17, 2025
767209d
pre-commit fix
Liliu1997 Oct 18, 2025
ef96847
rm model_config in AnthropicServingMessages
Liliu1997 Oct 21, 2025
48e01a9
add anthropic dependency
Liliu1997 Oct 21, 2025
e70286a
Merge branch 'main' into dev/antropic_v2
DarkLight1337 Oct 21, 2025
94f9731
Change Python command to use python3
mgoin Oct 21, 2025
78b6608
unit-test fix
Liliu1997 Oct 22, 2025
904fa56
merge master
Liliu1997 Oct 22, 2025
1f8c940
unit-test fix
Liliu1997 Oct 22, 2025
fc96a55
fix unit test
Liliu1997 Oct 22, 2025
26816e4
Merge remote-tracking branch 'origin/dev/antropic_v2' into dev/antrop…
Liliu1997 Oct 22, 2025
037f0d1
fix unit test
Liliu1997 Oct 22, 2025
ddc9645
fix unit test
Liliu1997 Oct 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements/common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
Empty file.
141 changes: 141 additions & 0 deletions tests/entrypoints/anthropic/test_messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import anthropic
import pytest
import pytest_asyncio

from ...utils import RemoteAnthropicServer

MODEL_NAME = "Qwen/Qwen3-0.6B"


@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len",
"2048",
"--enforce-eager",
"--enable-auto-tool-choice",
"--tool-call-parser",
"hermes",
"--served-model-name",
"claude-3-7-sonnet-latest",
]

with RemoteAnthropicServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
async def test_simple_messages(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")


@pytest.mark.asyncio
async def test_system_message(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
system="you are a helpful assistant",
messages=[{"role": "user", "content": "how are you!"}],
)
assert resp.stop_reason == "end_turn"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")


@pytest.mark.asyncio
async def test_anthropic_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[{"role": "user", "content": "how are you!"}],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())


@pytest.mark.asyncio
async def test_anthropic_tool_call(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{"role": "user", "content": "What's the weather like in New York today?"}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=False,
)
assert resp.stop_reason == "tool_use"
assert resp.role == "assistant"

print(f"Anthropic response: {resp.model_dump_json()}")

@pytest.mark.asyncio
async def test_anthropic_tool_call_streaming(client: anthropic.AsyncAnthropic):
resp = await client.messages.create(
model="claude-3-7-sonnet-latest",
max_tokens=1024,
messages=[
{
"role": "user",
"content": "What's the weather like in New York today?",
}
],
tools=[
{
"name": "get_current_weather",
"description": "Useful for querying the weather "
"in a specified city.",
"input_schema": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "City or region, for example: "
"New York, London, Tokyo, etc.",
}
},
"required": ["location"],
},
}
],
stream=True,
)

async for chunk in resp:
print(chunk.model_dump_json())
126 changes: 126 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Any, Literal
from unittest.mock import patch

import anthropic
import cloudpickle
import httpx
import openai
Expand Down Expand Up @@ -294,6 +295,131 @@ def __exit__(self, exc_type, exc_value, traceback):
self.proc.kill()


class RemoteAnthropicServer:
DUMMY_API_KEY = "token-abc123" # vLLM's Anthropic server does not need API key

def __init__(
self,
model: str,
vllm_serve_args: list[str],
*,
env_dict: dict[str, str] | None = None,
seed: int | None = 0,
auto_port: bool = True,
max_wait_seconds: float | None = None,
) -> None:
if auto_port:
if "-p" in vllm_serve_args or "--port" in vllm_serve_args:
raise ValueError(
"You have manually specified the port when `auto_port=True`."
)

# Don't mutate the input args
vllm_serve_args = vllm_serve_args + ["--port", str(get_open_port())]
if seed is not None:
if "--seed" in vllm_serve_args:
raise ValueError(
f"You have manually specified the seed when `seed={seed}`."
)

vllm_serve_args = vllm_serve_args + ["--seed", str(seed)]

parser = FlexibleArgumentParser(description="vLLM's remote Anthropic server.")
subparsers = parser.add_subparsers(required=False, dest="subparser")
parser = ServeSubcommand().subparser_init(subparsers)
args = parser.parse_args(["--model", model, *vllm_serve_args])
self.host = str(args.host or "localhost")
self.port = int(args.port)

self.show_hidden_metrics = args.show_hidden_metrics_for_version is not None

# download the model before starting the server to avoid timeout
is_local = os.path.isdir(model)
if not is_local:
engine_args = AsyncEngineArgs.from_cli_args(args)
model_config = engine_args.create_model_config()
load_config = engine_args.create_load_config()

model_loader = get_model_loader(load_config)
model_loader.download_model(model_config)

env = os.environ.copy()
# the current process might initialize cuda,
# to be safe, we should use spawn method
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
if env_dict is not None:
env.update(env_dict)
self.proc = subprocess.Popen(
[
sys.executable,
"-m",
"vllm.entrypoints.anthropic.api_server",
model,
*vllm_serve_args,
],
env=env,
stdout=sys.stdout,
stderr=sys.stderr,
)
max_wait_seconds = max_wait_seconds or 240
self._wait_for_server(url=self.url_for("health"), timeout=max_wait_seconds)

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
self.proc.terminate()
try:
self.proc.wait(8)
except subprocess.TimeoutExpired:
# force kill if needed
self.proc.kill()

def _wait_for_server(self, *, url: str, timeout: float):
# run health check
start = time.time()
while True:
try:
if requests.get(url).status_code == 200:
break
except Exception:
# this exception can only be raised by requests.get,
# which means the server is not ready yet.
# the stack trace is not useful, so we suppress it
# by using `raise from None`.
result = self.proc.poll()
if result is not None and result != 0:
raise RuntimeError("Server exited unexpectedly.") from None

time.sleep(0.5)
if time.time() - start > timeout:
raise RuntimeError("Server failed to start in time.") from None

@property
def url_root(self) -> str:
return f"http://{self.host}:{self.port}"

def url_for(self, *parts: str) -> str:
return self.url_root + "/" + "/".join(parts)

def get_client(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return anthropic.Anthropic(
base_url=self.url_for(),
api_key=self.DUMMY_API_KEY,
max_retries=0,
**kwargs,
)

def get_async_client(self, **kwargs):
if "timeout" not in kwargs:
kwargs["timeout"] = 600
return anthropic.AsyncAnthropic(
base_url=self.url_for(), api_key=self.DUMMY_API_KEY, max_retries=0, **kwargs
)


def _test_completion(
client: openai.OpenAI,
model: str,
Expand Down
Empty file.
Loading