Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions tests/entrypoints/openai/test_return_routed_experts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from ...utils import RemoteOpenAIServer

MODEL_NAME = "TitanML/tiny-mixtral"

# tiny-mixtral config: 8 local experts, top-2 routing, 2 hidden layers
NUM_LOCAL_EXPERTS = 8
NUM_EXPERTS_PER_TOK = 2
NUM_HIDDEN_LAYERS = 2


@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"256",
"--max-num-seqs",
"32",
"--enforce-eager",
"--enable-return-routed-experts",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest.mark.asyncio
async def test_routed_experts(server):
"""Test that /v1/completions returns routed_experts when enabled."""
async with server.get_async_client() as client:
result = await client.completions.create(
model=MODEL_NAME,
prompt="Hello, world",
max_tokens=10,
temperature=0,
extra_body={"return_token_ids": True},
)

choice = result.model_dump()["choices"][0]

assert choice["routed_experts"] is not None
assert choice["token_ids"] is not None

routed_experts = choice["routed_experts"]
assert len(routed_experts) > 0
for token_experts in routed_experts:
assert len(token_experts) == NUM_HIDDEN_LAYERS
for layer_experts in token_experts:
assert len(layer_experts) == NUM_EXPERTS_PER_TOK
for expert_id in layer_experts:
assert 0 <= expert_id < NUM_LOCAL_EXPERTS
71 changes: 71 additions & 0 deletions tests/entrypoints/serve/disagg/test_return_routed_experts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import httpx
import pytest
import pytest_asyncio

from tests.utils import RemoteOpenAIServer

MODEL_NAME = "TitanML/tiny-mixtral"
GEN_ENDPOINT = "/inference/v1/generate"

# tiny-mixtral config: 8 local experts, top-2 routing, 2 hidden layers
NUM_LOCAL_EXPERTS = 8
NUM_EXPERTS_PER_TOK = 2
NUM_HIDDEN_LAYERS = 2


@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"256",
"--max-num-seqs",
"32",
"--enforce-eager",
"--enable-return-routed-experts",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server: RemoteOpenAIServer):
transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
async with httpx.AsyncClient(
transport=transport,
base_url=server.url_root,
timeout=600,
headers=headers,
) as c:
yield c


@pytest.mark.asyncio
async def test_generate_routed_experts(client):
"""Test that /inference/v1/generate returns routed_experts when enabled."""
payload = {
"model": MODEL_NAME,
"token_ids": [1, 2, 3],
"sampling_params": {"max_tokens": 10, "temperature": 0.0},
"stream": False,
}
resp = await client.post(GEN_ENDPOINT, json=payload)
resp.raise_for_status()
data = resp.json()

choice = data["choices"][0]

assert choice["routed_experts"] is not None
assert choice["token_ids"] is not None

routed_experts = choice["routed_experts"]
assert len(routed_experts) > 0
for token_experts in routed_experts:
assert len(token_experts) == NUM_HIDDEN_LAYERS
for layer_experts in token_experts:
assert len(layer_experts) == NUM_EXPERTS_PER_TOK
for expert_id in layer_experts:
assert 0 <= expert_id < NUM_LOCAL_EXPERTS
8 changes: 8 additions & 0 deletions vllm/entrypoints/openai/chat_completion/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
# not part of the OpenAI spec but is useful for tracing the tokens
# in agent scenarios
token_ids: list[int] | None = None
routed_experts: list[list[list[int]]] | None = Field(
default=None,
description=(
"The routed expert indices for each generated token, with shape "
"[seq_len, num_layers, topk]. Only present when the server is "
"started with --enable-return-routed-experts."
),
)


class ChatCompletionResponse(OpenAIBaseModel):
Expand Down
10 changes: 10 additions & 0 deletions vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -1241,6 +1241,11 @@ async def chat_completion_full_generator(
token_ids=(
as_list(output.token_ids) if request.return_token_ids else None
),
routed_experts=(
output.routed_experts.tolist()
if output.routed_experts is not None
else None
),
)
choices.append(choice_data)
continue
Expand Down Expand Up @@ -1462,6 +1467,11 @@ async def chat_completion_full_generator(
token_ids=(
as_list(output.token_ids) if request.return_token_ids else None
),
routed_experts=(
output.routed_experts.tolist()
if output.routed_experts is not None
else None
),
)
choice_data = maybe_filter_parallel_tool_calls(choice_data, request)

Expand Down
8 changes: 8 additions & 0 deletions vllm/entrypoints/openai/completion/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,14 @@ class CompletionResponseChoice(OpenAIBaseModel):
token_ids: list[int] | None = None # For response
prompt_logprobs: list[dict[int, Logprob] | None] | None = None
prompt_token_ids: list[int] | None = None # For prompt
routed_experts: list[list[list[int]]] | None = Field(
default=None,
description=(
"The routed expert indices for each generated token, with shape "
"[seq_len, num_layers, topk]. Only present when the server is "
"started with --enable-return-routed-experts."
),
)


class CompletionResponse(OpenAIBaseModel):
Expand Down
5 changes: 5 additions & 0 deletions vllm/entrypoints/openai/completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,11 @@ def request_output_to_completion_response(
token_ids=(
as_list(output.token_ids) if request.return_token_ids else None
),
routed_experts=(
output.routed_experts.tolist()
if output.routed_experts is not None
else None
),
)
choices.append(choice_data)

Expand Down
8 changes: 8 additions & 0 deletions vllm/entrypoints/serve/disagg/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ class GenerateResponseChoice(BaseModel):
# per OpenAI spec this is the default
finish_reason: str | None = "stop"
token_ids: list[int] | None = None
routed_experts: list[list[list[int]]] | None = Field(
default=None,
description=(
"The routed expert indices for each generated token, with shape "
"[seq_len, num_layers, topk]. Only present when the server is "
"started with --enable-return-routed-experts."
),
)


class GenerateResponseStreamChoice(BaseModel):
Expand Down
5 changes: 5 additions & 0 deletions vllm/entrypoints/serve/disagg/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,11 @@ async def serve_tokens_full_generator(
logprobs=logprobs,
finish_reason=output.finish_reason if output.finish_reason else "stop",
token_ids=as_list(output.token_ids),
routed_experts=(
output.routed_experts.tolist()
if output.routed_experts is not None
else None
),
)

choices.append(choice_data)
Expand Down
Loading