diff --git a/tests/entrypoints/openai/test_return_routed_experts.py b/tests/entrypoints/openai/test_return_routed_experts.py new file mode 100644 index 000000000000..097119f4858c --- /dev/null +++ b/tests/entrypoints/openai/test_return_routed_experts.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from ...utils import RemoteOpenAIServer + +MODEL_NAME = "TitanML/tiny-mixtral" + +# tiny-mixtral config: 8 local experts, top-2 routing, 2 hidden layers +NUM_LOCAL_EXPERTS = 8 +NUM_EXPERTS_PER_TOK = 2 +NUM_HIDDEN_LAYERS = 2 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--max-model-len", + "256", + "--max-num-seqs", + "32", + "--enforce-eager", + "--enable-return-routed-experts", + ] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest.mark.asyncio +async def test_routed_experts(server): + """Test that /v1/completions returns routed_experts when enabled.""" + async with server.get_async_client() as client: + result = await client.completions.create( + model=MODEL_NAME, + prompt="Hello, world", + max_tokens=10, + temperature=0, + extra_body={"return_token_ids": True}, + ) + + choice = result.model_dump()["choices"][0] + + assert choice["routed_experts"] is not None + assert choice["token_ids"] is not None + + routed_experts = choice["routed_experts"] + assert len(routed_experts) > 0 + for token_experts in routed_experts: + assert len(token_experts) == NUM_HIDDEN_LAYERS + for layer_experts in token_experts: + assert len(layer_experts) == NUM_EXPERTS_PER_TOK + for expert_id in layer_experts: + assert 0 <= expert_id < NUM_LOCAL_EXPERTS diff --git a/tests/entrypoints/serve/disagg/test_return_routed_experts.py b/tests/entrypoints/serve/disagg/test_return_routed_experts.py new file mode 100644 index 000000000000..a09f90394f10 --- /dev/null +++ b/tests/entrypoints/serve/disagg/test_return_routed_experts.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import httpx +import pytest +import pytest_asyncio + +from tests.utils import RemoteOpenAIServer + +MODEL_NAME = "TitanML/tiny-mixtral" +GEN_ENDPOINT = "/inference/v1/generate" + +# tiny-mixtral config: 8 local experts, top-2 routing, 2 hidden layers +NUM_LOCAL_EXPERTS = 8 +NUM_EXPERTS_PER_TOK = 2 +NUM_HIDDEN_LAYERS = 2 + + +@pytest.fixture(scope="module") +def server(): + args = [ + "--max-model-len", + "256", + "--max-num-seqs", + "32", + "--enforce-eager", + "--enable-return-routed-experts", + ] + with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + yield remote_server + + +@pytest_asyncio.fixture +async def client(server: RemoteOpenAIServer): + transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None + headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"} + async with httpx.AsyncClient( + transport=transport, + base_url=server.url_root, + timeout=600, + headers=headers, + ) as c: + yield c + + +@pytest.mark.asyncio +async def test_generate_routed_experts(client): + """Test that /inference/v1/generate returns routed_experts when enabled.""" + payload = { + "model": MODEL_NAME, + "token_ids": [1, 2, 3], + "sampling_params": {"max_tokens": 10, "temperature": 0.0}, + "stream": False, + } + resp = await client.post(GEN_ENDPOINT, json=payload) + resp.raise_for_status() + data = resp.json() + + choice = data["choices"][0] + + assert choice["routed_experts"] is not None + assert choice["token_ids"] is not None + + routed_experts = choice["routed_experts"] + assert len(routed_experts) > 0 + for token_experts in routed_experts: + assert len(token_experts) == NUM_HIDDEN_LAYERS + for layer_experts in token_experts: + assert len(layer_experts) == NUM_EXPERTS_PER_TOK + for expert_id in layer_experts: + assert 0 <= expert_id < NUM_LOCAL_EXPERTS diff --git a/vllm/entrypoints/openai/chat_completion/protocol.py b/vllm/entrypoints/openai/chat_completion/protocol.py index c92cc13da01f..a0a7d0578bd0 100644 --- a/vllm/entrypoints/openai/chat_completion/protocol.py +++ b/vllm/entrypoints/openai/chat_completion/protocol.py @@ -92,6 +92,14 @@ class ChatCompletionResponseChoice(OpenAIBaseModel): # not part of the OpenAI spec but is useful for tracing the tokens # in agent scenarios token_ids: list[int] | None = None + routed_experts: list[list[list[int]]] | None = Field( + default=None, + description=( + "The routed expert indices for each generated token, with shape " + "[seq_len, num_layers, topk]. Only present when the server is " + "started with --enable-return-routed-experts." + ), + ) class ChatCompletionResponse(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index b8ad54adb5a6..e64c89138dcb 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -1241,6 +1241,11 @@ async def chat_completion_full_generator( token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), + routed_experts=( + output.routed_experts.tolist() + if output.routed_experts is not None + else None + ), ) choices.append(choice_data) continue @@ -1462,6 +1467,11 @@ async def chat_completion_full_generator( token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), + routed_experts=( + output.routed_experts.tolist() + if output.routed_experts is not None + else None + ), ) choice_data = maybe_filter_parallel_tool_calls(choice_data, request) diff --git a/vllm/entrypoints/openai/completion/protocol.py b/vllm/entrypoints/openai/completion/protocol.py index 7edea50d73aa..e6115c811169 100644 --- a/vllm/entrypoints/openai/completion/protocol.py +++ b/vllm/entrypoints/openai/completion/protocol.py @@ -468,6 +468,14 @@ class CompletionResponseChoice(OpenAIBaseModel): token_ids: list[int] | None = None # For response prompt_logprobs: list[dict[int, Logprob] | None] | None = None prompt_token_ids: list[int] | None = None # For prompt + routed_experts: list[list[list[int]]] | None = Field( + default=None, + description=( + "The routed expert indices for each generated token, with shape " + "[seq_len, num_layers, topk]. Only present when the server is " + "started with --enable-return-routed-experts." + ), + ) class CompletionResponse(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/completion/serving.py b/vllm/entrypoints/openai/completion/serving.py index 454b170a5fa5..670cd45b73b6 100644 --- a/vllm/entrypoints/openai/completion/serving.py +++ b/vllm/entrypoints/openai/completion/serving.py @@ -541,6 +541,11 @@ def request_output_to_completion_response( token_ids=( as_list(output.token_ids) if request.return_token_ids else None ), + routed_experts=( + output.routed_experts.tolist() + if output.routed_experts is not None + else None + ), ) choices.append(choice_data) diff --git a/vllm/entrypoints/serve/disagg/protocol.py b/vllm/entrypoints/serve/disagg/protocol.py index 633696824663..a3a8962447cb 100644 --- a/vllm/entrypoints/serve/disagg/protocol.py +++ b/vllm/entrypoints/serve/disagg/protocol.py @@ -121,6 +121,14 @@ class GenerateResponseChoice(BaseModel): # per OpenAI spec this is the default finish_reason: str | None = "stop" token_ids: list[int] | None = None + routed_experts: list[list[list[int]]] | None = Field( + default=None, + description=( + "The routed expert indices for each generated token, with shape " + "[seq_len, num_layers, topk]. Only present when the server is " + "started with --enable-return-routed-experts." + ), + ) class GenerateResponseStreamChoice(BaseModel): diff --git a/vllm/entrypoints/serve/disagg/serving.py b/vllm/entrypoints/serve/disagg/serving.py index d510125fd034..55616c0047fd 100644 --- a/vllm/entrypoints/serve/disagg/serving.py +++ b/vllm/entrypoints/serve/disagg/serving.py @@ -234,6 +234,11 @@ async def serve_tokens_full_generator( logprobs=logprobs, finish_reason=output.finish_reason if output.finish_reason else "stop", token_ids=as_list(output.token_ids), + routed_experts=( + output.routed_experts.tolist() + if output.routed_experts is not None + else None + ), ) choices.append(choice_data)