vllm-project · hao-aaron · Apr 3, 2026 · Apr 7, 2026 · Apr 30, 2026
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "TitanML/tiny-mixtral"
+
+# tiny-mixtral config: 8 local experts, top-2 routing, 2 hidden layers
+NUM_LOCAL_EXPERTS = 8
+NUM_EXPERTS_PER_TOK = 2
+NUM_HIDDEN_LAYERS = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "256",
+        "--max-num-seqs",
+        "32",
+        "--enforce-eager",
+        "--enable-return-routed-experts",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+async def test_routed_experts(server):
+    """Test that /v1/completions returns routed_experts when enabled."""
+    async with server.get_async_client() as client:
+        result = await client.completions.create(
+            model=MODEL_NAME,
+            prompt="Hello, world",
+            max_tokens=10,
+            temperature=0,
+            extra_body={"return_token_ids": True},
+        )
+
+        choice = result.model_dump()["choices"][0]
+
+        assert choice["routed_experts"] is not None
+        assert choice["token_ids"] is not None
+
+        routed_experts = choice["routed_experts"]
+        assert len(routed_experts) > 0
+        for token_experts in routed_experts:
+            assert len(token_experts) == NUM_HIDDEN_LAYERS
+            for layer_experts in token_experts:
+                assert len(layer_experts) == NUM_EXPERTS_PER_TOK
+                for expert_id in layer_experts:
+                    assert 0 <= expert_id < NUM_LOCAL_EXPERTS
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import httpx
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "TitanML/tiny-mixtral"
+GEN_ENDPOINT = "/inference/v1/generate"
+
+# tiny-mixtral config: 8 local experts, top-2 routing, 2 hidden layers
+NUM_LOCAL_EXPERTS = 8
+NUM_EXPERTS_PER_TOK = 2
+NUM_HIDDEN_LAYERS = 2
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--max-model-len",
+        "256",
+        "--max-num-seqs",
+        "32",
+        "--enforce-eager",
+        "--enable-return-routed-experts",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server: RemoteOpenAIServer):
+    transport = httpx.AsyncHTTPTransport(uds=server.uds) if server.uds else None
+    headers = {"Authorization": f"Bearer {server.DUMMY_API_KEY}"}
+    async with httpx.AsyncClient(
+        transport=transport,
+        base_url=server.url_root,
+        timeout=600,
+        headers=headers,
+    ) as c:
+        yield c
+
+
+@pytest.mark.asyncio
+async def test_generate_routed_experts(client):
+    """Test that /inference/v1/generate returns routed_experts when enabled."""
+    payload = {
+        "model": MODEL_NAME,
+        "token_ids": [1, 2, 3],
+        "sampling_params": {"max_tokens": 10, "temperature": 0.0},
+        "stream": False,
+    }
+    resp = await client.post(GEN_ENDPOINT, json=payload)
+    resp.raise_for_status()
+    data = resp.json()
+
+    choice = data["choices"][0]
+
+    assert choice["routed_experts"] is not None
+    assert choice["token_ids"] is not None
+
+    routed_experts = choice["routed_experts"]
+    assert len(routed_experts) > 0
+    for token_experts in routed_experts:
+        assert len(token_experts) == NUM_HIDDEN_LAYERS
+        for layer_experts in token_experts:
+            assert len(layer_experts) == NUM_EXPERTS_PER_TOK
+            for expert_id in layer_experts:
+                assert 0 <= expert_id < NUM_LOCAL_EXPERTS
@@ -92,6 +92,14 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
     # not part of the OpenAI spec but is useful for tracing the tokens
     # in agent scenarios
     token_ids: list[int] | None = None
+    routed_experts: list[list[list[int]]] | None = Field(
+        default=None,
+        description=(
+            "The routed expert indices for each generated token, with shape "
+            "[seq_len, num_layers, topk]. Only present when the server is "
+            "started with --enable-return-routed-experts."
+        ),
+    )
 
 
 class ChatCompletionResponse(OpenAIBaseModel):

@@ -1241,6 +1241,11 @@ async def chat_completion_full_generator(
                     token_ids=(
                         as_list(output.token_ids) if request.return_token_ids else None
                     ),
+                    routed_experts=(
+                        output.routed_experts.tolist()
+                        if output.routed_experts is not None
+                        else None
+                    ),
                 )
                 choices.append(choice_data)
                 continue
@@ -1462,6 +1467,11 @@ async def chat_completion_full_generator(
                 token_ids=(
                     as_list(output.token_ids) if request.return_token_ids else None
                 ),
+                routed_experts=(
+                    output.routed_experts.tolist()
+                    if output.routed_experts is not None
+                    else None
+                ),
             )
             choice_data = maybe_filter_parallel_tool_calls(choice_data, request)
 

@@ -468,6 +468,14 @@ class CompletionResponseChoice(OpenAIBaseModel):
     token_ids: list[int] | None = None  # For response
     prompt_logprobs: list[dict[int, Logprob] | None] | None = None
     prompt_token_ids: list[int] | None = None  # For prompt
+    routed_experts: list[list[list[int]]] | None = Field(
+        default=None,
+        description=(
+            "The routed expert indices for each generated token, with shape "
+            "[seq_len, num_layers, topk]. Only present when the server is "
+            "started with --enable-return-routed-experts."
+        ),
+    )
 
 
 class CompletionResponse(OpenAIBaseModel):

@@ -541,6 +541,11 @@ def request_output_to_completion_response(
                     token_ids=(
                         as_list(output.token_ids) if request.return_token_ids else None
                     ),
+                    routed_experts=(
+                        output.routed_experts.tolist()
+                        if output.routed_experts is not None
+                        else None
+                    ),
                 )
                 choices.append(choice_data)
 

@@ -121,6 +121,14 @@ class GenerateResponseChoice(BaseModel):
     # per OpenAI spec this is the default
     finish_reason: str | None = "stop"
     token_ids: list[int] | None = None
+    routed_experts: list[list[list[int]]] | None = Field(
+        default=None,
+        description=(
+            "The routed expert indices for each generated token, with shape "
+            "[seq_len, num_layers, topk]. Only present when the server is "
+            "started with --enable-return-routed-experts."
+        ),
+    )
 
 
 class GenerateResponseStreamChoice(BaseModel):

@@ -234,6 +234,11 @@ async def serve_tokens_full_generator(
                 logprobs=logprobs,
                 finish_reason=output.finish_reason if output.finish_reason else "stop",
                 token_ids=as_list(output.token_ids),
+                routed_experts=(
+                    output.routed_experts.tolist()
+                    if output.routed_experts is not None
+                    else None
+                ),
             )
 
             choices.append(choice_data)