vllm-project · njhill · Mar 24, 2026 · Jul 12, 2025 · Jul 12, 2025 · Jul 14, 2025
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
@@ -240,6 +240,81 @@ response = client.chat.completions.create(
 )
 ```
 
+## Thinking Budget Control
+
+Some models, such as [Qwen3](https://qwen.readthedocs.io/en/latest/getting_started/quickstart.html#thinking-budget), [DeepSeek](https://www.alibabacloud.com/help/en/model-studio/deep-thinking), and [Nemotron3](https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16), support a thinking budget that limits the maximum number of tokens used for reasoning.
+
+Token counting starts from `think_start_str`. Once the reasoning token count reaches the configured `thinking_token_budget`, vLLM forces the model to produce `think_end_str`, effectively terminating the reasoning block.
+
+To use this feature:
+
+- `--reasoning-parser` enables reasoning extraction.
+- `--reasoning-config` defines the reasoning boundary tokens (e.g., `think_start_str`, `think_end_str`).
+- `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit.
+
+If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`.
+
+`--reasoning-config` accepts a JSON object corresponding to  
+[ReasoningConfig][vllm.config.ReasoningConfig] with the following fields:
+
+| Field             | Type           | Description                                      |
+|-------------------|----------------|--------------------------------------------------|
+| `think_start_str` | `str \| null`  | String that marks the start of reasoning content |
+| `think_end_str`   | `str \| null`  | String that marks the end of reasoning content   |
+
+!!! note
+    `think_end_str` can include a transition phrase before the think end token. For example, setting `think_end_str` to `"I have to give the solution based on the thinking directly now.</think>"` instructs the model to emit that phrase when the budget is exhausted, making the reasoning termination more natural.
+
+### Online Serving
+
+```bash
+vllm serve Qwen/Qwen3-0.6B \
+    --reasoning-parser qwen3 \
+    --reasoning-config '{"think_start_str": "<think>", "think_end_str": "I have to give the solution based on the thinking directly now.</think>"}'
+```
+
+Then make a request with `thinking_token_budget` to limit the reasoning tokens:
+
+```bash
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [
+      { "role": "user", "content": "9.11 and 9.8, which is greater?" }
+    ],
+    "extra_body": {
+      "thinking_token_budget": 10
+    }
+  }'
+```
+
+### Offline Inference
+
+```python
+from vllm import LLM, SamplingParams
+from vllm.config import ReasoningConfig
+
+llm = LLM(
+    model="Qwen/Qwen3-0.6B",
+    reasoning_config=ReasoningConfig(
+        think_start_str="<think>",
+        think_end_str="I have to give the solution based on the thinking directly now.</think>",
+    ),
+)
+
+sampling_params = SamplingParams(thinking_token_budget=10)
+
+messages = [
+    {"role": "user", "content": "9.11 and 9.8, which is greater?"},
+]
+
+outputs = llm.chat(messages, sampling_params=sampling_params)
+
+for output in outputs:
+    print("text:", output.outputs[0].text)
+```
+
 ## Limitations
 
 - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).

diff --git a/tests/v1/entrypoints/openai/test_thinking_token_budget.py b/tests/v1/entrypoints/openai/test_thinking_token_budget.py
@@ -0,0 +1,87 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""E2E tests for thinking_token_budget with reasoning models."""
+
+import openai
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+
+MODEL_NAME = "Qwen/Qwen3-0.6B"
+MESSAGES = [{"role": "user", "content": "What is 1+1? Be concise."}]
+THINK_BUDGET = 5
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--reasoning-parser",
+        "qwen3",
+        "--reasoning-config",
+        '{"think_start_str": "<think>", "think_end_str": "</think>"}',
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--no-async-scheduling",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
+    """Test that mixed requests (some with thinking_token_budget, some without)
+    complete successfully without errors."""
+
+    response_with_budget = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        max_tokens=100,
+        extra_body={"thinking_token_budget": THINK_BUDGET},
+    )
+    response_without_budget = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        max_tokens=100,
+    )
+
+    msg_with = response_with_budget.choices[0].message
+    msg_without = response_without_budget.choices[0].message
+
+    assert msg_with.content or getattr(msg_with, "reasoning", None)
+    assert msg_without.content or getattr(msg_without, "reasoning", None)
+
+
+@pytest.mark.asyncio
+async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
+    """Test that thinking_token_budget limits the number of reasoning tokens.
+
+    In streaming mode each reasoning delta corresponds to one token, so
+    counting non-empty reasoning_content chunks gives the exact token count.
+    """
+
+    reasoning_token_count = 0
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        max_tokens=100,
+        stream=True,
+        extra_body={"thinking_token_budget": THINK_BUDGET},
+    )
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if getattr(delta, "reasoning", None):
+            reasoning_token_count += 1
+
+    assert reasoning_token_count == THINK_BUDGET, (
+        f"reasoning tokens ({reasoning_token_count}) != "
+        f"thinking_token_budget ({THINK_BUDGET})"
+    )