Skip to content
2 changes: 1 addition & 1 deletion docs/features/reasoning_outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ Token counting starts from `reasoning_start_str`. Once the reasoning token count
To use this feature:

- `--reasoning-parser` enables reasoning extraction.
- `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`).
- `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`). If not set, vLLM will attempt to automatically initialize these tokens from the reasoning parser.
- `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit.

If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,43 @@ def server():
"--max-model-len",
"2048",
"--enforce-eager",
"--gpu-memory-utilization",
"0.4",
"--no-async-scheduling",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest.fixture(scope="module")
def server_with_auto_reasoning_config():
args = [
"--reasoning-parser",
"qwen3",
"--max-model-len",
"2048",
"--enforce-eager",
"--gpu-memory-utilization",
"0.4",
"--no-async-scheduling",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server


@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
async def client(request, server, server_with_auto_reasoning_config):
server_map = {
"default": server,
"auto_config": server_with_auto_reasoning_config,
}
target_server = server_map[request.param]
async with target_server.get_async_client() as async_client:
yield async_client


@pytest.mark.asyncio
@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
"""Test that mixed requests (some with thinking_token_budget, some without)
complete successfully without errors."""
Expand All @@ -61,6 +85,7 @@ async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):


@pytest.mark.asyncio
@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
"""Test that thinking_token_budget limits the number of reasoning tokens.

Expand All @@ -82,6 +107,6 @@ async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI
reasoning_token_count += 1

assert reasoning_token_count == THINK_BUDGET, (
f"reasoning tokens ({reasoning_token_count}) != "
f"reasoning tokens ({reasoning_token_count}) exceeded "
f"thinking_token_budget ({THINK_BUDGET})"
)
17 changes: 17 additions & 0 deletions tests/kernels/attention/test_attention_selector.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,23 @@ def test_auto_backend_selection_behavior():
assert backend_auto.get_name() == backend_none.get_name()


def test_flash_attn_rejects_int4_kv_cache(monkeypatch: pytest.MonkeyPatch):
try:
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
except ImportError:
pytest.skip("vllm_flash_attn extension is not available in this env")

monkeypatch.setattr(
"vllm.v1.attention.backends.flash_attn.flash_attn_supports_fp8",
lambda: True,
)

assert FlashAttentionBackend.supports_kv_cache_dtype("fp8")
assert not FlashAttentionBackend.supports_kv_cache_dtype(
"int4_per_token_head"
)


@pytest.mark.parametrize(
"backend_name,flash_attn_version,should_succeed",
[
Expand Down
Loading
Loading