lesj0610 · lesj0610 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
@@ -249,7 +249,7 @@ Token counting starts from `reasoning_start_str`. Once the reasoning token count
 To use this feature:
 
 - `--reasoning-parser` enables reasoning extraction.
-- `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`).
+- `--reasoning-config` defines the reasoning boundary tokens (e.g., `reasoning_start_str`, `reasoning_end_str`). If not set, vLLM will attempt to automatically initialize these tokens from the reasoning parser.
 - `thinking_token_budget` (a sampling parameter) sets the per-request reasoning token limit.
 
 If `thinking_token_budget` is not specified, no explicit reasoning limit is applied beyond normal generation constraints such as `max_tokens`.

diff --git a/...ints/openai/test_thinking_token_budget.py → ..._completion/test_thinking_token_budget.py b/...ints/openai/test_thinking_token_budget.py → ..._completion/test_thinking_token_budget.py
@@ -24,19 +24,43 @@ def server():
         "--max-model-len",
         "2048",
         "--enforce-eager",
+        "--gpu-memory-utilization",
+        "0.4",
+        "--no-async-scheduling",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.fixture(scope="module")
+def server_with_auto_reasoning_config():
+    args = [
+        "--reasoning-parser",
+        "qwen3",
+        "--max-model-len",
+        "2048",
+        "--enforce-eager",
+        "--gpu-memory-utilization",
+        "0.4",
         "--no-async-scheduling",
     ]
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
         yield remote_server
 
 
 @pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
+async def client(request, server, server_with_auto_reasoning_config):
+    server_map = {
+        "default": server,
+        "auto_config": server_with_auto_reasoning_config,
+    }
+    target_server = server_map[request.param]
+    async with target_server.get_async_client() as async_client:
         yield async_client
 
 
 @pytest.mark.asyncio
+@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
 async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
     """Test that mixed requests (some with thinking_token_budget, some without)
     complete successfully without errors."""
@@ -61,6 +85,7 @@ async def test_thinking_token_budget_mixed_requests(client: openai.AsyncOpenAI):
 
 
 @pytest.mark.asyncio
+@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
 async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI):
     """Test that thinking_token_budget limits the number of reasoning tokens.
 
@@ -82,6 +107,6 @@ async def test_thinking_token_budget_limits_reasoning(client: openai.AsyncOpenAI
             reasoning_token_count += 1
 
     assert reasoning_token_count == THINK_BUDGET, (
-        f"reasoning tokens ({reasoning_token_count}) != "
+        f"reasoning tokens ({reasoning_token_count}) exceeded "
         f"thinking_token_budget ({THINK_BUDGET})"
     )
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
@@ -368,6 +368,23 @@ def test_auto_backend_selection_behavior():
     assert backend_auto.get_name() == backend_none.get_name()
 
 
+def test_flash_attn_rejects_int4_kv_cache(monkeypatch: pytest.MonkeyPatch):
+    try:
+        from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
+    except ImportError:
+        pytest.skip("vllm_flash_attn extension is not available in this env")
+
+    monkeypatch.setattr(
+        "vllm.v1.attention.backends.flash_attn.flash_attn_supports_fp8",
+        lambda: True,
+    )
+
+    assert FlashAttentionBackend.supports_kv_cache_dtype("fp8")
+    assert not FlashAttentionBackend.supports_kv_cache_dtype(
+        "int4_per_token_head"
+    )
+
+
 @pytest.mark.parametrize(
     "backend_name,flash_attn_version,should_succeed",
     [