vllm-project · chaunceyjiang · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
@@ -139,3 +139,32 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
     assert len(tool_calls.choices[0].message.reasoning_content) > 0
     assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
     assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
+
+@pytest.mark.asyncio
+async def test_stop_str_with_reasoning(client: openai.AsyncOpenAI):
+    # check that the response is correctly stopped at "9.8"
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "9.11 and 9.8, which is greater?"
+        }],
+        temperature=1.0,
+        stop="9.8",
+    )
+
+    assert response.choices[0].message.reasoning_content.find("9.8") != -1
+    assert response.choices[0].message.content.find("9.8") == -1
+
+    # check no stop string
+    response = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "9.11 and 9.8, which is greater?"
+        }],
+        temperature=1.0,
+    )
+    assert response.choices[0].message.reasoning_content.find("9.8") != -1
+    # check that the response is not stopped at "9.8"
+    assert response.choices[0].message.content.find("9.8") != -1
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
@@ -7,6 +7,7 @@
 import pytest
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
+from vllm.config import VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer
 from vllm.v1.engine import EngineCoreRequest
@@ -60,25 +61,28 @@ def _run_incremental_decode(
         skip_special_tokens=skip_special_tokens,
         spaces_between_special_tokens=spaces_between_special_tokens,
     )
-    request = EngineCoreRequest(
-        request_id="",
-        prompt_token_ids=prompt_token_ids,
-        mm_features=None,
-        sampling_params=params,
-        pooling_params=None,
-        eos_token_id=None,
-        arrival_time=0.0,
-        lora_request=None,
-        cache_salt=None,
-        data_parallel_rank=None,
-    )
-
+    request = EngineCoreRequest(request_id="",
+                                prompt_token_ids=prompt_token_ids,
+                                mm_features=None,
+                                sampling_params=params,
+                                pooling_params=None,
+                                eos_token_id=None,
+                                arrival_time=0.0,
+                                lora_request=None,
+                                cache_salt=None,
+                                data_parallel_rank=None)
+    vllm_config = VllmConfig()
     if fast is None:
-        detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
+        detokenizer = IncrementalDetokenizer.from_new_request(
+            vllm_config=vllm_config, tokenizer=tokenizer, request=request)
     elif fast:
-        detokenizer = FastIncrementalDetokenizer(tokenizer, request)
+        detokenizer = FastIncrementalDetokenizer(vllm_config=vllm_config,
+                                                 tokenizer=tokenizer,
+                                                 request=request)
     else:
-        detokenizer = SlowIncrementalDetokenizer(tokenizer, request)
+        detokenizer = SlowIncrementalDetokenizer(vllm_config=vllm_config,
+                                                 tokenizer=tokenizer,
+                                                 request=request)
 
     output_text = ""
     for i, token_id in enumerate(all_input_ids[starting_index:]):

diff --git a/tests/v1/engine/test_fast_incdec_prefix_err.py b/tests/v1/engine/test_fast_incdec_prefix_err.py
@@ -3,6 +3,7 @@
 
 from transformers import AutoTokenizer
 
+from vllm.config import VllmConfig
 from vllm.sampling_params import SamplingParams
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.detokenizer import IncrementalDetokenizer
@@ -21,7 +22,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
     https://gist.github.com/fpaupier/0ed1375bd7633c5be6c894b1c7ac1be3.
     """
     tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it")
-
+    vllm_config = VllmConfig()
     # Create a test request
     prompt_token_ids = [107, 4606, 236787, 107]
     params = SamplingParams(skip_special_tokens=True)
@@ -38,7 +39,8 @@ def test_fast_inc_detok_invalid_utf8_err_case():
         data_parallel_rank=None,
     )
 
-    detokenizer = IncrementalDetokenizer.from_new_request(tokenizer, request)
+    detokenizer = IncrementalDetokenizer.from_new_request(
+        vllm_config, tokenizer, request)
 
     assert detokenizer.__class__.__name__ == "FastIncrementalDetokenizer", (
         "Should use FastIncrementalDetokenizer by default"

diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py
@@ -15,6 +15,8 @@
     MockEngineCore,
 )
 from vllm import PoolingParams
+from vllm.config import VllmConfig, StructuredOutputsConfig
+from vllm.config import DecodingConfig, VllmConfig
 from vllm.logprobs import PromptLogprobs, SampleLogprobs
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import RequestOutputKind, SamplingParams
@@ -41,13 +43,17 @@ def _ref_convert_id_to_token(
 
 
 @pytest.mark.parametrize(
-    "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
-)
-def test_incremental_detokenization(
-    request_output_kind: RequestOutputKind, dummy_test_vectors
-):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
-    engine_core = MockEngineCore(tokens_list=dummy_test_vectors.generation_tokens)
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+def test_incremental_detokenization(request_output_kind: RequestOutputKind,
+                                    dummy_test_vectors):
+    vllm_config = VllmConfig(
+        structured_outputs_config=StructuredOutputsConfig())
+    output_processor = OutputProcessor(vllm_config=vllm_config,
+                                       tokenizer=dummy_test_vectors.tokenizer,
+                                       log_stats=False)
+    engine_core = MockEngineCore(
+        tokens_list=dummy_test_vectors.generation_tokens)
 
     # Make N requests.
     requests = [
@@ -407,17 +413,21 @@ def _validate_logprobs(
 
 
 @pytest.mark.parametrize(
-    "request_output_kind", [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY]
-)
-@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-@pytest.mark.parametrize("num_prompt_logprobs", [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
-def test_logprobs_processor(
-    request_output_kind: RequestOutputKind,
-    num_sample_logprobs: Optional[int],
-    num_prompt_logprobs: Optional[int],
-    dummy_test_vectors,
-):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+@pytest.mark.parametrize("num_prompt_logprobs",
+                         [None, NUM_PROMPT_LOGPROBS_UNDER_TEST])
+def test_logprobs_processor(request_output_kind: RequestOutputKind,
+                            num_sample_logprobs: Optional[int],
+                            num_prompt_logprobs: Optional[int],
+                            dummy_test_vectors):
+    vllm_config = VllmConfig(
+        structured_outputs_config=StructuredOutputsConfig())
+    output_processor = OutputProcessor(vllm_config=vllm_config,
+                                       tokenizer=dummy_test_vectors.tokenizer,
+                                       log_stats=False)
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
         generated_logprobs_raw=None
@@ -588,8 +598,11 @@ def test_stop_token(
         dummy_test_vectors.tokenizer.eos_token_id if is_eos_test else None
     )  # '<|end_of_text|>'
     stop_token_ids = [128009] if not is_eos_test else None  # '<|eot_id|>'
-
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
+    vllm_config = VllmConfig(
+        structured_outputs_config=StructuredOutputsConfig())
+    output_processor = OutputProcessor(vllm_config=vllm_config,
+                                       tokenizer=dummy_test_vectors.tokenizer,
+                                       log_stats=False)
     # Dummy engine core outputs, with control tokens suffixed to test stops
     suffix_token = [eos_token_id] if is_eos_test else stop_token_ids
     assert suffix_token is not None and isinstance(suffix_token[0], int)
@@ -693,13 +706,15 @@ def test_stop_token(
 
 
 @pytest.mark.parametrize("include_stop_str_in_output", [True, False])
-@pytest.mark.parametrize("num_sample_logprobs", [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
-def test_stop_string(
-    include_stop_str_in_output: bool,
-    num_sample_logprobs: Optional[int],
-    dummy_test_vectors,
-):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=False)
+@pytest.mark.parametrize("num_sample_logprobs",
+                         [None, NUM_SAMPLE_LOGPROBS_UNDER_TEST])
+def test_stop_string(include_stop_str_in_output: bool,
+                     num_sample_logprobs: Optional[int], dummy_test_vectors):
+    vllm_config = VllmConfig(
+        structured_outputs_config=StructuredOutputsConfig())
+    output_processor = OutputProcessor(vllm_config=vllm_config,
+                                       tokenizer=dummy_test_vectors.tokenizer,
+                                       log_stats=False)
     engine_core = MockEngineCore(
         tokens_list=dummy_test_vectors.generation_tokens,
         generated_logprobs_raw=dummy_test_vectors.generation_logprobs
@@ -827,7 +842,11 @@ def test_stop_string(
 
 
 def test_iteration_stats(dummy_test_vectors):
-    output_processor = OutputProcessor(dummy_test_vectors.tokenizer, log_stats=True)
+    vllm_config = VllmConfig(
+        structured_outputs_config=StructuredOutputsConfig())
+    output_processor = OutputProcessor(vllm_config=vllm_config,
+                                       tokenizer=dummy_test_vectors.tokenizer,
+                                       log_stats=True)
     engine_core = MockEngineCore(dummy_test_vectors.generation_tokens)
     engine_core_timestamp = time.monotonic()
 

@@ -116,9 +116,9 @@ def __init__(
         )
 
         # OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
-        self.output_processor = OutputProcessor(
-            self.tokenizer, log_stats=self.log_stats
-        )
+        self.output_processor = OutputProcessor(self.vllm_config,
+                                                self.tokenizer,
+                                                log_stats=self.log_stats)
         if self.observability_config.otlp_traces_endpoint is not None:
             tracer = init_tracer(
                 "vllm.llm_engine", self.observability_config.otlp_traces_endpoint