From c851d60ef04782df77d5f0e87bfd1d854f345706 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 12 Feb 2026 14:10:28 -0500 Subject: [PATCH 1/4] [Bugfix]: Fix structured output in multi-turn gpt-oss The logic in the gptoss_reasoning_parser to detect when the model has finished outputting reasoning content is is starting to output content to the final channel was inadvertently matching on final channel messages from previous messages for multi-turn scenarios. In practice this meant that vLLM started applying the grammar bitmasks to the entirety of the model's output in these multi-turn conversations prematurely, causing the model to deviate from its trained Harmony format and lead to empty or invalid outputs. This PR fixes things by never looking for the final channel marker in any message prior to the current one the model is generating so that we don't falsely believe the model is starting generation of the final channel unless it's actually doing so during this turn of the conversation. Prior to vLLM v0.13.0 this bug existed but we didn't actually trip over it because the way we handle multi-turn conversation state with gpt-oss models was missing important tokens that coincidentally caused those prior conversations to not actually match these token id checks. But, once we fixed multi-turn conversation state, that caused structured output usage with things like `json_object` response formats to then hit this bug in the reasoning parser. Fixes #32791 Signed-off-by: Ben Browning --- .../reasoning/test_gptoss_reasoning_parser.py | 19 ++++++++++++++++++- vllm/reasoning/gptoss_reasoning_parser.py | 9 +++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py index 873135d5717f..6013fa642edd 100644 --- a/tests/reasoning/test_gptoss_reasoning_parser.py +++ b/tests/reasoning/test_gptoss_reasoning_parser.py @@ -17,7 +17,9 @@ def gpt_oss_tokenizer(): USER_MESSAGE_START = "<|start|>user<|message|>" REASONING_SECTION_START = "<|end|><|start|>assistant<|channel|>analysis<|message|>" -ASSISTANT_CONTENT_START_PREFIX = "<|end|><|start|>assistant<|channel|>final" +END = "<|end|>" +ASSISTANT_START = "<|start|>assistant" +ASSISTANT_CONTENT_START_PREFIX = END + ASSISTANT_START + "<|channel|>final" ASSISTANT_CONTENT_START_SUFFIX = "<|message|>" ASSISTANT_CONTENT_START = ( ASSISTANT_CONTENT_START_PREFIX + ASSISTANT_CONTENT_START_SUFFIX @@ -97,6 +99,20 @@ def gpt_oss_tokenizer(): "is_reasoning_end": True, } +MULTI_TURN_CONTENT = { + "output": USER_MESSAGE_START + + "1st turn user message" + + REASONING_SECTION_START + + "1st turn reasoning" + + ASSISTANT_CONTENT_START + + "1st turn response" + + END + + USER_MESSAGE_START + + "2nd turn user message" + + END + + ASSISTANT_START, + "is_reasoning_end": False, +} TEST_CASES = [ BASIC_CONTENT, BASIC_REASONING_ONLY, @@ -106,6 +122,7 @@ def gpt_oss_tokenizer(): COMPLEX_CONTENT_1, COMPLEX_CONTENT_1_WITH_CONTENT, COMPLEX_CONTENT_2, + MULTI_TURN_CONTENT, ] diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 186c4e5c7f98..33b0b45a7dc4 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -76,6 +76,9 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): "<|channel|>final" ) self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>") + # We also need to check for the <|end|> token to avoid false positives from + # previous messages in multi-turn conversations. + self.eom_token_id = self.model_tokenizer.encode("<|end|>")[0] self.reasoning_max_num_between_tokens = 20 def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: @@ -86,6 +89,12 @@ def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: # Check if the end sequence is present in the input_ids. # We search from the end of input_ids to find the last match. for i in range(len(input_ids) - len(end_token_ids_prefix), -1, -1): + if input_ids[i] == self.eom_token_id: + # We looped backwards far enough to find the end of a previous message, + # which means we have searched the entirety of the current message + # and can exit early without searching further back into prior + # messages of the conversation. + return False if input_ids[i : i + len(end_token_ids_prefix)] == end_token_ids_prefix: # We have found the prefix, now we look for the suffix after the prefix. suffix_start = i + len(end_token_ids_prefix) From aae49f96955f5ea468641a75f02b7a869b7fb67a Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Thu, 12 Feb 2026 14:36:11 -0500 Subject: [PATCH 2/4] Be explicit about expecting gpt-oss eom to be a single token Signed-off-by: Ben Browning --- vllm/reasoning/gptoss_reasoning_parser.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index 33b0b45a7dc4..ac298cded607 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -78,7 +78,13 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>") # We also need to check for the <|end|> token to avoid false positives from # previous messages in multi-turn conversations. - self.eom_token_id = self.model_tokenizer.encode("<|end|>")[0] + eom_token_ids = self.model_tokenizer.encode("<|end|>") + if len(eom_token_ids) != 1: + raise ValueError( + "Expected '<|end|>' to be a single token, but got " + f"{len(eom_token_ids)} tokens: {eom_token_ids}" + ) + self.eom_token_id = eom_token_ids[0] self.reasoning_max_num_between_tokens = 20 def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: From b6c9fc9de8bccf823c8719c8b06b7a8ed4a3eef9 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Fri, 13 Feb 2026 08:12:26 -0500 Subject: [PATCH 3/4] Use model_tokenizer.vocab for single token ids Instead of .encode followed by taking the first token, it's cleaner to just directly use model_tokenizer.vocab to fetch single token ids. Co-authored-by: Cyrus Leung Signed-off-by: Ben Browning --- vllm/reasoning/gptoss_reasoning_parser.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py index ac298cded607..599392e36374 100644 --- a/vllm/reasoning/gptoss_reasoning_parser.py +++ b/vllm/reasoning/gptoss_reasoning_parser.py @@ -78,13 +78,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>") # We also need to check for the <|end|> token to avoid false positives from # previous messages in multi-turn conversations. - eom_token_ids = self.model_tokenizer.encode("<|end|>") - if len(eom_token_ids) != 1: - raise ValueError( - "Expected '<|end|>' to be a single token, but got " - f"{len(eom_token_ids)} tokens: {eom_token_ids}" - ) - self.eom_token_id = eom_token_ids[0] + self.eom_token_id = self.model_tokenizer.vocab["<|end|>"] self.reasoning_max_num_between_tokens = 20 def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: From 209c6bb2caae3dc8665e7751749bb28aed33a425 Mon Sep 17 00:00:00 2001 From: Ben Browning Date: Fri, 13 Feb 2026 09:13:04 -0500 Subject: [PATCH 4/4] Adjust mocks for other gptoss_reasoning_parser tests CI discovered some additional tests that use gptoss_reasoning_parser but with a mocked tokenizer. So, this adds a mocked `vocab` to that mock tokenizer so that these tests also pass. Signed-off-by: Ben Browning --- .../openai/test_gptoss_structural_tags_integration.py | 1 + tests/v1/structured_output/test_gptoss_structural_tags.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py index 2c481cc711dc..47f841540eba 100644 --- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py +++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py @@ -23,6 +23,7 @@ def mock_tokenizer(self): """Create a mock tokenizer.""" tokenizer = Mock() tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5]) + tokenizer.vocab = {"<|end|>": 6} return tokenizer @pytest.fixture diff --git a/tests/v1/structured_output/test_gptoss_structural_tags.py b/tests/v1/structured_output/test_gptoss_structural_tags.py index 0d49487302f4..fafa9d8ed465 100644 --- a/tests/v1/structured_output/test_gptoss_structural_tags.py +++ b/tests/v1/structured_output/test_gptoss_structural_tags.py @@ -25,6 +25,7 @@ def mock_tokenizer(self): """Create a mock tokenizer for testing.""" tokenizer = Mock() tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5]) + tokenizer.vocab = {"<|end|>": 6} return tokenizer @pytest.fixture