From c851d60ef04782df77d5f0e87bfd1d854f345706 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 12 Feb 2026 14:10:28 -0500
Subject: [PATCH 1/4] [Bugfix]: Fix structured output in multi-turn gpt-oss

The logic in the gptoss_reasoning_parser to detect when the model has
finished outputting reasoning content is is starting to output content
to the final channel was inadvertently matching on final channel
messages from previous messages for multi-turn scenarios. In practice
this meant that vLLM started applying the grammar bitmasks to the
entirety of the model's output in these multi-turn conversations
prematurely, causing the model to deviate from its trained Harmony
format and lead to empty or invalid outputs.

This PR fixes things by never looking for the final channel marker in any
message prior to the current one the model is generating so that we
don't falsely believe the model is starting generation of the final
channel unless it's actually doing so during this turn of the
conversation.

Prior to vLLM v0.13.0 this bug existed but we didn't actually trip over
it because the way we handle multi-turn conversation state with gpt-oss
models was missing important tokens that coincidentally caused those
prior conversations to not actually match these token id checks. But,
once we fixed multi-turn conversation state, that caused structured
output usage with things like `json_object` response formats to then hit
this bug in the reasoning parser.

Fixes #32791

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../reasoning/test_gptoss_reasoning_parser.py | 19 ++++++++++++++++++-
 vllm/reasoning/gptoss_reasoning_parser.py     |  9 +++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tests/reasoning/test_gptoss_reasoning_parser.py b/tests/reasoning/test_gptoss_reasoning_parser.py
index 873135d5717f..6013fa642edd 100644
--- a/tests/reasoning/test_gptoss_reasoning_parser.py
+++ b/tests/reasoning/test_gptoss_reasoning_parser.py
@@ -17,7 +17,9 @@ def gpt_oss_tokenizer():
 
 USER_MESSAGE_START = "<|start|>user<|message|>"
 REASONING_SECTION_START = "<|end|><|start|>assistant<|channel|>analysis<|message|>"
-ASSISTANT_CONTENT_START_PREFIX = "<|end|><|start|>assistant<|channel|>final"
+END = "<|end|>"
+ASSISTANT_START = "<|start|>assistant"
+ASSISTANT_CONTENT_START_PREFIX = END + ASSISTANT_START + "<|channel|>final"
 ASSISTANT_CONTENT_START_SUFFIX = "<|message|>"
 ASSISTANT_CONTENT_START = (
     ASSISTANT_CONTENT_START_PREFIX + ASSISTANT_CONTENT_START_SUFFIX
@@ -97,6 +99,20 @@ def gpt_oss_tokenizer():
     "is_reasoning_end": True,
 }
 
+MULTI_TURN_CONTENT = {
+    "output": USER_MESSAGE_START
+    + "1st turn user message"
+    + REASONING_SECTION_START
+    + "1st turn reasoning"
+    + ASSISTANT_CONTENT_START
+    + "1st turn response"
+    + END
+    + USER_MESSAGE_START
+    + "2nd turn user message"
+    + END
+    + ASSISTANT_START,
+    "is_reasoning_end": False,
+}
 TEST_CASES = [
     BASIC_CONTENT,
     BASIC_REASONING_ONLY,
@@ -106,6 +122,7 @@ def gpt_oss_tokenizer():
     COMPLEX_CONTENT_1,
     COMPLEX_CONTENT_1_WITH_CONTENT,
     COMPLEX_CONTENT_2,
+    MULTI_TURN_CONTENT,
 ]
 
 
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 186c4e5c7f98..33b0b45a7dc4 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -76,6 +76,9 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
             "<|channel|>final"
         )
         self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
+        # We also need to check for the <|end|> token to avoid false positives from
+        # previous messages in multi-turn conversations.
+        self.eom_token_id = self.model_tokenizer.encode("<|end|>")[0]
         self.reasoning_max_num_between_tokens = 20
 
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
@@ -86,6 +89,12 @@ def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
         # Check if the end sequence is present in the input_ids.
         # We search from the end of input_ids to find the last match.
         for i in range(len(input_ids) - len(end_token_ids_prefix), -1, -1):
+            if input_ids[i] == self.eom_token_id:
+                # We looped backwards far enough to find the end of a previous message,
+                # which means we have searched the entirety of the current message
+                # and can exit early without searching further back into prior
+                # messages of the conversation.
+                return False
             if input_ids[i : i + len(end_token_ids_prefix)] == end_token_ids_prefix:
                 # We have found the prefix, now we look for the suffix after the prefix.
                 suffix_start = i + len(end_token_ids_prefix)

From aae49f96955f5ea468641a75f02b7a869b7fb67a Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Thu, 12 Feb 2026 14:36:11 -0500
Subject: [PATCH 2/4] Be explicit about expecting gpt-oss eom to be a single
 token

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 vllm/reasoning/gptoss_reasoning_parser.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 33b0b45a7dc4..ac298cded607 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -78,7 +78,13 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
         # We also need to check for the <|end|> token to avoid false positives from
         # previous messages in multi-turn conversations.
-        self.eom_token_id = self.model_tokenizer.encode("<|end|>")[0]
+        eom_token_ids = self.model_tokenizer.encode("<|end|>")
+        if len(eom_token_ids) != 1:
+            raise ValueError(
+                "Expected '<|end|>' to be a single token, but got "
+                f"{len(eom_token_ids)} tokens: {eom_token_ids}"
+            )
+        self.eom_token_id = eom_token_ids[0]
         self.reasoning_max_num_between_tokens = 20
 
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:

From b6c9fc9de8bccf823c8719c8b06b7a8ed4a3eef9 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Fri, 13 Feb 2026 08:12:26 -0500
Subject: [PATCH 3/4] Use model_tokenizer.vocab for single token ids

Instead of .encode followed by taking the first token, it's cleaner to just directly use model_tokenizer.vocab to fetch single token ids.

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 vllm/reasoning/gptoss_reasoning_parser.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index ac298cded607..599392e36374 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -78,13 +78,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         self.reasoning_end_token_ids_suffix = self.model_tokenizer.encode("<|message|>")
         # We also need to check for the <|end|> token to avoid false positives from
         # previous messages in multi-turn conversations.
-        eom_token_ids = self.model_tokenizer.encode("<|end|>")
-        if len(eom_token_ids) != 1:
-            raise ValueError(
-                "Expected '<|end|>' to be a single token, but got "
-                f"{len(eom_token_ids)} tokens: {eom_token_ids}"
-            )
-        self.eom_token_id = eom_token_ids[0]
+        self.eom_token_id = self.model_tokenizer.vocab["<|end|>"]
         self.reasoning_max_num_between_tokens = 20
 
     def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:

From 209c6bb2caae3dc8665e7751749bb28aed33a425 Mon Sep 17 00:00:00 2001
From: Ben Browning <bbrownin@redhat.com>
Date: Fri, 13 Feb 2026 09:13:04 -0500
Subject: [PATCH 4/4] Adjust mocks for other gptoss_reasoning_parser tests

CI discovered some additional tests that use gptoss_reasoning_parser but
with a mocked tokenizer. So, this adds a mocked `vocab` to that mock
tokenizer so that these tests also pass.

Signed-off-by: Ben Browning <bbrownin@redhat.com>
---
 .../openai/test_gptoss_structural_tags_integration.py            | 1 +
 tests/v1/structured_output/test_gptoss_structural_tags.py        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
index 2c481cc711dc..47f841540eba 100644
--- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -23,6 +23,7 @@ def mock_tokenizer(self):
         """Create a mock tokenizer."""
         tokenizer = Mock()
         tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
+        tokenizer.vocab = {"<|end|>": 6}
         return tokenizer
 
     @pytest.fixture
diff --git a/tests/v1/structured_output/test_gptoss_structural_tags.py b/tests/v1/structured_output/test_gptoss_structural_tags.py
index 0d49487302f4..fafa9d8ed465 100644
--- a/tests/v1/structured_output/test_gptoss_structural_tags.py
+++ b/tests/v1/structured_output/test_gptoss_structural_tags.py
@@ -25,6 +25,7 @@ def mock_tokenizer(self):
         """Create a mock tokenizer for testing."""
         tokenizer = Mock()
         tokenizer.encode = Mock(return_value=[1, 2, 3, 4, 5])
+        tokenizer.vocab = {"<|end|>": 6}
         return tokenizer
 
     @pytest.fixture