vllm-project · he-yufeng · Mar 14, 2026 · Mar 31, 2026 · gemini-code-assist · Mar 14, 2026
diff --git a/tests/reasoning/test_glm4_moe_reasoning_parser.py b/tests/reasoning/test_glm4_moe_reasoning_parser.py
@@ -59,6 +59,20 @@ def glm45_tokenizer():
     "content": "This is the rest\nThat",
     "is_reasoning_end": True,
 }
+EMPTY_THINK_BLOCK = {
+    "output": "<think></think>This is the rest",
+    "reasoning": None,
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+EMPTY_THINK_BLOCK_NO_CONTENT = {
+    "output": "<think></think>",
+    "reasoning": None,
+    "content": None,
+    "is_reasoning_end": True,
+}
+
 ONLY_OPEN_TAG = {
     "output": "<think>This is a reasoning section",
     "reasoning": None,
@@ -114,6 +128,26 @@ def glm45_tokenizer():
         MULTILINE_REASONING,
         id="multiline_reasoning_stream",
     ),
+    pytest.param(
+        False,
+        EMPTY_THINK_BLOCK,
+        id="empty_think_block",
+    ),
+    pytest.param(
+        True,
+        EMPTY_THINK_BLOCK,
+        id="empty_think_block_stream",
+    ),
+    pytest.param(
+        False,
+        EMPTY_THINK_BLOCK_NO_CONTENT,
+        id="empty_think_block_no_content",
+    ),
+    pytest.param(
+        True,
+        EMPTY_THINK_BLOCK_NO_CONTENT,
+        id="empty_think_block_no_content_stream",
+    ),
     pytest.param(
         False,
         ONLY_OPEN_TAG,

@@ -33,8 +33,8 @@
         "Ernie45ReasoningParser",
     ),
     "glm45": (
-        "deepseek_v3_reasoning_parser",
-        "DeepSeekV3ReasoningWithThinkingParser",
+        "glm4_moe_reasoning_parser",
+        "Glm4MoeReasoningParser",
     ),
     "openai_gptoss": (
         "gptoss_reasoning_parser",

@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TYPE_CHECKING
+
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+
+if TYPE_CHECKING:
+    from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+    from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+
+
+class Glm4MoeReasoningParser(BaseThinkingReasoningParser):
+    """
+    Reasoning parser for GLM-4 MoE models.
+
+    Unlike DeepSeek R1, GLM-4 injects <think> via the chat template rather
+    than generating it.  When the model output lacks </think>, the entire
+    output is treated as *content* (not reasoning), because the absence of
+    the end tag means the model chose not to reason.
+    """
+
+    @property
+    def start_token(self) -> str:
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        return "</think>"
+
+    def extract_reasoning(
+        self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest"
+    ) -> tuple[str | None, str | None]:
+        if self.end_token not in model_output:
+            # No closing tag — model didn't produce reasoning.
+            # Return the full original output as content.
+            return None, model_output
+
+        # Normal case: <think>reasoning</think>content
+        parts = model_output.partition(self.start_token)
+        after_start = parts[2] if parts[1] else parts[0]
+        reasoning, _, content = after_start.partition(self.end_token)
+
+        # Normalize empty strings to None -- <think></think> means
+        # the model chose not to reason, not that reasoning is "".
+        return reasoning or None, content or None
@@ -1,7 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from collections.abc import Sequence
 
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
 from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
 
 
@@ -25,3 +27,43 @@ def start_token(self) -> str:
     def end_token(self) -> str:
         """The token that ends reasoning content."""
         return "</seed:think>"
+
+    def extract_reasoning_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+    ) -> DeltaMessage | None:
+        # Like R1, SeedOSS may not emit the start token (it's in the chat
+        # template).  When neither previous nor delta contains the start
+        # token, treat text as reasoning unless the end token has been seen.
+        ret = super().extract_reasoning_streaming(
+            previous_text,
+            current_text,
+            delta_text,
+            previous_token_ids,
+            current_token_ids,
+            delta_token_ids,
+        )
+        if (
+            ret is not None
+            and self.start_token_id not in previous_token_ids
+            and self.start_token_id not in delta_token_ids
+        ):
+            if self.end_token_id in delta_token_ids:
+                end_index = delta_text.find(self.end_token)
+                reasoning = delta_text[:end_index]
+                content = delta_text[end_index + len(self.end_token) :]
+                return DeltaMessage(
+                    reasoning=reasoning,
+                    content=content if content else None,
+                )
+            elif self.end_token_id in previous_token_ids:
+                return DeltaMessage(content=delta_text)
+            else:
+                return DeltaMessage(reasoning=delta_text)
+
+        return ret