Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions tests/reasoning/test_glm4_moe_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,20 @@ def glm45_tokenizer():
"content": "This is the rest\nThat",
"is_reasoning_end": True,
}
EMPTY_THINK_BLOCK = {
"output": "<think></think>This is the rest",
"reasoning": None,
"content": "This is the rest",
"is_reasoning_end": True,
}

EMPTY_THINK_BLOCK_NO_CONTENT = {
"output": "<think></think>",
"reasoning": None,
"content": None,
"is_reasoning_end": True,
}

ONLY_OPEN_TAG = {
"output": "<think>This is a reasoning section",
"reasoning": None,
Expand Down Expand Up @@ -114,6 +128,26 @@ def glm45_tokenizer():
MULTILINE_REASONING,
id="multiline_reasoning_stream",
),
pytest.param(
False,
EMPTY_THINK_BLOCK,
id="empty_think_block",
),
pytest.param(
True,
EMPTY_THINK_BLOCK,
id="empty_think_block_stream",
),
pytest.param(
False,
EMPTY_THINK_BLOCK_NO_CONTENT,
id="empty_think_block_no_content",
),
pytest.param(
True,
EMPTY_THINK_BLOCK_NO_CONTENT,
id="empty_think_block_no_content_stream",
),
pytest.param(
False,
ONLY_OPEN_TAG,
Expand Down
4 changes: 2 additions & 2 deletions vllm/reasoning/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@
"Ernie45ReasoningParser",
),
"glm45": (
"deepseek_v3_reasoning_parser",
"DeepSeekV3ReasoningWithThinkingParser",
"glm4_moe_reasoning_parser",
"Glm4MoeReasoningParser",
),
"openai_gptoss": (
"gptoss_reasoning_parser",
Expand Down
46 changes: 46 additions & 0 deletions vllm/reasoning/glm4_moe_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from typing import TYPE_CHECKING

from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser

if TYPE_CHECKING:
from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.responses.protocol import ResponsesRequest


class Glm4MoeReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for GLM-4 MoE models.

Unlike DeepSeek R1, GLM-4 injects <think> via the chat template rather
than generating it. When the model output lacks </think>, the entire
output is treated as *content* (not reasoning), because the absence of
the end tag means the model chose not to reason.
"""
Comment on lines +13 to +21
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

There's an inconsistency between the non-streaming and streaming behavior of this parser for outputs that contain <think> but not </think>.

  • The extract_reasoning method correctly implements the logic described in the docstring: if </think> is absent, the entire output is treated as content. For an input like "<think>some reasoning", it will return (None, "<think>some reasoning").

  • However, this class inherits extract_reasoning_streaming from BaseThinkingReasoningParser. The base implementation will treat "<think>some reasoning" as reasoning during streaming, which contradicts this parser's stated logic for handling outputs without a closing </think> tag.

This is the same type of inconsistency that this PR fixes for SeedOSSReasoningParser. To ensure consistent behavior, Glm4MoeReasoningParser should also override extract_reasoning_streaming. A potential approach is to buffer content after <think> and only flush it as reasoning once </think> is seen. If the stream ends before </think>, the buffer would be flushed as content.


@property
def start_token(self) -> str:
return "<think>"

@property
def end_token(self) -> str:
return "</think>"

def extract_reasoning(
self, model_output: str, request: "ChatCompletionRequest | ResponsesRequest"
) -> tuple[str | None, str | None]:
if self.end_token not in model_output:
# No closing tag — model didn't produce reasoning.
# Return the full original output as content.
return None, model_output

# Normal case: <think>reasoning</think>content
parts = model_output.partition(self.start_token)
after_start = parts[2] if parts[1] else parts[0]
reasoning, _, content = after_start.partition(self.end_token)

# Normalize empty strings to None -- <think></think> means
# the model chose not to reason, not that reasoning is "".
return reasoning or None, content or None
42 changes: 42 additions & 0 deletions vllm/reasoning/seedoss_reasoning_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

from collections.abc import Sequence

from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser


Expand All @@ -25,3 +27,43 @@ def start_token(self) -> str:
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</seed:think>"

def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
# Like R1, SeedOSS may not emit the start token (it's in the chat
# template). When neither previous nor delta contains the start
# token, treat text as reasoning unless the end token has been seen.
ret = super().extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
if (
ret is not None
and self.start_token_id not in previous_token_ids
and self.start_token_id not in delta_token_ids
):
if self.end_token_id in delta_token_ids:
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning,
content=content if content else None,
)
elif self.end_token_id in previous_token_ids:
return DeltaMessage(content=delta_text)
else:
return DeltaMessage(reasoning=delta_text)

return ret
Loading