Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 32 additions & 133 deletions vllm/reasoning/glm4_moe_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,11 @@

from collections.abc import Sequence

from transformers import PreTrainedTokenizerBase
from vllm.entrypoints.openai.protocol import DeltaMessage
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser

from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParser

logger = init_logger(__name__)


class Glm4MoeModelReasoningParser(ReasoningParser):
class Glm4MoeModelReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for the Glm4MoeModel model.

Expand All @@ -23,52 +18,15 @@ class Glm4MoeModelReasoningParser(ReasoningParser):
from the model's output.
"""

def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
self.think_start_token = "<think>"
self.think_end_token = "</think>"
self.assistant_token = "<|assistant|>"

if not self.model_tokenizer:
raise ValueError(
"The model tokenizer must be passed to the ReasoningParser "
"constructor during construction."
)

self.think_start_token_id = self.vocab.get(self.think_start_token)
self.think_end_token_id = self.vocab.get(self.think_end_token)
self.assistant_token_id = self.vocab.get(self.assistant_token)
if (
self.think_start_token_id is None
or self.think_end_token_id is None
or self.assistant_token_id is None
):
raise RuntimeError(
"Glm4MoeModel reasoning parser could not locate "
"think start/end or assistant tokens in the tokenizer!"
)

def is_reasoning_end(self, input_ids: list[int]) -> bool:
"""
GLM's chat template has <think></think> tokens after every
<|assistant|> token. Thus, we need to check if </think> is
after the most recent <|assistant|> token (if present).
"""
for token_id in input_ids[::-1]:
if token_id == self.think_end_token_id:
return True
elif token_id == self.assistant_token_id:
return False
return False
@property
def start_token(self) -> str:
"""The token that starts reasoning content."""
return "<think>"

def extract_content_ids(self, input_ids: list[int]) -> list[int]:
"""
Extract the content after the end tokens
"""
if self.think_end_token_id not in input_ids[:-1]:
return []
else:
return input_ids[input_ids.index(self.think_end_token_id) + 1 :]
@property
def end_token(self) -> str:
"""The token that ends reasoning content."""
return "</think>"

def extract_reasoning_streaming(
self,
Expand All @@ -79,93 +37,34 @@ def extract_reasoning_streaming(
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
"""
Extract reasoning content from a delta message.
Handles streaming output where previous + delta = current.
Uses token IDs for faster processing.
For text <think>abc</think>xyz:
- 'abc' goes to reasoning
- 'xyz' goes to content
"""
# Skip single special tokens
if len(delta_token_ids) == 1 and (
delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]
ret = super().extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
if (
ret is not None
and self.start_token_id not in previous_token_ids
and self.start_token_id not in delta_token_ids
):

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Normal outputs treated as reasoning when think tokens absent

The new branch that runs when no <think> start token has ever been seen now routes text into the reasoning field even though the model never emitted thinking markers, and the class now inherits the base extract_reasoning (basic_parsers.py 151-175) which likewise returns the entire output as reasoning when <think> is missing. For GLM calls with enable_thinking=False or for variants that simply omit the tags, content stays None, so the OpenAI response builders (serving_responses.py 844-889) skip creating the assistant message and the user receives no answer despite the model producing one. The previous parser returned the text as content when the tags were absent, so non-thinking generations are now silently dropped.

Useful? React with 👍 / 👎.

return None

if self.think_start_token_id in previous_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in previous, </think> in delta,
# extract reasoning content
end_index = delta_text.find(self.think_end_token)
if self.end_token_id in delta_token_ids:
# end token in delta with more tokens,
# extract reasoning content and content
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.think_end_token) :]
content = delta_text[end_index + len(self.end_token) :]
return DeltaMessage(
reasoning=reasoning,
content=content if content else None,
)
elif self.think_end_token_id in previous_token_ids:
# <think> in previous, </think> in previous,
# reasoning content continues
elif self.end_token_id in previous_token_ids:
# end token in previous, thinking content ends
return DeltaMessage(content=delta_text)
else:
# <think> in previous, no </think> in previous or delta,
# reasoning content continues
return DeltaMessage(reasoning=delta_text)
elif self.think_start_token_id in delta_token_ids:
if self.think_end_token_id in delta_token_ids:
# <think> in delta, </think> in delta, extract reasoning content
start_index = delta_text.find(self.think_start_token)
end_index = delta_text.find(self.think_end_token)
reasoning = delta_text[
start_index + len(self.think_start_token) : end_index
]
content = delta_text[end_index + len(self.think_end_token) :]
return DeltaMessage(
reasoning=reasoning,
content=content if content else None,
)
else:
# <think> in delta, no </think> in delta,
# reasoning content continues
# no end token in previous or delta, reasoning content continues
return DeltaMessage(reasoning=delta_text)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The logic within this if block for handling streaming extraction when a start token is missing is a duplication of existing logic in BaseThinkingReasoningParser for when a start token is present. This code duplication introduces a maintenance risk: future changes to the parsing logic will need to be made in two places, which is error-prone.

To improve maintainability, this duplicated logic should be extracted into a shared helper method within the base class.

else:
# thinking is disabled, just content
return DeltaMessage(content=delta_text)

def extract_reasoning(
self, model_output: str, request: ChatCompletionRequest
) -> tuple[str | None, str | None]:
"""
Extract reasoning content from the model output.

For text <think>abc</think>xyz:
- 'abc' goes to reasoning
- 'xyz' goes to content

Returns:
tuple[Optional[str], Optional[str]]: reasoning content and content
"""

# Check if the model output contains the <think> and </think> tokens.
if (
self.think_start_token not in model_output
or self.think_end_token not in model_output
):
return None, model_output
# Check if the <think> is present in the model output, remove it
# if it is present.
model_output_parts = model_output.partition(self.think_start_token)
model_output = (
model_output_parts[2] if model_output_parts[1] else model_output_parts[0]
)
# Check if the model output contains the </think> tokens.
# If the end token is not found, return the model output as is.
if self.think_end_token not in model_output:
return None, model_output

# Extract reasoning content from the model output.
reasoning, _, content = model_output.partition(self.think_end_token)

final_content = content or None
return reasoning, final_content
return ret