-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
fix no think of GLM-4.5 / GLM-4.7 #31449
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,16 +3,11 @@ | |
|
|
||
| from collections.abc import Sequence | ||
|
|
||
| from transformers import PreTrainedTokenizerBase | ||
| from vllm.entrypoints.openai.protocol import DeltaMessage | ||
| from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser | ||
|
|
||
| from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage | ||
| from vllm.logger import init_logger | ||
| from vllm.reasoning import ReasoningParser | ||
|
|
||
| logger = init_logger(__name__) | ||
|
|
||
|
|
||
| class Glm4MoeModelReasoningParser(ReasoningParser): | ||
| class Glm4MoeModelReasoningParser(BaseThinkingReasoningParser): | ||
| """ | ||
| Reasoning parser for the Glm4MoeModel model. | ||
|
|
||
|
|
@@ -23,52 +18,15 @@ class Glm4MoeModelReasoningParser(ReasoningParser): | |
| from the model's output. | ||
| """ | ||
|
|
||
| def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): | ||
| super().__init__(tokenizer, *args, **kwargs) | ||
| self.think_start_token = "<think>" | ||
| self.think_end_token = "</think>" | ||
| self.assistant_token = "<|assistant|>" | ||
|
|
||
| if not self.model_tokenizer: | ||
| raise ValueError( | ||
| "The model tokenizer must be passed to the ReasoningParser " | ||
| "constructor during construction." | ||
| ) | ||
|
|
||
| self.think_start_token_id = self.vocab.get(self.think_start_token) | ||
| self.think_end_token_id = self.vocab.get(self.think_end_token) | ||
| self.assistant_token_id = self.vocab.get(self.assistant_token) | ||
| if ( | ||
| self.think_start_token_id is None | ||
| or self.think_end_token_id is None | ||
| or self.assistant_token_id is None | ||
| ): | ||
| raise RuntimeError( | ||
| "Glm4MoeModel reasoning parser could not locate " | ||
| "think start/end or assistant tokens in the tokenizer!" | ||
| ) | ||
|
|
||
| def is_reasoning_end(self, input_ids: list[int]) -> bool: | ||
| """ | ||
| GLM's chat template has <think></think> tokens after every | ||
| <|assistant|> token. Thus, we need to check if </think> is | ||
| after the most recent <|assistant|> token (if present). | ||
| """ | ||
| for token_id in input_ids[::-1]: | ||
| if token_id == self.think_end_token_id: | ||
| return True | ||
| elif token_id == self.assistant_token_id: | ||
| return False | ||
| return False | ||
| @property | ||
| def start_token(self) -> str: | ||
| """The token that starts reasoning content.""" | ||
| return "<think>" | ||
|
|
||
| def extract_content_ids(self, input_ids: list[int]) -> list[int]: | ||
| """ | ||
| Extract the content after the end tokens | ||
| """ | ||
| if self.think_end_token_id not in input_ids[:-1]: | ||
| return [] | ||
| else: | ||
| return input_ids[input_ids.index(self.think_end_token_id) + 1 :] | ||
| @property | ||
| def end_token(self) -> str: | ||
| """The token that ends reasoning content.""" | ||
| return "</think>" | ||
|
|
||
| def extract_reasoning_streaming( | ||
| self, | ||
|
|
@@ -79,93 +37,34 @@ def extract_reasoning_streaming( | |
| current_token_ids: Sequence[int], | ||
| delta_token_ids: Sequence[int], | ||
| ) -> DeltaMessage | None: | ||
| """ | ||
| Extract reasoning content from a delta message. | ||
| Handles streaming output where previous + delta = current. | ||
| Uses token IDs for faster processing. | ||
| For text <think>abc</think>xyz: | ||
| - 'abc' goes to reasoning | ||
| - 'xyz' goes to content | ||
| """ | ||
| # Skip single special tokens | ||
| if len(delta_token_ids) == 1 and ( | ||
| delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id] | ||
| ret = super().extract_reasoning_streaming( | ||
| previous_text, | ||
| current_text, | ||
| delta_text, | ||
| previous_token_ids, | ||
| current_token_ids, | ||
| delta_token_ids, | ||
| ) | ||
| if ( | ||
| ret is not None | ||
| and self.start_token_id not in previous_token_ids | ||
| and self.start_token_id not in delta_token_ids | ||
| ): | ||
| return None | ||
|
|
||
| if self.think_start_token_id in previous_token_ids: | ||
| if self.think_end_token_id in delta_token_ids: | ||
| # <think> in previous, </think> in delta, | ||
| # extract reasoning content | ||
| end_index = delta_text.find(self.think_end_token) | ||
| if self.end_token_id in delta_token_ids: | ||
| # end token in delta with more tokens, | ||
| # extract reasoning content and content | ||
| end_index = delta_text.find(self.end_token) | ||
| reasoning = delta_text[:end_index] | ||
| content = delta_text[end_index + len(self.think_end_token) :] | ||
| content = delta_text[end_index + len(self.end_token) :] | ||
| return DeltaMessage( | ||
| reasoning=reasoning, | ||
| content=content if content else None, | ||
| ) | ||
| elif self.think_end_token_id in previous_token_ids: | ||
| # <think> in previous, </think> in previous, | ||
| # reasoning content continues | ||
| elif self.end_token_id in previous_token_ids: | ||
| # end token in previous, thinking content ends | ||
| return DeltaMessage(content=delta_text) | ||
| else: | ||
| # <think> in previous, no </think> in previous or delta, | ||
| # reasoning content continues | ||
| return DeltaMessage(reasoning=delta_text) | ||
| elif self.think_start_token_id in delta_token_ids: | ||
| if self.think_end_token_id in delta_token_ids: | ||
| # <think> in delta, </think> in delta, extract reasoning content | ||
| start_index = delta_text.find(self.think_start_token) | ||
| end_index = delta_text.find(self.think_end_token) | ||
| reasoning = delta_text[ | ||
| start_index + len(self.think_start_token) : end_index | ||
| ] | ||
| content = delta_text[end_index + len(self.think_end_token) :] | ||
| return DeltaMessage( | ||
| reasoning=reasoning, | ||
| content=content if content else None, | ||
| ) | ||
| else: | ||
| # <think> in delta, no </think> in delta, | ||
| # reasoning content continues | ||
| # no end token in previous or delta, reasoning content continues | ||
| return DeltaMessage(reasoning=delta_text) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic within this To improve maintainability, this duplicated logic should be extracted into a shared helper method within the base class. |
||
| else: | ||
| # thinking is disabled, just content | ||
| return DeltaMessage(content=delta_text) | ||
|
|
||
| def extract_reasoning( | ||
| self, model_output: str, request: ChatCompletionRequest | ||
| ) -> tuple[str | None, str | None]: | ||
| """ | ||
| Extract reasoning content from the model output. | ||
|
|
||
| For text <think>abc</think>xyz: | ||
| - 'abc' goes to reasoning | ||
| - 'xyz' goes to content | ||
|
|
||
| Returns: | ||
| tuple[Optional[str], Optional[str]]: reasoning content and content | ||
| """ | ||
|
|
||
| # Check if the model output contains the <think> and </think> tokens. | ||
| if ( | ||
| self.think_start_token not in model_output | ||
| or self.think_end_token not in model_output | ||
| ): | ||
| return None, model_output | ||
| # Check if the <think> is present in the model output, remove it | ||
| # if it is present. | ||
| model_output_parts = model_output.partition(self.think_start_token) | ||
| model_output = ( | ||
| model_output_parts[2] if model_output_parts[1] else model_output_parts[0] | ||
| ) | ||
| # Check if the model output contains the </think> tokens. | ||
| # If the end token is not found, return the model output as is. | ||
| if self.think_end_token not in model_output: | ||
| return None, model_output | ||
|
|
||
| # Extract reasoning content from the model output. | ||
| reasoning, _, content = model_output.partition(self.think_end_token) | ||
|
|
||
| final_content = content or None | ||
| return reasoning, final_content | ||
| return ret | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new branch that runs when no
<think>start token has ever been seen now routes text into the reasoning field even though the model never emitted thinking markers, and the class now inherits the baseextract_reasoning(basic_parsers.py 151-175) which likewise returns the entire output as reasoning when<think>is missing. For GLM calls withenable_thinking=Falseor for variants that simply omit the tags,contentstaysNone, so the OpenAI response builders (serving_responses.py 844-889) skip creating the assistant message and the user receives no answer despite the model producing one. The previous parser returned the text as content when the tags were absent, so non-thinking generations are now silently dropped.Useful? React with 👍 / 👎.