Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions vllm/entrypoints/openai/chat_completion/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,34 @@ async def chat_completion_stream_generator(
finish_reason_ = (
output.finish_reason if output.finish_reason else "stop"
)

# MTP truncation detection: with speculative
# decoding the model may produce EOS during the
# reasoning-to-tool-call transition, causing
# finish_reason="stop" with only reasoning
# produced. Detect this and raise a retryable
# error so the client retries.
if (
finish_reason_ == "stop"
and request.tools
and not tools_streamed[i]
and not auto_tools_called
and reasoning_parser is not None
and delta_message is not None
and not delta_message.content
and not delta_message.tool_calls
):
logger.warning(
"MTP truncation detected for request %s: "
"finished with 'stop' but tools configured "
"and only reasoning produced.",
request_id,
)
raise GenerationError(
"MTP speculative decoding truncated tool "
"call generation. Please retry."
)

choice_data = ChatCompletionResponseStreamChoice(
index=i,
delta=delta_message,
Expand Down
Loading