diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 694ff80047c7..0ca8c4d5c008 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -1036,6 +1036,34 @@ async def chat_completion_stream_generator( finish_reason_ = ( output.finish_reason if output.finish_reason else "stop" ) + + # MTP truncation detection: with speculative + # decoding the model may produce EOS during the + # reasoning-to-tool-call transition, causing + # finish_reason="stop" with only reasoning + # produced. Detect this and raise a retryable + # error so the client retries. + if ( + finish_reason_ == "stop" + and request.tools + and not tools_streamed[i] + and not auto_tools_called + and reasoning_parser is not None + and delta_message is not None + and not delta_message.content + and not delta_message.tool_calls + ): + logger.warning( + "MTP truncation detected for request %s: " + "finished with 'stop' but tools configured " + "and only reasoning produced.", + request_id, + ) + raise GenerationError( + "MTP speculative decoding truncated tool " + "call generation. Please retry." + ) + choice_data = ChatCompletionResponseStreamChoice( index=i, delta=delta_message,