From be8d11e403c3daffe7ecdc3a6fe84d7145ee729d Mon Sep 17 00:00:00 2001 From: RioS Date: Thu, 11 Dec 2025 08:21:17 +0900 Subject: [PATCH 1/6] [bugfix] missing tokens occur in harmony streaming Signed-off-by: RioS --- vllm/entrypoints/openai/serving_responses.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 60d14337dcaa..e4c3b787aa30 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -1663,7 +1663,7 @@ async def _process_harmony_streaming_events( ) # stream the output of a harmony message - if ctx.parser.last_content_delta: + if ctx.last_delta: if ( ctx.parser.current_channel == "final" and ctx.parser.current_recipient is None @@ -1708,7 +1708,7 @@ async def _process_harmony_streaming_events( content_index=current_content_index, output_index=current_output_index, item_id=current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_delta, # TODO, use logprobs from ctx.last_request_output logprobs=[], ) @@ -1753,7 +1753,7 @@ async def _process_harmony_streaming_events( item_id=current_item_id, output_index=current_output_index, content_index=current_content_index, - delta=ctx.parser.last_content_delta, + delta=ctx.last_delta, sequence_number=-1, ) ) @@ -1796,7 +1796,7 @@ async def _process_harmony_streaming_events( sequence_number=-1, output_index=current_output_index, item_id=current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_delta, ) ) @@ -1966,7 +1966,7 @@ async def _process_harmony_streaming_events( yield _increment_sequence_number_and_return( ResponseFunctionCallArgumentsDeltaEvent( item_id=current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_delta, output_index=current_output_index, sequence_number=-1, type="response.function_call_arguments.delta", From 71a92033d359babe86e0d6af602c88071b942515 Mon Sep 17 00:00:00 2001 From: RioS Date: Thu, 11 Dec 2025 08:21:37 +0900 Subject: [PATCH 2/6] [bugfix] missing tokens occur in harmony streaming Signed-off-by: RioS --- vllm/entrypoints/context.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index c70eaaa082fe..0bbd612a6115 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -767,6 +767,7 @@ def __init__(self, *args, **kwargs): self.encoding = get_encoding() self.last_tok = None self.first_tok_of_message = True + self.last_delta = None @property def messages(self) -> list: @@ -775,6 +776,7 @@ def messages(self) -> list: def append_output(self, output: RequestOutput) -> None: # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. + self.last_delta = None if self.first_tok_of_message: self._update_prefill_token_usage(output) # Reset self.first_tok_of_message if needed: @@ -782,8 +784,12 @@ def append_output(self, output: RequestOutput) -> None: # (finished=True), then the next token processed will mark the # beginning of a new message self.first_tok_of_message = output.finished + last_delta_text = '' for tok in output.outputs[0].token_ids: self.parser.process(tok) + last_delta_text += self.parser.last_content_delta or '' + if last_delta_text: + self.last_delta = last_delta_text self._update_decode_token_usage(output) # For streaming, update previous turn when message is complete From b897335f4f42473161cdbc0f30f8a21bccad19f1 Mon Sep 17 00:00:00 2001 From: Ri0S Date: Thu, 11 Dec 2025 09:06:15 +0900 Subject: [PATCH 3/6] [bugfix] missing tokens occur in harmony streaming Signed-off-by: Ri0S --- vllm/entrypoints/context.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 0bbd612a6115..9408edf6c5c9 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -784,10 +784,10 @@ def append_output(self, output: RequestOutput) -> None: # (finished=True), then the next token processed will mark the # beginning of a new message self.first_tok_of_message = output.finished - last_delta_text = '' + last_delta_text = "" for tok in output.outputs[0].token_ids: self.parser.process(tok) - last_delta_text += self.parser.last_content_delta or '' + last_delta_text += self.parser.last_content_delta or "" if last_delta_text: self.last_delta = last_delta_text self._update_decode_token_usage(output) From 6ae8af98e75084576840f78c8883a2987b4e6fd4 Mon Sep 17 00:00:00 2001 From: RioS Date: Tue, 6 Jan 2026 10:07:48 +0900 Subject: [PATCH 4/6] Update vllm/entrypoints/context.py Co-authored-by: Chauncey Signed-off-by: RioS --- vllm/entrypoints/context.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9408edf6c5c9..e8c1db5ef8f4 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -767,7 +767,7 @@ def __init__(self, *args, **kwargs): self.encoding = get_encoding() self.last_tok = None self.first_tok_of_message = True - self.last_delta = None + self.last_content_delta = None @property def messages(self) -> list: From 5a0ae40caf6022eb74a66307c8951dd7e3b009a2 Mon Sep 17 00:00:00 2001 From: Ri0S Date: Tue, 6 Jan 2026 10:13:33 +0900 Subject: [PATCH 5/6] [bugfix] missing tokens occur in harmony streaming Signed-off-by: Ri0S --- vllm/entrypoints/context.py | 4 ++-- vllm/entrypoints/openai/serving_responses.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index e8c1db5ef8f4..76eb6fe8057c 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -776,7 +776,7 @@ def messages(self) -> list: def append_output(self, output: RequestOutput) -> None: # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. - self.last_delta = None + self.last_content_delta = None if self.first_tok_of_message: self._update_prefill_token_usage(output) # Reset self.first_tok_of_message if needed: @@ -789,7 +789,7 @@ def append_output(self, output: RequestOutput) -> None: self.parser.process(tok) last_delta_text += self.parser.last_content_delta or "" if last_delta_text: - self.last_delta = last_delta_text + self.last_content_delta = last_delta_text self._update_decode_token_usage(output) # For streaming, update previous turn when message is complete diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index e4c3b787aa30..d0002d3e4a33 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -1663,7 +1663,7 @@ async def _process_harmony_streaming_events( ) # stream the output of a harmony message - if ctx.last_delta: + if ctx.last_content_delta: if ( ctx.parser.current_channel == "final" and ctx.parser.current_recipient is None @@ -1708,7 +1708,7 @@ async def _process_harmony_streaming_events( content_index=current_content_index, output_index=current_output_index, item_id=current_item_id, - delta=ctx.last_delta, + delta=ctx.last_content_delta, # TODO, use logprobs from ctx.last_request_output logprobs=[], ) @@ -1753,7 +1753,7 @@ async def _process_harmony_streaming_events( item_id=current_item_id, output_index=current_output_index, content_index=current_content_index, - delta=ctx.last_delta, + delta=ctx.last_content_delta, sequence_number=-1, ) ) @@ -1796,7 +1796,7 @@ async def _process_harmony_streaming_events( sequence_number=-1, output_index=current_output_index, item_id=current_item_id, - delta=ctx.last_delta, + delta=ctx.last_content_delta, ) ) @@ -1966,7 +1966,7 @@ async def _process_harmony_streaming_events( yield _increment_sequence_number_and_return( ResponseFunctionCallArgumentsDeltaEvent( item_id=current_item_id, - delta=ctx.last_delta, + delta=ctx.last_content_delta, output_index=current_output_index, sequence_number=-1, type="response.function_call_arguments.delta", From 6e419669398a40831da3b5414bc76dda1515ecaa Mon Sep 17 00:00:00 2001 From: Ri0S Date: Fri, 9 Jan 2026 11:00:27 +0900 Subject: [PATCH 6/6] Resolve merge conflict Signed-off-by: Ri0S --- vllm/entrypoints/context.py | 6 ++++++ vllm/entrypoints/openai/serving_responses.py | 14 +++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 58d57905c68a..c9bece08f188 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -824,6 +824,7 @@ def __init__(self, *args, **kwargs): self.encoding = get_encoding() self.last_tok = None self.first_tok_of_message = True + self.last_content_delta = None @property def messages(self) -> list: @@ -832,6 +833,7 @@ def messages(self) -> list: def append_output(self, output: RequestOutput) -> None: # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. + self.last_content_delta = None if self.first_tok_of_message: self._update_prefill_token_usage(output) # Reset self.first_tok_of_message if needed: @@ -839,8 +841,12 @@ def append_output(self, output: RequestOutput) -> None: # (finished=True), then the next token processed will mark the # beginning of a new message self.first_tok_of_message = output.finished + last_delta_text = "" for tok in output.outputs[0].token_ids: self.parser.process(tok) + last_delta_text += self.parser.last_content_delta or "" + if last_delta_text: + self.last_content_delta = last_delta_text self._update_decode_token_usage(output) # For streaming, update previous turn when message is complete diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index c574ae4d30de..2ff69a5d7bda 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -1811,7 +1811,7 @@ def _emit_final_channel_delta_events( content_index=state.current_content_index, output_index=state.current_output_index, item_id=state.current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_content_delta, # TODO, use logprobs from ctx.last_request_output logprobs=[], ) @@ -1861,7 +1861,7 @@ def _emit_analysis_channel_delta_events( item_id=state.current_item_id, output_index=state.current_output_index, content_index=state.current_content_index, - delta=ctx.parser.last_content_delta, + delta=ctx.last_content_delta, sequence_number=-1, ) ) @@ -1908,7 +1908,7 @@ def _emit_mcp_tool_delta_events( sequence_number=-1, output_index=state.current_output_index, item_id=state.current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_content_delta, ) ) return events @@ -1952,7 +1952,7 @@ def _emit_code_interpreter_delta_events( sequence_number=-1, output_index=state.current_output_index, item_id=state.current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_content_delta, ) ) return events @@ -1999,7 +1999,7 @@ def _emit_mcp_prefix_delta_events( sequence_number=-1, output_index=state.current_output_index, item_id=state.current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_content_delta, ) ) return events @@ -2010,7 +2010,7 @@ def _emit_content_delta_events( state: HarmonyStreamingState, ) -> list[StreamingResponsesResponse]: """Emit events for content delta streaming based on channel type.""" - if not ctx.parser.last_content_delta: + if not ctx.last_content_delta: return [] if ( @@ -2364,7 +2364,7 @@ def _emit_function_call_delta_events( events.append( ResponseFunctionCallArgumentsDeltaEvent( item_id=state.current_item_id, - delta=ctx.parser.last_content_delta, + delta=ctx.last_content_delta, output_index=state.current_output_index, sequence_number=-1, type="response.function_call_arguments.delta",