Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 23 additions & 12 deletions vllm/entrypoints/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ def _update_num_reasoning_tokens(self):

def append_output(self, output: RequestOutput) -> None:
output_token_ids = output.outputs[0].token_ids
# Reset parser for each append_output call to handle multi-turn scenarios
# where the parser needs to start fresh for each assistant response
self.parser = get_streamable_parser_for_assistant()
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@heheda12345 it looks like you added this line in #22512... wonder if you could check this and claude's logic when you have a chance?

Copy link
Member Author

@njhill njhill Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually in response to the new test failure below, claude suggested to reinstate this line.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's not clear to me is why the non-streaming case resets the parser between turns but the streaming case uses the same parser and tries to feed the missing tokens into it to catch it back up with the expected state. On the surface, it feels like both of these should take the same approach - either resetting the parser between turns or doing the extra logic the streaming side does down in streaming's render_for_completion.

But, for the sake of trying to get this going without digging deeper into things here, I think putting this back like it was with the added comment at least makes it clear why it's doing this.

for token_id in output_token_ids:
self.parser.process(token_id)
Expand Down Expand Up @@ -504,6 +506,8 @@ def __init__(self, *args, **kwargs):
self.encoding = get_encoding()
self.last_tok = None
self.first_tok_of_message = True
# Track how many tokens have been processed to avoid buggy token search
self.processed_token_count = 0

@property
def messages(self) -> list:
Expand All @@ -519,8 +523,10 @@ def append_output(self, output: RequestOutput) -> None:
# (finished=True), then the next token processed will mark the
# beginning of a new message
self.first_tok_of_message = output.finished
for tok in output.outputs[0].token_ids:
token_ids = output.outputs[0].token_ids
for tok in token_ids:
self.parser.process(tok)
self.processed_token_count += 1
self._update_decode_token_usage(output)

# For streaming, update previous turn when message is complete
Expand All @@ -529,7 +535,9 @@ def append_output(self, output: RequestOutput) -> None:
self.current_turn_metrics.reset()
# Check if the current token is part of reasoning content
self._update_num_reasoning_tokens()
self.last_tok = tok
# Only update last_tok if we actually processed tokens
if token_ids:
self.last_tok = tok
if len(self._messages) - self.num_init_messages < len(self.parser.messages):
self._messages.extend(
self.parser.messages[len(self._messages) - self.num_init_messages :]
Expand All @@ -546,8 +554,13 @@ def append_tool_output(self, output: list[Message]) -> None:
toks = self.encoding.render(msg)
for tok in toks:
self.parser.process(tok)
self.processed_token_count += 1
self.last_tok = toks[-1]
# TODO: add tool_output messages to self._messages
# Add tool output messages from parser to self._messages
# (same pattern as append_output)
msg_count = len(self._messages) - self.num_init_messages
if msg_count < len(self.parser.messages):
self._messages.extend(self.parser.messages[msg_count:])

def is_expecting_start(self) -> bool:
return self.parser.state == StreamState.EXPECT_START
Expand All @@ -556,17 +569,15 @@ def is_assistant_action_turn(self) -> bool:
return self.last_tok in self.encoding.stop_tokens_for_assistant_actions()

def render_for_completion(self) -> list[int]:
# now this list of tokens as next turn's starting tokens
# `<|start|>assistant`,
# we need to process them in parser.
# Render all messages including the new turn start tokens
# e.g. [...previous tokens...] [<|start|>] [assistant]
rendered_tokens = super().render_for_completion()

last_n = -1
to_process = []
while rendered_tokens[last_n] != self.last_tok:
to_process.append(rendered_tokens[last_n])
last_n -= 1
for tok in reversed(to_process):
# Process only the NEW tokens that we haven't seen before
# This avoids the buggy token search that could match at wrong positions
to_process = rendered_tokens[self.processed_token_count :]
for tok in to_process:
self.parser.process(tok)
self.processed_token_count += 1

return rendered_tokens