From d1545acbf4e51abf40c18b17383271168f613dd0 Mon Sep 17 00:00:00 2001 From: George Nagy II Date: Fri, 22 Aug 2025 17:22:30 -0500 Subject: [PATCH 1/3] Add support for num_cached_tokens and num_reasoning_tokens tracking - Add _update_num_cached_tokens() method to track cached tokens from RequestOutput - Add _update_num_reasoning_tokens() method to track reasoning tokens based on: - Analysis channel content (parser.current_channel == 'analysis') - Tool directed messages - Integrate token tracking into append_output() methods for both context types - Cached tokens only tracked on first token in streaming mode Signed-off-by: George Nagy II --- vllm/entrypoints/context.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index f70e1fc207f8..c213733e3108 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -81,17 +81,34 @@ def _update_num_prompt_tokens(self, output: RequestOutput): # as new prompt each time. Hence the sum. self.num_prompt_tokens += len(output.prompt_token_ids) + def _update_num_cached_tokens(self, output: RequestOutput): + if output.num_cached_tokens is not None: + #Similar to num_prompt_tokens + self.num_cached_tokens += output.num_cached_tokens + def _update_num_output_tokens(self, token_ids: Sequence[int]): self.num_output_tokens += len(token_ids) + def _update_num_reasoning_tokens(self, token_ids: Sequence[int]): + # Count tokens that are part of reasoning content (analysis channel + # or tool-directed messages like python/browser calls) + if self.parser.current_channel == "analysis" or ( + self.parser.current_recipient is not None and + (self.parser.current_recipient.startswith("python") + or self.parser.current_recipient.startswith("browser."))): + self.num_reasoning_tokens += len(token_ids) + def append_output(self, output) -> None: if isinstance(output, RequestOutput): self._update_num_prompt_tokens(output) + self._update_num_cached_tokens(output) output_token_ids = output.outputs[0].token_ids self._update_num_output_tokens(output_token_ids) self.parser = get_streamable_parser_for_assistant() for token_id in output_token_ids: self.parser.process(token_id) + # Check if the current token is part of reasoning content + self._update_num_reasoning_tokens([token_id]) output_msgs = self.parser.messages else: # Tool output. @@ -183,6 +200,7 @@ def append_output(self, output) -> None: # so we only want to add the prompt tokens once for each message. if self.first_tok_of_message: self._update_num_prompt_tokens(output) + self._update_num_cached_tokens(output) # Reset self.first_tok_of_message if needed: # if the current token is the last one of the current message # (finished=True), then the next token processed will mark the @@ -191,6 +209,8 @@ def append_output(self, output) -> None: tok = output.outputs[0].token_ids[0] self.parser.process(tok) self._update_num_output_tokens(output.outputs[0].token_ids) + # Check if the current token is part of reasoning content + self._update_num_reasoning_tokens([tok]) self.last_tok = tok else: # Handle the case of tool output in direct message format From c4ca23f29877064285e161ae33a25dae6dc3f664 Mon Sep 17 00:00:00 2001 From: George Nagy II Date: Sat, 23 Aug 2025 03:25:07 -0500 Subject: [PATCH 2/3] Update vllm/entrypoints/context.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: George Nagy II --- vllm/entrypoints/context.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index c213733e3108..4b773b11d726 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -92,10 +92,12 @@ def _update_num_output_tokens(self, token_ids: Sequence[int]): def _update_num_reasoning_tokens(self, token_ids: Sequence[int]): # Count tokens that are part of reasoning content (analysis channel # or tool-directed messages like python/browser calls) - if self.parser.current_channel == "analysis" or ( - self.parser.current_recipient is not None and - (self.parser.current_recipient.startswith("python") - or self.parser.current_recipient.startswith("browser."))): + is_analysis = self.parser.current_channel == "analysis" + is_tool_call = ( + self.parser.current_recipient is not None and + (self.parser.current_recipient.startswith("python") or + self.parser.current_recipient.startswith("browser."))) + if is_analysis or is_tool_call: self.num_reasoning_tokens += len(token_ids) def append_output(self, output) -> None: From 6c8610373ca7ae537417924a9956cfb89e75c360 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 3 Sep 2025 11:55:41 -0700 Subject: [PATCH 3/3] format Signed-off-by: Chen Zhang --- vllm/entrypoints/context.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 42c7c13dd90b..52e35bcac961 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -105,10 +105,9 @@ def _update_num_reasoning_tokens(self, token_ids: Sequence[int]): # Count tokens that are part of reasoning content (analysis channel # or tool-directed messages like python/browser calls) is_analysis = self.parser.current_channel == "analysis" - is_tool_call = ( - self.parser.current_recipient is not None and - (self.parser.current_recipient.startswith("python") or - self.parser.current_recipient.startswith("browser."))) + is_tool_call = (self.parser.current_recipient is not None and + (self.parser.current_recipient.startswith("python") or + self.parser.current_recipient.startswith("browser."))) if is_analysis or is_tool_call: self.num_reasoning_tokens += len(token_ids)