From d1545acbf4e51abf40c18b17383271168f613dd0 Mon Sep 17 00:00:00 2001
From: George Nagy II <george.nagy0969@gmail.com>
Date: Fri, 22 Aug 2025 17:22:30 -0500
Subject: [PATCH 1/3] Add support for num_cached_tokens and
 num_reasoning_tokens tracking

  - Add _update_num_cached_tokens() method to track cached tokens from RequestOutput
  - Add _update_num_reasoning_tokens() method to track reasoning tokens based on:
  - Analysis channel content (parser.current_channel == 'analysis')
  - Tool directed messages
  - Integrate token tracking into append_output() methods for both context types
  - Cached tokens only tracked on first token in streaming mode

Signed-off-by: George Nagy II <george.nagy0969@gmail.com>
---
 vllm/entrypoints/context.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index f70e1fc207f8..c213733e3108 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -81,17 +81,34 @@ def _update_num_prompt_tokens(self, output: RequestOutput):
             # as new prompt each time. Hence the sum.
             self.num_prompt_tokens += len(output.prompt_token_ids)
 
+    def _update_num_cached_tokens(self, output: RequestOutput):
+        if output.num_cached_tokens is not None:
+            #Similar to num_prompt_tokens
+            self.num_cached_tokens += output.num_cached_tokens
+
     def _update_num_output_tokens(self, token_ids: Sequence[int]):
         self.num_output_tokens += len(token_ids)
 
+    def _update_num_reasoning_tokens(self, token_ids: Sequence[int]):
+        # Count tokens that are part of reasoning content (analysis channel
+        # or tool-directed messages like python/browser calls)
+        if self.parser.current_channel == "analysis" or (
+                self.parser.current_recipient is not None and
+            (self.parser.current_recipient.startswith("python")
+             or self.parser.current_recipient.startswith("browser."))):
+            self.num_reasoning_tokens += len(token_ids)
+
     def append_output(self, output) -> None:
         if isinstance(output, RequestOutput):
             self._update_num_prompt_tokens(output)
+            self._update_num_cached_tokens(output)
             output_token_ids = output.outputs[0].token_ids
             self._update_num_output_tokens(output_token_ids)
             self.parser = get_streamable_parser_for_assistant()
             for token_id in output_token_ids:
                 self.parser.process(token_id)
+                # Check if the current token is part of reasoning content
+                self._update_num_reasoning_tokens([token_id])
             output_msgs = self.parser.messages
         else:
             # Tool output.
@@ -183,6 +200,7 @@ def append_output(self, output) -> None:
             # so we only want to add the prompt tokens once for each message.
             if self.first_tok_of_message:
                 self._update_num_prompt_tokens(output)
+                self._update_num_cached_tokens(output)
             # Reset self.first_tok_of_message if needed:
             # if the current token is the last one of the current message
             # (finished=True), then the next token processed will mark the
@@ -191,6 +209,8 @@ def append_output(self, output) -> None:
             tok = output.outputs[0].token_ids[0]
             self.parser.process(tok)
             self._update_num_output_tokens(output.outputs[0].token_ids)
+            # Check if the current token is part of reasoning content
+            self._update_num_reasoning_tokens([tok])
             self.last_tok = tok
         else:
             # Handle the case of tool output in direct message format

From c4ca23f29877064285e161ae33a25dae6dc3f664 Mon Sep 17 00:00:00 2001
From: George Nagy II <george.nagy0969@gmail.com>
Date: Sat, 23 Aug 2025 03:25:07 -0500
Subject: [PATCH 2/3] Update vllm/entrypoints/context.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: George Nagy II <george.nagy0969@gmail.com>
---
 vllm/entrypoints/context.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index c213733e3108..4b773b11d726 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -92,10 +92,12 @@ def _update_num_output_tokens(self, token_ids: Sequence[int]):
     def _update_num_reasoning_tokens(self, token_ids: Sequence[int]):
         # Count tokens that are part of reasoning content (analysis channel
         # or tool-directed messages like python/browser calls)
-        if self.parser.current_channel == "analysis" or (
-                self.parser.current_recipient is not None and
-            (self.parser.current_recipient.startswith("python")
-             or self.parser.current_recipient.startswith("browser."))):
+        is_analysis = self.parser.current_channel == "analysis"
+        is_tool_call = (
+            self.parser.current_recipient is not None and
+            (self.parser.current_recipient.startswith("python") or
+             self.parser.current_recipient.startswith("browser.")))
+        if is_analysis or is_tool_call:
             self.num_reasoning_tokens += len(token_ids)
 
     def append_output(self, output) -> None:

From 6c8610373ca7ae537417924a9956cfb89e75c360 Mon Sep 17 00:00:00 2001
From: Chen Zhang <zhangch99@outlook.com>
Date: Wed, 3 Sep 2025 11:55:41 -0700
Subject: [PATCH 3/3] format

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
---
 vllm/entrypoints/context.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py
index 42c7c13dd90b..52e35bcac961 100644
--- a/vllm/entrypoints/context.py
+++ b/vllm/entrypoints/context.py
@@ -105,10 +105,9 @@ def _update_num_reasoning_tokens(self, token_ids: Sequence[int]):
         # Count tokens that are part of reasoning content (analysis channel
         # or tool-directed messages like python/browser calls)
         is_analysis = self.parser.current_channel == "analysis"
-        is_tool_call = (
-            self.parser.current_recipient is not None and
-            (self.parser.current_recipient.startswith("python") or
-             self.parser.current_recipient.startswith("browser.")))
+        is_tool_call = (self.parser.current_recipient is not None and
+                        (self.parser.current_recipient.startswith("python") or
+                         self.parser.current_recipient.startswith("browser.")))
         if is_analysis or is_tool_call:
             self.num_reasoning_tokens += len(token_ids)