Skip to content

Commit a515fb7

Browse files
NagyGeorgeeicherseiji
authored andcommitted
[Feature][gpt-oss] Add support for num_cached_tokens and num_reasoning_tokens tracking (vllm-project#23460)
Signed-off-by: George Nagy II <[email protected]> Signed-off-by: Chen Zhang <[email protected]>
1 parent 7288100 commit a515fb7

File tree

1 file changed

+21
-0
lines changed

1 file changed

+21
-0
lines changed

vllm/entrypoints/context.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,35 @@ def _update_num_prompt_tokens(self, output: RequestOutput):
9393
# as new prompt each time. Hence the sum.
9494
self.num_prompt_tokens += len(output.prompt_token_ids)
9595

96+
def _update_num_cached_tokens(self, output: RequestOutput):
97+
if output.num_cached_tokens is not None:
98+
#Similar to num_prompt_tokens
99+
self.num_cached_tokens += output.num_cached_tokens
100+
96101
def _update_num_output_tokens(self, token_ids: Sequence[int]):
97102
self.num_output_tokens += len(token_ids)
98103

104+
def _update_num_reasoning_tokens(self, token_ids: Sequence[int]):
105+
# Count tokens that are part of reasoning content (analysis channel
106+
# or tool-directed messages like python/browser calls)
107+
is_analysis = self.parser.current_channel == "analysis"
108+
is_tool_call = (self.parser.current_recipient is not None and
109+
(self.parser.current_recipient.startswith("python") or
110+
self.parser.current_recipient.startswith("browser.")))
111+
if is_analysis or is_tool_call:
112+
self.num_reasoning_tokens += len(token_ids)
113+
99114
def append_output(self, output) -> None:
100115
if isinstance(output, RequestOutput):
101116
self._update_num_prompt_tokens(output)
117+
self._update_num_cached_tokens(output)
102118
output_token_ids = output.outputs[0].token_ids
103119
self._update_num_output_tokens(output_token_ids)
104120
self.parser = get_streamable_parser_for_assistant()
105121
for token_id in output_token_ids:
106122
self.parser.process(token_id)
123+
# Check if the current token is part of reasoning content
124+
self._update_num_reasoning_tokens([token_id])
107125
output_msgs = self.parser.messages
108126
else:
109127
# Tool output.
@@ -204,6 +222,7 @@ def append_output(self, output) -> None:
204222
# so we only want to add the prompt tokens once for each message.
205223
if self.first_tok_of_message:
206224
self._update_num_prompt_tokens(output)
225+
self._update_num_cached_tokens(output)
207226
# Reset self.first_tok_of_message if needed:
208227
# if the current token is the last one of the current message
209228
# (finished=True), then the next token processed will mark the
@@ -212,6 +231,8 @@ def append_output(self, output) -> None:
212231
tok = output.outputs[0].token_ids[0]
213232
self.parser.process(tok)
214233
self._update_num_output_tokens(output.outputs[0].token_ids)
234+
# Check if the current token is part of reasoning content
235+
self._update_num_reasoning_tokens([tok])
215236
self.last_tok = tok
216237
else:
217238
# Handle the case of tool output in direct message format

0 commit comments

Comments
 (0)