@@ -189,6 +189,8 @@ async def chat_completion_stream_generator(
189189 previous_num_tokens = [0 ] * num_choices
190190 reasoning_num_tokens = [0 ] * num_choices
191191 num_prompt_tokens = 0
192+ num_cached_tokens = 0
193+ num_image_tokens = [0 ] * num_choices
192194 tool_called = [False ] * num_choices
193195 max_streaming_response_tokens = (
194196 request .max_streaming_response_tokens
@@ -321,6 +323,9 @@ async def chat_completion_stream_generator(
321323 output_top_logprobs = output ["top_logprobs" ]
322324 output_draft_top_logprobs = output ["draft_top_logprobs" ]
323325 previous_num_tokens [idx ] += len (output ["token_ids" ])
326+ if output .get ("num_image_tokens" ):
327+ previous_num_tokens [idx ] += output .get ("num_image_tokens" )
328+ num_image_tokens [idx ] += output .get ("num_image_tokens" )
324329 reasoning_num_tokens [idx ] += output .get ("reasoning_token_num" , 0 )
325330 logprobs_res : Optional [LogProbs ] = None
326331 draft_logprobs_res : Optional [LogProbs ] = None
@@ -389,8 +394,10 @@ async def chat_completion_stream_generator(
389394 prompt_tokens = num_prompt_tokens ,
390395 completion_tokens = previous_num_tokens [idx ],
391396 total_tokens = num_prompt_tokens + previous_num_tokens [idx ],
397+ prompt_tokens_details = PromptTokenUsageInfo (cached_tokens = num_cached_tokens ),
392398 completion_tokens_details = CompletionTokenUsageInfo (
393- reasoning_tokens = reasoning_num_tokens [idx ]
399+ reasoning_tokens = reasoning_num_tokens [idx ],
400+ image_tokens = num_image_tokens [idx ],
394401 ),
395402 )
396403 choices .append (choice )
@@ -409,7 +416,10 @@ async def chat_completion_stream_generator(
409416 prompt_tokens = num_prompt_tokens ,
410417 completion_tokens = completion_tokens ,
411418 total_tokens = num_prompt_tokens + completion_tokens ,
412- completion_tokens_details = CompletionTokenUsageInfo (reasoning_tokens = reasoning_tokens ),
419+ prompt_tokens_details = PromptTokenUsageInfo (cached_tokens = num_cached_tokens ),
420+ completion_tokens_details = CompletionTokenUsageInfo (
421+ image_tokens = sum (num_image_tokens ), reasoning_tokens = reasoning_tokens
422+ ),
413423 )
414424 chunk = ChatCompletionStreamResponse (
415425 id = request_id ,
@@ -466,6 +476,7 @@ async def chat_completion_full_generator(
466476 draft_logprob_contents = [[] for _ in range (num_choices )]
467477 completion_token_ids = [[] for _ in range (num_choices )]
468478 num_cached_tokens = [0 ] * num_choices
479+ num_image_tokens = [0 ] * num_choices
469480 response_processor = ChatResponseProcessor (
470481 data_processor = self .engine_client .data_processor ,
471482 enable_mm_output = self .enable_mm_output ,
@@ -531,6 +542,9 @@ async def chat_completion_full_generator(
531542 if data ["finished" ]:
532543 num_choices -= 1
533544 reasoning_num_tokens [idx ] = data ["outputs" ].get ("reasoning_token_num" , 0 )
545+ if data ["outputs" ].get ("image_token_num" ):
546+ previous_num_tokens [idx ] += data ["outputs" ].get ("image_token_num" )
547+ num_image_tokens [idx ] = data ["outputs" ].get ("image_token_num" )
534548 choice = await self ._create_chat_completion_choice (
535549 output = output ,
536550 index = idx ,
@@ -540,6 +554,7 @@ async def chat_completion_full_generator(
540554 prompt_tokens = prompt_tokens ,
541555 completion_token_ids = completion_token_ids [idx ],
542556 num_cached_tokens = num_cached_tokens ,
557+ num_image_tokens = num_image_tokens ,
543558 logprob_contents = logprob_contents ,
544559 response_processor = response_processor ,
545560 )
@@ -557,7 +572,9 @@ async def chat_completion_full_generator(
557572 completion_tokens = num_generated_tokens ,
558573 total_tokens = num_prompt_tokens + num_generated_tokens ,
559574 prompt_tokens_details = PromptTokenUsageInfo (cached_tokens = sum (num_cached_tokens )),
560- completion_tokens_details = CompletionTokenUsageInfo (reasoning_tokens = num_reasoning_tokens ),
575+ completion_tokens_details = CompletionTokenUsageInfo (
576+ reasoning_tokens = num_reasoning_tokens , image_tokens = sum (num_image_tokens )
577+ ),
561578 )
562579 choices = sorted (choices , key = lambda x : x .index )
563580 res = ChatCompletionResponse (
@@ -580,6 +597,7 @@ async def _create_chat_completion_choice(
580597 prompt_tokens : str ,
581598 completion_token_ids : list ,
582599 num_cached_tokens : list ,
600+ num_image_tokens : list ,
583601 logprob_contents : list ,
584602 response_processor : ChatResponseProcessor ,
585603 ) -> ChatCompletionResponseChoice :
@@ -609,6 +627,7 @@ async def _create_chat_completion_choice(
609627 has_no_token_limit = request .max_tokens is None and request .max_completion_tokens is None
610628 max_tokens = request .max_completion_tokens or request .max_tokens
611629 num_cached_tokens [index ] = output .get ("num_cached_tokens" , 0 )
630+ num_image_tokens [index ] = output .get ("num_image_tokens" , 0 )
612631
613632 finish_reason = "stop"
614633 if has_no_token_limit or previous_num_tokens != max_tokens :
0 commit comments