Skip to content

Commit 2a9ed72

Browse files
authored
feat: add support for API usage with multimodal models (#4548)
* feat: add support for API usage with multimodal models * completion_tokens contains num_image_tokens * remove test_request.py * fix: paddle.device.is_compiled_with_cuda() * fix test_unstream_without_logprobs
1 parent e1ac90d commit 2a9ed72

File tree

10 files changed

+256
-21
lines changed

10 files changed

+256
-21
lines changed

fastdeploy/collect_env.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ def get_env_info():
561561

562562
if PADDLE_AVAILABLE:
563563
paddle_version_str = paddle.__version__
564-
paddle_cuda_available_str = str(torch.cuda.is_available())
564+
paddle_cuda_available_str = str(paddle.device.is_compiled_with_cuda())
565565
paddle_cuda_version_str = str(paddle.version.cuda())
566566
else:
567567
version_str = paddle_cuda_available_str = cuda_version_str = "N/A"

fastdeploy/engine/common_engine.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1068,6 +1068,7 @@ def _exit_sub_services(self):
10681068
"""
10691069
exit sub services
10701070
"""
1071+
llm_logger.info("Exit sub services.....")
10711072
self.running = False
10721073
if hasattr(self, "engine_worker_queue_server") and self.engine_worker_queue_server is not None:
10731074
self.engine_worker_queue_server.cleanup()

fastdeploy/entrypoints/openai/protocol.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class CompletionTokenUsageInfo(BaseModel):
6666
"""
6767

6868
reasoning_tokens: Optional[int] = None
69+
image_tokens: Optional[int] = None
6970

7071

7172
class PromptTokenUsageInfo(BaseModel):
@@ -74,6 +75,8 @@ class PromptTokenUsageInfo(BaseModel):
7475
"""
7576

7677
cached_tokens: Optional[int] = None
78+
image_tokens: Optional[int] = None
79+
video_tokens: Optional[int] = None
7780

7881

7982
class UsageInfo(BaseModel):

fastdeploy/entrypoints/openai/response_processors.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616

1717
from typing import Any, List, Optional
1818

19+
from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
1920
from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
2021
from fastdeploy.utils import api_server_logger
2122

@@ -104,6 +105,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
104105
image_output = self._end_image_code_request_output
105106
image_output["outputs"]["multipart"] = [image]
106107
image_output["outputs"]["token_ids"] = all_tokens
108+
image_output["outputs"]["num_image_tokens"] = count_tokens(all_tokens)
107109
yield image_output
108110

109111
self.data_processor.process_response_dict(
@@ -124,6 +126,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
124126
token_ids = request_output["outputs"]["token_ids"]
125127
if token_ids[-1] == self.eos_token_id:
126128
multipart = []
129+
num_image_tokens = 0
127130
for part in self._multipart_buffer:
128131
if part["decode_type"] == 0:
129132
self.data_processor.process_response_dict(
@@ -139,6 +142,7 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
139142
if self.decoder_client:
140143
req_id = part["request_output"]["request_id"]
141144
all_tokens = part["request_output"]["outputs"]["token_ids"]
145+
num_image_tokens += count_tokens(all_tokens)
142146

143147
image_ret = await self.decoder_client.decode_image(
144148
request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
@@ -150,4 +154,5 @@ async def process_response_chat(self, request_outputs, stream, enable_thinking,
150154

151155
lasrt_request_output = self._multipart_buffer[-1]["request_output"]
152156
lasrt_request_output["outputs"]["multipart"] = multipart
157+
lasrt_request_output["outputs"]["num_image_tokens"] = num_image_tokens
153158
yield lasrt_request_output

fastdeploy/entrypoints/openai/serving_chat.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ async def chat_completion_stream_generator(
189189
previous_num_tokens = [0] * num_choices
190190
reasoning_num_tokens = [0] * num_choices
191191
num_prompt_tokens = 0
192+
num_cached_tokens = 0
193+
num_image_tokens = [0] * num_choices
192194
tool_called = [False] * num_choices
193195
max_streaming_response_tokens = (
194196
request.max_streaming_response_tokens
@@ -321,6 +323,9 @@ async def chat_completion_stream_generator(
321323
output_top_logprobs = output["top_logprobs"]
322324
output_draft_top_logprobs = output["draft_top_logprobs"]
323325
previous_num_tokens[idx] += len(output["token_ids"])
326+
if output.get("num_image_tokens"):
327+
previous_num_tokens[idx] += output.get("num_image_tokens")
328+
num_image_tokens[idx] += output.get("num_image_tokens")
324329
reasoning_num_tokens[idx] += output.get("reasoning_token_num", 0)
325330
logprobs_res: Optional[LogProbs] = None
326331
draft_logprobs_res: Optional[LogProbs] = None
@@ -389,8 +394,10 @@ async def chat_completion_stream_generator(
389394
prompt_tokens=num_prompt_tokens,
390395
completion_tokens=previous_num_tokens[idx],
391396
total_tokens=num_prompt_tokens + previous_num_tokens[idx],
397+
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
392398
completion_tokens_details=CompletionTokenUsageInfo(
393-
reasoning_tokens=reasoning_num_tokens[idx]
399+
reasoning_tokens=reasoning_num_tokens[idx],
400+
image_tokens=num_image_tokens[idx],
394401
),
395402
)
396403
choices.append(choice)
@@ -409,7 +416,10 @@ async def chat_completion_stream_generator(
409416
prompt_tokens=num_prompt_tokens,
410417
completion_tokens=completion_tokens,
411418
total_tokens=num_prompt_tokens + completion_tokens,
412-
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=reasoning_tokens),
419+
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
420+
completion_tokens_details=CompletionTokenUsageInfo(
421+
image_tokens=sum(num_image_tokens), reasoning_tokens=reasoning_tokens
422+
),
413423
)
414424
chunk = ChatCompletionStreamResponse(
415425
id=request_id,
@@ -466,6 +476,7 @@ async def chat_completion_full_generator(
466476
draft_logprob_contents = [[] for _ in range(num_choices)]
467477
completion_token_ids = [[] for _ in range(num_choices)]
468478
num_cached_tokens = [0] * num_choices
479+
num_image_tokens = [0] * num_choices
469480
response_processor = ChatResponseProcessor(
470481
data_processor=self.engine_client.data_processor,
471482
enable_mm_output=self.enable_mm_output,
@@ -531,6 +542,9 @@ async def chat_completion_full_generator(
531542
if data["finished"]:
532543
num_choices -= 1
533544
reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0)
545+
if data["outputs"].get("image_token_num"):
546+
previous_num_tokens[idx] += data["outputs"].get("image_token_num")
547+
num_image_tokens[idx] = data["outputs"].get("image_token_num")
534548
choice = await self._create_chat_completion_choice(
535549
output=output,
536550
index=idx,
@@ -540,6 +554,7 @@ async def chat_completion_full_generator(
540554
prompt_tokens=prompt_tokens,
541555
completion_token_ids=completion_token_ids[idx],
542556
num_cached_tokens=num_cached_tokens,
557+
num_image_tokens=num_image_tokens,
543558
logprob_contents=logprob_contents,
544559
response_processor=response_processor,
545560
)
@@ -557,7 +572,9 @@ async def chat_completion_full_generator(
557572
completion_tokens=num_generated_tokens,
558573
total_tokens=num_prompt_tokens + num_generated_tokens,
559574
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=sum(num_cached_tokens)),
560-
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
575+
completion_tokens_details=CompletionTokenUsageInfo(
576+
reasoning_tokens=num_reasoning_tokens, image_tokens=sum(num_image_tokens)
577+
),
561578
)
562579
choices = sorted(choices, key=lambda x: x.index)
563580
res = ChatCompletionResponse(
@@ -580,6 +597,7 @@ async def _create_chat_completion_choice(
580597
prompt_tokens: str,
581598
completion_token_ids: list,
582599
num_cached_tokens: list,
600+
num_image_tokens: list,
583601
logprob_contents: list,
584602
response_processor: ChatResponseProcessor,
585603
) -> ChatCompletionResponseChoice:
@@ -609,6 +627,7 @@ async def _create_chat_completion_choice(
609627
has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
610628
max_tokens = request.max_completion_tokens or request.max_tokens
611629
num_cached_tokens[index] = output.get("num_cached_tokens", 0)
630+
num_image_tokens[index] = output.get("num_image_tokens", 0)
612631

613632
finish_reason = "stop"
614633
if has_no_token_limit or previous_num_tokens != max_tokens:

fastdeploy/entrypoints/openai/serving_completion.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
CompletionTokenUsageInfo,
3434
ErrorInfo,
3535
ErrorResponse,
36+
PromptTokenUsageInfo,
3637
UsageInfo,
3738
)
3839
from fastdeploy.utils import (
@@ -370,6 +371,8 @@ async def completion_stream_generator(
370371
req_id = f"{request_id}_{i}"
371372
dealer.write([b"", req_id.encode("utf-8")]) # 发送多路请求
372373
output_tokens = [0] * num_choices
374+
num_cache_tokens = [0] * num_choices
375+
num_image_tokens = [0] * num_choices
373376
inference_start_time = [0] * num_choices
374377
reasoning_tokens = [0] * num_choices
375378
first_iteration = [True] * num_choices
@@ -459,7 +462,11 @@ async def completion_stream_generator(
459462
draft_logprobs_res = self._create_completion_logprobs(
460463
output_draft_top_logprobs, request.logprobs, 0
461464
)
462-
output_tokens[idx] += 1
465+
output_tokens[idx] += len(output.get("token_ids", [])) or 0
466+
num_cache_tokens[idx] += output.get("num_cache_tokens") or 0
467+
if output.get("num_image_tokens"):
468+
output_tokens[idx] += output.get("num_image_tokens")
469+
num_image_tokens[idx] += output.get("num_image_tokens")
463470
reasoning_tokens[idx] += output.get("reasoning_token_num", 0)
464471
delta_message = CompletionResponseStreamChoice(
465472
index=idx,
@@ -527,8 +534,9 @@ async def completion_stream_generator(
527534
prompt_batched_token_ids[idx // (1 if request.n is None else request.n)]
528535
)
529536
+ output_tokens[idx],
537+
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens[idx]),
530538
completion_tokens_details=CompletionTokenUsageInfo(
531-
reasoning_tokens=reasoning_tokens[idx]
539+
image_tokens=num_image_tokens[idx], reasoning_tokens=reasoning_tokens[idx]
532540
),
533541
),
534542
)
@@ -559,6 +567,8 @@ def request_output_to_completion_response(
559567
choices: List[CompletionResponseChoice] = []
560568
num_prompt_tokens = 0
561569
num_generated_tokens = 0
570+
num_cache_tokens = 0
571+
num_image_tokens = 0
562572
num_reasoning_tokens = 0
563573

564574
for idx in range(len(final_res_batch)):
@@ -614,6 +624,10 @@ def request_output_to_completion_response(
614624
num_generated_tokens += final_res["output_token_ids"]
615625

616626
num_prompt_tokens += len(prompt_token_ids)
627+
num_cache_tokens += output.get("num_cache_tokens") or 0
628+
if output.get("num_image_tokens"):
629+
num_generated_tokens += output.get("num_image_tokens")
630+
num_image_tokens += output.get("num_image_tokens")
617631

618632
num_reasoning_tokens += output.get("reasoning_token_num", 0)
619633

@@ -622,7 +636,10 @@ def request_output_to_completion_response(
622636
prompt_tokens=num_prompt_tokens,
623637
completion_tokens=num_generated_tokens,
624638
total_tokens=num_prompt_tokens + num_generated_tokens,
625-
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
639+
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens),
640+
completion_tokens_details=CompletionTokenUsageInfo(
641+
reasoning_tokens=num_reasoning_tokens, image_tokens=num_image_tokens
642+
),
626643
)
627644
del request
628645

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License"
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
import numpy as np
18+
19+
20+
def count_tokens(tokens):
21+
"""
22+
Count the number of tokens in a nested list or array structure.
23+
"""
24+
count = 0
25+
stack = [tokens]
26+
while stack:
27+
current = stack.pop()
28+
if isinstance(current, (list, tuple, np.ndarray)):
29+
for item in reversed(current):
30+
stack.append(item)
31+
else:
32+
count += 1
33+
return count

tests/ce/server/test_logprobs.py

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,9 @@ def test_unstream_with_logprobs():
3232
"bytes": [231, 137, 155, 233, 161, 191],
3333
"top_logprobs": None,
3434
}
35-
assert resp_json["usage"] == {
36-
"prompt_tokens": 22,
37-
"total_tokens": 25,
38-
"completion_tokens": 3,
39-
"prompt_tokens_details": {"cached_tokens": 0},
40-
"completion_tokens_details": {"reasoning_tokens": 0},
41-
}
35+
assert resp_json["usage"]["prompt_tokens"] == 22
36+
assert resp_json["usage"]["completion_tokens"] == 3
37+
assert resp_json["usage"]["total_tokens"] == 25
4238

4339

4440
def test_unstream_without_logprobs():
@@ -65,13 +61,9 @@ def test_unstream_without_logprobs():
6561
# 校验返回内容与 logprobs 字段
6662
assert resp_json["choices"][0]["message"]["content"] == "牛顿的"
6763
assert resp_json["choices"][0]["logprobs"] is None
68-
assert resp_json["usage"] == {
69-
"prompt_tokens": 22,
70-
"total_tokens": 25,
71-
"completion_tokens": 3,
72-
"prompt_tokens_details": {"cached_tokens": 0},
73-
"completion_tokens_details": {"reasoning_tokens": 0},
74-
}
64+
assert resp_json["usage"]["prompt_tokens"] == 22
65+
assert resp_json["usage"]["completion_tokens"] == 3
66+
assert resp_json["usage"]["total_tokens"] == 25
7567

7668

7769
def test_stream_with_logprobs():

tests/entrypoints/openai/test_max_streaming_tokens.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ async def test_create_chat_completion_choice(self):
388388
"reasoning_content": "Normal reasoning",
389389
"tool_call": None,
390390
"num_cached_tokens": 3,
391+
"num_image_tokens": 2,
391392
"raw_prediction": "raw_answer_0",
392393
},
393394
"finished": True,
@@ -403,6 +404,7 @@ async def test_create_chat_completion_choice(self):
403404
"tool_calls": None,
404405
"raw_prediction": "raw_answer_0",
405406
"num_cached_tokens": 3,
407+
"num_image_tokens": 2,
406408
"finish_reason": "stop",
407409
},
408410
},
@@ -415,6 +417,7 @@ async def test_create_chat_completion_choice(self):
415417
"reasoning_content": None,
416418
"tool_call": None,
417419
"num_cached_tokens": 0,
420+
"num_image_tokens": 0,
418421
"raw_prediction": None,
419422
},
420423
"finished": True,
@@ -430,6 +433,7 @@ async def test_create_chat_completion_choice(self):
430433
"tool_calls": None,
431434
"raw_prediction": None,
432435
"num_cached_tokens": 0,
436+
"num_image_tokens": 0,
433437
"finish_reason": "stop",
434438
},
435439
},
@@ -442,6 +446,7 @@ async def test_create_chat_completion_choice(self):
442446
mock_response_processor.enable_multimodal_content.return_value = False
443447
completion_token_ids = [[], []]
444448
num_cached_tokens = [0, 0]
449+
num_image_tokens = [0, 0]
445450

446451
for idx, case in enumerate(test_cases):
447452
actual_choice = await self.chat_serving._create_chat_completion_choice(
@@ -453,6 +458,7 @@ async def test_create_chat_completion_choice(self):
453458
prompt_tokens=prompt_tokens,
454459
completion_token_ids=completion_token_ids[idx],
455460
num_cached_tokens=num_cached_tokens,
461+
num_image_tokens=num_image_tokens,
456462
logprob_contents=logprob_contents,
457463
response_processor=mock_response_processor,
458464
)
@@ -468,6 +474,7 @@ async def test_create_chat_completion_choice(self):
468474
self.assertEqual(actual_choice.message.completion_token_ids, completion_token_ids[idx])
469475

470476
self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"])
477+
self.assertEqual(num_image_tokens[expected["index"]], expected["num_image_tokens"])
471478
self.assertEqual(actual_choice.finish_reason, expected["finish_reason"])
472479
assert actual_choice.logprobs is not None
473480

0 commit comments

Comments
 (0)