Skip to content

Commit ee9d8a8

Browse files
authored
[fix]Modify follow-up push parameters and Modify the verification method for thinking length (#4086)
* 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * 续推参数 generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式 * add completion_token_ids * add logger * fix reasoning_max_tokens ParameterError * add unittest * add unittest * add unittest * add unittest * add unittest * add unit test
1 parent 66a98b4 commit ee9d8a8

File tree

6 files changed

+75
-24
lines changed

6 files changed

+75
-24
lines changed

fastdeploy/entrypoints/engine_client.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,13 @@ def valid_parameters(self, data):
236236
raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).")
237237

238238
if data.get("reasoning_max_tokens") is not None:
239-
if data["reasoning_max_tokens"] > data["max_tokens"] or data["reasoning_max_tokens"] < 1:
240-
raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be between max_tokens and 1")
239+
if data["reasoning_max_tokens"] < 1:
240+
raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1")
241+
if data["reasoning_max_tokens"] > data["max_tokens"]:
242+
data["reasoning_max_tokens"] = data["max_tokens"]
243+
api_server_logger.warning(
244+
f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens"
245+
)
241246

242247
# logprobs
243248
logprobs = data.get("logprobs")

fastdeploy/entrypoints/openai/protocol.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,7 @@ class ChatCompletionRequest(BaseModel):
593593
prompt_token_ids: Optional[List[int]] = None
594594
max_streaming_response_tokens: Optional[int] = None
595595
disable_chat_template: Optional[bool] = False
596+
completion_token_ids: Optional[List[int]] = None
596597
# doc: end-chat-completion-extra-params
597598

598599
def to_dict_for_infer(self, request_id=None):
@@ -618,6 +619,9 @@ def to_dict_for_infer(self, request_id=None):
618619
), "The parameter `raw_request` is not supported now, please use completion api instead."
619620
for key, value in self.metadata.items():
620621
req_dict[key] = value
622+
from fastdeploy.utils import api_server_logger
623+
624+
api_server_logger.warning("The parameter metadata is obsolete.")
621625
for key, value in self.dict().items():
622626
if value is not None:
623627
req_dict[key] = value

fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,8 @@ def process_request_dict(self, request, max_model_len=None):
241241
else:
242242
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
243243

244-
metadata = request.get("metadata")
245-
# 如果metadata包含之前输出的token,将这些token添加到input_ids末尾
246-
if metadata and metadata.get("generated_token_ids"):
247-
self.append_generated_tokens(outputs, metadata["generated_token_ids"])
244+
if request.get("completion_token_ids"):
245+
self.append_completion_tokens(outputs, request["completion_token_ids"])
248246
outputs = self.pack_outputs(outputs)
249247
request["prompt_token_ids"] = outputs["input_ids"].tolist()
250248
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
@@ -259,11 +257,11 @@ def process_request_dict(self, request, max_model_len=None):
259257

260258
return request
261259

262-
def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
263-
"append already generated tokens"
260+
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
261+
"append already completion tokens"
264262

265-
num_tokens = len(generated_token_ids)
266-
multimodal_inputs["input_ids"].extend(generated_token_ids)
263+
num_tokens = len(completion_token_ids)
264+
multimodal_inputs["input_ids"].extend(completion_token_ids)
267265
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
268266

269267
start = multimodal_inputs["cur_position"]

fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -245,15 +245,11 @@ def process_request_dict(self, request, max_model_len=None):
245245
else:
246246
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
247247

248-
metadata = request.get("metadata")
249248
# Handle continuation of previous generation by appending existing tokens
250-
if metadata and metadata.get("generated_token_ids"):
251-
self.append_generated_tokens(outputs, metadata["generated_token_ids"])
249+
if request.get("completion_token_ids"):
250+
self.append_completion_tokens(outputs, request["completion_token_ids"])
252251

253252
enable_thinking = False
254-
if metadata:
255-
enable_thinking = metadata.get("enable_thinking", False)
256-
257253
if request.get("chat_template_kwargs"):
258254
chat_template_kwargs = request.get("chat_template_kwargs")
259255
enable_thinking = chat_template_kwargs.get("enable_thinking", False)
@@ -278,16 +274,16 @@ def process_request_dict(self, request, max_model_len=None):
278274

279275
return request
280276

281-
def append_generated_tokens(self, outputs, generated_token_ids):
277+
def append_completion_tokens(self, outputs, completion_token_ids):
282278
"""
283-
Append generated tokens to existing outputs.
279+
Append completion tokens to existing outputs.
284280
285281
Args:
286282
outputs: Current model outputs
287-
generated_token_ids: Generated tokens to append
283+
completion_token_ids: completion tokens to append
288284
"""
289285
out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]}
290-
self.processor._add_text(generated_token_ids, out)
286+
self.processor._add_text(completion_token_ids, out)
291287

292288
outputs["input_ids"] = np.concatenate(
293289
[outputs["input_ids"], np.array(out["input_ids"], dtype=np.int64)], axis=0

tests/e2e/test_EB_VL_Lite_serving.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,16 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
255255
assert content1 == content2
256256

257257

258+
def test_with_metadata(api_url, headers, consistent_payload):
259+
"""
260+
Test that result is same as the base result.
261+
"""
262+
# request
263+
consistent_payload["metadata"] = {"enable_thinking": True}
264+
resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
265+
assert resp1.status_code == 200
266+
267+
258268
# ==========================
259269
# OpenAI Client Chat Completion Test
260270
# ==========================
@@ -555,6 +565,46 @@ def test_chat_with_thinking(openai_client, capsys):
555565
assert reasoning_tokens <= reasoning_max_tokens
556566

557567

568+
def test_chat_with_completion_token_ids(openai_client):
569+
"""Test completion_token_ids"""
570+
response = openai_client.chat.completions.create(
571+
model="default",
572+
messages=[{"role": "user", "content": "Hello"}],
573+
extra_body={
574+
"completion_token_ids": [94936],
575+
"return_token_ids": True,
576+
"reasoning_max_tokens": 20,
577+
"max_tokens": 10,
578+
},
579+
max_tokens=10,
580+
stream=False,
581+
)
582+
assert hasattr(response, "choices")
583+
assert len(response.choices) > 0
584+
assert hasattr(response.choices[0], "message")
585+
assert hasattr(response.choices[0].message, "prompt_token_ids")
586+
assert isinstance(response.choices[0].message.prompt_token_ids, list)
587+
assert 94936 in response.choices[0].message.prompt_token_ids
588+
589+
590+
def test_chat_with_reasoning_max_tokens(openai_client):
591+
"""Test completion_token_ids"""
592+
assertion_executed = False
593+
try:
594+
openai_client.chat.completions.create(
595+
model="default",
596+
messages=[{"role": "user", "content": "Hello"}],
597+
extra_body={"completion_token_ids": [18900], "return_token_ids": True, "reasoning_max_tokens": -1},
598+
max_tokens=10,
599+
stream=False,
600+
)
601+
except openai.InternalServerError as e:
602+
error_message = str(e)
603+
assertion_executed = True
604+
assert "reasoning_max_tokens must be greater than 1" in error_message
605+
assert assertion_executed, "Assertion was not executed (no exception raised)"
606+
607+
558608
def test_profile_reset_block_num():
559609
"""测试profile reset_block_num功能,与baseline diff不能超过5%"""
560610
log_file = "./log/config.log"

tests/input/test_qwen_vl_processor.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,10 @@ def test_process_request_dict(self):
176176
3. Video processing produces expected output dimensions
177177
4. Correct counts for images (1) and videos (1)
178178
"""
179-
num_generated_token_ids = 10
179+
num_completion_token_ids = 10
180180
request = {
181181
"request_id": "12345",
182-
"metadata": {
183-
"generated_token_ids": [1] * num_generated_token_ids,
184-
},
182+
"completion_token_ids": [1] * num_completion_token_ids,
185183
"stop": ["stop", "eof"],
186184
"messages": [
187185
{

0 commit comments

Comments
 (0)