[fix]Modify follow-up push parameters and Modify the verification method for thinking length (#4086)

luukunn · web-flow · commit ee9d8a840aad · 2025-09-19T14:26:01.000+08:00
* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* 续推参数  generated_token_ids 修改成 completion_token_ids;修改思考长度校验方式

* add completion_token_ids

* add logger

* fix reasoning_max_tokens ParameterError

* add unittest

* add unittest

* add unittest

* add unittest

* add unittest

* add unit test
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
@@ -236,8 +236,13 @@ def valid_parameters(self, data):
                 raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).")
 
         if data.get("reasoning_max_tokens") is not None:
-            if data["reasoning_max_tokens"] > data["max_tokens"] or data["reasoning_max_tokens"] < 1:
-                raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be between max_tokens and 1")
+            if data["reasoning_max_tokens"] < 1:
+                raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1")
+            if data["reasoning_max_tokens"] > data["max_tokens"]:
+                data["reasoning_max_tokens"] = data["max_tokens"]
+                api_server_logger.warning(
+                    f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to match that of max_tokens"
+                )
 
         # logprobs
         logprobs = data.get("logprobs")
diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py
@@ -593,6 +593,7 @@ class ChatCompletionRequest(BaseModel):
     prompt_token_ids: Optional[List[int]] = None
     max_streaming_response_tokens: Optional[int] = None
     disable_chat_template: Optional[bool] = False
+    completion_token_ids: Optional[List[int]] = None
     # doc: end-chat-completion-extra-params
 
     def to_dict_for_infer(self, request_id=None):
@@ -618,6 +619,9 @@ def to_dict_for_infer(self, request_id=None):
             ), "The parameter `raw_request` is not supported now, please use completion api instead."
             for key, value in self.metadata.items():
                 req_dict[key] = value
+            from fastdeploy.utils import api_server_logger
+
+            api_server_logger.warning("The parameter metadata is obsolete.")
         for key, value in self.dict().items():
             if value is not None:
                 req_dict[key] = value
diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -241,10 +241,8 @@ def process_request_dict(self, request, max_model_len=None):
         else:
             raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
 
-        metadata = request.get("metadata")
-        # 如果metadata包含之前输出的token，将这些token添加到input_ids末尾
-        if metadata and metadata.get("generated_token_ids"):
-            self.append_generated_tokens(outputs, metadata["generated_token_ids"])
+        if request.get("completion_token_ids"):
+            self.append_completion_tokens(outputs, request["completion_token_ids"])
         outputs = self.pack_outputs(outputs)
         request["prompt_token_ids"] = outputs["input_ids"].tolist()
         request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
@@ -259,11 +257,11 @@ def process_request_dict(self, request, max_model_len=None):
 
         return request
 
-    def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
-        "append already generated tokens"
+    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
+        "append already completion tokens"
 
-        num_tokens = len(generated_token_ids)
-        multimodal_inputs["input_ids"].extend(generated_token_ids)
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
         multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
 
         start = multimodal_inputs["cur_position"]
diff --git a/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py b/fastdeploy/input/qwen_vl_processor/qwen_vl_processor.py
@@ -245,15 +245,11 @@ def process_request_dict(self, request, max_model_len=None):
         else:
             raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
 
-        metadata = request.get("metadata")
         # Handle continuation of previous generation by appending existing tokens
-        if metadata and metadata.get("generated_token_ids"):
-            self.append_generated_tokens(outputs, metadata["generated_token_ids"])
+        if request.get("completion_token_ids"):
+            self.append_completion_tokens(outputs, request["completion_token_ids"])
 
         enable_thinking = False
-        if metadata:
-            enable_thinking = metadata.get("enable_thinking", False)
-
         if request.get("chat_template_kwargs"):
             chat_template_kwargs = request.get("chat_template_kwargs")
             enable_thinking = chat_template_kwargs.get("enable_thinking", False)
@@ -278,16 +274,16 @@ def process_request_dict(self, request, max_model_len=None):
 
         return request
 
-    def append_generated_tokens(self, outputs, generated_token_ids):
+    def append_completion_tokens(self, outputs, completion_token_ids):
         """
-        Append generated tokens to existing outputs.
+        Append completion tokens to existing outputs.
 
         Args:
             outputs: Current model outputs
-            generated_token_ids: Generated tokens to append
+            completion_token_ids: completion tokens to append
         """
         out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]}
-        self.processor._add_text(generated_token_ids, out)
+        self.processor._add_text(completion_token_ids, out)
 
         outputs["input_ids"] = np.concatenate(
             [outputs["input_ids"], np.array(out["input_ids"], dtype=np.int64)], axis=0
diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py
@@ -255,6 +255,16 @@ def test_consistency_between_runs(api_url, headers, consistent_payload):
     assert content1 == content2
 
 
+def test_with_metadata(api_url, headers, consistent_payload):
+    """
+    Test that result is same as the base result.
+    """
+    # request
+    consistent_payload["metadata"] = {"enable_thinking": True}
+    resp1 = requests.post(api_url, headers=headers, json=consistent_payload)
+    assert resp1.status_code == 200
+
+
 # ==========================
 # OpenAI Client Chat Completion Test
 # ==========================
@@ -555,6 +565,46 @@ def test_chat_with_thinking(openai_client, capsys):
     assert reasoning_tokens <= reasoning_max_tokens
 
 
+def test_chat_with_completion_token_ids(openai_client):
+    """Test completion_token_ids"""
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Hello"}],
+        extra_body={
+            "completion_token_ids": [94936],
+            "return_token_ids": True,
+            "reasoning_max_tokens": 20,
+            "max_tokens": 10,
+        },
+        max_tokens=10,
+        stream=False,
+    )
+    assert hasattr(response, "choices")
+    assert len(response.choices) > 0
+    assert hasattr(response.choices[0], "message")
+    assert hasattr(response.choices[0].message, "prompt_token_ids")
+    assert isinstance(response.choices[0].message.prompt_token_ids, list)
+    assert 94936 in response.choices[0].message.prompt_token_ids
+
+
+def test_chat_with_reasoning_max_tokens(openai_client):
+    """Test completion_token_ids"""
+    assertion_executed = False
+    try:
+        openai_client.chat.completions.create(
+            model="default",
+            messages=[{"role": "user", "content": "Hello"}],
+            extra_body={"completion_token_ids": [18900], "return_token_ids": True, "reasoning_max_tokens": -1},
+            max_tokens=10,
+            stream=False,
+        )
+    except openai.InternalServerError as e:
+        error_message = str(e)
+        assertion_executed = True
+        assert "reasoning_max_tokens must be greater than 1" in error_message
+    assert assertion_executed, "Assertion was not executed (no exception raised)"
+
+
 def test_profile_reset_block_num():
     """测试profile reset_block_num功能，与baseline diff不能超过5%"""
     log_file = "./log/config.log"
diff --git a/tests/input/test_qwen_vl_processor.py b/tests/input/test_qwen_vl_processor.py
@@ -176,12 +176,10 @@ def test_process_request_dict(self):
         3. Video processing produces expected output dimensions
         4. Correct counts for images (1) and videos (1)
         """
-        num_generated_token_ids = 10
+        num_completion_token_ids = 10
         request = {
             "request_id": "12345",
-            "metadata": {
-                "generated_token_ids": [1] * num_generated_token_ids,
-            },
+            "completion_token_ids": [1] * num_completion_token_ids,
             "stop": ["stop", "eof"],
             "messages": [
                 {