From 654c457332c6a731578e70382a8563abd5f681a3 Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Fri, 1 Nov 2024 20:05:37 +0800 Subject: [PATCH] Support min_tokens, min_p parameters for api_server (#2681) * Support min_tokens for api_server * fix * use min_new_tokens * add min_p --- lmdeploy/serve/openai/api_server.py | 16 ++++++++++++++++ lmdeploy/serve/openai/protocol.py | 4 ++++ 2 files changed, 20 insertions(+) diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 019a617acd..a12cadaa7d 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -323,6 +323,12 @@ async def chat_completions_v1(request: ChatCompletionRequest, - ignore_eos (bool): indicator for ignoring eos - skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) Currently we do not support the following features: - presence_penalty (replaced with repetition_penalty) @@ -386,6 +392,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, skip_special_tokens=request.skip_special_tokens, response_format=response_format, logits_processors=logits_processors, + min_new_tokens=request.min_new_tokens, + min_p=request.min_p, random_seed=random_seed) tools = None @@ -826,6 +834,12 @@ async def chat_interactive_v1(request: GenerateRequest, in the decoding. Default to be True. - adapter_name (str): For slora inference. Choose which lora to do the inference. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) """ if request.cancel: if request.session_id != -1: @@ -867,6 +881,8 @@ async def chat_interactive_v1(request: GenerateRequest, ignore_eos=request.ignore_eos, stop_words=request.stop, skip_special_tokens=request.skip_special_tokens, + min_new_tokens=request.min_new_tokens, + min_p=request.min_p, random_seed=random_seed) if request.image_url: from lmdeploy.vl import load_image diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index bd54028c39..d4bf8ed315 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -137,6 +137,8 @@ class ChatCompletionRequest(BaseModel): skip_special_tokens: Optional[bool] = True top_k: Optional[int] = 40 seed: Optional[int] = None + min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + min_p: float = 0.0 class FunctionResponse(BaseModel): @@ -339,6 +341,8 @@ class GenerateRequest(BaseModel): cancel: Optional[bool] = False # cancel a responding request adapter_name: Optional[str] = Field(default=None, examples=[None]) seed: Optional[int] = None + min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + min_p: float = 0.0 class GenerateResponse(BaseModel):