diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 019a617acd..a12cadaa7d 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -323,6 +323,12 @@ async def chat_completions_v1(request: ChatCompletionRequest, - ignore_eos (bool): indicator for ignoring eos - skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) Currently we do not support the following features: - presence_penalty (replaced with repetition_penalty) @@ -386,6 +392,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, skip_special_tokens=request.skip_special_tokens, response_format=response_format, logits_processors=logits_processors, + min_new_tokens=request.min_new_tokens, + min_p=request.min_p, random_seed=random_seed) tools = None @@ -826,6 +834,12 @@ async def chat_interactive_v1(request: GenerateRequest, in the decoding. Default to be True. - adapter_name (str): For slora inference. Choose which lora to do the inference. + - min_new_tokens (int): To generate at least numbers of tokens. + - min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) """ if request.cancel: if request.session_id != -1: @@ -867,6 +881,8 @@ async def chat_interactive_v1(request: GenerateRequest, ignore_eos=request.ignore_eos, stop_words=request.stop, skip_special_tokens=request.skip_special_tokens, + min_new_tokens=request.min_new_tokens, + min_p=request.min_p, random_seed=random_seed) if request.image_url: from lmdeploy.vl import load_image diff --git a/lmdeploy/serve/openai/protocol.py b/lmdeploy/serve/openai/protocol.py index bd54028c39..d4bf8ed315 100644 --- a/lmdeploy/serve/openai/protocol.py +++ b/lmdeploy/serve/openai/protocol.py @@ -137,6 +137,8 @@ class ChatCompletionRequest(BaseModel): skip_special_tokens: Optional[bool] = True top_k: Optional[int] = 40 seed: Optional[int] = None + min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + min_p: float = 0.0 class FunctionResponse(BaseModel): @@ -339,6 +341,8 @@ class GenerateRequest(BaseModel): cancel: Optional[bool] = False # cancel a responding request adapter_name: Optional[str] = Field(default=None, examples=[None]) seed: Optional[int] = None + min_new_tokens: Optional[int] = Field(default=None, examples=[None]) + min_p: float = 0.0 class GenerateResponse(BaseModel):