From 800b6010c0bf76aadf678bc38a507b749fb9774c Mon Sep 17 00:00:00 2001 From: AllentDan <41138331+AllentDan@users.noreply.github.com> Date: Thu, 23 Jan 2025 14:25:40 +0800 Subject: [PATCH] More arguments in api_client, update docstrings (#3077) --- lmdeploy/serve/openai/api_client.py | 31 +++++++++++++++++++++++++---- lmdeploy/serve/openai/api_server.py | 4 ++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lmdeploy/serve/openai/api_client.py b/lmdeploy/serve/openai/api_client.py index 03b1ef708..79fd04570 100644 --- a/lmdeploy/serve/openai/api_client.py +++ b/lmdeploy/serve/openai/api_client.py @@ -103,9 +103,14 @@ def chat_completions_v1(self, frequency_penalty: Optional[float] = 0.0, user: Optional[str] = None, repetition_penalty: Optional[float] = 1.0, - session_id: Optional[int] = -1, ignore_eos: Optional[bool] = False, skip_special_tokens: Optional[bool] = True, + spaces_between_special_tokens: Optional[bool] = True, + top_k: int = 40, + min_new_tokens: Optional[int] = None, + min_p: float = 0.0, + logit_bias: Optional[Dict[str, float]] = None, + stream_options: Optional[Dict] = None, **kwargs): """Chat completion v1. @@ -128,7 +133,20 @@ def chat_completions_v1(self, ignore_eos (bool): indicator for ignoring eos skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. - session_id (int): Deprecated. + spaces_between_special_tokens (bool): Whether or not to add spaces + around special tokens. The behavior of Fast tokenizers is to have + this to False. This is setup to True in slow tokenizers. + top_k (int): The number of the highest probability vocabulary + tokens to keep for top-k-filtering + min_new_tokens (int): To generate at least numbers of tokens. + min_p (float): Minimum token probability, which will be scaled by the + probability of the most likely token. It must be a value between + 0 and 1. Typical values are in the 0.01-0.2 range, comparably + selective as setting `top_p` in the 0.99-0.8 range (use the + opposite of normal `top_p` values) + logit_bias (Dict): Bias to logits. Only supported in pytorch engine. + stream_options: Options for streaming response. Only set this when you + set stream: true. Yields: json objects in openai formats @@ -229,9 +247,10 @@ def completions_v1( user: Optional[str] = None, # additional argument of lmdeploy repetition_penalty: Optional[float] = 1.0, - session_id: Optional[int] = -1, ignore_eos: Optional[bool] = False, skip_special_tokens: Optional[bool] = True, + spaces_between_special_tokens: Optional[bool] = True, + stream_options: Optional[Dict] = None, **kwargs): """Chat completion v1. @@ -258,7 +277,11 @@ def completions_v1( ignore_eos (bool): indicator for ignoring eos skip_special_tokens (bool): Whether or not to remove special tokens in the decoding. Default to be True. - session_id (int): Deprecated. + spaces_between_special_tokens (bool): Whether or not to add spaces + around special tokens. The behavior of Fast tokenizers is to have + this to False. This is setup to True in slow tokenizers. + stream_options: Options for streaming response. Only set this when you + set stream: true. Yields: json objects in openai formats diff --git a/lmdeploy/serve/openai/api_server.py b/lmdeploy/serve/openai/api_server.py index 40a3599ae..b61772f40 100644 --- a/lmdeploy/serve/openai/api_server.py +++ b/lmdeploy/serve/openai/api_server.py @@ -273,6 +273,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque - n (int): How many chat completion choices to generate for each input message. **Only support one here**. - stream: whether to stream the results or not. Default to false. + - stream_options: Options for streaming response. Only set this when you + set stream: true. - max_tokens (int | None): output token nums. Default to None. - repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty @@ -523,6 +525,8 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None - n (int): How many chat completion choices to generate for each input message. **Only support one here**. - stream: whether to stream the results or not. Default to false. + - stream_options: Options for streaming response. Only set this when you + set stream: true. - repetition_penalty (float): The parameter for repetition penalty. 1.0 means no penalty - user (str): A unique identifier representing your end-user.