Skip to content

Commit

Permalink
Support min_tokens, min_p parameters for api_server (#2681)
Browse files Browse the repository at this point in the history
* Support min_tokens for api_server

* fix

* use min_new_tokens

* add min_p
  • Loading branch information
AllentDan authored Nov 1, 2024
1 parent e034610 commit 654c457
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 0 deletions.
16 changes: 16 additions & 0 deletions lmdeploy/serve/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,12 @@ async def chat_completions_v1(request: ChatCompletionRequest,
- ignore_eos (bool): indicator for ignoring eos
- skip_special_tokens (bool): Whether or not to remove special tokens
in the decoding. Default to be True.
- min_new_tokens (int): To generate at least numbers of tokens.
- min_p (float): Minimum token probability, which will be scaled by the
probability of the most likely token. It must be a value between
0 and 1. Typical values are in the 0.01-0.2 range, comparably
selective as setting `top_p` in the 0.99-0.8 range (use the
opposite of normal `top_p` values)
Currently we do not support the following features:
- presence_penalty (replaced with repetition_penalty)
Expand Down Expand Up @@ -386,6 +392,8 @@ async def chat_completions_v1(request: ChatCompletionRequest,
skip_special_tokens=request.skip_special_tokens,
response_format=response_format,
logits_processors=logits_processors,
min_new_tokens=request.min_new_tokens,
min_p=request.min_p,
random_seed=random_seed)

tools = None
Expand Down Expand Up @@ -826,6 +834,12 @@ async def chat_interactive_v1(request: GenerateRequest,
in the decoding. Default to be True.
- adapter_name (str): For slora inference. Choose which lora to do the
inference.
- min_new_tokens (int): To generate at least numbers of tokens.
- min_p (float): Minimum token probability, which will be scaled by the
probability of the most likely token. It must be a value between
0 and 1. Typical values are in the 0.01-0.2 range, comparably
selective as setting `top_p` in the 0.99-0.8 range (use the
opposite of normal `top_p` values)
"""
if request.cancel:
if request.session_id != -1:
Expand Down Expand Up @@ -867,6 +881,8 @@ async def chat_interactive_v1(request: GenerateRequest,
ignore_eos=request.ignore_eos,
stop_words=request.stop,
skip_special_tokens=request.skip_special_tokens,
min_new_tokens=request.min_new_tokens,
min_p=request.min_p,
random_seed=random_seed)
if request.image_url:
from lmdeploy.vl import load_image
Expand Down
4 changes: 4 additions & 0 deletions lmdeploy/serve/openai/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ class ChatCompletionRequest(BaseModel):
skip_special_tokens: Optional[bool] = True
top_k: Optional[int] = 40
seed: Optional[int] = None
min_new_tokens: Optional[int] = Field(default=None, examples=[None])
min_p: float = 0.0


class FunctionResponse(BaseModel):
Expand Down Expand Up @@ -339,6 +341,8 @@ class GenerateRequest(BaseModel):
cancel: Optional[bool] = False # cancel a responding request
adapter_name: Optional[str] = Field(default=None, examples=[None])
seed: Optional[int] = None
min_new_tokens: Optional[int] = Field(default=None, examples=[None])
min_p: float = 0.0


class GenerateResponse(BaseModel):
Expand Down

0 comments on commit 654c457

Please sign in to comment.