-
Notifications
You must be signed in to change notification settings - Fork 798
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Frontend] Fix request length check and add option to disallow auto t…
…runcation in scheduler (#2876)
- Loading branch information
1 parent
0427416
commit a8ccacc
Showing
6 changed files
with
154 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import logging | ||
from typing import Optional | ||
|
||
from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def validate_input_length( | ||
req: Req, max_req_input_len: int, allow_auto_truncate: bool | ||
) -> Optional[str]: | ||
"""Validate and potentially truncate input length. | ||
Args: | ||
req: The request containing input_ids to validate | ||
max_req_input_len: Maximum allowed input length | ||
allow_auto_truncate: Whether to truncate long inputs | ||
Returns: | ||
Error message if validation fails, None if successful | ||
""" | ||
if len(req.origin_input_ids) >= max_req_input_len: | ||
if allow_auto_truncate: | ||
logger.warning( | ||
"Request length is longer than the KV cache pool size or " | ||
"the max context length. Truncated. " | ||
f"{len(req.origin_input_ids)=}, {max_req_input_len=}." | ||
) | ||
req.origin_input_ids = req.origin_input_ids[:max_req_input_len] | ||
return None | ||
else: | ||
error_msg = ( | ||
f"Input length ({len(req.origin_input_ids)} tokens) exceeds " | ||
f"the maximum allowed length ({max_req_input_len} tokens). " | ||
f"Use a shorter input or enable --allow-auto-truncate." | ||
) | ||
logger.error(error_msg) | ||
req.finished_reason = FINISH_ABORT(error_msg) | ||
return error_msg | ||
|
||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import unittest | ||
|
||
import openai | ||
|
||
from sglang.srt.utils import kill_process_tree | ||
from sglang.test.test_utils import ( | ||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST, | ||
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
DEFAULT_URL_FOR_TEST, | ||
popen_launch_server, | ||
) | ||
|
||
|
||
class TestRequestLengthValidation(unittest.TestCase): | ||
@classmethod | ||
def setUpClass(cls): | ||
cls.base_url = DEFAULT_URL_FOR_TEST | ||
cls.api_key = "sk-123456" | ||
|
||
# Start server with auto truncate disabled | ||
cls.process = popen_launch_server( | ||
DEFAULT_SMALL_MODEL_NAME_FOR_TEST, | ||
cls.base_url, | ||
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, | ||
api_key=cls.api_key, | ||
other_args=("--max-total-tokens", "1000", "--context-length", "100"), | ||
) | ||
|
||
@classmethod | ||
def tearDownClass(cls): | ||
kill_process_tree(cls.process.pid) | ||
|
||
def test_input_length_validation(self): | ||
client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1") | ||
|
||
long_text = "hello " * 100 # Will tokenize to more than context length | ||
|
||
with self.assertRaises(openai.BadRequestError) as cm: | ||
client.chat.completions.create( | ||
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, | ||
messages=[ | ||
{"role": "user", "content": long_text}, | ||
], | ||
temperature=0, | ||
) | ||
|
||
self.assertIn("is longer than the model's context length", str(cm.exception)) | ||
|
||
def test_max_tokens_validation(self): | ||
client = openai.Client(api_key=self.api_key, base_url=f"{self.base_url}/v1") | ||
|
||
long_text = "hello " | ||
|
||
with self.assertRaises(openai.BadRequestError) as cm: | ||
client.chat.completions.create( | ||
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST, | ||
messages=[ | ||
{"role": "user", "content": long_text}, | ||
], | ||
temperature=0, | ||
max_tokens=500, | ||
) | ||
|
||
self.assertIn( | ||
"Requested token count exceeds the model's maximum context", | ||
str(cm.exception), | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |