Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion python/sglang/srt/openai_api/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,8 @@ def v1_chat_generate_request(
image_data = conv.image_data
audio_data = conv.audio_data
modalities = conv.modalities
stop = conv.stop_str or []
stop = conv.stop_str or [] if not request.ignore_eos else []

if request.stop:
if isinstance(request.stop, str):
stop.append(request.stop)
Expand Down
74 changes: 74 additions & 0 deletions test/srt/test_openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -676,5 +676,79 @@ def test_embedding_batch(self):
self.assertTrue(len(response.data[1].embedding) > 0)


class TestOpenAIServerIgnoreEOS(CustomTestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_SMALL_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
api_key=cls.api_key,
other_args=["--chat-template=llama_3_vision"],
)
cls.base_url += "/v1"
cls.tokenizer = get_tokenizer(DEFAULT_SMALL_MODEL_NAME_FOR_TEST)

@classmethod
def tearDownClass(cls):
kill_process_tree(cls.process.pid)

def test_ignore_eos(self):
"""
Test that ignore_eos=True allows generation to continue beyond EOS token
and reach the max_tokens limit.
"""
client = openai.Client(api_key=self.api_key, base_url=self.base_url)

max_tokens = 200

response_default = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Count from 1 to 20."},
],
temperature=0,
max_tokens=max_tokens,
extra_body={"ignore_eos": False},
)

response_ignore_eos = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Count from 1 to 20."},
],
temperature=0,
max_tokens=max_tokens,
extra_body={"ignore_eos": True},
)

default_tokens = len(
self.tokenizer.encode(response_default.choices[0].message.content)
)
ignore_eos_tokens = len(
self.tokenizer.encode(response_ignore_eos.choices[0].message.content)
)

# Check if ignore_eos resulted in more tokens or exactly max_tokens
# The ignore_eos response should either:
# 1. Have more tokens than the default response (if default stopped at EOS before max_tokens)
# 2. Have exactly max_tokens (if it reached the max_tokens limit)
self.assertTrue(
ignore_eos_tokens > default_tokens or ignore_eos_tokens >= max_tokens,
f"ignore_eos did not generate more tokens: {ignore_eos_tokens} vs {default_tokens}",
)

self.assertEqual(
response_ignore_eos.choices[0].finish_reason,
"length",
f"Expected finish_reason='length' for ignore_eos=True, got {response_ignore_eos.choices[0].finish_reason}",
)


if __name__ == "__main__":
unittest.main()
Loading