Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions docs/advanced_features/separate_reasoning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -256,9 +256,7 @@
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
"input = tokenizer.apply_chat_template(\n",
" messages,\n",
" tokenize=False,\n",
" add_generation_prompt=True,\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"\n",
"gen_url = f\"http://localhost:{port}/generate\"\n",
Expand Down Expand Up @@ -319,9 +317,7 @@
"llm = sgl.Engine(model_path=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n",
"input = tokenizer.apply_chat_template(\n",
" messages,\n",
" tokenize=False,\n",
" add_generation_prompt=True,\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"sampling_params = {\n",
" \"max_new_tokens\": 1024,\n",
Expand Down
10 changes: 5 additions & 5 deletions docs/advanced_features/structured_outputs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@
" }\n",
"]\n",
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"response = requests.post(\n",
" f\"http://localhost:{port}/generate\",\n",
Expand Down Expand Up @@ -525,7 +525,7 @@
" }\n",
"]\n",
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"response = requests.post(\n",
" f\"http://localhost:{port}/generate\",\n",
Expand Down Expand Up @@ -571,7 +571,7 @@
" }\n",
"]\n",
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"response = requests.post(\n",
" f\"http://localhost:{port}/generate\",\n",
Expand Down Expand Up @@ -606,7 +606,7 @@
"tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n",
"\n",
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"payload = {\n",
" \"text\": text,\n",
Expand Down Expand Up @@ -883,7 +883,7 @@
"outputs": [],
"source": [
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"prompts = [text]\n",
"\n",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,7 @@
" },\n",
"]\n",
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"# Make API request\n",
"response = requests.post(\n",
Expand Down Expand Up @@ -448,7 +448,7 @@
"\n",
"# JSON\n",
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"response = requests.post(\n",
" f\"http://localhost:{port}/generate\",\n",
Expand Down Expand Up @@ -543,7 +543,7 @@
"outputs": [],
"source": [
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"payload = {\n",
" \"text\": text,\n",
Expand Down Expand Up @@ -765,7 +765,7 @@
"outputs": [],
"source": [
"text = tokenizer.apply_chat_template(\n",
" messages, tokenize=False, add_generation_prompt=True\n",
" messages, tokenize=False, add_generation_prompt=True, return_dict=False\n",
")\n",
"prompts = [text]\n",
"\n",
Expand Down
7 changes: 2 additions & 5 deletions docs/advanced_features/tool_parser.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -391,10 +391,7 @@
"messages = get_messages()\n",
"\n",
"input = tokenizer.apply_chat_template(\n",
" messages,\n",
" tokenize=False,\n",
" add_generation_prompt=True,\n",
" tools=tools,\n",
" messages, tokenize=False, add_generation_prompt=True, tools=tools, return_dict=False\n",
")\n",
"\n",
"gen_url = f\"http://localhost:{port}/generate\"\n",
Expand Down Expand Up @@ -459,7 +456,7 @@
"llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",
"tokenizer = llm.tokenizer_manager.tokenizer\n",
"input_ids = tokenizer.apply_chat_template(\n",
" messages, tokenize=True, add_generation_prompt=True, tools=tools\n",
" messages, tokenize=True, add_generation_prompt=True, tools=tools, return_dict=False\n",
")\n",
"\n",
"# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/basic_usage/native_api.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@
"]\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n",
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n",
"prompts = tokenizer.apply_chat_template(CONVS, tokenize=False, return_dict=False)\n",
"\n",
"url = f\"http://localhost:{port}/classify\"\n",
"data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n",
Expand Down
6 changes: 5 additions & 1 deletion python/sglang/bench_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -995,7 +995,10 @@ async def get_mooncake_request_over_time(
# Form the full prompt from history
try:
full_prompt_text = tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
chat_history,
tokenize=False,
add_generation_prompt=True,
return_dict=False,
)
except Exception:
full_prompt_text = "\n".join(
Expand Down Expand Up @@ -1161,6 +1164,7 @@ def sample_sharegpt_requests(
[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
return_dict=False,
)
if tokenizer.bos_token:
prompt = prompt.replace(tokenizer.bos_token, "")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ def enable_hf_transfer() -> None:
class DisabledTqdm(tqdm):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, disable=True)
kwargs["disable"] = True
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/gemini what's the difference here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The difference is in how the disable=True argument is passed to the tqdm constructor.

Before:

super().__init__(*args, **kwargs, disable=True)

This directly passes disable=True as a keyword argument. If kwargs already contained a disable key, this would result in a TypeError because the argument would be duplicated.

After:

kwargs["disable"] = True
super().__init__(*args, **kwargs)

This approach first explicitly sets or overrides the "disable" key in the kwargs dictionary to True. Then, **kwargs (which now definitively includes "disable": True) is passed to the super().__init__ method.

Functional Difference:
This change makes the DisabledTqdm class more robust. It ensures that disable is always True when the tqdm constructor is called, regardless of whether disable was already present in the kwargs passed to DisabledTqdm.__init__. It prevents potential TypeError exceptions and guarantees the intended behavior of disabling the progress bar.

super().__init__(*args, **kwargs)


def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None):
Expand Down
2 changes: 2 additions & 0 deletions python/sglang/srt/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ def _apply_jinja_template(
**(
request.chat_template_kwargs if request.chat_template_kwargs else {}
),
return_dict=False,
)
except Exception:
# This except branch will be triggered when the chosen model
Expand All @@ -343,6 +344,7 @@ def _apply_jinja_template(
**(
request.chat_template_kwargs if request.chat_template_kwargs else {}
),
return_dict=False,
)

if assistant_prefix:
Expand Down
3 changes: 2 additions & 1 deletion python/sglang/srt/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def enable_hf_transfer():

class DisabledTqdm(tqdm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, disable=True)
kwargs["disable"] = True
super().__init__(*args, **kwargs)


def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None):
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/test/runners.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ def start_model_process(
scores = []
for conv in prompts:
conv_formatted = self.tokenizer.apply_chat_template(
conv, tokenize=False
conv, tokenize=False, return_dict=False
)
conv_tokenized = self.tokenizer(
conv_formatted, return_tensors="pt"
Expand Down
4 changes: 3 additions & 1 deletion test/srt/models/test_reward_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ def assert_close_reward_scores(
torch_dtype=torch_dtype,
model_type="reward",
) as srt_runner:
prompts = srt_runner.tokenizer.apply_chat_template(convs, tokenize=False)
prompts = srt_runner.tokenizer.apply_chat_template(
convs, tokenize=False, return_dict=False
)
srt_outputs = srt_runner.forward(prompts)

hf_scores = torch.tensor(hf_outputs.scores)
Expand Down
Loading