diff --git a/docs/advanced_features/separate_reasoning.ipynb b/docs/advanced_features/separate_reasoning.ipynb index 56a28f03ceae..fa24e63b7871 100644 --- a/docs/advanced_features/separate_reasoning.ipynb +++ b/docs/advanced_features/separate_reasoning.ipynb @@ -256,9 +256,7 @@ "\n", "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n", "input = tokenizer.apply_chat_template(\n", - " messages,\n", - " tokenize=False,\n", - " add_generation_prompt=True,\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "\n", "gen_url = f\"http://localhost:{port}/generate\"\n", @@ -319,9 +317,7 @@ "llm = sgl.Engine(model_path=\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n", "tokenizer = AutoTokenizer.from_pretrained(\"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B\")\n", "input = tokenizer.apply_chat_template(\n", - " messages,\n", - " tokenize=False,\n", - " add_generation_prompt=True,\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "sampling_params = {\n", " \"max_new_tokens\": 1024,\n", diff --git a/docs/advanced_features/structured_outputs.ipynb b/docs/advanced_features/structured_outputs.ipynb index 7388adfb44f2..b0ec5e6c7d61 100644 --- a/docs/advanced_features/structured_outputs.ipynb +++ b/docs/advanced_features/structured_outputs.ipynb @@ -443,7 +443,7 @@ " }\n", "]\n", "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "response = requests.post(\n", " f\"http://localhost:{port}/generate\",\n", @@ -525,7 +525,7 @@ " }\n", "]\n", "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "response = requests.post(\n", " f\"http://localhost:{port}/generate\",\n", @@ -571,7 +571,7 @@ " }\n", "]\n", "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "response = requests.post(\n", " f\"http://localhost:{port}/generate\",\n", @@ -606,7 +606,7 @@ "tokenizer = AutoTokenizer.from_pretrained(\"meta-llama/Meta-Llama-3.1-8B-Instruct\")\n", "\n", "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "payload = {\n", " \"text\": text,\n", @@ -883,7 +883,7 @@ "outputs": [], "source": [ "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "prompts = [text]\n", "\n", diff --git a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb index c8f51a98af30..9cdcc29e152a 100644 --- a/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb +++ b/docs/advanced_features/structured_outputs_for_reasoning_models.ipynb @@ -400,7 +400,7 @@ " },\n", "]\n", "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "# Make API request\n", "response = requests.post(\n", @@ -448,7 +448,7 @@ "\n", "# JSON\n", "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "response = requests.post(\n", " f\"http://localhost:{port}/generate\",\n", @@ -543,7 +543,7 @@ "outputs": [], "source": [ "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "payload = {\n", " \"text\": text,\n", @@ -765,7 +765,7 @@ "outputs": [], "source": [ "text = tokenizer.apply_chat_template(\n", - " messages, tokenize=False, add_generation_prompt=True\n", + " messages, tokenize=False, add_generation_prompt=True, return_dict=False\n", ")\n", "prompts = [text]\n", "\n", diff --git a/docs/advanced_features/tool_parser.ipynb b/docs/advanced_features/tool_parser.ipynb index 6ef2e321f9d6..1b5198ea7fac 100644 --- a/docs/advanced_features/tool_parser.ipynb +++ b/docs/advanced_features/tool_parser.ipynb @@ -391,10 +391,7 @@ "messages = get_messages()\n", "\n", "input = tokenizer.apply_chat_template(\n", - " messages,\n", - " tokenize=False,\n", - " add_generation_prompt=True,\n", - " tools=tools,\n", + " messages, tokenize=False, add_generation_prompt=True, tools=tools, return_dict=False\n", ")\n", "\n", "gen_url = f\"http://localhost:{port}/generate\"\n", @@ -459,7 +456,7 @@ "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n", "tokenizer = llm.tokenizer_manager.tokenizer\n", "input_ids = tokenizer.apply_chat_template(\n", - " messages, tokenize=True, add_generation_prompt=True, tools=tools\n", + " messages, tokenize=True, add_generation_prompt=True, tools=tools, return_dict=False\n", ")\n", "\n", "# Note that for gpt-oss tool parser, adding \"no_stop_trim\": True\n", diff --git a/docs/basic_usage/native_api.ipynb b/docs/basic_usage/native_api.ipynb index edfbd34533a5..028e646d2398 100644 --- a/docs/basic_usage/native_api.ipynb +++ b/docs/basic_usage/native_api.ipynb @@ -410,7 +410,7 @@ "]\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\")\n", - "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False)\n", + "prompts = tokenizer.apply_chat_template(CONVS, tokenize=False, return_dict=False)\n", "\n", "url = f\"http://localhost:{port}/classify\"\n", "data = {\"model\": \"Skywork/Skywork-Reward-Llama-3.1-8B-v0.2\", \"text\": prompts}\n", diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index cf782ea25905..d7b07a9bf91f 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -995,7 +995,10 @@ async def get_mooncake_request_over_time( # Form the full prompt from history try: full_prompt_text = tokenizer.apply_chat_template( - chat_history, tokenize=False, add_generation_prompt=True + chat_history, + tokenize=False, + add_generation_prompt=True, + return_dict=False, ) except Exception: full_prompt_text = "\n".join( @@ -1161,6 +1164,7 @@ def sample_sharegpt_requests( [{"role": "user", "content": prompt}], add_generation_prompt=True, tokenize=False, + return_dict=False, ) if tokenizer.bos_token: prompt = prompt.replace(tokenizer.bos_token, "") diff --git a/python/sglang/multimodal_gen/runtime/loader/weight_utils.py b/python/sglang/multimodal_gen/runtime/loader/weight_utils.py index 7796defd88f1..f0e45a95f563 100644 --- a/python/sglang/multimodal_gen/runtime/loader/weight_utils.py +++ b/python/sglang/multimodal_gen/runtime/loader/weight_utils.py @@ -46,7 +46,8 @@ def enable_hf_transfer() -> None: class DisabledTqdm(tqdm): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, disable=True) + kwargs["disable"] = True + super().__init__(*args, **kwargs) def get_lock(model_name_or_path: str | Path, cache_dir: str | None = None): diff --git a/python/sglang/srt/entrypoints/openai/serving_chat.py b/python/sglang/srt/entrypoints/openai/serving_chat.py index 773707c36b52..b6880e5e887c 100644 --- a/python/sglang/srt/entrypoints/openai/serving_chat.py +++ b/python/sglang/srt/entrypoints/openai/serving_chat.py @@ -324,6 +324,7 @@ def _apply_jinja_template( **( request.chat_template_kwargs if request.chat_template_kwargs else {} ), + return_dict=False, ) except Exception: # This except branch will be triggered when the chosen model @@ -343,6 +344,7 @@ def _apply_jinja_template( **( request.chat_template_kwargs if request.chat_template_kwargs else {} ), + return_dict=False, ) if assistant_prefix: diff --git a/python/sglang/srt/model_loader/weight_utils.py b/python/sglang/srt/model_loader/weight_utils.py index a7b987e110f4..7598eb7e9089 100644 --- a/python/sglang/srt/model_loader/weight_utils.py +++ b/python/sglang/srt/model_loader/weight_utils.py @@ -70,7 +70,8 @@ def enable_hf_transfer(): class DisabledTqdm(tqdm): def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs, disable=True) + kwargs["disable"] = True + super().__init__(*args, **kwargs) def get_lock(model_name_or_path: str, cache_dir: Optional[str] = None): diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index e469a3c035a6..63e31cda45b4 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -353,7 +353,7 @@ def start_model_process( scores = [] for conv in prompts: conv_formatted = self.tokenizer.apply_chat_template( - conv, tokenize=False + conv, tokenize=False, return_dict=False ) conv_tokenized = self.tokenizer( conv_formatted, return_tensors="pt" diff --git a/test/srt/models/test_reward_models.py b/test/srt/models/test_reward_models.py index 5592ce22382e..72b74eb3326e 100644 --- a/test/srt/models/test_reward_models.py +++ b/test/srt/models/test_reward_models.py @@ -68,7 +68,9 @@ def assert_close_reward_scores( torch_dtype=torch_dtype, model_type="reward", ) as srt_runner: - prompts = srt_runner.tokenizer.apply_chat_template(convs, tokenize=False) + prompts = srt_runner.tokenizer.apply_chat_template( + convs, tokenize=False, return_dict=False + ) srt_outputs = srt_runner.forward(prompts) hf_scores = torch.tensor(hf_outputs.scores)