From 2dbf218631f19d323a8dacbca363f28d790d82b4 Mon Sep 17 00:00:00 2001 From: princepride Date: Sun, 5 Apr 2026 13:15:44 +0000 Subject: [PATCH 1/2] [BugFix] Add bagel text2text/img2text think mode support Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 22 +++++++++---------- .../stage_input_processors/bagel.py | 7 ++++++ 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 2153a31ba70..35ebf35cf08 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -2,7 +2,10 @@ import os from vllm_omni.inputs.data import OmniPromptType -from vllm_omni.model_executor.stage_input_processors.bagel import GEN_THINK_SYSTEM_PROMPT +from vllm_omni.model_executor.stage_input_processors.bagel import ( + GEN_THINK_SYSTEM_PROMPT, + VLM_THINK_SYSTEM_PROMPT, +) def parse_args(): @@ -179,7 +182,8 @@ def main(): } formatted_prompts.append(prompt_dict) elif args.modality == "text2text": - final_prompt_text = f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n" + think_prefix = f"<|im_start|>{VLM_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else "" + final_prompt_text = f"{think_prefix}<|im_start|>{p}<|im_end|><|im_start|>" prompt_dict = {"prompt": final_prompt_text, "modalities": ["text"]} formatted_prompts.append(prompt_dict) else: @@ -217,15 +221,11 @@ def main(): img_idx = 0 for req_output in omni_outputs: if args.think: - text_output = getattr(req_output, "text", None) or getattr(req_output, "outputs", None) - if text_output: - if isinstance(text_output, list) and text_output: - for out in text_output: - txt = getattr(out, "text", str(out)) - if txt: - print(f"[Think] {txt}") - elif isinstance(text_output, str): - print(f"[Think] {text_output}") + ro = getattr(req_output, "request_output", None) + if ro and getattr(ro, "outputs", None): + txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) + if txt: + print(txt) images = getattr(req_output, "images", None) diff --git a/vllm_omni/model_executor/stage_input_processors/bagel.py b/vllm_omni/model_executor/stage_input_processors/bagel.py index 6b88fcd4a18..bfcff0ea0f3 100644 --- a/vllm_omni/model_executor/stage_input_processors/bagel.py +++ b/vllm_omni/model_executor/stage_input_processors/bagel.py @@ -135,6 +135,13 @@ def expand_cfg_prompts( "i.e. planning process here image here" ) +VLM_THINK_SYSTEM_PROMPT = ( + "You should first think about the reasoning process in the mind " + "and then provide the user with the answer. \n" + "The reasoning process is enclosed within tags, " + "i.e. reasoning process here answer here" +) + def expand_cfg_prompts_think( prompt: dict[str, Any] | str, From 6e4b86c1333dacebc3a91777381237f924549472 Mon Sep 17 00:00:00 2001 From: princepride Date: Sun, 5 Apr 2026 15:14:00 +0000 Subject: [PATCH 2/2] add img2text think mode Signed-off-by: princepride --- examples/offline_inference/bagel/end2end.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 35ebf35cf08..472d748d1e6 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -174,7 +174,10 @@ def main(): elif args.modality == "img2text": if args.image_path: loaded_image = Image.open(args.image_path).convert("RGB") - final_prompt_text = f"<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" + think_prefix = f"<|im_start|>system\n{VLM_THINK_SYSTEM_PROMPT}<|im_end|>\n" if args.think else "" + final_prompt_text = ( + f"{think_prefix}<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" + ) prompt_dict = { "prompt": final_prompt_text, "multi_modal_data": {"image": loaded_image},