diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py index 2153a31ba70..472d748d1e6 100644 --- a/examples/offline_inference/bagel/end2end.py +++ b/examples/offline_inference/bagel/end2end.py @@ -2,7 +2,10 @@ import os from vllm_omni.inputs.data import OmniPromptType -from vllm_omni.model_executor.stage_input_processors.bagel import GEN_THINK_SYSTEM_PROMPT +from vllm_omni.model_executor.stage_input_processors.bagel import ( + GEN_THINK_SYSTEM_PROMPT, + VLM_THINK_SYSTEM_PROMPT, +) def parse_args(): @@ -171,7 +174,10 @@ def main(): elif args.modality == "img2text": if args.image_path: loaded_image = Image.open(args.image_path).convert("RGB") - final_prompt_text = f"<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" + think_prefix = f"<|im_start|>system\n{VLM_THINK_SYSTEM_PROMPT}<|im_end|>\n" if args.think else "" + final_prompt_text = ( + f"{think_prefix}<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n" + ) prompt_dict = { "prompt": final_prompt_text, "multi_modal_data": {"image": loaded_image}, @@ -179,7 +185,8 @@ def main(): } formatted_prompts.append(prompt_dict) elif args.modality == "text2text": - final_prompt_text = f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n" + think_prefix = f"<|im_start|>{VLM_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else "" + final_prompt_text = f"{think_prefix}<|im_start|>{p}<|im_end|><|im_start|>" prompt_dict = {"prompt": final_prompt_text, "modalities": ["text"]} formatted_prompts.append(prompt_dict) else: @@ -217,15 +224,11 @@ def main(): img_idx = 0 for req_output in omni_outputs: if args.think: - text_output = getattr(req_output, "text", None) or getattr(req_output, "outputs", None) - if text_output: - if isinstance(text_output, list) and text_output: - for out in text_output: - txt = getattr(out, "text", str(out)) - if txt: - print(f"[Think] {txt}") - elif isinstance(text_output, str): - print(f"[Think] {text_output}") + ro = getattr(req_output, "request_output", None) + if ro and getattr(ro, "outputs", None): + txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs) + if txt: + print(txt) images = getattr(req_output, "images", None) diff --git a/vllm_omni/model_executor/stage_input_processors/bagel.py b/vllm_omni/model_executor/stage_input_processors/bagel.py index 6b88fcd4a18..bfcff0ea0f3 100644 --- a/vllm_omni/model_executor/stage_input_processors/bagel.py +++ b/vllm_omni/model_executor/stage_input_processors/bagel.py @@ -135,6 +135,13 @@ def expand_cfg_prompts( "i.e. planning process here image here" ) +VLM_THINK_SYSTEM_PROMPT = ( + "You should first think about the reasoning process in the mind " + "and then provide the user with the answer. \n" + "The reasoning process is enclosed within tags, " + "i.e. reasoning process here answer here" +) + def expand_cfg_prompts_think( prompt: dict[str, Any] | str,