From 2dbf218631f19d323a8dacbca363f28d790d82b4 Mon Sep 17 00:00:00 2001
From: princepride <wangzhipeng628@gmail.com>
Date: Sun, 5 Apr 2026 13:15:44 +0000
Subject: [PATCH 1/2] [BugFix] Add bagel text2text/img2text think mode support

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 examples/offline_inference/bagel/end2end.py   | 22 +++++++++----------
 .../stage_input_processors/bagel.py           |  7 ++++++
 2 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py
index 2153a31ba70..35ebf35cf08 100644
--- a/examples/offline_inference/bagel/end2end.py
+++ b/examples/offline_inference/bagel/end2end.py
@@ -2,7 +2,10 @@
 import os
 
 from vllm_omni.inputs.data import OmniPromptType
-from vllm_omni.model_executor.stage_input_processors.bagel import GEN_THINK_SYSTEM_PROMPT
+from vllm_omni.model_executor.stage_input_processors.bagel import (
+    GEN_THINK_SYSTEM_PROMPT,
+    VLM_THINK_SYSTEM_PROMPT,
+)
 
 
 def parse_args():
@@ -179,7 +182,8 @@ def main():
                 }
                 formatted_prompts.append(prompt_dict)
         elif args.modality == "text2text":
-            final_prompt_text = f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n"
+            think_prefix = f"<|im_start|>{VLM_THINK_SYSTEM_PROMPT}<|im_end|>" if args.think else ""
+            final_prompt_text = f"{think_prefix}<|im_start|>{p}<|im_end|><|im_start|>"
             prompt_dict = {"prompt": final_prompt_text, "modalities": ["text"]}
             formatted_prompts.append(prompt_dict)
         else:
@@ -217,15 +221,11 @@ def main():
     img_idx = 0
     for req_output in omni_outputs:
         if args.think:
-            text_output = getattr(req_output, "text", None) or getattr(req_output, "outputs", None)
-            if text_output:
-                if isinstance(text_output, list) and text_output:
-                    for out in text_output:
-                        txt = getattr(out, "text", str(out))
-                        if txt:
-                            print(f"[Think] {txt}")
-                elif isinstance(text_output, str):
-                    print(f"[Think] {text_output}")
+            ro = getattr(req_output, "request_output", None)
+            if ro and getattr(ro, "outputs", None):
+                txt = "".join(getattr(o, "text", "") or "" for o in ro.outputs)
+                if txt:
+                    print(txt)
 
         images = getattr(req_output, "images", None)
 
diff --git a/vllm_omni/model_executor/stage_input_processors/bagel.py b/vllm_omni/model_executor/stage_input_processors/bagel.py
index 6b88fcd4a18..bfcff0ea0f3 100644
--- a/vllm_omni/model_executor/stage_input_processors/bagel.py
+++ b/vllm_omni/model_executor/stage_input_processors/bagel.py
@@ -135,6 +135,13 @@ def expand_cfg_prompts(
     "i.e. <think> planning process here </think> image here"
 )
 
+VLM_THINK_SYSTEM_PROMPT = (
+    "You should first think about the reasoning process in the mind "
+    "and then provide the user with the answer. \n"
+    "The reasoning process is enclosed within <think> </think> tags, "
+    "i.e. <think> reasoning process here </think> answer here"
+)
+
 
 def expand_cfg_prompts_think(
     prompt: dict[str, Any] | str,

From 6e4b86c1333dacebc3a91777381237f924549472 Mon Sep 17 00:00:00 2001
From: princepride <wangzhipeng628@gmail.com>
Date: Sun, 5 Apr 2026 15:14:00 +0000
Subject: [PATCH 2/2] add img2text think mode

Signed-off-by: princepride <wangzhipeng628@gmail.com>
---
 examples/offline_inference/bagel/end2end.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py
index 35ebf35cf08..472d748d1e6 100644
--- a/examples/offline_inference/bagel/end2end.py
+++ b/examples/offline_inference/bagel/end2end.py
@@ -174,7 +174,10 @@ def main():
         elif args.modality == "img2text":
             if args.image_path:
                 loaded_image = Image.open(args.image_path).convert("RGB")
-                final_prompt_text = f"<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n"
+                think_prefix = f"<|im_start|>system\n{VLM_THINK_SYSTEM_PROMPT}<|im_end|>\n" if args.think else ""
+                final_prompt_text = (
+                    f"{think_prefix}<|im_start|>user\n<|image_pad|>\n{p}<|im_end|>\n<|im_start|>assistant\n"
+                )
                 prompt_dict = {
                     "prompt": final_prompt_text,
                     "multi_modal_data": {"image": loaded_image},