diff --git a/examples/offline_inference/bagel/end2end.py b/examples/offline_inference/bagel/end2end.py
index 6562f32ae6a..922a1af2368 100644
--- a/examples/offline_inference/bagel/end2end.py
+++ b/examples/offline_inference/bagel/end2end.py
@@ -101,7 +101,7 @@ def main():
 
     if not prompts:
         # Default prompt for text2img test if none provided
-        prompts = ["<|im_start|>A cute cat<|im_end|>"]
+        prompts = ["A cute cat"]
         print(f"[Info] No prompts provided, using default: {prompts}")
     omni_outputs = []
 
diff --git a/examples/online_serving/bagel/openai_chat_client.py b/examples/online_serving/bagel/openai_chat_client.py
index fd5f4cac5d7..cc9ec32db91 100755
--- a/examples/online_serving/bagel/openai_chat_client.py
+++ b/examples/online_serving/bagel/openai_chat_client.py
@@ -125,7 +125,7 @@ def generate_image(
 
 def main():
     parser = argparse.ArgumentParser(description="Bagel multimodal chat client")
-    parser.add_argument("--prompt", "-p", default="<|im_start|>A cute cat<|im_end|>", help="Text prompt")
+    parser.add_argument("--prompt", "-p", default="A cute cat", help="Text prompt")
     parser.add_argument("--output", "-o", default="bagel_output.png", help="Output file (for image results)")
     parser.add_argument("--server", "-s", default="http://localhost:8091", help="Server URL")
 
diff --git a/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml
new file mode 100644
index 00000000000..2c1d84af493
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/bagel_single_stage.yaml
@@ -0,0 +1,32 @@
+# Stage 0: Thinker (multimodal understanding + text generation)
+
+stage_args:
+
+  - stage_id: 0
+    stage_type: diffusion
+    runtime:
+      devices: "0"
+    engine_args:
+      model_stage: dit
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.45
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: image
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      tensor_parallel_size: 1
+
+    final_output: true
+    final_output_type: image
+    is_comprehension: false
+    default_sampling_params:
+      seed: 52
+
+# Runtime edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1
+    max_inflight: 1