From 1e69e9feb88dcb720891cc941e88a21fb3721a8b Mon Sep 17 00:00:00 2001 From: princepride Date: Tue, 14 Apr 2026 14:44:46 +0000 Subject: [PATCH 1/5] fix multi-stage cfg bug Signed-off-by: princepride --- .../diffusion/models/bagel/pipeline_bagel.py | 20 +++++++++---------- .../stage_input_processors/bagel.py | 6 +++++- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py index a3d2259e643..29058413426 100644 --- a/vllm_omni/diffusion/models/bagel/pipeline_bagel.py +++ b/vllm_omni/diffusion/models/bagel/pipeline_bagel.py @@ -397,8 +397,15 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: cfg_text_context["ropes"] = cfg_text_metadata["ropes"] else: cfg_text_context["ropes"] = [cfg_text_seq_len] - - if cfg_img_kv is None and cfg_text_kv is not None: + else: + # No cfg_text companion received. For text2img this is the + # expected path: original BAGEL uses an empty KV cache (0 + # tokens) as the text-unconditional branch. Keep the default + # empty NaiveCache in cfg_text_context and preserve the + # original cfg_text_scale so CFG still applies. + pass + + if cfg_img_kv is None: cfg_img_kv = injected_kv if cfg_img_kv is not None: @@ -410,15 +417,6 @@ def forward(self, req: OmniDiffusionRequest) -> DiffusionOutput: else: cfg_img_context["ropes"] = [cfg_img_seq_len] - if not cfg_parallel_contract: - logger.warning("CFG is disabled: only single KV cache available") - gen_params = BagelGenParams( - num_timesteps=gen_params.num_timesteps, - timestep_shift=gen_params.timestep_shift, - cfg_text_scale=1.0, - cfg_img_scale=1.0, - ) - else: image_input = ( None diff --git a/vllm_omni/model_executor/stage_input_processors/bagel.py b/vllm_omni/model_executor/stage_input_processors/bagel.py index bfcff0ea0f3..02b3ab1735f 100644 --- a/vllm_omni/model_executor/stage_input_processors/bagel.py +++ b/vllm_omni/model_executor/stage_input_processors/bagel.py @@ -82,6 +82,8 @@ def expand_cfg_prompts( neg_prompt = _get_negative_prompt(prompt, sampling_params) if "image" in modalities: + if not neg_prompt: + return [] neg_prompt_dict = { "prompt": neg_prompt, "modalities": prompt.get("modalities", []), @@ -166,6 +168,8 @@ def expand_cfg_prompts_think( companion_params = {"max_tokens": 1} if "image" in modalities: + if not neg_prompt: + return [] neg_prompt_dict = { "prompt": neg_prompt, "modalities": prompt.get("modalities", []), @@ -300,4 +304,4 @@ def _get_negative_prompt( if neg: return neg - return "<|im_start|><|im_end|>" + return "" From b2d79f40df7b40e805b45ed56e0e1e20eaa48f06 Mon Sep 17 00:00:00 2001 From: princepride Date: Tue, 14 Apr 2026 15:41:37 +0000 Subject: [PATCH 2/5] update ut Signed-off-by: princepride --- .../stage_configs/bagel_mooncake_ci.yaml | 1 + .../stage_configs/bagel_sharedmemory_ci.yaml | 1 + .../offline_inference/test_bagel_img2img.py | 40 +++++++++---------- .../offline_inference/test_bagel_text2img.py | 40 +++++++++---------- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml index 1f0d06cb8c0..e64256e4ae4 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml @@ -50,6 +50,7 @@ stage_args: enforce_eager: true trust_remote_code: true distributed_executor_backend: mp + load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml index 36b1d2bbe48..6e918c3ae88 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml @@ -49,6 +49,7 @@ stage_args: enforce_eager: true trust_remote_code: true distributed_executor_backend: "mp" + load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py index 63d2a37da79..be79aa7348a 100644 --- a/tests/e2e/offline_inference/test_bagel_img2img.py +++ b/tests/e2e/offline_inference/test_bagel_img2img.py @@ -32,30 +32,30 @@ # prompt='Change the grass color to red', # input image: 2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (157, 172, 217)}, - {"position": (400, 50), "rgb": (105, 144, 218)}, - {"position": (700, 100), "rgb": (118, 159, 233)}, - {"position": (150, 400), "rgb": (195, 34, 60)}, - {"position": (512, 336), "rgb": (222, 214, 193)}, - {"position": (700, 400), "rgb": (197, 15, 43)}, - {"position": (100, 600), "rgb": (105, 13, 18)}, - {"position": (400, 600), "rgb": (169, 33, 44)}, - {"position": (700, 600), "rgb": (101, 86, 93)}, - {"position": (256, 256), "rgb": (181, 202, 222)}, + {"position": (100, 100), "rgb": (156, 172, 217)}, + {"position": (400, 50), "rgb": (105, 144, 217)}, + {"position": (700, 100), "rgb": (118, 159, 232)}, + {"position": (150, 400), "rgb": (180, 22, 52)}, + {"position": (512, 336), "rgb": (221, 211, 194)}, + {"position": (700, 400), "rgb": (192, 10, 46)}, + {"position": (100, 600), "rgb": (102, 12, 22)}, + {"position": (400, 600), "rgb": (161, 28, 47)}, + {"position": (700, 600), "rgb": (100, 87, 94)}, + {"position": (256, 256), "rgb": (181, 201, 221)}, ] if current_omni_platform.is_rocm(): REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (156, 172, 215)}, - {"position": (400, 50), "rgb": (106, 144, 216)}, - {"position": (700, 100), "rgb": (118, 158, 231)}, - {"position": (150, 400), "rgb": (183, 23, 48)}, - {"position": (512, 336), "rgb": (218, 215, 191)}, - {"position": (700, 400), "rgb": (194, 14, 42)}, - {"position": (100, 600), "rgb": (105, 10, 16)}, - {"position": (400, 600), "rgb": (167, 33, 46)}, - {"position": (700, 600), "rgb": (102, 86, 92)}, - {"position": (256, 256), "rgb": (181, 201, 220)}, + {"position": (100, 100), "rgb": (156, 172, 217)}, + {"position": (400, 50), "rgb": (105, 144, 217)}, + {"position": (700, 100), "rgb": (118, 159, 232)}, + {"position": (150, 400), "rgb": (180, 22, 52)}, + {"position": (512, 336), "rgb": (221, 211, 194)}, + {"position": (700, 400), "rgb": (192, 10, 46)}, + {"position": (100, 600), "rgb": (102, 12, 22)}, + {"position": (400, 600), "rgb": (161, 28, 47)}, + {"position": (700, 600), "rgb": (100, 87, 94)}, + {"position": (256, 256), "rgb": (181, 201, 221)}, ] PIXEL_TOLERANCE = 10 diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py index e45d64f2ac5..534b8730682 100644 --- a/tests/e2e/offline_inference/test_bagel_text2img.py +++ b/tests/e2e/offline_inference/test_bagel_text2img.py @@ -37,30 +37,30 @@ # "Generated with seed=52, num_inference_steps=15, # prompt='A futuristic city skyline at twilight, cyberpunk style'" REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (121, 118, 100)}, - {"position": (400, 50), "rgb": (163, 162, 143)}, - {"position": (700, 100), "rgb": (170, 156, 127)}, - {"position": (150, 400), "rgb": (129, 127, 112)}, - {"position": (512, 512), "rgb": (135, 61, 59)}, - {"position": (700, 400), "rgb": (205, 107, 43)}, - {"position": (100, 700), "rgb": (197, 177, 157)}, - {"position": (400, 700), "rgb": (139, 107, 86)}, - {"position": (700, 700), "rgb": (247, 205, 146)}, - {"position": (256, 256), "rgb": (171, 160, 153)}, + {"position": (100, 100), "rgb": (115, 113, 94)}, + {"position": (400, 50), "rgb": (159, 160, 144)}, + {"position": (700, 100), "rgb": (164, 151, 123)}, + {"position": (150, 400), "rgb": (120, 121, 107)}, + {"position": (512, 512), "rgb": (165, 133, 127)}, + {"position": (700, 400), "rgb": (217, 130, 66)}, + {"position": (100, 700), "rgb": (191, 168, 152)}, + {"position": (400, 700), "rgb": (130, 96, 77)}, + {"position": (700, 700), "rgb": (247, 203, 140)}, + {"position": (256, 256), "rgb": (167, 156, 150)}, ] if current_omni_platform.is_rocm(): REFERENCE_PIXELS = [ - {"position": (100, 100), "rgb": (123, 119, 100)}, - {"position": (400, 50), "rgb": (162, 161, 142)}, - {"position": (700, 100), "rgb": (171, 156, 127)}, - {"position": (150, 400), "rgb": (131, 128, 112)}, - {"position": (512, 512), "rgb": (134, 61, 59)}, - {"position": (700, 400), "rgb": (204, 107, 43)}, - {"position": (100, 700), "rgb": (201, 180, 165)}, - {"position": (400, 700), "rgb": (140, 108, 87)}, - {"position": (700, 700), "rgb": (247, 205, 145)}, - {"position": (256, 256), "rgb": (171, 160, 153)}, + {"position": (100, 100), "rgb": (115, 113, 94)}, + {"position": (400, 50), "rgb": (159, 160, 144)}, + {"position": (700, 100), "rgb": (164, 151, 123)}, + {"position": (150, 400), "rgb": (120, 121, 107)}, + {"position": (512, 512), "rgb": (165, 133, 127)}, + {"position": (700, 400), "rgb": (217, 130, 66)}, + {"position": (100, 700), "rgb": (191, 168, 152)}, + {"position": (400, 700), "rgb": (130, 96, 77)}, + {"position": (700, 700), "rgb": (247, 203, 140)}, + {"position": (256, 256), "rgb": (167, 156, 150)}, ] # Maximum allowed difference per color channel From 847c9b78a35faf8c04900651c8546565936c7864 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Thu, 16 Apr 2026 20:50:27 +0800 Subject: [PATCH 3/5] fix some bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- .../e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml | 2 +- .../offline_inference/stage_configs/bagel_sharedmemory_ci.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml index e64256e4ae4..524d6ad0c97 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml @@ -50,7 +50,7 @@ stage_args: enforce_eager: true trust_remote_code: true distributed_executor_backend: mp - load_format: dummy + diffusion_load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml index 6e918c3ae88..0bec8fb8803 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml @@ -49,7 +49,7 @@ stage_args: enforce_eager: true trust_remote_code: true distributed_executor_backend: "mp" - load_format: dummy + diffusion_load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] From 836ed780a787c7d92f746ede39bdfa6371ad57be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Thu, 16 Apr 2026 22:41:59 +0800 Subject: [PATCH 4/5] fix some bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- vllm_omni/diffusion/diffusion_engine.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index fe940d623e5..e5779e8161a 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -97,12 +97,13 @@ def __init__( self.abort_queue: queue.Queue[str] = queue.Queue() self.execute_fn = self.executor.execute_step if self.step_execution else self.executor.execute_request - try: - self._dummy_run() - except Exception as e: - logger.error(f"Dummy run failed: {e}") - self.close() - raise e + if od_config.diffusion_load_format != "dummy": + try: + self._dummy_run() + except Exception as e: + logger.error(f"Dummy run failed: {e}") + self.close() + raise e def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: diffusion_engine_start_time = time.perf_counter() From ff65d90fbeb9e8d0d994213f005bf282fb78a225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=AA=E5=BF=97=E9=B9=8F?= Date: Thu, 16 Apr 2026 23:23:09 +0800 Subject: [PATCH 5/5] fix some bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 汪志鹏 --- .../stage_configs/bagel_mooncake_ci.yaml | 1 - .../stage_configs/bagel_sharedmemory_ci.yaml | 1 - vllm_omni/diffusion/diffusion_engine.py | 13 ++++++------- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml index 524d6ad0c97..1f0d06cb8c0 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_mooncake_ci.yaml @@ -50,7 +50,6 @@ stage_args: enforce_eager: true trust_remote_code: true distributed_executor_backend: mp - diffusion_load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml index 0bec8fb8803..36b1d2bbe48 100644 --- a/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml +++ b/tests/e2e/offline_inference/stage_configs/bagel_sharedmemory_ci.yaml @@ -49,7 +49,6 @@ stage_args: enforce_eager: true trust_remote_code: true distributed_executor_backend: "mp" - diffusion_load_format: dummy omni_kv_config: need_recv_cache: true engine_input_source: [0] diff --git a/vllm_omni/diffusion/diffusion_engine.py b/vllm_omni/diffusion/diffusion_engine.py index e5779e8161a..fe940d623e5 100644 --- a/vllm_omni/diffusion/diffusion_engine.py +++ b/vllm_omni/diffusion/diffusion_engine.py @@ -97,13 +97,12 @@ def __init__( self.abort_queue: queue.Queue[str] = queue.Queue() self.execute_fn = self.executor.execute_step if self.step_execution else self.executor.execute_request - if od_config.diffusion_load_format != "dummy": - try: - self._dummy_run() - except Exception as e: - logger.error(f"Dummy run failed: {e}") - self.close() - raise e + try: + self._dummy_run() + except Exception as e: + logger.error(f"Dummy run failed: {e}") + self.close() + raise e def step(self, request: OmniDiffusionRequest) -> list[OmniRequestOutput]: diffusion_engine_start_time = time.perf_counter()