diff --git a/.github/workflows/comfyui-publish.yml b/.github/workflows/comfyui-publish.yml
deleted file mode 100644
index c282bcada26..00000000000
--- a/.github/workflows/comfyui-publish.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: ComfyUI Integration -- Publish to Comfy registry
-on:
-  workflow_dispatch:
-  push:
-    tags:
-      - "*"
-    paths:
-      - "apps/ComfyUI-vLLM-Omni/**"
-
-permissions:
-  issues: write
-
-defaults:
-  run:
-    working-directory: apps/ComfyUI-vLLM-Omni
-
-jobs:
-  publish-node:
-    name: Publish Custom Node to registry
-    runs-on: ubuntu-latest
-    steps:
-      - name: ♻️ Check out code
-        uses: actions/checkout@v4
-      - name: 📦 Publish Custom Node
-        uses: Comfy-Org/publish-node-action@main
-        with:
-          personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
diff --git a/.github/workflows/comfyui-validate.yml b/.github/workflows/comfyui-validate.yml
deleted file mode 100644
index 06b9b1f913e..00000000000
--- a/.github/workflows/comfyui-validate.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: ComfyUI Integration -- Validate backwards compatibility
-
-on:
-  pull_request:
-    branches:
-      - master
-      - main
-    paths:
-      - "apps/ComfyUI-vLLM-Omni/**"
-
-defaults:
-  run:
-    working-directory: apps/ComfyUI-vLLM-Omni
-
-jobs:
-  check-base-path:
-    runs-on: ubuntu-latest
-    outputs:
-      exists: ${{ steps.check.outputs.exists }}
-    defaults:
-      run:
-        working-directory: .
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.base.sha }}
-          fetch-depth: 1
-      - id: check
-        run: |
-          if [ -d "apps/ComfyUI-vLLM-Omni" ]; then
-            echo "exists=true" >> "$GITHUB_OUTPUT"
-          else
-            echo "exists=false" >> "$GITHUB_OUTPUT"
-          fi
-  validate:
-    needs: check-base-path
-    if: needs.check-base-path.outputs.exists == 'true'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: comfy-org/node-diff@main
diff --git a/apps/ComfyUI-vLLM-Omni/README.md b/apps/ComfyUI-vLLM-Omni/README.md
index dd530174f3f..23bbe5b7293 100644
--- a/apps/ComfyUI-vLLM-Omni/README.md
+++ b/apps/ComfyUI-vLLM-Omni/README.md
@@ -47,12 +47,12 @@ If no, check your shell running the ComfyUI process. There may be some error mes
 This extension offers the following nodes based on the output modalities (at **ComfyUI sidebar -> Node Library**):
 
 - **Generate Image** for text-to-image and image-to-image tasks
-- **Multimodality Comprehension** for multimodality-to-text and multimodality-to-audio tasks
+- **Multimodality Understanding** for multimodality-to-text and multimodality-to-audio tasks
 - **TTS** and **TTS Voice Clone** for TTS tasks
 
 This extension also offers example workflows (at **ComfyUI sidebar -> Templates -> vLLM-Omni**)
 
-> [!INFO]
+> [!NOTE]
 > The node UI and feature designs are intended to match vLLM-Omni online serving interfaces. It cannot offer more than what the interfaces support.
 
 To build a simple workflow yourself,
@@ -65,28 +65,16 @@ To build a simple workflow yourself,
     - For some multi-stage models like BAGEL, [only one stage's sampling parameters are exposed and tunable via vLLM-Omni's online serving API](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/bagel/). Thus, these models are treated as single-stage ones. Please check the vLLM-Omni documentation on how to correctly set each model's sampling parameters.
     - For multi-stage models where all stages are either autoregression or diffusion, you can also connect only a single Sampling Params node, indicating that this set of sampling parameters will be used for all stages.
 
-**The following features are tested**:
-
-- Single-node workflows for
-    - Multimodal Comprehension (e.g., Qwen Omni, BAGEL)
-    - Text-to-Image Generation (e.g., Qwen-Image)
-    - Image-to-Image Generation (e.g., Qwen-Image-Edit)
-    - TTS (e.g., Qwen TTS, including VoiceDesign, VoiceClone, CustomVoice)
-
-**The following features are not currently tested**. They may work or break. You are welcomed to test it out and offer comments.
-
-- Multi-node workflow that connects multiple model services together.
-
 ## Screenshots and Examples
 
-### Multimodal comprehension (e.g., Qwen Omni series, BAGEL)
+### Multimodal understanding (e.g., Qwen Omni series, BAGEL)
 
-(Also available at **ComfyUI sidebar->Template->vLLM-Omni->vLLM-Omni Annotated Example**)
+(Also available at **ComfyUI sidebar->Template->vLLM-Omni->vLLM-Omni Multimodal Understanding**)
 
 <p align="center">
   <picture>
-    <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-comprehension.jpg">
-    <img alt="vLLM-Omni Main Architecture" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-comprehension.jpg" width=55%>
+    <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-understanding.jpg">
+    <img alt="vLLM-Omni multimodal understanding" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-understanding.jpg" width=55%>
   </picture>
 </p>
 
@@ -98,7 +86,7 @@ You can configure per-stage sampling parameters for multi-stage models.
 <p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-multi-stage.jpg">
-    <img alt="vLLM-Omni Main Architecture" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-multi-stage.jpg" width=55%>
+    <img alt="vLLM-Omni multiple stages" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-multi-stage.jpg" width=55%>
   </picture>
 </p>
 
@@ -109,7 +97,7 @@ You can configure per-stage sampling parameters for multi-stage models.
 <p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-image-generation.jpg">
-    <img alt="vLLM-Omni Main Architecture" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-image-generation.jpg" width=55%>
+    <img alt="vLLM-Omni image generation" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-image-generation.jpg" width=55%>
   </picture>
 </p>
 
@@ -123,13 +111,24 @@ You can configure per-stage sampling parameters for multi-stage models.
 <p align="center">
   <picture>
     <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-tts.jpg">
-    <img alt="vLLM-Omni Main Architecture" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-tts.jpg" width=55%>
+    <img alt="vLLM-Omni TTS" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-tts.jpg" width=55%>
   </picture>
 </p>
 
 > [!TIP]
 > There is a dedicated node for VoiceClone tasks with reference audio input. Other simple text-to-speech tasks should use the regular TTS node.
 
+### Chaining multiple model services
+
+(Also available at **ComfyUI sidebar->Template->vLLM-Omni->vLLM-Omni Chaining Services**)
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg">
+    <img alt="vLLM-Omni TTS" src="https://raw.githubusercontent.com/vllm-project/vllm-omni/refs/heads/main/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg" width=55%>
+  </picture>
+</p>
+
 ## Develop
 
 Follow the [development convention and rules of vLLM-Omni](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/).
diff --git a/apps/ComfyUI-vLLM-Omni/__init__.py b/apps/ComfyUI-vLLM-Omni/__init__.py
index ffb86ddb3c4..641824add78 100644
--- a/apps/ComfyUI-vLLM-Omni/__init__.py
+++ b/apps/ComfyUI-vLLM-Omni/__init__.py
@@ -6,18 +6,18 @@
     "WEB_DIRECTORY",
 ]
 
-__author__ = """Zeyu Huang"""
-__email__ = "11222265+fhfuih@users.noreply.github.com"
+__author__ = """vLLM-Omni Team"""
+__email__ = "vllm-omni@vllm.ai"
 __version__ = "0.0.1"
 
 from .comfyui_vllm_omni.nodes import (
     VLLMOmniARSampling,
-    VLLMOmniComprehension,
     VLLMOmniDiffusionSampling,
     VLLMOmniGenerateImage,
     VLLMOmniQwenTTSParams,
     VLLMOmniSamplingParamsList,
     VLLMOmniTTS,
+    VLLMOmniUnderstanding,
     VLLMOmniVoiceClone,
 )
 
@@ -25,7 +25,7 @@
 NODE_CLASS_MAPPINGS = {
     # === Generation ===
     "VLLMOmniGenerateImage": VLLMOmniGenerateImage,
-    "VLLMOmniComprehension": VLLMOmniComprehension,
+    "VLLMOmniUnderstanding": VLLMOmniUnderstanding,
     "VLLMOmniTTS": VLLMOmniTTS,
     "VLLMOmniVoiceClone": VLLMOmniVoiceClone,
     # === Params ===
@@ -39,7 +39,7 @@
 NODE_DISPLAY_NAME_MAPPINGS = {
     # === Generation ===
     "VLLMOmniGenerateImage": "Generate Image",
-    "VLLMOmniComprehension": "Multimodality Comprehension",
+    "VLLMOmniUnderstanding": "Multimodality Understanding",
     "VLLMOmniTTS": "TTS (Text to Speech)",
     "VLLMOmniVoiceClone": "TTS Voice Cloning",
     # === Params ===
diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py
index ebc2822df64..5caa3869ed3 100644
--- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py
+++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py
@@ -1,4 +1,4 @@
-from typing import Literal, cast
+from typing import Literal
 
 import torch
 from comfy_api.input import AudioInput, VideoInput
@@ -6,7 +6,12 @@
 from .utils.api_client import VLLMOmniClient
 from .utils.logger import get_logger
 from .utils.models import lookup_model_spec
-from .utils.types import AudioFormat
+from .utils.types import (
+    AudioFormat,
+    AutoregressionSamplingParams,
+    DiffusionSamplingParams,
+    QwenTTSModelSpecificParams,
+)
 from .utils.validators import (
     add_sampling_parameters_to_stage,
     validate_model_and_sampling_params_types,
@@ -86,7 +91,10 @@ async def generate(
 
         # Prefer DALL-E compatible API for simple (one-stage) diffusion models
         if (spec is None or spec["stages"] == ["diffusion"]) and not is_bagel:
-            sampling_params = cast(dict | None, sampling_params)
+            # The number of sampling parameter groups should have been validated.
+            # Now, simply convert single-item list to dict.
+            if isinstance(sampling_params, list):
+                sampling_params = sampling_params[0]
             if audio is None and image is None and video is None:
                 # No multimodal input --- use DALL-E image generation
                 logger.info("Using DALL-E image generation endpoint")
@@ -133,7 +141,7 @@ async def generate(
         return (output,)
 
 
-class VLLMOmniComprehension(_VLLMOmniGenerateBase):
+class VLLMOmniUnderstanding(_VLLMOmniGenerateBase):
     @classmethod
     def INPUT_TYPES(cls):
         return {
@@ -197,7 +205,7 @@ async def generate(
             (
                 text_response,
                 _,
-            ) = await client.generate_comprehension_chat_completion(
+            ) = await client.generate_understanding_chat_completion(
                 model=model,
                 prompt=prompt,
                 image=image,
@@ -221,7 +229,7 @@ async def generate(
             (
                 text_response,
                 audio,
-            ) = await client.generate_comprehension_chat_completion(
+            ) = await client.generate_understanding_chat_completion(
                 model=model,
                 prompt=prompt,
                 image=image,
@@ -287,15 +295,13 @@ async def generate(
         logger.info("Got extra kwargs in TTS: %s", kwargs)
 
         is_qwen_tts = "qwen3-tts" in model.lower()
-        extra_params_type = None if model_specific_params is None else model_specific_params["type"]
-        if not is_qwen_tts and extra_params_type == "qwen-tts":
+        if not is_qwen_tts and isinstance(model_specific_params, QwenTTSModelSpecificParams):
             raise ValueError(
                 "You have provided Qwen-specific TTS params."
                 "However, the model appears to not be a Qwen TTS model (no 'Qwen3-TTS' in model name)."
             )
 
         combined_params = {**kwargs, **(model_specific_params or {})}
-        combined_params.pop("type", None)  # Internal fields in model_specific_params
 
         client = VLLMOmniClient(url)
 
@@ -352,8 +358,7 @@ async def generate(
         **kwargs,
     ):
         is_qwen_tts = "qwen3-tts" in model.lower()
-        extra_params_type = None if model_specific_params is None else model_specific_params["type"]
-        if not is_qwen_tts and extra_params_type == "qwen-tts":
+        if not is_qwen_tts and isinstance(model_specific_params, QwenTTSModelSpecificParams):
             raise ValueError(
                 "You have provided Qwen-specific TTS params."
                 "However, the model appears to not be a Qwen TTS model (no 'Qwen3-TTS' in model name)."
@@ -366,7 +371,6 @@ async def generate(
             **kwargs,
             **(model_specific_params or {}),
         }
-        combined_params.pop("type", None)  # Internal fields in model_specific_params
 
         client = VLLMOmniClient(url)
 
@@ -419,10 +423,7 @@ def INPUT_TYPES(cls):
     CATEGORY = "vLLM-Omni/Sampling Params"
 
     def get_params(self, seed, **kwargs):
-        params = {
-            "type": "autoregression",  # for internal use, removed before sending the request
-            **kwargs,
-        }
+        params = AutoregressionSamplingParams(kwargs)
         if seed >= 0:
             params["seed"] = seed
         return (params,)
@@ -479,6 +480,13 @@ def INPUT_TYPES(cls):
                         "tooltip": "Enable VAE slicing for reduced memory usage (slight quality trade-off)",
                     },
                 ),
+                "vae_use_tiling": (
+                    "BOOLEAN",
+                    {
+                        "default": False,
+                        "tooltip": "Enable VAE tiling for reduced memory usage (slight quality trade-off)",
+                    },
+                ),
                 # === Put seed at last. ===
                 # Whenever a field named "seed" is present, ComfyUI adds another field called "control after generate"
                 "seed": (
@@ -499,10 +507,7 @@ def INPUT_TYPES(cls):
     CATEGORY = "vLLM-Omni/Sampling Params"
 
     def get_params(self, seed, **kwargs):
-        params = {
-            "type": "diffusion",  # for internal use, removed before sending the request
-            **kwargs,
-        }
+        params = DiffusionSamplingParams(kwargs)
         if seed >= 0:
             params["seed"] = seed
         return (params,)
@@ -566,4 +571,4 @@ def INPUT_TYPES(cls):
     CATEGORY = "vLLM-Omni/TTS Params"
 
     def get_params(self, **kwargs):
-        return ({"type": "qwen-tts", **kwargs},)
+        return (QwenTTSModelSpecificParams(kwargs),)
diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py
index c5372641f51..d340073bc24 100644
--- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py
+++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py
@@ -57,16 +57,7 @@ async def generate_image(
         if negative_prompt:
             payload["negative_prompt"] = negative_prompt
         if sampling_params is not None:
-            # Only select specific sampling params
-            for k in (
-                "n",
-                "num_inference_steps",
-                "guidance_scale",
-                "true_cfg_scale",
-                "vae_use_slicing",
-            ):
-                if k in sampling_params and sampling_params[k] is not None:
-                    payload[k] = sampling_params[k]
+            payload.update(sampling_params)
         logger.debug("img gen payload: %s", payload)
 
         url = self.base_url + "/images/generations"
@@ -138,10 +129,8 @@ async def edit_image(
         if negative_prompt:
             form.add_field("negative_prompt", negative_prompt)
         if sampling_params is not None:
-            # Only select specific sampling params
-            for k in ("n", "num_inference_steps", "guidance_scale", "true_cfg_scale"):
-                if k in sampling_params and sampling_params[k] is not None:
-                    form.add_field(k, str(sampling_params[k]))
+            for k, v in sampling_params.items():
+                form.add_field(k, str(v))
         if mask is not None:
             mask_filename = "mask.png"
             form.add_field(
@@ -217,7 +206,7 @@ async def generate_image_chat_completion(
 
         return torch.stack(image_tensors, dim=0)
 
-    async def generate_comprehension_chat_completion(
+    async def generate_understanding_chat_completion(
         self,
         *,
         model: str,
@@ -427,30 +416,28 @@ def _prepare_chat_completion_messages(
             message_content.append({"type": "video_url", "video_url": {"url": video_to_base64(video)}})
         messages = [{"role": "user", "content": message_content}]
 
+        payload: dict[str, Any] = {"messages": messages, "model": model}
+        if modalities:
+            payload["modalities"] = modalities
+
         combined_extra_body: dict[str, Any] = {}
         if sampling_params is not None:
             spec, _ = lookup_model_spec(model)
             is_single_sampling_param = isinstance(sampling_params, dict) or len(sampling_params) == 1
 
-            # Exclude internal key
-            if isinstance(sampling_params, dict):
-                sampling_params = {k: v for k, v in sampling_params.items() if k != "type"}
-            else:
-                sampling_params = [{k: v for k, v in sp.items() if k != "type"} for sp in sampling_params]
-
             if (spec is None and is_single_sampling_param) or (spec is not None and spec["stages"] == ["diffusion"]):
                 # Diffusion format: extra_body directly contains sampling params.
-                # Validation should have taken care of matching sampling params' types.
+                # Validation has already taken care of matching sampling params' types and length. Safe to take [0].
                 # * Use this mode if the model is a simple one-stage diffusion model.
                 # * Fallback to this mode if model is not registered and a single sampling param is provided.
                 sampling_params = sampling_params if isinstance(sampling_params, dict) else sampling_params[0]
-                combined_extra_body: dict[str, Any] = {**sampling_params}
+                combined_extra_body: dict[str, Any] = sampling_params.copy()
                 if "n" in combined_extra_body:
-                    combined_extra_body["num_outputs_per_prompt"] = combined_extra_body["n"]
-                    del combined_extra_body["n"]
+                    combined_extra_body["num_outputs_per_prompt"] = combined_extra_body.pop("n")
             else:
-                # Use AR style payload, extra_body has a sampling_params_list field
-                combined_extra_body: dict[str, Any] = {"sampling_params_list": sampling_params}
+                # AR format: the payload has a sampling_params_list field, containing a list.
+                sampling_params_list = sampling_params if isinstance(sampling_params, list) else [sampling_params]
+                payload["sampling_params_list"] = sampling_params_list
 
         if negative_prompt:
             combined_extra_body["negative_prompt"] = negative_prompt
@@ -458,12 +445,11 @@ def _prepare_chat_completion_messages(
         if extra_body:
             combined_extra_body.update(extra_body)
 
-        payload: dict[str, Any] = {"messages": messages, "model": model}
+        # Add extra_body only if it has any content.
         if combined_extra_body:
             payload["extra_body"] = combined_extra_body
-        if modalities:
-            payload["modalities"] = modalities
 
+        # Place to inject any model-specific payload adjustment
         spec, _ = lookup_model_spec(model)
         if spec:
             preprocessor = spec.get("payload_preprocessor", None)
diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py
index 83bf64fe7d4..bfeddd82b87 100644
--- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py
+++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py
@@ -11,8 +11,7 @@ def _bagel_payload_preprocessor(payload: dict) -> dict:
                     content["text"] = "<|im_start|>" + content["text"] + "<|im_end|>"
     except (KeyError, TypeError):
         raise RuntimeError("Internal Error: malformatted BAGEL payload")
-    extra_body = payload.pop("extra_body", {})
-    return {**payload, **extra_body}
+    return payload
 
 
 def _qwen25_payload_preprocessor(payload: dict) -> dict:
@@ -37,7 +36,7 @@ def _qwen25_payload_preprocessor(payload: dict) -> dict:
         ],
         "modes": [
             {
-                "mode": ModelMode.COMPREHENSION,
+                "mode": ModelMode.UNDERSTANDING,
                 "input_modalities": [Modality.TEXT, Modality.IMAGE],
             }
         ],
@@ -48,7 +47,7 @@ def _qwen25_payload_preprocessor(payload: dict) -> dict:
         "payload_preprocessor": _qwen25_payload_preprocessor,
         "modes": [
             {
-                "mode": ModelMode.COMPREHENSION,
+                "mode": ModelMode.UNDERSTANDING,
                 "input_modalities": [
                     Modality.TEXT,
                     Modality.IMAGE,
@@ -62,7 +61,7 @@ def _qwen25_payload_preprocessor(payload: dict) -> dict:
         "stages": ["autoregression", "autoregression", "autoregression"],
         "modes": [
             {
-                "mode": ModelMode.COMPREHENSION,
+                "mode": ModelMode.UNDERSTANDING,
                 "input_modalities": [
                     Modality.TEXT,
                     Modality.IMAGE,
diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py
index 29be9648c9c..6cc79f3ed97 100644
--- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py
+++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py
@@ -11,11 +11,23 @@
 AudioFormat: TypeAlias = Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]
 
 
+class AutoregressionSamplingParams(dict):
+    pass
+
+
+class DiffusionSamplingParams(dict):
+    pass
+
+
+class QwenTTSModelSpecificParams(dict):
+    pass
+
+
 class ModelMode(Enum):
     IMAGE_GENERATION = auto()
     VIDEO_GENERATION = auto()
     AUDIO_GENERATION = auto()
-    COMPREHENSION = auto()
+    UNDERSTANDING = auto()
 
 
 class Modality(Enum):
diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py
index f1af7b2e97d..2abdd21006e 100644
--- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py
+++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py
@@ -1,5 +1,6 @@
 from .logger import get_logger
 from .models import lookup_model_spec
+from .types import AutoregressionSamplingParams, DiffusionSamplingParams
 
 logger = get_logger(__name__)
 
@@ -31,24 +32,20 @@ def validate_model_and_sampling_params_types(
             )
         # Check that each stage's type match
         for i, sp in enumerate(sampling_param_list):
-            if "type" not in sp:
-                raise RuntimeError("Internal error: unknown sampling parameter type")
-            if sp["type"] != stages[i]:
+            if not _check_sampling_param_matches_stage(sp, stages[i]):
                 raise ValueError(
-                    f"Sampling parameter type ({sp['type']}) does not match "
+                    f"Sampling parameter type ({sp.__class__.__name__}) does not match "
                     f"stage type ({stages[i]}) at index {i} for model {model_name}."
                 )
     elif isinstance(sampling_param_list, dict):
-        if "type" not in sampling_param_list:
-            raise RuntimeError("Internal error: unknown sampling parameter type")
         # Check that the provided single sampling param matches all stages
-        elif any(stage != sampling_param_list["type"] for stage in stages):
-            raise ValueError(
-                f"When passing a single sampling parameter node, all stages of the model must match "
-                f"the provided sampling parameter's type. "
-                f"However, the stages of model {model_name} are: {stages}. "
-                f"The provided sampling parameter is {sampling_param_list['type']}"
-            )
+        for i, stage in enumerate(stages):
+            if not _check_sampling_param_matches_stage(sampling_param_list, stage):
+                raise ValueError(
+                    f"Provided single sampling parameter type ({sampling_param_list.__class__.__name__}) must match "
+                    f"the types of all stages of the model. "
+                    f"However, stage {i} of model {model_name} is of type {stage}."
+                )
 
 
 def add_sampling_parameters_to_stage(
@@ -82,3 +79,11 @@ def add_sampling_parameters_to_stage(
                 sampling_param_list[i].update(params_to_add)
 
     return sampling_param_list
+
+
+def _check_sampling_param_matches_stage(sampling_param: dict, stage_type: str) -> bool:
+    if stage_type == "autoregression":
+        return isinstance(sampling_param, AutoregressionSamplingParams)
+    if stage_type == "diffusion":
+        return isinstance(sampling_param, DiffusionSamplingParams)
+    raise RuntimeError(f"Internal error: unknown stage type {stage_type}.")
diff --git a/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg b/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg
new file mode 100644
index 00000000000..20d9d077938
Binary files /dev/null and b/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg differ
diff --git a/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-comprehension.jpg b/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-understanding.jpg
similarity index 100%
rename from apps/ComfyUI-vLLM-Omni/docs/images/comfyui-comprehension.jpg
rename to apps/ComfyUI-vLLM-Omni/docs/images/comfyui-understanding.jpg
diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Annotated Example.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Annotated Example.json
deleted file mode 100644
index 16657df0efc..00000000000
--- a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Annotated Example.json	
+++ /dev/null
@@ -1 +0,0 @@
-{"id":"1c99f525-0a37-45ba-a28a-7df7c3af66b4","revision":0,"last_node_id":12,"last_link_id":14,"nodes":[{"id":1,"type":"VLLMOmniComprehension","pos":[1191.2177053682556,144.66829928181377],"size":[400,268],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"image","name":"image","shape":7,"type":"IMAGE","link":1},{"localized_name":"video","name":"video","shape":7,"type":"VIDEO","link":2},{"localized_name":"audio","name":"audio","shape":7,"type":"AUDIO","link":3},{"localized_name":"sampling_params","name":"sampling_params","shape":7,"type":"SAMPLING_PARAMS","link":14},{"localized_name":"url","name":"url","type":"STRING","widget":{"name":"url"},"link":null},{"localized_name":"model","name":"model","type":"STRING","widget":{"name":"model"},"link":null},{"localized_name":"prompt","name":"prompt","type":"STRING","widget":{"name":"prompt"},"link":null},{"localized_name":"output_text","name":"output_text","type":"BOOLEAN","widget":{"name":"output_text"},"link":null},{"localized_name":"output_audio","name":"output_audio","type":"BOOLEAN","widget":{"name":"output_audio"},"link":null},{"localized_name":"use_audio_in_video","name":"use_audio_in_video","type":"BOOLEAN","widget":{"name":"use_audio_in_video"},"link":null}],"outputs":[{"localized_name":"text_response","name":"text_response","type":"STRING","links":[8]},{"localized_name":"audio_response","name":"audio_response","type":"AUDIO","links":[9]}],"properties":{"Node name for S&R":"VLLMOmniComprehension"},"widgets_values":["http://localhost:8000/v1","Qwen/Qwen2.5-Omni-7B","",true,true,true]},{"id":3,"type":"LoadVideo","pos":[729.5984141255855,-198.631920454299],"size":[282.798828125,233.0743408203125],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"VIDEO","name":"VIDEO","type":"VIDEO","links":[2]}],"properties":{"Node name for S&R":"LoadVideo"},"widgets_values":["draw.mp4","image"]},{"id":4,"type":"LoadAudio","pos":[729.8037086965753,99.86963519703949],"size":[282.798828125,136],"flags":{},"order":1,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[3]}],"properties":{"Node name for S&R":"LoadAudio"},"widgets_values":["Megan-Fox.mp3",null,null]},{"id":5,"type":"VLLMOmniARSampling","pos":[510.3517536642828,658.073751009259],"size":[270,178],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"max_tokens","name":"max_tokens","type":"INT","widget":{"name":"max_tokens"},"link":null},{"localized_name":"temperature","name":"temperature","type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"repetition_penalty","name":"repetition_penalty","type":"FLOAT","widget":{"name":"repetition_penalty"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"AR sampling params","name":"AR sampling params","type":"SAMPLING_PARAMS","links":[12]}],"properties":{"Node name for S&R":"VLLMOmniARSampling"},"widgets_values":[100,1,1,1,-1,"randomize"]},{"id":7,"type":"VLLMOmniARSampling","pos":[503.33235181647115,419.34158016181806],"size":[270,178],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"max_tokens","name":"max_tokens","type":"INT","widget":{"name":"max_tokens"},"link":null},{"localized_name":"temperature","name":"temperature","type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"repetition_penalty","name":"repetition_penalty","type":"FLOAT","widget":{"name":"repetition_penalty"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"AR sampling params","name":"AR sampling params","type":"SAMPLING_PARAMS","links":[5,13]}],"properties":{"Node name for S&R":"VLLMOmniARSampling"},"widgets_values":[100,1,1,1,-1,"randomize"]},{"id":8,"type":"VLLMOmniSamplingParamsList","pos":[820.6056617389042,426.38372037182273],"size":[263.066015625,66],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"param1","name":"param1","type":"SAMPLING_PARAMS","link":5},{"localized_name":"param2","name":"param2","shape":7,"type":"SAMPLING_PARAMS","link":12},{"localized_name":"param3","name":"param3","shape":7,"type":"SAMPLING_PARAMS","link":13}],"outputs":[{"localized_name":"param list","name":"param list","type":"SAMPLING_PARAMS","links":[14]}],"properties":{"Node name for S&R":"VLLMOmniSamplingParamsList"},"widgets_values":[]},{"id":11,"type":"MarkdownNote","pos":[826.2328280438272,569.1890318701705],"size":[333.8220435590464,261.63596728060656],"flags":{},"order":4,"mode":0,"inputs":[],"outputs":[],"title":"Note: Sampling Parameters","properties":{},"widgets_values":["## Sampling Parameter Types\n\nThere are two types of sampling parameters: one for autoregression and one for diffusion.\nYou should ensure that you have chosen the correct type of sampling parameters for the model you request.\n\n## Stages & Shorthand\n\nFor multi-stage models such as Qwen Omni, you can either\n- connect one sampling parameter node, which is applied to all stages.\n- connect exactly the same number of sampling parameter nodes to a \"Multi-Stage Sampling Parameter List\" node, then connect this node to the primary request node.\n\nNote that this shorthand is intended to stay consistent with the [online serving API](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/qwen2_5_omni/)"],"color":"#432","bgcolor":"#000"},{"id":12,"type":"MarkdownNote","pos":[378.9866207003777,152.59550252215752],"size":[319.7287574247016,107.15904081906785],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"Note: Input","properties":{},"widgets_values":["Note that not all models support every modality as input. For example, `ByteDance-Seed/BAGEL-7B-MoT` in Multimodality Comprehension mode only support text and image input.\n\nYou should ensure that the input are supported by the model. You can check the corresponding [online serving documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/bagel/) for confirmation."],"color":"#432","bgcolor":"#000"},{"id":2,"type":"LoadImage","pos":[394.4674804308822,-207.6987397548834],"size":[282.798828125,314],"flags":{},"order":5,"mode":0,"inputs":[{"localized_name":"image","name":"image","type":"COMBO","widget":{"name":"image"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"IMAGE","name":"IMAGE","type":"IMAGE","links":[1]},{"localized_name":"MASK","name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":10,"type":"PreviewAudio","pos":[1664.548345556043,297.5921292054054],"size":[270,88],"flags":{},"order":10,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":9},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"Node name for S&R":"PreviewAudio"},"widgets_values":[]},{"id":9,"type":"ShowText|pysssss","pos":[1649.2506875091847,66.22823888292349],"size":[318.7188464232943,173.38502269972975],"flags":{},"order":9,"mode":0,"inputs":[{"localized_name":"text","name":"text","type":"STRING","link":8}],"outputs":[{"localized_name":"STRING","name":"STRING","shape":6,"type":"STRING","links":null}],"properties":{"Node name for S&R":"ShowText|pysssss"},"widgets_values":[]}],"links":[[1,2,0,1,0,"IMAGE"],[2,3,0,1,1,"VIDEO"],[3,4,0,1,2,"AUDIO"],[5,7,0,8,0,"SAMPLING_PARAMS"],[8,1,0,9,0,"STRING"],[9,1,1,10,0,"AUDIO"],[12,5,0,8,1,"SAMPLING_PARAMS"],[13,7,0,8,2,"SAMPLING_PARAMS"],[14,8,0,1,3,"SAMPLING_PARAMS"]],"groups":[{"id":1,"title":"Sampling Parameters","bounding":[480.1649301181556,341.08402513937995,692.4113568972277,510.48648853403665],"color":"#3f789e","font_size":24,"flags":{}},{"id":2,"title":"Input","bounding":[344.2364817813528,-287.5850484183313,704.8832238768634,559.5124009832894],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"workflowRendererVersion":"LG","ds":{"scale":0.9478575057427204,"offset":[33.81037136029557,307.1974296197726]}},"version":0.4}
diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Chaining Services.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Chaining Services.json
new file mode 100644
index 00000000000..3031f83444e
--- /dev/null
+++ b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Chaining Services.json	
@@ -0,0 +1,552 @@
+{
+    "id": "6643e5fd-fa2a-4f25-935a-483173a8097c",
+    "revision": 0,
+    "last_node_id": 8,
+    "last_link_id": 6,
+    "nodes": [
+        {
+            "id": 1,
+            "type": "PreviewImage",
+            "pos": [
+                1446.2005205859512,
+                -316.7359686049902
+            ],
+            "size": [
+                305.6628685610174,
+                307.0336884172169
+            ],
+            "flags": {},
+            "order": 5,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "images",
+                    "name": "images",
+                    "type": "IMAGE",
+                    "link": 1
+                }
+            ],
+            "outputs": [],
+            "properties": {
+                "Node name for S&R": "PreviewImage"
+            },
+            "widgets_values": []
+        },
+        {
+            "id": 8,
+            "type": "PreviewImage",
+            "pos": [
+                1010.1848219273943,
+                -311.54216706114363
+            ],
+            "size": [
+                305.6628685610174,
+                307.0336884172169
+            ],
+            "flags": {},
+            "order": 4,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "images",
+                    "name": "images",
+                    "type": "IMAGE",
+                    "link": 6
+                }
+            ],
+            "outputs": [],
+            "properties": {
+                "Node name for S&R": "PreviewImage"
+            },
+            "widgets_values": []
+        },
+        {
+            "id": 7,
+            "type": "VLLMOmniDiffusionSampling",
+            "pos": [
+                234.61958293819296,
+                9.405681270043619
+            ],
+            "size": [
+                284.205078125,
+                226
+            ],
+            "flags": {},
+            "order": 0,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "n",
+                    "name": "n",
+                    "type": "INT",
+                    "widget": {
+                        "name": "n"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "num_inference_steps",
+                    "name": "num_inference_steps",
+                    "type": "INT",
+                    "widget": {
+                        "name": "num_inference_steps"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "guidance_scale",
+                    "name": "guidance_scale",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "guidance_scale"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "true_cfg_scale",
+                    "name": "true_cfg_scale",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "true_cfg_scale"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "vae_use_slicing",
+                    "name": "vae_use_slicing",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "vae_use_slicing"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "vae_use_tiling",
+                    "name": "vae_use_tiling",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "vae_use_tiling"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "seed",
+                    "name": "seed",
+                    "type": "INT",
+                    "widget": {
+                        "name": "seed"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "diffusion sampling params",
+                    "name": "diffusion sampling params",
+                    "type": "SAMPLING_PARAMS",
+                    "links": [
+                        5
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniDiffusionSampling"
+            },
+            "widgets_values": [
+                1,
+                50,
+                1,
+                1,
+                false,
+                false,
+                1525,
+                "randomize"
+            ]
+        },
+        {
+            "id": 4,
+            "type": "VLLMOmniDiffusionSampling",
+            "pos": [
+                666.8380026154548,
+                268.86271068330126
+            ],
+            "size": [
+                284.205078125,
+                226
+            ],
+            "flags": {},
+            "order": 1,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "n",
+                    "name": "n",
+                    "type": "INT",
+                    "widget": {
+                        "name": "n"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "num_inference_steps",
+                    "name": "num_inference_steps",
+                    "type": "INT",
+                    "widget": {
+                        "name": "num_inference_steps"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "guidance_scale",
+                    "name": "guidance_scale",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "guidance_scale"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "true_cfg_scale",
+                    "name": "true_cfg_scale",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "true_cfg_scale"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "vae_use_slicing",
+                    "name": "vae_use_slicing",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "vae_use_slicing"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "vae_use_tiling",
+                    "name": "vae_use_tiling",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "vae_use_tiling"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "seed",
+                    "name": "seed",
+                    "type": "INT",
+                    "widget": {
+                        "name": "seed"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "diffusion sampling params",
+                    "name": "diffusion sampling params",
+                    "type": "SAMPLING_PARAMS",
+                    "links": [
+                        2
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniDiffusionSampling"
+            },
+            "widgets_values": [
+                4,
+                50,
+                7,
+                1,
+                false,
+                false,
+                42,
+                "fixed"
+            ]
+        },
+        {
+            "id": 5,
+            "type": "VLLMOmniGenerateImage",
+            "pos": [
+                984.723613585788,
+                63.376900027553276
+            ],
+            "size": [
+                416.56628685610167,
+                372.1662621294205
+            ],
+            "flags": {},
+            "order": 3,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "image",
+                    "name": "image",
+                    "shape": 7,
+                    "type": "IMAGE",
+                    "link": 4
+                },
+                {
+                    "localized_name": "mask",
+                    "name": "mask",
+                    "shape": 7,
+                    "type": "MASK",
+                    "link": null
+                },
+                {
+                    "localized_name": "sampling_params",
+                    "name": "sampling_params",
+                    "shape": 7,
+                    "type": "SAMPLING_PARAMS",
+                    "link": 2
+                },
+                {
+                    "localized_name": "url",
+                    "name": "url",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "url"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "model",
+                    "name": "model",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "model"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "prompt",
+                    "name": "prompt",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "prompt"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "negative_prompt",
+                    "name": "negative_prompt",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "negative_prompt"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "width",
+                    "name": "width",
+                    "type": "INT",
+                    "widget": {
+                        "name": "width"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "height",
+                    "name": "height",
+                    "type": "INT",
+                    "widget": {
+                        "name": "height"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "image",
+                    "name": "image",
+                    "type": "IMAGE",
+                    "links": [
+                        1
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniGenerateImage"
+            },
+            "widgets_values": [
+                "http://localhost:8001/v1",
+                "/home/models/Qwen/Qwen-Image-Edit",
+                "A high-quality, high contrast, stylized portrait of the object in the uploaded reference image. Pop art and doodle style, with abundant scribbling patterns, such as a teal crown, orange lightning bolts, colorful handwritten scripts.",
+                "Realistic",
+                800,
+                800
+            ]
+        },
+        {
+            "id": 6,
+            "type": "VLLMOmniGenerateImage",
+            "pos": [
+                541.2702717429818,
+                -160.70392744708548
+            ],
+            "size": [
+                416.56628685610167,
+                372.1662621294205
+            ],
+            "flags": {},
+            "order": 2,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "image",
+                    "name": "image",
+                    "shape": 7,
+                    "type": "IMAGE",
+                    "link": null
+                },
+                {
+                    "localized_name": "mask",
+                    "name": "mask",
+                    "shape": 7,
+                    "type": "MASK",
+                    "link": null
+                },
+                {
+                    "localized_name": "sampling_params",
+                    "name": "sampling_params",
+                    "shape": 7,
+                    "type": "SAMPLING_PARAMS",
+                    "link": 5
+                },
+                {
+                    "localized_name": "url",
+                    "name": "url",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "url"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "model",
+                    "name": "model",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "model"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "prompt",
+                    "name": "prompt",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "prompt"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "negative_prompt",
+                    "name": "negative_prompt",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "negative_prompt"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "width",
+                    "name": "width",
+                    "type": "INT",
+                    "widget": {
+                        "name": "width"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "height",
+                    "name": "height",
+                    "type": "INT",
+                    "widget": {
+                        "name": "height"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "image",
+                    "name": "image",
+                    "type": "IMAGE",
+                    "links": [
+                        4,
+                        6
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniGenerateImage"
+            },
+            "widgets_values": [
+                "http://localhost:8000/v1",
+                "/home/models/Tongyi-MAI/Z-Image-Turbo",
+                "A headshot of a cute Siamese kitty. Blurred background due to wide aperture. Close-up look. Realistic.",
+                "Cartoonish.",
+                800,
+                800
+            ]
+        }
+    ],
+    "links": [
+        [
+            1,
+            5,
+            0,
+            1,
+            0,
+            "IMAGE"
+        ],
+        [
+            2,
+            4,
+            0,
+            5,
+            2,
+            "SAMPLING_PARAMS"
+        ],
+        [
+            4,
+            6,
+            0,
+            5,
+            0,
+            "IMAGE"
+        ],
+        [
+            5,
+            7,
+            0,
+            6,
+            2,
+            "SAMPLING_PARAMS"
+        ],
+        [
+            6,
+            6,
+            0,
+            8,
+            0,
+            "IMAGE"
+        ]
+    ],
+    "groups": [],
+    "config": {},
+    "extra": {
+        "workflowRendererVersion": "LG",
+        "ds": {
+            "scale": 1.0426432563169903,
+            "offset": [
+                -90.4071374799346,
+                575.2948920069742
+            ]
+        }
+    },
+    "version": 0.4
+}
diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json
index d1495bc9459..86194ee70d5 100644
--- a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json	
+++ b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json	
@@ -1 +1 @@
-{"id":"91f75acc-8040-40f6-865a-2e8a7cfd6672","revision":0,"last_node_id":11,"last_link_id":23,"nodes":[{"id":3,"type":"PreviewImage","pos":[1281.8455167767304,-69.02638461454333],"size":[305.6628685610174,307.0336884172169],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"images","name":"images","type":"IMAGE","link":22}],"outputs":[],"properties":{"Node name for S&R":"PreviewImage"},"widgets_values":[]},{"id":11,"type":"VLLMOmniGenerateImage","pos":[816.0962515873642,-137.51112584854445],"size":[400,278],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"image","name":"image","shape":7,"type":"IMAGE","link":21},{"localized_name":"mask","name":"mask","shape":7,"type":"MASK","link":null},{"localized_name":"sampling_params","name":"sampling_params","shape":7,"type":"SAMPLING_PARAMS","link":23},{"localized_name":"url","name":"url","type":"STRING","widget":{"name":"url"},"link":null},{"localized_name":"model","name":"model","type":"STRING","widget":{"name":"model"},"link":null},{"localized_name":"prompt","name":"prompt","type":"STRING","widget":{"name":"prompt"},"link":null},{"localized_name":"negative_prompt","name":"negative_prompt","type":"STRING","widget":{"name":"negative_prompt"},"link":null},{"localized_name":"width","name":"width","type":"INT","widget":{"name":"width"},"link":null},{"localized_name":"height","name":"height","type":"INT","widget":{"name":"height"},"link":null}],"outputs":[{"localized_name":"image","name":"image","type":"IMAGE","links":[22]}],"properties":{"Node name for S&R":"VLLMOmniGenerateImage"},"widgets_values":["http://localhost:8000/v1","Qwen/Qwen-Image-Edit","Put this figure in a realistic mountain view","",512,512]},{"id":8,"type":"VLLMOmniDiffusionSampling","pos":[478.59266934006774,183.67711984955648],"size":[284.205078125,202],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"n","name":"n","type":"INT","widget":{"name":"n"},"link":null},{"localized_name":"num_inference_steps","name":"num_inference_steps","type":"INT","widget":{"name":"num_inference_steps"},"link":null},{"localized_name":"guidance_scale","name":"guidance_scale","type":"FLOAT","widget":{"name":"guidance_scale"},"link":null},{"localized_name":"true_cfg_scale","name":"true_cfg_scale","type":"FLOAT","widget":{"name":"true_cfg_scale"},"link":null},{"localized_name":"vae_use_slicing","name":"vae_use_slicing","type":"BOOLEAN","widget":{"name":"vae_use_slicing"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"diffusion sampling params","name":"diffusion sampling params","type":"SAMPLING_PARAMS","links":[23]}],"properties":{"Node name for S&R":"VLLMOmniDiffusionSampling"},"widgets_values":[4,50,1,1,false,42,"fixed"]},{"id":10,"type":"MarkdownNote","pos":[227.99306819575278,-231.24143306069843],"size":[240.3326478922474,136.3505820791881],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[],"title":"Note: Task and Input","properties":{},"widgets_values":["vLLM-Omni nodes are categorized based on the output modality. The \"Generate Image\" node supports both text-to-image generation or image-to-image generation (a.k.a. image editing). The node will route to the correct endpoint depending on whether an input image is present or not."],"color":"#432","bgcolor":"#000"},{"id":4,"type":"LoadImage","pos":[496.31859627609606,-229.71277089860084],"size":[270,314],"flags":{},"order":1,"mode":0,"inputs":[{"localized_name":"image","name":"image","type":"COMBO","widget":{"name":"image"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"IMAGE","name":"IMAGE","type":"IMAGE","links":[21]},{"localized_name":"MASK","name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]}],"links":[[21,4,0,11,0,"IMAGE"],[22,11,0,3,0,"IMAGE"],[23,8,0,11,2,"SAMPLING_PARAMS"]],"groups":[{"id":1,"title":"Input","bounding":[213.1706010087147,-313.20095750667554,560.8246471437027,407.4246532472182],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"workflowRendererVersion":"LG","ds":{"scale":1.1469075819486894,"offset":[276.8427259980847,549.4885226647582]}},"version":0.4}
+{"id":"91f75acc-8040-40f6-865a-2e8a7cfd6672","revision":0,"last_node_id":11,"last_link_id":23,"nodes":[{"id":3,"type":"PreviewImage","pos":[1281.8455167767304,-69.02638461454333],"size":[305.6628685610174,307.0336884172169],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"图像","name":"images","type":"IMAGE","link":22}],"outputs":[],"properties":{"Node name for S&R":"PreviewImage"},"widgets_values":[]},{"id":11,"type":"VLLMOmniGenerateImage","pos":[816.0962515873642,-137.51112584854445],"size":[400,278],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"image","name":"image","shape":7,"type":"IMAGE","link":21},{"localized_name":"mask","name":"mask","shape":7,"type":"MASK","link":null},{"localized_name":"sampling_params","name":"sampling_params","shape":7,"type":"SAMPLING_PARAMS","link":23},{"localized_name":"url","name":"url","type":"STRING","widget":{"name":"url"},"link":null},{"localized_name":"model","name":"model","type":"STRING","widget":{"name":"model"},"link":null},{"localized_name":"prompt","name":"prompt","type":"STRING","widget":{"name":"prompt"},"link":null},{"localized_name":"negative_prompt","name":"negative_prompt","type":"STRING","widget":{"name":"negative_prompt"},"link":null},{"localized_name":"width","name":"width","type":"INT","widget":{"name":"width"},"link":null},{"localized_name":"height","name":"height","type":"INT","widget":{"name":"height"},"link":null}],"outputs":[{"localized_name":"image","name":"image","type":"IMAGE","links":[22]}],"properties":{"Node name for S&R":"VLLMOmniGenerateImage"},"widgets_values":["http://localhost:8000/v1","Qwen/Qwen-Image-Edit","Put this figure in a realistic mountain view","",512,512]},{"id":10,"type":"MarkdownNote","pos":[227.99306819575278,-231.24143306069843],"size":[240.3326478922474,136.3505820791881],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Note: Task and Input","properties":{},"widgets_values":["vLLM-Omni nodes are categorized based on the output modality. The \"Generate Image\" node supports both text-to-image generation or image-to-image generation (a.k.a. image editing). The node will route to the correct endpoint depending on whether an input image is present or not."],"color":"#432","bgcolor":"#000"},{"id":4,"type":"LoadImage","pos":[496.31859627609606,-229.71277089860084],"size":[270,314],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"图像","name":"image","type":"COMBO","widget":{"name":"image"},"link":null},{"localized_name":"选择文件上传","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"图像","name":"IMAGE","type":"IMAGE","links":[21]},{"localized_name":"遮罩","name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":8,"type":"VLLMOmniDiffusionSampling","pos":[478.59266934006774,183.67711984955648],"size":[284.205078125,226],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"n","name":"n","type":"INT","widget":{"name":"n"},"link":null},{"localized_name":"num_inference_steps","name":"num_inference_steps","type":"INT","widget":{"name":"num_inference_steps"},"link":null},{"localized_name":"guidance_scale","name":"guidance_scale","type":"FLOAT","widget":{"name":"guidance_scale"},"link":null},{"localized_name":"true_cfg_scale","name":"true_cfg_scale","type":"FLOAT","widget":{"name":"true_cfg_scale"},"link":null},{"localized_name":"vae_use_slicing","name":"vae_use_slicing","type":"BOOLEAN","widget":{"name":"vae_use_slicing"},"link":null},{"localized_name":"vae_use_tiling","name":"vae_use_tiling","type":"BOOLEAN","widget":{"name":"vae_use_tiling"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"diffusion sampling params","name":"diffusion sampling params","type":"SAMPLING_PARAMS","links":[23]}],"properties":{"Node name for S&R":"VLLMOmniDiffusionSampling"},"widgets_values":[4,50,1,1,false,false,42,"randomize"]}],"links":[[21,4,0,11,0,"IMAGE"],[22,11,0,3,0,"IMAGE"],[23,8,0,11,2,"SAMPLING_PARAMS"]],"groups":[{"id":1,"title":"Input","bounding":[213.1706010087147,-313.20095750667554,560.8246471437027,407.4246532472182],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"workflowRendererVersion":"LG","ds":{"scale":1.1469075819486894,"offset":[-55.18047513099182,220.2553505195962]}},"version":0.4}
diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Multimodal Understanding.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Multimodal Understanding.json
new file mode 100644
index 00000000000..4d32d5368c2
--- /dev/null
+++ b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Multimodal Understanding.json	
@@ -0,0 +1,761 @@
+{
+    "id": "1c99f525-0a37-45ba-a28a-7df7c3af66b4",
+    "revision": 0,
+    "last_node_id": 12,
+    "last_link_id": 14,
+    "nodes": [
+        {
+            "id": 1,
+            "type": "VLLMOmniUnderstanding",
+            "pos": [
+                1191.2177053682556,
+                144.66829928181377
+            ],
+            "size": [
+                400,
+                268
+            ],
+            "flags": {},
+            "order": 8,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "image",
+                    "name": "image",
+                    "shape": 7,
+                    "type": "IMAGE",
+                    "link": 1
+                },
+                {
+                    "localized_name": "video",
+                    "name": "video",
+                    "shape": 7,
+                    "type": "VIDEO",
+                    "link": 2
+                },
+                {
+                    "localized_name": "audio",
+                    "name": "audio",
+                    "shape": 7,
+                    "type": "AUDIO",
+                    "link": 3
+                },
+                {
+                    "localized_name": "sampling_params",
+                    "name": "sampling_params",
+                    "shape": 7,
+                    "type": "SAMPLING_PARAMS",
+                    "link": 14
+                },
+                {
+                    "localized_name": "url",
+                    "name": "url",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "url"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "model",
+                    "name": "model",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "model"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "prompt",
+                    "name": "prompt",
+                    "type": "STRING",
+                    "widget": {
+                        "name": "prompt"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "output_text",
+                    "name": "output_text",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "output_text"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "output_audio",
+                    "name": "output_audio",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "output_audio"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "use_audio_in_video",
+                    "name": "use_audio_in_video",
+                    "type": "BOOLEAN",
+                    "widget": {
+                        "name": "use_audio_in_video"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "text_response",
+                    "name": "text_response",
+                    "type": "STRING",
+                    "links": [
+                        8
+                    ]
+                },
+                {
+                    "localized_name": "audio_response",
+                    "name": "audio_response",
+                    "type": "AUDIO",
+                    "links": [
+                        9
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniUnderstanding"
+            },
+            "widgets_values": [
+                "http://localhost:8000/v1",
+                "Qwen/Qwen2.5-Omni-7B",
+                "",
+                true,
+                true,
+                true
+            ]
+        },
+        {
+            "id": 3,
+            "type": "LoadVideo",
+            "pos": [
+                729.5984141255855,
+                -198.631920454299
+            ],
+            "size": [
+                282.798828125,
+                233.0743408203125
+            ],
+            "flags": {},
+            "order": 0,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "file",
+                    "name": "file",
+                    "type": "COMBO",
+                    "widget": {
+                        "name": "file"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "choose file to upload",
+                    "name": "upload",
+                    "type": "IMAGEUPLOAD",
+                    "widget": {
+                        "name": "upload"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "VIDEO",
+                    "name": "VIDEO",
+                    "type": "VIDEO",
+                    "links": [
+                        2
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "LoadVideo"
+            },
+            "widgets_values": [
+                "draw.mp4",
+                "image"
+            ]
+        },
+        {
+            "id": 4,
+            "type": "LoadAudio",
+            "pos": [
+                729.8037086965753,
+                99.86963519703949
+            ],
+            "size": [
+                282.798828125,
+                136
+            ],
+            "flags": {},
+            "order": 1,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "audio",
+                    "name": "audio",
+                    "type": "COMBO",
+                    "widget": {
+                        "name": "audio"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "audioUI",
+                    "name": "audioUI",
+                    "type": "AUDIO_UI",
+                    "widget": {
+                        "name": "audioUI"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "choose file to upload",
+                    "name": "upload",
+                    "type": "AUDIOUPLOAD",
+                    "widget": {
+                        "name": "upload"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "AUDIO",
+                    "name": "AUDIO",
+                    "type": "AUDIO",
+                    "links": [
+                        3
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "LoadAudio"
+            },
+            "widgets_values": [
+                "Megan-Fox.mp3",
+                null,
+                null
+            ]
+        },
+        {
+            "id": 5,
+            "type": "VLLMOmniARSampling",
+            "pos": [
+                510.3517536642828,
+                658.073751009259
+            ],
+            "size": [
+                270,
+                178
+            ],
+            "flags": {},
+            "order": 2,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "max_tokens",
+                    "name": "max_tokens",
+                    "type": "INT",
+                    "widget": {
+                        "name": "max_tokens"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "temperature",
+                    "name": "temperature",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "temperature"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "top_p",
+                    "name": "top_p",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "top_p"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "repetition_penalty",
+                    "name": "repetition_penalty",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "repetition_penalty"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "seed",
+                    "name": "seed",
+                    "type": "INT",
+                    "widget": {
+                        "name": "seed"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "AR sampling params",
+                    "name": "AR sampling params",
+                    "type": "SAMPLING_PARAMS",
+                    "links": [
+                        12
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniARSampling"
+            },
+            "widgets_values": [
+                100,
+                1,
+                1,
+                1,
+                -1,
+                "randomize"
+            ]
+        },
+        {
+            "id": 7,
+            "type": "VLLMOmniARSampling",
+            "pos": [
+                503.33235181647115,
+                419.34158016181806
+            ],
+            "size": [
+                270,
+                178
+            ],
+            "flags": {},
+            "order": 3,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "max_tokens",
+                    "name": "max_tokens",
+                    "type": "INT",
+                    "widget": {
+                        "name": "max_tokens"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "temperature",
+                    "name": "temperature",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "temperature"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "top_p",
+                    "name": "top_p",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "top_p"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "repetition_penalty",
+                    "name": "repetition_penalty",
+                    "type": "FLOAT",
+                    "widget": {
+                        "name": "repetition_penalty"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "seed",
+                    "name": "seed",
+                    "type": "INT",
+                    "widget": {
+                        "name": "seed"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "AR sampling params",
+                    "name": "AR sampling params",
+                    "type": "SAMPLING_PARAMS",
+                    "links": [
+                        5,
+                        13
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniARSampling"
+            },
+            "widgets_values": [
+                100,
+                1,
+                1,
+                1,
+                -1,
+                "randomize"
+            ]
+        },
+        {
+            "id": 8,
+            "type": "VLLMOmniSamplingParamsList",
+            "pos": [
+                820.6056617389042,
+                426.38372037182273
+            ],
+            "size": [
+                263.066015625,
+                66
+            ],
+            "flags": {},
+            "order": 7,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "param1",
+                    "name": "param1",
+                    "type": "SAMPLING_PARAMS",
+                    "link": 5
+                },
+                {
+                    "localized_name": "param2",
+                    "name": "param2",
+                    "shape": 7,
+                    "type": "SAMPLING_PARAMS",
+                    "link": 12
+                },
+                {
+                    "localized_name": "param3",
+                    "name": "param3",
+                    "shape": 7,
+                    "type": "SAMPLING_PARAMS",
+                    "link": 13
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "param list",
+                    "name": "param list",
+                    "type": "SAMPLING_PARAMS",
+                    "links": [
+                        14
+                    ]
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "VLLMOmniSamplingParamsList"
+            },
+            "widgets_values": []
+        },
+        {
+            "id": 11,
+            "type": "MarkdownNote",
+            "pos": [
+                826.2328280438272,
+                569.1890318701705
+            ],
+            "size": [
+                333.8220435590464,
+                261.63596728060656
+            ],
+            "flags": {},
+            "order": 4,
+            "mode": 0,
+            "inputs": [],
+            "outputs": [],
+            "title": "Note: Sampling Parameters",
+            "properties": {},
+            "widgets_values": [
+                "## Sampling Parameter Types\n\nThere are two types of sampling parameters: one for autoregression and one for diffusion.\nYou should ensure that you have chosen the correct type of sampling parameters for the model you request.\n\n## Stages & Shorthand\n\nFor multi-stage models such as Qwen Omni, you can either\n- connect one sampling parameter node, which is applied to all stages.\n- connect exactly the same number of sampling parameter nodes to a \"Multi-Stage Sampling Parameter List\" node, then connect this node to the primary request node.\n\nNote that this shorthand is intended to stay consistent with the [online serving API](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/qwen2_5_omni/)"
+            ],
+            "color": "#432",
+            "bgcolor": "#000"
+        },
+        {
+            "id": 12,
+            "type": "MarkdownNote",
+            "pos": [
+                378.9866207003777,
+                152.59550252215752
+            ],
+            "size": [
+                319.7287574247016,
+                107.15904081906785
+            ],
+            "flags": {},
+            "order": 6,
+            "mode": 0,
+            "inputs": [],
+            "outputs": [],
+            "title": "Note: Input",
+            "properties": {},
+            "widgets_values": [
+                "Note that not all models support every modality as input. For example, `ByteDance-Seed/BAGEL-7B-MoT` in Multimodality Understanding mode only support text and image input.\n\nYou should ensure that the input are supported by the model. You can check the corresponding [online serving documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/bagel/) for confirmation."
+            ],
+            "color": "#432",
+            "bgcolor": "#000"
+        },
+        {
+            "id": 2,
+            "type": "LoadImage",
+            "pos": [
+                394.4674804308822,
+                -207.6987397548834
+            ],
+            "size": [
+                282.798828125,
+                314
+            ],
+            "flags": {},
+            "order": 5,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "image",
+                    "name": "image",
+                    "type": "COMBO",
+                    "widget": {
+                        "name": "image"
+                    },
+                    "link": null
+                },
+                {
+                    "localized_name": "choose file to upload",
+                    "name": "upload",
+                    "type": "IMAGEUPLOAD",
+                    "widget": {
+                        "name": "upload"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "IMAGE",
+                    "name": "IMAGE",
+                    "type": "IMAGE",
+                    "links": [
+                        1
+                    ]
+                },
+                {
+                    "localized_name": "MASK",
+                    "name": "MASK",
+                    "type": "MASK",
+                    "links": null
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "LoadImage"
+            },
+            "widgets_values": [
+                "example.png",
+                "image"
+            ]
+        },
+        {
+            "id": 10,
+            "type": "PreviewAudio",
+            "pos": [
+                1664.548345556043,
+                297.5921292054054
+            ],
+            "size": [
+                270,
+                88
+            ],
+            "flags": {},
+            "order": 10,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "audio",
+                    "name": "audio",
+                    "type": "AUDIO",
+                    "link": 9
+                },
+                {
+                    "localized_name": "audioUI",
+                    "name": "audioUI",
+                    "type": "AUDIO_UI",
+                    "widget": {
+                        "name": "audioUI"
+                    },
+                    "link": null
+                }
+            ],
+            "outputs": [],
+            "properties": {
+                "Node name for S&R": "PreviewAudio"
+            },
+            "widgets_values": []
+        },
+        {
+            "id": 9,
+            "type": "ShowText|pysssss",
+            "pos": [
+                1649.2506875091847,
+                66.22823888292349
+            ],
+            "size": [
+                318.7188464232943,
+                173.38502269972975
+            ],
+            "flags": {},
+            "order": 9,
+            "mode": 0,
+            "inputs": [
+                {
+                    "localized_name": "text",
+                    "name": "text",
+                    "type": "STRING",
+                    "link": 8
+                }
+            ],
+            "outputs": [
+                {
+                    "localized_name": "STRING",
+                    "name": "STRING",
+                    "shape": 6,
+                    "type": "STRING",
+                    "links": null
+                }
+            ],
+            "properties": {
+                "Node name for S&R": "ShowText|pysssss"
+            },
+            "widgets_values": []
+        }
+    ],
+    "links": [
+        [
+            1,
+            2,
+            0,
+            1,
+            0,
+            "IMAGE"
+        ],
+        [
+            2,
+            3,
+            0,
+            1,
+            1,
+            "VIDEO"
+        ],
+        [
+            3,
+            4,
+            0,
+            1,
+            2,
+            "AUDIO"
+        ],
+        [
+            5,
+            7,
+            0,
+            8,
+            0,
+            "SAMPLING_PARAMS"
+        ],
+        [
+            8,
+            1,
+            0,
+            9,
+            0,
+            "STRING"
+        ],
+        [
+            9,
+            1,
+            1,
+            10,
+            0,
+            "AUDIO"
+        ],
+        [
+            12,
+            5,
+            0,
+            8,
+            1,
+            "SAMPLING_PARAMS"
+        ],
+        [
+            13,
+            7,
+            0,
+            8,
+            2,
+            "SAMPLING_PARAMS"
+        ],
+        [
+            14,
+            8,
+            0,
+            1,
+            3,
+            "SAMPLING_PARAMS"
+        ]
+    ],
+    "groups": [
+        {
+            "id": 1,
+            "title": "Sampling Parameters",
+            "bounding": [
+                480.1649301181556,
+                341.08402513937995,
+                692.4113568972277,
+                510.48648853403665
+            ],
+            "color": "#3f789e",
+            "font_size": 24,
+            "flags": {}
+        },
+        {
+            "id": 2,
+            "title": "Input",
+            "bounding": [
+                344.2364817813528,
+                -287.5850484183313,
+                704.8832238768634,
+                559.5124009832894
+            ],
+            "color": "#3f789e",
+            "font_size": 24,
+            "flags": {}
+        }
+    ],
+    "config": {},
+    "extra": {
+        "workflowRendererVersion": "LG",
+        "ds": {
+            "scale": 0.9478575057427204,
+            "offset": [
+                33.81037136029557,
+                307.1974296197726
+            ]
+        }
+    },
+    "version": 0.4
+}
diff --git a/pyproject.toml b/pyproject.toml
index 965a3e4b20c..4d6bc033317 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,7 +48,8 @@ dev = [
     "soundfile>=0.13.1",
     "imageio[ffmpeg]>=0.6.0",
     "opencv-python>=4.12.0.88",
-    "mooncake-transfer-engine==0.3.8.post1"
+    "mooncake-transfer-engine==0.3.8.post1",
+    "av" # for ComfyUI tests
 ]
 
 docs = [
diff --git a/tests/comfyui/conftest.py b/tests/comfyui/conftest.py
new file mode 100644
index 00000000000..7c9770b8e98
--- /dev/null
+++ b/tests/comfyui/conftest.py
@@ -0,0 +1,82 @@
+"""
+Conftest for ComfyUI-vLLM-Omni tests.
+
+This module sets up the test environment by:
+1. Adding the ComfyUI plugin to Python path
+2. Mocking comfy_api.input module (AudioInput, VideoInput) since comfyui is not installed
+3. Mocking comfy_extras.nodes_audio module
+"""
+
+import os
+import sys
+from typing import BinaryIO, TypedDict
+from unittest.mock import MagicMock
+
+
+def pytest_configure(config):
+    """
+    Called after command line options have been parsed and before test collection.
+    This is the right place to set up sys.path and mock modules.
+    """
+    _setup_comfyui_test_environment()
+
+
+def _setup_comfyui_test_environment():
+    """Set up the test environment for ComfyUI plugin testing."""
+    # === Add ComfyUI plugin path to allow importing comfyui_vllm_omni ===
+    _COMFYUI_PLUGIN_PATH = os.path.abspath(
+        os.path.join(os.path.dirname(__file__), "..", "..", "apps", "ComfyUI-vLLM-Omni")
+    )
+    if not os.path.isdir(_COMFYUI_PLUGIN_PATH):
+        raise FileNotFoundError(
+            f"ComfyUI plugin not found at {_COMFYUI_PLUGIN_PATH}. "
+            "If it is moved elsewhere, please update the path in this conftest.py."
+        )
+    if _COMFYUI_PLUGIN_PATH not in sys.path:
+        sys.path.insert(0, _COMFYUI_PLUGIN_PATH)
+
+    # Import torch after changing import paths. (To be used later)
+    import torch
+
+    # === Mock ComfyUI internal modules (comfy_api & comfy_extras) and "import" them to sys.module ===
+    class AudioInput(TypedDict):
+        """Mock AudioInput TypedDict from comfy_api.input"""
+
+        waveform: torch.Tensor  # Shape: (B, C, T)
+        sample_rate: int
+
+    class VideoInput:
+        """Mock VideoInput class from comfy_api.input"""
+
+        def __init__(self, data: bytes = b"mock_video_data"):
+            self._data = data
+
+        def save_to(self, file: str | BinaryIO):
+            """Save video data to file or file-like object."""
+            if isinstance(file, str):
+                print("Called VideoInput.save_to with file path. Saving to a path is no-op in tests.")
+            else:
+                file.write(self._data)
+
+    mock_comfy_api = MagicMock()
+    mock_comfy_api_input = MagicMock()
+    mock_comfy_api_input.AudioInput = AudioInput
+    mock_comfy_api_input.VideoInput = VideoInput
+    mock_comfy_api.input = mock_comfy_api_input
+
+    def mock_load(_: str | BinaryIO):
+        """Mock nodes_audio.load that returns a waveform tensor (channels, samples) and sample rate."""
+        waveform = torch.zeros((1, 24000), dtype=torch.float32)
+        sample_rate = 24000
+        return waveform, sample_rate
+
+    mock_comfy_extras = MagicMock()
+    mock_nodes_audio = MagicMock()
+    mock_nodes_audio.load = mock_load
+    mock_comfy_extras.nodes_audio = mock_nodes_audio
+
+    # Install mock modules BEFORE importing any comfyui_vllm_omni code
+    sys.modules["comfy_api"] = mock_comfy_api
+    sys.modules["comfy_api.input"] = mock_comfy_api_input
+    sys.modules["comfy_extras"] = mock_comfy_extras
+    sys.modules["comfy_extras.nodes_audio"] = mock_nodes_audio
diff --git a/tests/comfyui/test_comfyui_integration.py b/tests/comfyui/test_comfyui_integration.py
new file mode 100644
index 00000000000..46359632609
--- /dev/null
+++ b/tests/comfyui/test_comfyui_integration.py
@@ -0,0 +1,567 @@
+"""
+Integration tests for ComfyUI nodes that use the Omni API client, with a mocked AsyncOmni and a real API server running in a background process.
+These tests cover the integration between ComfyUI node and the API server, without actual model inference logic.
+It ensures that
+1. Changes made to the API (e.g., request and response formats) do not break the ComfyUI frontend that use it.
+2. The sampling parameters are correctly passed from the node to AsyncOmni through the API layer.
+"""
+
+import multiprocessing
+import time
+import traceback
+from collections.abc import Iterable, Sequence
+from enum import StrEnum, auto
+from typing import Any, NamedTuple
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+import requests
+import torch
+from comfy_api.input import AudioInput, VideoInput
+from comfyui_vllm_omni.nodes import (
+    VLLMOmniGenerateImage,
+    VLLMOmniTTS,
+    VLLMOmniUnderstanding,
+    VLLMOmniVoiceClone,
+)
+from comfyui_vllm_omni.utils.types import AutoregressionSamplingParams, DiffusionSamplingParams
+from PIL import Image
+from vllm import SamplingParams
+from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+from vllm_omni.entrypoints.cli.serve import OmniServeCommand
+from vllm_omni.inputs.data import OmniSamplingParams
+from vllm_omni.outputs import OmniRequestOutput
+
+
+class ServerCase(NamedTuple):
+    """Parametrizing the model to serve."""
+
+    served_model: str
+    stage_list: list
+    stage_configs: list[dict]
+    outputs: list[OmniRequestOutput]
+
+
+class SamplingCase(NamedTuple):
+    """Parametrizing the input sampling parameters."""
+
+    kind: "SamplingKind"
+    sampling_params: dict | list[dict] | None
+
+
+class SamplingKind(StrEnum):
+    IMAGE_NONE = auto()
+    IMAGE_DIFFUSION_SINGLE = auto()
+    UNDERSTANDING_NONE = auto()
+    UNDERSTANDING_AR_LIST = auto()
+    TTS_NONE = auto()
+    TTS_DIFFUSION_SINGLE = auto()
+
+
+# Pre-defined arguments to be used in function calls during the tests
+IMAGE_WIDTH = 64
+IMAGE_HEIGHT = 64
+DIFFUSION_SINGLE_SAMPLING_PARAMS = DiffusionSamplingParams(
+    {
+        "n": 2,
+        "num_inference_steps": 30,
+        "guidance_scale": 6.0,
+        "true_cfg_scale": 1.5,
+    }
+)
+
+AR_LIST_SAMPLING_PARAMS = [
+    AutoregressionSamplingParams(
+        {
+            "max_tokens": 64,
+            "temperature": 0.6,
+            "top_p": 0.9,
+            "repetition_penalty": 1.0,
+            "seed": 21,
+        }
+    ),
+    AutoregressionSamplingParams(
+        {
+            "max_tokens": 96,
+            "temperature": 0.75,
+            "top_p": 0.85,
+            "repetition_penalty": 1.05,
+            "seed": 22,
+        }
+    ),
+    AutoregressionSamplingParams(
+        {
+            "max_tokens": 128,
+            "temperature": 0.8,
+            "top_p": 0.8,
+            "repetition_penalty": 1.1,
+            "seed": 23,
+        }
+    ),
+]
+
+
+def _build_image_output(size: tuple[int, int] = (IMAGE_WIDTH, IMAGE_HEIGHT), color: str = "red") -> Image.Image:
+    return Image.new("RGB", size, color=color)
+
+
+def _build_text_output(text: str = "This is a test response.") -> OmniRequestOutput:
+    completion_output = CompletionOutput(
+        index=0,
+        text=text,
+        token_ids=[1, 2, 3],
+        cumulative_logprob=0.0,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    )
+    request_output = RequestOutput(
+        request_id="test_req_text",
+        prompt="test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+    )
+    return OmniRequestOutput(
+        request_id="test_req_text",
+        finished=True,
+        final_output_type="text",
+        request_output=request_output,
+    )
+
+
+def _build_audio_chat_output(num_samples: int = 24000) -> OmniRequestOutput:
+    completion_output = CompletionOutput(
+        index=0,
+        text="",
+        token_ids=[],
+        cumulative_logprob=0.0,
+        logprobs=None,
+        finish_reason="stop",
+        stop_reason=None,
+    )
+    completion_output.multimodal_output = {"audio": [torch.zeros(1, num_samples)]}
+    request_output = RequestOutput(
+        request_id="test_req_audio_chat",
+        prompt="test prompt",
+        prompt_token_ids=[1, 2, 3],
+        prompt_logprobs=None,
+        outputs=[completion_output],
+        finished=True,
+        metrics=None,
+        lora_request=None,
+    )
+    return OmniRequestOutput(
+        request_id="test_req_audio_chat",
+        finished=True,
+        final_output_type="audio",
+        request_output=request_output,
+    )
+
+
+def _build_audio_speech_output(num_samples: int = 24000) -> OmniRequestOutput:
+    return OmniRequestOutput.from_diffusion(
+        request_id="test_req_audio_speech",
+        images=[],
+        multimodal_output={"audio": torch.zeros(num_samples), "sr": 24000},
+        final_output_type="audio",
+    )
+
+
+def _build_diffusion_image_output_for_images_endpoint() -> OmniRequestOutput:
+    return OmniRequestOutput.from_diffusion(
+        request_id="test_req_img_dalle",
+        images=[_build_image_output()],
+        final_output_type="image",
+    )
+
+
+def _build_diffusion_image_output_for_chat_endpoint() -> OmniRequestOutput:
+    request_output = MagicMock()
+    request_output.images = [_build_image_output(color="blue")]
+    request_output.finished = True
+    return OmniRequestOutput(
+        request_id="test_req_img_chat",
+        finished=True,
+        final_output_type="image",
+        request_output=request_output,
+    )
+
+
+def _assert_sampling_param_values(received: OmniSamplingParams, expected: dict[str, Any]):
+    for key, expected_value in expected.items():
+        actual_value = getattr(received, key, None)
+        assert actual_value == expected_value, (
+            f"Expected sampling param '{key}'={expected_value}, got {actual_value}. The received sampling params: {received}"
+        )
+
+
+def _build_mock_outputs(outputs: Iterable[OmniRequestOutput], sampling_case: SamplingCase, server_case: ServerCase):
+    async def _mock_generate(*args, **kwargs):
+        received_sampling_params_list: Sequence[OmniSamplingParams] | None = (
+            args[2] if len(args) > 2 else kwargs.get("sampling_params_list")
+        )
+
+        assert received_sampling_params_list is not None, (
+            "In the current codebase, the API layer always provides not-None sampling parameter list when calling `AsyncOmni.generate`"
+            "This test also uses this assumption for now."
+            "If this assertion fails, it means the API layer has changed and this test needs to be updated accordingly."
+            "It does not necessarily mean there is a bug, because `AsyncOmni.generate` does allow sampling_params_list to be None."
+        )
+        assert isinstance(received_sampling_params_list, Sequence), "sampling_params_list should be a Sequence"
+
+        if sampling_case.kind is SamplingKind.IMAGE_NONE:
+            assert len(received_sampling_params_list) == 1
+            _assert_sampling_param_values(
+                received_sampling_params_list[0],
+                {
+                    "width": IMAGE_WIDTH,
+                    "height": IMAGE_HEIGHT,
+                },
+            )
+        elif sampling_case.kind is SamplingKind.IMAGE_DIFFUSION_SINGLE:
+            assert len(received_sampling_params_list) == 1
+            expected = DIFFUSION_SINGLE_SAMPLING_PARAMS.copy()
+            expected["num_outputs_per_prompt"] = expected.pop("n")  # convert from n to num_outputs_per_prompt
+            _assert_sampling_param_values(
+                received_sampling_params_list[0],
+                {
+                    "width": IMAGE_WIDTH,
+                    "height": IMAGE_HEIGHT,
+                    **expected,
+                },
+            )
+        elif sampling_case.kind is SamplingKind.UNDERSTANDING_NONE:
+            assert len(received_sampling_params_list) == 3
+        elif sampling_case.kind is SamplingKind.UNDERSTANDING_AR_LIST:
+            assert len(received_sampling_params_list) == 3
+            for i, expected in enumerate(AR_LIST_SAMPLING_PARAMS):
+                _assert_sampling_param_values(received_sampling_params_list[i], expected)
+        elif sampling_case.kind in {SamplingKind.TTS_NONE, SamplingKind.TTS_DIFFUSION_SINGLE}:
+            assert len(received_sampling_params_list) == 1
+        else:
+            raise AssertionError(f"Unknown sampling case: {sampling_case.kind}")
+
+        for output in outputs:
+            yield output
+
+    return _mock_generate
+
+
+@pytest.fixture
+def server_case(request) -> ServerCase:
+    return request.param
+
+
+@pytest.fixture
+def sampling_case(request) -> SamplingCase:
+    return request.param
+
+
+@pytest.fixture
+def mock_async_omni(server_case: ServerCase, sampling_case: SamplingCase):
+    async def _mock_preprocess_chat(self, *args, **kwargs):
+        return ([{"role": "user", "content": "test"}], [{"prompt": "test prompt"}])
+
+    # Need to mock AsyncOmni itself (not only its generate method) because
+    # 1. The API layer uses its stage_list and stage_configs attributes
+    # 2. Its __init__ method has slow side effects (model & config loading).
+    with (
+        patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") as MockAsyncOmni,
+        patch(
+            "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat",
+            new=_mock_preprocess_chat,
+        ),
+    ):
+        mock_instance = AsyncMock()
+        mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case)
+
+        mock_instance.stage_list = server_case.stage_list
+        mock_instance.stage_configs = server_case.stage_configs
+        mock_instance.default_sampling_params_list = [
+            SamplingParams() if stage.get("stage_type") != "diffusion" else MagicMock()
+            for stage in server_case.stage_configs
+        ]
+        mock_instance.errored = False
+        mock_instance.dead_error = RuntimeError("Mock engine error")
+        mock_instance.model_config = MagicMock(max_model_len=4096, io_processor_plugin=None)
+        mock_instance.io_processor = MagicMock()
+        mock_instance.input_processor = MagicMock()
+        mock_instance.shutdown = MagicMock()
+        mock_instance.get_vllm_config = AsyncMock(return_value=None)
+        mock_instance.get_supported_tasks = AsyncMock(return_value=["generate"])
+        mock_instance.get_tokenizer = AsyncMock(return_value=None)
+
+        MockAsyncOmni.return_value = mock_instance
+        yield MockAsyncOmni
+
+
+@pytest.fixture
+def api_server(unused_tcp_port_factory, server_case: ServerCase, mock_async_omni):
+    """Set up a API server in background process from command line with parametrized model name and mocked AsyncOmni."""
+    parser = FlexibleArgumentParser()
+    subparsers = parser.add_subparsers(dest="command")
+    cmd = OmniServeCommand()
+    cmd.subparser_init(subparsers)
+
+    port = unused_tcp_port_factory()
+    args = parser.parse_args(["serve", server_case.served_model, "--omni", "--port", str(port)])
+
+    def run_server():
+        try:
+            cmd.cmd(args)
+        except Exception:
+            traceback.print_exc()
+
+    server_process = multiprocessing.Process(target=run_server)
+    server_process.start()
+
+    # Wait for the server to be ready by polling the health endpoint.
+    wait_time = 30
+    wait_poll_interval = 1
+    for _ in range(wait_time // wait_poll_interval):
+        try:
+            response = requests.get(f"http://127.0.0.1:{port}/health")
+            if response.status_code == 200:
+                break
+        except requests.ConnectionError:
+            time.sleep(wait_poll_interval)
+    else:
+        if server_process.is_alive():
+            server_process.terminate()
+            server_process.join(timeout=5)
+            if server_process.is_alive():
+                server_process.kill()
+                server_process.join(timeout=5)
+        pytest.fail(f"API server failed to start within {wait_time} seconds")
+
+    yield f"http://127.0.0.1:{port}/v1"
+
+    if server_process.is_alive():
+        server_process.terminate()
+    server_process.join(timeout=10)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server_case,model,image_input",
+    [
+        pytest.param(
+            ServerCase(
+                served_model="Tongyi-MAI/Z-Image-Turbo",
+                stage_list=["diffusion"],
+                stage_configs=[{"stage_type": "diffusion"}],
+                outputs=[_build_diffusion_image_output_for_images_endpoint()],
+            ),
+            "Tongyi-MAI/Z-Image-Turbo",
+            False,
+            id="text-to-image-dalle-endpoint",
+        ),
+        pytest.param(
+            ServerCase(
+                served_model="ByteDance-Seed/BAGEL-7B-MoT",
+                stage_list=["diffusion"],
+                stage_configs=[{"stage_type": "diffusion"}],
+                outputs=[_build_diffusion_image_output_for_chat_endpoint()],
+            ),
+            "ByteDance-Seed/BAGEL-7B-MoT",
+            False,
+            id="text-to-image-bagel-chat-endpoint",
+        ),
+        pytest.param(
+            ServerCase(
+                served_model="Qwen/Qwen-Image-Edit",
+                stage_list=["diffusion"],
+                stage_configs=[{"stage_type": "diffusion"}],
+                outputs=[_build_diffusion_image_output_for_images_endpoint()],
+            ),
+            "Qwen/Qwen-Image-Edit",
+            True,
+            id="image-to-image-dalle-endpoint",
+        ),
+        pytest.param(
+            ServerCase(
+                served_model="ByteDance-Seed/BAGEL-7B-MoT",
+                stage_list=["diffusion"],
+                stage_configs=[{"stage_type": "diffusion"}],
+                outputs=[_build_diffusion_image_output_for_chat_endpoint()],
+            ),
+            "ByteDance-Seed/BAGEL-7B-MoT",
+            True,
+            id="image-to-image-bagel-chat-endpoint",
+        ),
+    ],
+    indirect=["server_case"],
+)
+@pytest.mark.parametrize(
+    "sampling_case",
+    [
+        pytest.param(SamplingCase(kind=SamplingKind.IMAGE_NONE, sampling_params=None), id="no-sampling-params"),
+        pytest.param(
+            SamplingCase(kind=SamplingKind.IMAGE_DIFFUSION_SINGLE, sampling_params=DIFFUSION_SINGLE_SAMPLING_PARAMS),
+            id="single-diffusion-sampling-params",
+        ),
+    ],
+    indirect=["sampling_case"],
+)
+async def test_image_generation_node(api_server: str, model: str, image_input: bool, sampling_case: SamplingCase):
+    node = VLLMOmniGenerateImage()
+
+    kwargs = {
+        "url": api_server,
+        "model": model,
+        "prompt": "A beautiful sunset",
+        "width": IMAGE_WIDTH,
+        "height": IMAGE_HEIGHT,
+    }
+    if image_input:
+        kwargs["image"] = torch.zeros((1, IMAGE_WIDTH, IMAGE_HEIGHT, 3), dtype=torch.float32)
+    if sampling_case.sampling_params is not None:
+        kwargs["sampling_params"] = sampling_case.sampling_params
+    print(f"!!!!!! Calling {model} node.generate with kwargs: {sampling_case.sampling_params}")
+
+    result = await node.generate(**kwargs)
+
+    assert isinstance(result, tuple)
+    assert len(result) == 1
+    assert isinstance(result[0], torch.Tensor)
+    assert result[0].shape == (1, IMAGE_WIDTH, IMAGE_HEIGHT, 3)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server_case",
+    [
+        pytest.param(
+            ServerCase(
+                served_model="Qwen/Qwen2.5-Omni-7B",
+                stage_list=[
+                    MagicMock(is_comprehension=True, model_stage="llm"),
+                    MagicMock(is_comprehension=False, model_stage="llm"),
+                    MagicMock(is_comprehension=False, model_stage="llm"),
+                ],
+                stage_configs=[
+                    {"stage_type": "llm"},
+                    {"stage_type": "llm"},
+                    {"stage_type": "llm"},
+                ],
+                outputs=[_build_audio_chat_output(), _build_text_output("Understanding response")],
+            ),
+            id="multimodal-understanding",
+        )
+    ],
+    indirect=["server_case"],
+)
+@pytest.mark.parametrize(
+    "sampling_case",
+    [
+        pytest.param(SamplingCase(kind=SamplingKind.UNDERSTANDING_NONE, sampling_params=None), id="no-sampling-params"),
+        pytest.param(
+            SamplingCase(kind=SamplingKind.UNDERSTANDING_AR_LIST, sampling_params=AR_LIST_SAMPLING_PARAMS),
+            id="ar-sampling-params-list",
+        ),
+    ],
+    indirect=["sampling_case"],
+)
+async def test_understanding_node(api_server: str, sampling_case: SamplingCase):
+    node = VLLMOmniUnderstanding()
+
+    image = torch.zeros((1, IMAGE_WIDTH, IMAGE_HEIGHT, 3), dtype=torch.float32)
+    video = VideoInput(b"mock_video_for_test")  # type: ignore[reportAbstractUsage]
+    audio: AudioInput = {"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000}
+
+    text_response, audio_response = await node.generate(
+        url=api_server,
+        model="Qwen/Qwen2.5-Omni-7B",
+        prompt="Describe all modalities.",
+        image=image,
+        audio=audio,
+        video=video,
+        sampling_params=sampling_case.sampling_params,
+        output_text=True,
+        output_audio=True,
+        use_audio_in_video=True,
+    )
+
+    assert text_response == "Understanding response"
+    assert isinstance(audio_response, dict)
+    assert audio_response["sample_rate"] == 24000
+    assert isinstance(audio_response["waveform"], torch.Tensor)
+    assert audio_response["waveform"].shape == (1, 1, 24000)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "server_case,node_cls,call_kwargs",
+    [
+        pytest.param(
+            ServerCase(
+                served_model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+                stage_list=["llm"],
+                stage_configs=[{"stage_type": "llm"}],
+                outputs=[_build_audio_speech_output()],
+            ),
+            VLLMOmniTTS,
+            {
+                "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice",
+                "input": "Hello from TTS test",
+                "voice": "Vivian",
+                "response_format": "wav",
+                "speed": 1.0,
+                "model_specific_params": None,
+            },
+            id="tts",
+        ),
+        pytest.param(
+            ServerCase(
+                served_model="Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+                stage_list=["llm"],
+                stage_configs=[{"stage_type": "llm"}],
+                outputs=[_build_audio_speech_output()],
+            ),
+            VLLMOmniVoiceClone,
+            {
+                "model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
+                "input": "Hello from voice clone test",
+                "voice": "Vivian",
+                "response_format": "wav",
+                "speed": 1.0,
+                "ref_audio": {"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000},
+                "ref_text": "Reference transcript",
+                "x_vector_only_mode": False,
+                "model_specific_params": None,
+            },
+            id="tts-voice-clone",
+        ),
+    ],
+    indirect=["server_case"],
+)
+@pytest.mark.parametrize(
+    "sampling_case",
+    [
+        pytest.param(SamplingCase(kind=SamplingKind.TTS_NONE, sampling_params=None), id="no-sampling-params"),
+        pytest.param(
+            SamplingCase(kind=SamplingKind.TTS_DIFFUSION_SINGLE, sampling_params=DIFFUSION_SINGLE_SAMPLING_PARAMS),
+            id="single-diffusion-sampling-params",
+        ),
+    ],
+    indirect=["sampling_case"],
+)
+async def test_tts_nodes(api_server: str, node_cls, call_kwargs: dict, sampling_case: SamplingCase):
+    node = node_cls()
+    actual_kwargs = dict(call_kwargs)
+    if sampling_case.sampling_params is not None:
+        actual_kwargs["model_specific_params"] = sampling_case.sampling_params
+    result = await node.generate(url=api_server, **actual_kwargs)
+
+    assert isinstance(result, tuple)
+    assert len(result) == 1
+    assert isinstance(result[0], dict)
+    assert result[0]["sample_rate"] == 24000
+    assert isinstance(result[0]["waveform"], torch.Tensor)
+    assert result[0]["waveform"].shape == (1, 1, 24000)