diff --git a/.github/workflows/comfyui-publish.yml b/.github/workflows/comfyui-publish.yml deleted file mode 100644 index c282bcada26..00000000000 --- a/.github/workflows/comfyui-publish.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: ComfyUI Integration -- Publish to Comfy registry -on: - workflow_dispatch: - push: - tags: - - "*" - paths: - - "apps/ComfyUI-vLLM-Omni/**" - -permissions: - issues: write - -defaults: - run: - working-directory: apps/ComfyUI-vLLM-Omni - -jobs: - publish-node: - name: Publish Custom Node to registry - runs-on: ubuntu-latest - steps: - - name: ♻️ Check out code - uses: actions/checkout@v4 - - name: 📦 Publish Custom Node - uses: Comfy-Org/publish-node-action@main - with: - personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }} diff --git a/.github/workflows/comfyui-validate.yml b/.github/workflows/comfyui-validate.yml deleted file mode 100644 index 06b9b1f913e..00000000000 --- a/.github/workflows/comfyui-validate.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: ComfyUI Integration -- Validate backwards compatibility - -on: - pull_request: - branches: - - master - - main - paths: - - "apps/ComfyUI-vLLM-Omni/**" - -defaults: - run: - working-directory: apps/ComfyUI-vLLM-Omni - -jobs: - check-base-path: - runs-on: ubuntu-latest - outputs: - exists: ${{ steps.check.outputs.exists }} - defaults: - run: - working-directory: . - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ github.event.pull_request.base.sha }} - fetch-depth: 1 - - id: check - run: | - if [ -d "apps/ComfyUI-vLLM-Omni" ]; then - echo "exists=true" >> "$GITHUB_OUTPUT" - else - echo "exists=false" >> "$GITHUB_OUTPUT" - fi - validate: - needs: check-base-path - if: needs.check-base-path.outputs.exists == 'true' - runs-on: ubuntu-latest - steps: - - uses: comfy-org/node-diff@main diff --git a/apps/ComfyUI-vLLM-Omni/README.md b/apps/ComfyUI-vLLM-Omni/README.md index dd530174f3f..23bbe5b7293 100644 --- a/apps/ComfyUI-vLLM-Omni/README.md +++ b/apps/ComfyUI-vLLM-Omni/README.md @@ -47,12 +47,12 @@ If no, check your shell running the ComfyUI process. There may be some error mes This extension offers the following nodes based on the output modalities (at **ComfyUI sidebar -> Node Library**): - **Generate Image** for text-to-image and image-to-image tasks -- **Multimodality Comprehension** for multimodality-to-text and multimodality-to-audio tasks +- **Multimodality Understanding** for multimodality-to-text and multimodality-to-audio tasks - **TTS** and **TTS Voice Clone** for TTS tasks This extension also offers example workflows (at **ComfyUI sidebar -> Templates -> vLLM-Omni**) -> [!INFO] +> [!NOTE] > The node UI and feature designs are intended to match vLLM-Omni online serving interfaces. It cannot offer more than what the interfaces support. To build a simple workflow yourself, @@ -65,28 +65,16 @@ To build a simple workflow yourself, - For some multi-stage models like BAGEL, [only one stage's sampling parameters are exposed and tunable via vLLM-Omni's online serving API](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/bagel/). Thus, these models are treated as single-stage ones. Please check the vLLM-Omni documentation on how to correctly set each model's sampling parameters. - For multi-stage models where all stages are either autoregression or diffusion, you can also connect only a single Sampling Params node, indicating that this set of sampling parameters will be used for all stages. -**The following features are tested**: - -- Single-node workflows for - - Multimodal Comprehension (e.g., Qwen Omni, BAGEL) - - Text-to-Image Generation (e.g., Qwen-Image) - - Image-to-Image Generation (e.g., Qwen-Image-Edit) - - TTS (e.g., Qwen TTS, including VoiceDesign, VoiceClone, CustomVoice) - -**The following features are not currently tested**. They may work or break. You are welcomed to test it out and offer comments. - -- Multi-node workflow that connects multiple model services together. - ## Screenshots and Examples -### Multimodal comprehension (e.g., Qwen Omni series, BAGEL) +### Multimodal understanding (e.g., Qwen Omni series, BAGEL) -(Also available at **ComfyUI sidebar->Template->vLLM-Omni->vLLM-Omni Annotated Example**) +(Also available at **ComfyUI sidebar->Template->vLLM-Omni->vLLM-Omni Multimodal Understanding**)

- - vLLM-Omni Main Architecture + + vLLM-Omni multimodal understanding

@@ -98,7 +86,7 @@ You can configure per-stage sampling parameters for multi-stage models.

- vLLM-Omni Main Architecture + vLLM-Omni multiple stages

@@ -109,7 +97,7 @@ You can configure per-stage sampling parameters for multi-stage models.

- vLLM-Omni Main Architecture + vLLM-Omni image generation

@@ -123,13 +111,24 @@ You can configure per-stage sampling parameters for multi-stage models.

- vLLM-Omni Main Architecture + vLLM-Omni TTS

> [!TIP] > There is a dedicated node for VoiceClone tasks with reference audio input. Other simple text-to-speech tasks should use the regular TTS node. +### Chaining multiple model services + +(Also available at **ComfyUI sidebar->Template->vLLM-Omni->vLLM-Omni Chaining Services**) + +

+ + + vLLM-Omni TTS + +

+ ## Develop Follow the [development convention and rules of vLLM-Omni](https://docs.vllm.ai/projects/vllm-omni/en/latest/contributing/). diff --git a/apps/ComfyUI-vLLM-Omni/__init__.py b/apps/ComfyUI-vLLM-Omni/__init__.py index ffb86ddb3c4..641824add78 100644 --- a/apps/ComfyUI-vLLM-Omni/__init__.py +++ b/apps/ComfyUI-vLLM-Omni/__init__.py @@ -6,18 +6,18 @@ "WEB_DIRECTORY", ] -__author__ = """Zeyu Huang""" -__email__ = "11222265+fhfuih@users.noreply.github.com" +__author__ = """vLLM-Omni Team""" +__email__ = "vllm-omni@vllm.ai" __version__ = "0.0.1" from .comfyui_vllm_omni.nodes import ( VLLMOmniARSampling, - VLLMOmniComprehension, VLLMOmniDiffusionSampling, VLLMOmniGenerateImage, VLLMOmniQwenTTSParams, VLLMOmniSamplingParamsList, VLLMOmniTTS, + VLLMOmniUnderstanding, VLLMOmniVoiceClone, ) @@ -25,7 +25,7 @@ NODE_CLASS_MAPPINGS = { # === Generation === "VLLMOmniGenerateImage": VLLMOmniGenerateImage, - "VLLMOmniComprehension": VLLMOmniComprehension, + "VLLMOmniUnderstanding": VLLMOmniUnderstanding, "VLLMOmniTTS": VLLMOmniTTS, "VLLMOmniVoiceClone": VLLMOmniVoiceClone, # === Params === @@ -39,7 +39,7 @@ NODE_DISPLAY_NAME_MAPPINGS = { # === Generation === "VLLMOmniGenerateImage": "Generate Image", - "VLLMOmniComprehension": "Multimodality Comprehension", + "VLLMOmniUnderstanding": "Multimodality Understanding", "VLLMOmniTTS": "TTS (Text to Speech)", "VLLMOmniVoiceClone": "TTS Voice Cloning", # === Params === diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py index ebc2822df64..5caa3869ed3 100644 --- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py +++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/nodes.py @@ -1,4 +1,4 @@ -from typing import Literal, cast +from typing import Literal import torch from comfy_api.input import AudioInput, VideoInput @@ -6,7 +6,12 @@ from .utils.api_client import VLLMOmniClient from .utils.logger import get_logger from .utils.models import lookup_model_spec -from .utils.types import AudioFormat +from .utils.types import ( + AudioFormat, + AutoregressionSamplingParams, + DiffusionSamplingParams, + QwenTTSModelSpecificParams, +) from .utils.validators import ( add_sampling_parameters_to_stage, validate_model_and_sampling_params_types, @@ -86,7 +91,10 @@ async def generate( # Prefer DALL-E compatible API for simple (one-stage) diffusion models if (spec is None or spec["stages"] == ["diffusion"]) and not is_bagel: - sampling_params = cast(dict | None, sampling_params) + # The number of sampling parameter groups should have been validated. + # Now, simply convert single-item list to dict. + if isinstance(sampling_params, list): + sampling_params = sampling_params[0] if audio is None and image is None and video is None: # No multimodal input --- use DALL-E image generation logger.info("Using DALL-E image generation endpoint") @@ -133,7 +141,7 @@ async def generate( return (output,) -class VLLMOmniComprehension(_VLLMOmniGenerateBase): +class VLLMOmniUnderstanding(_VLLMOmniGenerateBase): @classmethod def INPUT_TYPES(cls): return { @@ -197,7 +205,7 @@ async def generate( ( text_response, _, - ) = await client.generate_comprehension_chat_completion( + ) = await client.generate_understanding_chat_completion( model=model, prompt=prompt, image=image, @@ -221,7 +229,7 @@ async def generate( ( text_response, audio, - ) = await client.generate_comprehension_chat_completion( + ) = await client.generate_understanding_chat_completion( model=model, prompt=prompt, image=image, @@ -287,15 +295,13 @@ async def generate( logger.info("Got extra kwargs in TTS: %s", kwargs) is_qwen_tts = "qwen3-tts" in model.lower() - extra_params_type = None if model_specific_params is None else model_specific_params["type"] - if not is_qwen_tts and extra_params_type == "qwen-tts": + if not is_qwen_tts and isinstance(model_specific_params, QwenTTSModelSpecificParams): raise ValueError( "You have provided Qwen-specific TTS params." "However, the model appears to not be a Qwen TTS model (no 'Qwen3-TTS' in model name)." ) combined_params = {**kwargs, **(model_specific_params or {})} - combined_params.pop("type", None) # Internal fields in model_specific_params client = VLLMOmniClient(url) @@ -352,8 +358,7 @@ async def generate( **kwargs, ): is_qwen_tts = "qwen3-tts" in model.lower() - extra_params_type = None if model_specific_params is None else model_specific_params["type"] - if not is_qwen_tts and extra_params_type == "qwen-tts": + if not is_qwen_tts and isinstance(model_specific_params, QwenTTSModelSpecificParams): raise ValueError( "You have provided Qwen-specific TTS params." "However, the model appears to not be a Qwen TTS model (no 'Qwen3-TTS' in model name)." @@ -366,7 +371,6 @@ async def generate( **kwargs, **(model_specific_params or {}), } - combined_params.pop("type", None) # Internal fields in model_specific_params client = VLLMOmniClient(url) @@ -419,10 +423,7 @@ def INPUT_TYPES(cls): CATEGORY = "vLLM-Omni/Sampling Params" def get_params(self, seed, **kwargs): - params = { - "type": "autoregression", # for internal use, removed before sending the request - **kwargs, - } + params = AutoregressionSamplingParams(kwargs) if seed >= 0: params["seed"] = seed return (params,) @@ -479,6 +480,13 @@ def INPUT_TYPES(cls): "tooltip": "Enable VAE slicing for reduced memory usage (slight quality trade-off)", }, ), + "vae_use_tiling": ( + "BOOLEAN", + { + "default": False, + "tooltip": "Enable VAE tiling for reduced memory usage (slight quality trade-off)", + }, + ), # === Put seed at last. === # Whenever a field named "seed" is present, ComfyUI adds another field called "control after generate" "seed": ( @@ -499,10 +507,7 @@ def INPUT_TYPES(cls): CATEGORY = "vLLM-Omni/Sampling Params" def get_params(self, seed, **kwargs): - params = { - "type": "diffusion", # for internal use, removed before sending the request - **kwargs, - } + params = DiffusionSamplingParams(kwargs) if seed >= 0: params["seed"] = seed return (params,) @@ -566,4 +571,4 @@ def INPUT_TYPES(cls): CATEGORY = "vLLM-Omni/TTS Params" def get_params(self, **kwargs): - return ({"type": "qwen-tts", **kwargs},) + return (QwenTTSModelSpecificParams(kwargs),) diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py index c5372641f51..d340073bc24 100644 --- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py +++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/api_client.py @@ -57,16 +57,7 @@ async def generate_image( if negative_prompt: payload["negative_prompt"] = negative_prompt if sampling_params is not None: - # Only select specific sampling params - for k in ( - "n", - "num_inference_steps", - "guidance_scale", - "true_cfg_scale", - "vae_use_slicing", - ): - if k in sampling_params and sampling_params[k] is not None: - payload[k] = sampling_params[k] + payload.update(sampling_params) logger.debug("img gen payload: %s", payload) url = self.base_url + "/images/generations" @@ -138,10 +129,8 @@ async def edit_image( if negative_prompt: form.add_field("negative_prompt", negative_prompt) if sampling_params is not None: - # Only select specific sampling params - for k in ("n", "num_inference_steps", "guidance_scale", "true_cfg_scale"): - if k in sampling_params and sampling_params[k] is not None: - form.add_field(k, str(sampling_params[k])) + for k, v in sampling_params.items(): + form.add_field(k, str(v)) if mask is not None: mask_filename = "mask.png" form.add_field( @@ -217,7 +206,7 @@ async def generate_image_chat_completion( return torch.stack(image_tensors, dim=0) - async def generate_comprehension_chat_completion( + async def generate_understanding_chat_completion( self, *, model: str, @@ -427,30 +416,28 @@ def _prepare_chat_completion_messages( message_content.append({"type": "video_url", "video_url": {"url": video_to_base64(video)}}) messages = [{"role": "user", "content": message_content}] + payload: dict[str, Any] = {"messages": messages, "model": model} + if modalities: + payload["modalities"] = modalities + combined_extra_body: dict[str, Any] = {} if sampling_params is not None: spec, _ = lookup_model_spec(model) is_single_sampling_param = isinstance(sampling_params, dict) or len(sampling_params) == 1 - # Exclude internal key - if isinstance(sampling_params, dict): - sampling_params = {k: v for k, v in sampling_params.items() if k != "type"} - else: - sampling_params = [{k: v for k, v in sp.items() if k != "type"} for sp in sampling_params] - if (spec is None and is_single_sampling_param) or (spec is not None and spec["stages"] == ["diffusion"]): # Diffusion format: extra_body directly contains sampling params. - # Validation should have taken care of matching sampling params' types. + # Validation has already taken care of matching sampling params' types and length. Safe to take [0]. # * Use this mode if the model is a simple one-stage diffusion model. # * Fallback to this mode if model is not registered and a single sampling param is provided. sampling_params = sampling_params if isinstance(sampling_params, dict) else sampling_params[0] - combined_extra_body: dict[str, Any] = {**sampling_params} + combined_extra_body: dict[str, Any] = sampling_params.copy() if "n" in combined_extra_body: - combined_extra_body["num_outputs_per_prompt"] = combined_extra_body["n"] - del combined_extra_body["n"] + combined_extra_body["num_outputs_per_prompt"] = combined_extra_body.pop("n") else: - # Use AR style payload, extra_body has a sampling_params_list field - combined_extra_body: dict[str, Any] = {"sampling_params_list": sampling_params} + # AR format: the payload has a sampling_params_list field, containing a list. + sampling_params_list = sampling_params if isinstance(sampling_params, list) else [sampling_params] + payload["sampling_params_list"] = sampling_params_list if negative_prompt: combined_extra_body["negative_prompt"] = negative_prompt @@ -458,12 +445,11 @@ def _prepare_chat_completion_messages( if extra_body: combined_extra_body.update(extra_body) - payload: dict[str, Any] = {"messages": messages, "model": model} + # Add extra_body only if it has any content. if combined_extra_body: payload["extra_body"] = combined_extra_body - if modalities: - payload["modalities"] = modalities + # Place to inject any model-specific payload adjustment spec, _ = lookup_model_spec(model) if spec: preprocessor = spec.get("payload_preprocessor", None) diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py index 83bf64fe7d4..bfeddd82b87 100644 --- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py +++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/models.py @@ -11,8 +11,7 @@ def _bagel_payload_preprocessor(payload: dict) -> dict: content["text"] = "<|im_start|>" + content["text"] + "<|im_end|>" except (KeyError, TypeError): raise RuntimeError("Internal Error: malformatted BAGEL payload") - extra_body = payload.pop("extra_body", {}) - return {**payload, **extra_body} + return payload def _qwen25_payload_preprocessor(payload: dict) -> dict: @@ -37,7 +36,7 @@ def _qwen25_payload_preprocessor(payload: dict) -> dict: ], "modes": [ { - "mode": ModelMode.COMPREHENSION, + "mode": ModelMode.UNDERSTANDING, "input_modalities": [Modality.TEXT, Modality.IMAGE], } ], @@ -48,7 +47,7 @@ def _qwen25_payload_preprocessor(payload: dict) -> dict: "payload_preprocessor": _qwen25_payload_preprocessor, "modes": [ { - "mode": ModelMode.COMPREHENSION, + "mode": ModelMode.UNDERSTANDING, "input_modalities": [ Modality.TEXT, Modality.IMAGE, @@ -62,7 +61,7 @@ def _qwen25_payload_preprocessor(payload: dict) -> dict: "stages": ["autoregression", "autoregression", "autoregression"], "modes": [ { - "mode": ModelMode.COMPREHENSION, + "mode": ModelMode.UNDERSTANDING, "input_modalities": [ Modality.TEXT, Modality.IMAGE, diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py index 29be9648c9c..6cc79f3ed97 100644 --- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py +++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/types.py @@ -11,11 +11,23 @@ AudioFormat: TypeAlias = Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] +class AutoregressionSamplingParams(dict): + pass + + +class DiffusionSamplingParams(dict): + pass + + +class QwenTTSModelSpecificParams(dict): + pass + + class ModelMode(Enum): IMAGE_GENERATION = auto() VIDEO_GENERATION = auto() AUDIO_GENERATION = auto() - COMPREHENSION = auto() + UNDERSTANDING = auto() class Modality(Enum): diff --git a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py index f1af7b2e97d..2abdd21006e 100644 --- a/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py +++ b/apps/ComfyUI-vLLM-Omni/comfyui_vllm_omni/utils/validators.py @@ -1,5 +1,6 @@ from .logger import get_logger from .models import lookup_model_spec +from .types import AutoregressionSamplingParams, DiffusionSamplingParams logger = get_logger(__name__) @@ -31,24 +32,20 @@ def validate_model_and_sampling_params_types( ) # Check that each stage's type match for i, sp in enumerate(sampling_param_list): - if "type" not in sp: - raise RuntimeError("Internal error: unknown sampling parameter type") - if sp["type"] != stages[i]: + if not _check_sampling_param_matches_stage(sp, stages[i]): raise ValueError( - f"Sampling parameter type ({sp['type']}) does not match " + f"Sampling parameter type ({sp.__class__.__name__}) does not match " f"stage type ({stages[i]}) at index {i} for model {model_name}." ) elif isinstance(sampling_param_list, dict): - if "type" not in sampling_param_list: - raise RuntimeError("Internal error: unknown sampling parameter type") # Check that the provided single sampling param matches all stages - elif any(stage != sampling_param_list["type"] for stage in stages): - raise ValueError( - f"When passing a single sampling parameter node, all stages of the model must match " - f"the provided sampling parameter's type. " - f"However, the stages of model {model_name} are: {stages}. " - f"The provided sampling parameter is {sampling_param_list['type']}" - ) + for i, stage in enumerate(stages): + if not _check_sampling_param_matches_stage(sampling_param_list, stage): + raise ValueError( + f"Provided single sampling parameter type ({sampling_param_list.__class__.__name__}) must match " + f"the types of all stages of the model. " + f"However, stage {i} of model {model_name} is of type {stage}." + ) def add_sampling_parameters_to_stage( @@ -82,3 +79,11 @@ def add_sampling_parameters_to_stage( sampling_param_list[i].update(params_to_add) return sampling_param_list + + +def _check_sampling_param_matches_stage(sampling_param: dict, stage_type: str) -> bool: + if stage_type == "autoregression": + return isinstance(sampling_param, AutoregressionSamplingParams) + if stage_type == "diffusion": + return isinstance(sampling_param, DiffusionSamplingParams) + raise RuntimeError(f"Internal error: unknown stage type {stage_type}.") diff --git a/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg b/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg new file mode 100644 index 00000000000..20d9d077938 Binary files /dev/null and b/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-chaining-services.jpg differ diff --git a/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-comprehension.jpg b/apps/ComfyUI-vLLM-Omni/docs/images/comfyui-understanding.jpg similarity index 100% rename from apps/ComfyUI-vLLM-Omni/docs/images/comfyui-comprehension.jpg rename to apps/ComfyUI-vLLM-Omni/docs/images/comfyui-understanding.jpg diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Annotated Example.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Annotated Example.json deleted file mode 100644 index 16657df0efc..00000000000 --- a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Annotated Example.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"1c99f525-0a37-45ba-a28a-7df7c3af66b4","revision":0,"last_node_id":12,"last_link_id":14,"nodes":[{"id":1,"type":"VLLMOmniComprehension","pos":[1191.2177053682556,144.66829928181377],"size":[400,268],"flags":{},"order":8,"mode":0,"inputs":[{"localized_name":"image","name":"image","shape":7,"type":"IMAGE","link":1},{"localized_name":"video","name":"video","shape":7,"type":"VIDEO","link":2},{"localized_name":"audio","name":"audio","shape":7,"type":"AUDIO","link":3},{"localized_name":"sampling_params","name":"sampling_params","shape":7,"type":"SAMPLING_PARAMS","link":14},{"localized_name":"url","name":"url","type":"STRING","widget":{"name":"url"},"link":null},{"localized_name":"model","name":"model","type":"STRING","widget":{"name":"model"},"link":null},{"localized_name":"prompt","name":"prompt","type":"STRING","widget":{"name":"prompt"},"link":null},{"localized_name":"output_text","name":"output_text","type":"BOOLEAN","widget":{"name":"output_text"},"link":null},{"localized_name":"output_audio","name":"output_audio","type":"BOOLEAN","widget":{"name":"output_audio"},"link":null},{"localized_name":"use_audio_in_video","name":"use_audio_in_video","type":"BOOLEAN","widget":{"name":"use_audio_in_video"},"link":null}],"outputs":[{"localized_name":"text_response","name":"text_response","type":"STRING","links":[8]},{"localized_name":"audio_response","name":"audio_response","type":"AUDIO","links":[9]}],"properties":{"Node name for S&R":"VLLMOmniComprehension"},"widgets_values":["http://localhost:8000/v1","Qwen/Qwen2.5-Omni-7B","",true,true,true]},{"id":3,"type":"LoadVideo","pos":[729.5984141255855,-198.631920454299],"size":[282.798828125,233.0743408203125],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"file","name":"file","type":"COMBO","widget":{"name":"file"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"VIDEO","name":"VIDEO","type":"VIDEO","links":[2]}],"properties":{"Node name for S&R":"LoadVideo"},"widgets_values":["draw.mp4","image"]},{"id":4,"type":"LoadAudio","pos":[729.8037086965753,99.86963519703949],"size":[282.798828125,136],"flags":{},"order":1,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"COMBO","widget":{"name":"audio"},"link":null},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"AUDIOUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"AUDIO","name":"AUDIO","type":"AUDIO","links":[3]}],"properties":{"Node name for S&R":"LoadAudio"},"widgets_values":["Megan-Fox.mp3",null,null]},{"id":5,"type":"VLLMOmniARSampling","pos":[510.3517536642828,658.073751009259],"size":[270,178],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"max_tokens","name":"max_tokens","type":"INT","widget":{"name":"max_tokens"},"link":null},{"localized_name":"temperature","name":"temperature","type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"repetition_penalty","name":"repetition_penalty","type":"FLOAT","widget":{"name":"repetition_penalty"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"AR sampling params","name":"AR sampling params","type":"SAMPLING_PARAMS","links":[12]}],"properties":{"Node name for S&R":"VLLMOmniARSampling"},"widgets_values":[100,1,1,1,-1,"randomize"]},{"id":7,"type":"VLLMOmniARSampling","pos":[503.33235181647115,419.34158016181806],"size":[270,178],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"max_tokens","name":"max_tokens","type":"INT","widget":{"name":"max_tokens"},"link":null},{"localized_name":"temperature","name":"temperature","type":"FLOAT","widget":{"name":"temperature"},"link":null},{"localized_name":"top_p","name":"top_p","type":"FLOAT","widget":{"name":"top_p"},"link":null},{"localized_name":"repetition_penalty","name":"repetition_penalty","type":"FLOAT","widget":{"name":"repetition_penalty"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"AR sampling params","name":"AR sampling params","type":"SAMPLING_PARAMS","links":[5,13]}],"properties":{"Node name for S&R":"VLLMOmniARSampling"},"widgets_values":[100,1,1,1,-1,"randomize"]},{"id":8,"type":"VLLMOmniSamplingParamsList","pos":[820.6056617389042,426.38372037182273],"size":[263.066015625,66],"flags":{},"order":7,"mode":0,"inputs":[{"localized_name":"param1","name":"param1","type":"SAMPLING_PARAMS","link":5},{"localized_name":"param2","name":"param2","shape":7,"type":"SAMPLING_PARAMS","link":12},{"localized_name":"param3","name":"param3","shape":7,"type":"SAMPLING_PARAMS","link":13}],"outputs":[{"localized_name":"param list","name":"param list","type":"SAMPLING_PARAMS","links":[14]}],"properties":{"Node name for S&R":"VLLMOmniSamplingParamsList"},"widgets_values":[]},{"id":11,"type":"MarkdownNote","pos":[826.2328280438272,569.1890318701705],"size":[333.8220435590464,261.63596728060656],"flags":{},"order":4,"mode":0,"inputs":[],"outputs":[],"title":"Note: Sampling Parameters","properties":{},"widgets_values":["## Sampling Parameter Types\n\nThere are two types of sampling parameters: one for autoregression and one for diffusion.\nYou should ensure that you have chosen the correct type of sampling parameters for the model you request.\n\n## Stages & Shorthand\n\nFor multi-stage models such as Qwen Omni, you can either\n- connect one sampling parameter node, which is applied to all stages.\n- connect exactly the same number of sampling parameter nodes to a \"Multi-Stage Sampling Parameter List\" node, then connect this node to the primary request node.\n\nNote that this shorthand is intended to stay consistent with the [online serving API](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/qwen2_5_omni/)"],"color":"#432","bgcolor":"#000"},{"id":12,"type":"MarkdownNote","pos":[378.9866207003777,152.59550252215752],"size":[319.7287574247016,107.15904081906785],"flags":{},"order":6,"mode":0,"inputs":[],"outputs":[],"title":"Note: Input","properties":{},"widgets_values":["Note that not all models support every modality as input. For example, `ByteDance-Seed/BAGEL-7B-MoT` in Multimodality Comprehension mode only support text and image input.\n\nYou should ensure that the input are supported by the model. You can check the corresponding [online serving documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/bagel/) for confirmation."],"color":"#432","bgcolor":"#000"},{"id":2,"type":"LoadImage","pos":[394.4674804308822,-207.6987397548834],"size":[282.798828125,314],"flags":{},"order":5,"mode":0,"inputs":[{"localized_name":"image","name":"image","type":"COMBO","widget":{"name":"image"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"IMAGE","name":"IMAGE","type":"IMAGE","links":[1]},{"localized_name":"MASK","name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":10,"type":"PreviewAudio","pos":[1664.548345556043,297.5921292054054],"size":[270,88],"flags":{},"order":10,"mode":0,"inputs":[{"localized_name":"audio","name":"audio","type":"AUDIO","link":9},{"localized_name":"audioUI","name":"audioUI","type":"AUDIO_UI","widget":{"name":"audioUI"},"link":null}],"outputs":[],"properties":{"Node name for S&R":"PreviewAudio"},"widgets_values":[]},{"id":9,"type":"ShowText|pysssss","pos":[1649.2506875091847,66.22823888292349],"size":[318.7188464232943,173.38502269972975],"flags":{},"order":9,"mode":0,"inputs":[{"localized_name":"text","name":"text","type":"STRING","link":8}],"outputs":[{"localized_name":"STRING","name":"STRING","shape":6,"type":"STRING","links":null}],"properties":{"Node name for S&R":"ShowText|pysssss"},"widgets_values":[]}],"links":[[1,2,0,1,0,"IMAGE"],[2,3,0,1,1,"VIDEO"],[3,4,0,1,2,"AUDIO"],[5,7,0,8,0,"SAMPLING_PARAMS"],[8,1,0,9,0,"STRING"],[9,1,1,10,0,"AUDIO"],[12,5,0,8,1,"SAMPLING_PARAMS"],[13,7,0,8,2,"SAMPLING_PARAMS"],[14,8,0,1,3,"SAMPLING_PARAMS"]],"groups":[{"id":1,"title":"Sampling Parameters","bounding":[480.1649301181556,341.08402513937995,692.4113568972277,510.48648853403665],"color":"#3f789e","font_size":24,"flags":{}},{"id":2,"title":"Input","bounding":[344.2364817813528,-287.5850484183313,704.8832238768634,559.5124009832894],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"workflowRendererVersion":"LG","ds":{"scale":0.9478575057427204,"offset":[33.81037136029557,307.1974296197726]}},"version":0.4} diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Chaining Services.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Chaining Services.json new file mode 100644 index 00000000000..3031f83444e --- /dev/null +++ b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Chaining Services.json @@ -0,0 +1,552 @@ +{ + "id": "6643e5fd-fa2a-4f25-935a-483173a8097c", + "revision": 0, + "last_node_id": 8, + "last_link_id": 6, + "nodes": [ + { + "id": 1, + "type": "PreviewImage", + "pos": [ + 1446.2005205859512, + -316.7359686049902 + ], + "size": [ + 305.6628685610174, + 307.0336884172169 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 1 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 8, + "type": "PreviewImage", + "pos": [ + 1010.1848219273943, + -311.54216706114363 + ], + "size": [ + 305.6628685610174, + 307.0336884172169 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [ + { + "localized_name": "images", + "name": "images", + "type": "IMAGE", + "link": 6 + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewImage" + }, + "widgets_values": [] + }, + { + "id": 7, + "type": "VLLMOmniDiffusionSampling", + "pos": [ + 234.61958293819296, + 9.405681270043619 + ], + "size": [ + 284.205078125, + 226 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "n", + "name": "n", + "type": "INT", + "widget": { + "name": "n" + }, + "link": null + }, + { + "localized_name": "num_inference_steps", + "name": "num_inference_steps", + "type": "INT", + "widget": { + "name": "num_inference_steps" + }, + "link": null + }, + { + "localized_name": "guidance_scale", + "name": "guidance_scale", + "type": "FLOAT", + "widget": { + "name": "guidance_scale" + }, + "link": null + }, + { + "localized_name": "true_cfg_scale", + "name": "true_cfg_scale", + "type": "FLOAT", + "widget": { + "name": "true_cfg_scale" + }, + "link": null + }, + { + "localized_name": "vae_use_slicing", + "name": "vae_use_slicing", + "type": "BOOLEAN", + "widget": { + "name": "vae_use_slicing" + }, + "link": null + }, + { + "localized_name": "vae_use_tiling", + "name": "vae_use_tiling", + "type": "BOOLEAN", + "widget": { + "name": "vae_use_tiling" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "diffusion sampling params", + "name": "diffusion sampling params", + "type": "SAMPLING_PARAMS", + "links": [ + 5 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniDiffusionSampling" + }, + "widgets_values": [ + 1, + 50, + 1, + 1, + false, + false, + 1525, + "randomize" + ] + }, + { + "id": 4, + "type": "VLLMOmniDiffusionSampling", + "pos": [ + 666.8380026154548, + 268.86271068330126 + ], + "size": [ + 284.205078125, + 226 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "n", + "name": "n", + "type": "INT", + "widget": { + "name": "n" + }, + "link": null + }, + { + "localized_name": "num_inference_steps", + "name": "num_inference_steps", + "type": "INT", + "widget": { + "name": "num_inference_steps" + }, + "link": null + }, + { + "localized_name": "guidance_scale", + "name": "guidance_scale", + "type": "FLOAT", + "widget": { + "name": "guidance_scale" + }, + "link": null + }, + { + "localized_name": "true_cfg_scale", + "name": "true_cfg_scale", + "type": "FLOAT", + "widget": { + "name": "true_cfg_scale" + }, + "link": null + }, + { + "localized_name": "vae_use_slicing", + "name": "vae_use_slicing", + "type": "BOOLEAN", + "widget": { + "name": "vae_use_slicing" + }, + "link": null + }, + { + "localized_name": "vae_use_tiling", + "name": "vae_use_tiling", + "type": "BOOLEAN", + "widget": { + "name": "vae_use_tiling" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "diffusion sampling params", + "name": "diffusion sampling params", + "type": "SAMPLING_PARAMS", + "links": [ + 2 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniDiffusionSampling" + }, + "widgets_values": [ + 4, + 50, + 7, + 1, + false, + false, + 42, + "fixed" + ] + }, + { + "id": 5, + "type": "VLLMOmniGenerateImage", + "pos": [ + 984.723613585788, + 63.376900027553276 + ], + "size": [ + 416.56628685610167, + 372.1662621294205 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": 4 + }, + { + "localized_name": "mask", + "name": "mask", + "shape": 7, + "type": "MASK", + "link": null + }, + { + "localized_name": "sampling_params", + "name": "sampling_params", + "shape": 7, + "type": "SAMPLING_PARAMS", + "link": 2 + }, + { + "localized_name": "url", + "name": "url", + "type": "STRING", + "widget": { + "name": "url" + }, + "link": null + }, + { + "localized_name": "model", + "name": "model", + "type": "STRING", + "widget": { + "name": "model" + }, + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": null + }, + { + "localized_name": "negative_prompt", + "name": "negative_prompt", + "type": "STRING", + "widget": { + "name": "negative_prompt" + }, + "link": null + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": null + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "links": [ + 1 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniGenerateImage" + }, + "widgets_values": [ + "http://localhost:8001/v1", + "/home/models/Qwen/Qwen-Image-Edit", + "A high-quality, high contrast, stylized portrait of the object in the uploaded reference image. Pop art and doodle style, with abundant scribbling patterns, such as a teal crown, orange lightning bolts, colorful handwritten scripts.", + "Realistic", + 800, + 800 + ] + }, + { + "id": 6, + "type": "VLLMOmniGenerateImage", + "pos": [ + 541.2702717429818, + -160.70392744708548 + ], + "size": [ + 416.56628685610167, + 372.1662621294205 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": null + }, + { + "localized_name": "mask", + "name": "mask", + "shape": 7, + "type": "MASK", + "link": null + }, + { + "localized_name": "sampling_params", + "name": "sampling_params", + "shape": 7, + "type": "SAMPLING_PARAMS", + "link": 5 + }, + { + "localized_name": "url", + "name": "url", + "type": "STRING", + "widget": { + "name": "url" + }, + "link": null + }, + { + "localized_name": "model", + "name": "model", + "type": "STRING", + "widget": { + "name": "model" + }, + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": null + }, + { + "localized_name": "negative_prompt", + "name": "negative_prompt", + "type": "STRING", + "widget": { + "name": "negative_prompt" + }, + "link": null + }, + { + "localized_name": "width", + "name": "width", + "type": "INT", + "widget": { + "name": "width" + }, + "link": null + }, + { + "localized_name": "height", + "name": "height", + "type": "INT", + "widget": { + "name": "height" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "image", + "name": "image", + "type": "IMAGE", + "links": [ + 4, + 6 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniGenerateImage" + }, + "widgets_values": [ + "http://localhost:8000/v1", + "/home/models/Tongyi-MAI/Z-Image-Turbo", + "A headshot of a cute Siamese kitty. Blurred background due to wide aperture. Close-up look. Realistic.", + "Cartoonish.", + 800, + 800 + ] + } + ], + "links": [ + [ + 1, + 5, + 0, + 1, + 0, + "IMAGE" + ], + [ + 2, + 4, + 0, + 5, + 2, + "SAMPLING_PARAMS" + ], + [ + 4, + 6, + 0, + 5, + 0, + "IMAGE" + ], + [ + 5, + 7, + 0, + 6, + 2, + "SAMPLING_PARAMS" + ], + [ + 6, + 6, + 0, + 8, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "workflowRendererVersion": "LG", + "ds": { + "scale": 1.0426432563169903, + "offset": [ + -90.4071374799346, + 575.2948920069742 + ] + } + }, + "version": 0.4 +} diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json index d1495bc9459..86194ee70d5 100644 --- a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json +++ b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Image Generation.json @@ -1 +1 @@ -{"id":"91f75acc-8040-40f6-865a-2e8a7cfd6672","revision":0,"last_node_id":11,"last_link_id":23,"nodes":[{"id":3,"type":"PreviewImage","pos":[1281.8455167767304,-69.02638461454333],"size":[305.6628685610174,307.0336884172169],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"images","name":"images","type":"IMAGE","link":22}],"outputs":[],"properties":{"Node name for S&R":"PreviewImage"},"widgets_values":[]},{"id":11,"type":"VLLMOmniGenerateImage","pos":[816.0962515873642,-137.51112584854445],"size":[400,278],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"image","name":"image","shape":7,"type":"IMAGE","link":21},{"localized_name":"mask","name":"mask","shape":7,"type":"MASK","link":null},{"localized_name":"sampling_params","name":"sampling_params","shape":7,"type":"SAMPLING_PARAMS","link":23},{"localized_name":"url","name":"url","type":"STRING","widget":{"name":"url"},"link":null},{"localized_name":"model","name":"model","type":"STRING","widget":{"name":"model"},"link":null},{"localized_name":"prompt","name":"prompt","type":"STRING","widget":{"name":"prompt"},"link":null},{"localized_name":"negative_prompt","name":"negative_prompt","type":"STRING","widget":{"name":"negative_prompt"},"link":null},{"localized_name":"width","name":"width","type":"INT","widget":{"name":"width"},"link":null},{"localized_name":"height","name":"height","type":"INT","widget":{"name":"height"},"link":null}],"outputs":[{"localized_name":"image","name":"image","type":"IMAGE","links":[22]}],"properties":{"Node name for S&R":"VLLMOmniGenerateImage"},"widgets_values":["http://localhost:8000/v1","Qwen/Qwen-Image-Edit","Put this figure in a realistic mountain view","",512,512]},{"id":8,"type":"VLLMOmniDiffusionSampling","pos":[478.59266934006774,183.67711984955648],"size":[284.205078125,202],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"n","name":"n","type":"INT","widget":{"name":"n"},"link":null},{"localized_name":"num_inference_steps","name":"num_inference_steps","type":"INT","widget":{"name":"num_inference_steps"},"link":null},{"localized_name":"guidance_scale","name":"guidance_scale","type":"FLOAT","widget":{"name":"guidance_scale"},"link":null},{"localized_name":"true_cfg_scale","name":"true_cfg_scale","type":"FLOAT","widget":{"name":"true_cfg_scale"},"link":null},{"localized_name":"vae_use_slicing","name":"vae_use_slicing","type":"BOOLEAN","widget":{"name":"vae_use_slicing"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"diffusion sampling params","name":"diffusion sampling params","type":"SAMPLING_PARAMS","links":[23]}],"properties":{"Node name for S&R":"VLLMOmniDiffusionSampling"},"widgets_values":[4,50,1,1,false,42,"fixed"]},{"id":10,"type":"MarkdownNote","pos":[227.99306819575278,-231.24143306069843],"size":[240.3326478922474,136.3505820791881],"flags":{},"order":2,"mode":0,"inputs":[],"outputs":[],"title":"Note: Task and Input","properties":{},"widgets_values":["vLLM-Omni nodes are categorized based on the output modality. The \"Generate Image\" node supports both text-to-image generation or image-to-image generation (a.k.a. image editing). The node will route to the correct endpoint depending on whether an input image is present or not."],"color":"#432","bgcolor":"#000"},{"id":4,"type":"LoadImage","pos":[496.31859627609606,-229.71277089860084],"size":[270,314],"flags":{},"order":1,"mode":0,"inputs":[{"localized_name":"image","name":"image","type":"COMBO","widget":{"name":"image"},"link":null},{"localized_name":"choose file to upload","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"IMAGE","name":"IMAGE","type":"IMAGE","links":[21]},{"localized_name":"MASK","name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]}],"links":[[21,4,0,11,0,"IMAGE"],[22,11,0,3,0,"IMAGE"],[23,8,0,11,2,"SAMPLING_PARAMS"]],"groups":[{"id":1,"title":"Input","bounding":[213.1706010087147,-313.20095750667554,560.8246471437027,407.4246532472182],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"workflowRendererVersion":"LG","ds":{"scale":1.1469075819486894,"offset":[276.8427259980847,549.4885226647582]}},"version":0.4} +{"id":"91f75acc-8040-40f6-865a-2e8a7cfd6672","revision":0,"last_node_id":11,"last_link_id":23,"nodes":[{"id":3,"type":"PreviewImage","pos":[1281.8455167767304,-69.02638461454333],"size":[305.6628685610174,307.0336884172169],"flags":{},"order":4,"mode":0,"inputs":[{"localized_name":"图像","name":"images","type":"IMAGE","link":22}],"outputs":[],"properties":{"Node name for S&R":"PreviewImage"},"widgets_values":[]},{"id":11,"type":"VLLMOmniGenerateImage","pos":[816.0962515873642,-137.51112584854445],"size":[400,278],"flags":{},"order":3,"mode":0,"inputs":[{"localized_name":"image","name":"image","shape":7,"type":"IMAGE","link":21},{"localized_name":"mask","name":"mask","shape":7,"type":"MASK","link":null},{"localized_name":"sampling_params","name":"sampling_params","shape":7,"type":"SAMPLING_PARAMS","link":23},{"localized_name":"url","name":"url","type":"STRING","widget":{"name":"url"},"link":null},{"localized_name":"model","name":"model","type":"STRING","widget":{"name":"model"},"link":null},{"localized_name":"prompt","name":"prompt","type":"STRING","widget":{"name":"prompt"},"link":null},{"localized_name":"negative_prompt","name":"negative_prompt","type":"STRING","widget":{"name":"negative_prompt"},"link":null},{"localized_name":"width","name":"width","type":"INT","widget":{"name":"width"},"link":null},{"localized_name":"height","name":"height","type":"INT","widget":{"name":"height"},"link":null}],"outputs":[{"localized_name":"image","name":"image","type":"IMAGE","links":[22]}],"properties":{"Node name for S&R":"VLLMOmniGenerateImage"},"widgets_values":["http://localhost:8000/v1","Qwen/Qwen-Image-Edit","Put this figure in a realistic mountain view","",512,512]},{"id":10,"type":"MarkdownNote","pos":[227.99306819575278,-231.24143306069843],"size":[240.3326478922474,136.3505820791881],"flags":{},"order":1,"mode":0,"inputs":[],"outputs":[],"title":"Note: Task and Input","properties":{},"widgets_values":["vLLM-Omni nodes are categorized based on the output modality. The \"Generate Image\" node supports both text-to-image generation or image-to-image generation (a.k.a. image editing). The node will route to the correct endpoint depending on whether an input image is present or not."],"color":"#432","bgcolor":"#000"},{"id":4,"type":"LoadImage","pos":[496.31859627609606,-229.71277089860084],"size":[270,314],"flags":{},"order":2,"mode":0,"inputs":[{"localized_name":"图像","name":"image","type":"COMBO","widget":{"name":"image"},"link":null},{"localized_name":"选择文件上传","name":"upload","type":"IMAGEUPLOAD","widget":{"name":"upload"},"link":null}],"outputs":[{"localized_name":"图像","name":"IMAGE","type":"IMAGE","links":[21]},{"localized_name":"遮罩","name":"MASK","type":"MASK","links":null}],"properties":{"Node name for S&R":"LoadImage"},"widgets_values":["example.png","image"]},{"id":8,"type":"VLLMOmniDiffusionSampling","pos":[478.59266934006774,183.67711984955648],"size":[284.205078125,226],"flags":{},"order":0,"mode":0,"inputs":[{"localized_name":"n","name":"n","type":"INT","widget":{"name":"n"},"link":null},{"localized_name":"num_inference_steps","name":"num_inference_steps","type":"INT","widget":{"name":"num_inference_steps"},"link":null},{"localized_name":"guidance_scale","name":"guidance_scale","type":"FLOAT","widget":{"name":"guidance_scale"},"link":null},{"localized_name":"true_cfg_scale","name":"true_cfg_scale","type":"FLOAT","widget":{"name":"true_cfg_scale"},"link":null},{"localized_name":"vae_use_slicing","name":"vae_use_slicing","type":"BOOLEAN","widget":{"name":"vae_use_slicing"},"link":null},{"localized_name":"vae_use_tiling","name":"vae_use_tiling","type":"BOOLEAN","widget":{"name":"vae_use_tiling"},"link":null},{"localized_name":"seed","name":"seed","type":"INT","widget":{"name":"seed"},"link":null}],"outputs":[{"localized_name":"diffusion sampling params","name":"diffusion sampling params","type":"SAMPLING_PARAMS","links":[23]}],"properties":{"Node name for S&R":"VLLMOmniDiffusionSampling"},"widgets_values":[4,50,1,1,false,false,42,"randomize"]}],"links":[[21,4,0,11,0,"IMAGE"],[22,11,0,3,0,"IMAGE"],[23,8,0,11,2,"SAMPLING_PARAMS"]],"groups":[{"id":1,"title":"Input","bounding":[213.1706010087147,-313.20095750667554,560.8246471437027,407.4246532472182],"color":"#3f789e","font_size":24,"flags":{}}],"config":{},"extra":{"workflowRendererVersion":"LG","ds":{"scale":1.1469075819486894,"offset":[-55.18047513099182,220.2553505195962]}},"version":0.4} diff --git a/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Multimodal Understanding.json b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Multimodal Understanding.json new file mode 100644 index 00000000000..4d32d5368c2 --- /dev/null +++ b/apps/ComfyUI-vLLM-Omni/example_workflows/vLLM-Omni Multimodal Understanding.json @@ -0,0 +1,761 @@ +{ + "id": "1c99f525-0a37-45ba-a28a-7df7c3af66b4", + "revision": 0, + "last_node_id": 12, + "last_link_id": 14, + "nodes": [ + { + "id": 1, + "type": "VLLMOmniUnderstanding", + "pos": [ + 1191.2177053682556, + 144.66829928181377 + ], + "size": [ + 400, + 268 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "shape": 7, + "type": "IMAGE", + "link": 1 + }, + { + "localized_name": "video", + "name": "video", + "shape": 7, + "type": "VIDEO", + "link": 2 + }, + { + "localized_name": "audio", + "name": "audio", + "shape": 7, + "type": "AUDIO", + "link": 3 + }, + { + "localized_name": "sampling_params", + "name": "sampling_params", + "shape": 7, + "type": "SAMPLING_PARAMS", + "link": 14 + }, + { + "localized_name": "url", + "name": "url", + "type": "STRING", + "widget": { + "name": "url" + }, + "link": null + }, + { + "localized_name": "model", + "name": "model", + "type": "STRING", + "widget": { + "name": "model" + }, + "link": null + }, + { + "localized_name": "prompt", + "name": "prompt", + "type": "STRING", + "widget": { + "name": "prompt" + }, + "link": null + }, + { + "localized_name": "output_text", + "name": "output_text", + "type": "BOOLEAN", + "widget": { + "name": "output_text" + }, + "link": null + }, + { + "localized_name": "output_audio", + "name": "output_audio", + "type": "BOOLEAN", + "widget": { + "name": "output_audio" + }, + "link": null + }, + { + "localized_name": "use_audio_in_video", + "name": "use_audio_in_video", + "type": "BOOLEAN", + "widget": { + "name": "use_audio_in_video" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "text_response", + "name": "text_response", + "type": "STRING", + "links": [ + 8 + ] + }, + { + "localized_name": "audio_response", + "name": "audio_response", + "type": "AUDIO", + "links": [ + 9 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniUnderstanding" + }, + "widgets_values": [ + "http://localhost:8000/v1", + "Qwen/Qwen2.5-Omni-7B", + "", + true, + true, + true + ] + }, + { + "id": 3, + "type": "LoadVideo", + "pos": [ + 729.5984141255855, + -198.631920454299 + ], + "size": [ + 282.798828125, + 233.0743408203125 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [ + { + "localized_name": "file", + "name": "file", + "type": "COMBO", + "widget": { + "name": "file" + }, + "link": null + }, + { + "localized_name": "choose file to upload", + "name": "upload", + "type": "IMAGEUPLOAD", + "widget": { + "name": "upload" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "VIDEO", + "name": "VIDEO", + "type": "VIDEO", + "links": [ + 2 + ] + } + ], + "properties": { + "Node name for S&R": "LoadVideo" + }, + "widgets_values": [ + "draw.mp4", + "image" + ] + }, + { + "id": 4, + "type": "LoadAudio", + "pos": [ + 729.8037086965753, + 99.86963519703949 + ], + "size": [ + 282.798828125, + 136 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [ + { + "localized_name": "audio", + "name": "audio", + "type": "COMBO", + "widget": { + "name": "audio" + }, + "link": null + }, + { + "localized_name": "audioUI", + "name": "audioUI", + "type": "AUDIO_UI", + "widget": { + "name": "audioUI" + }, + "link": null + }, + { + "localized_name": "choose file to upload", + "name": "upload", + "type": "AUDIOUPLOAD", + "widget": { + "name": "upload" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AUDIO", + "name": "AUDIO", + "type": "AUDIO", + "links": [ + 3 + ] + } + ], + "properties": { + "Node name for S&R": "LoadAudio" + }, + "widgets_values": [ + "Megan-Fox.mp3", + null, + null + ] + }, + { + "id": 5, + "type": "VLLMOmniARSampling", + "pos": [ + 510.3517536642828, + 658.073751009259 + ], + "size": [ + 270, + 178 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [ + { + "localized_name": "max_tokens", + "name": "max_tokens", + "type": "INT", + "widget": { + "name": "max_tokens" + }, + "link": null + }, + { + "localized_name": "temperature", + "name": "temperature", + "type": "FLOAT", + "widget": { + "name": "temperature" + }, + "link": null + }, + { + "localized_name": "top_p", + "name": "top_p", + "type": "FLOAT", + "widget": { + "name": "top_p" + }, + "link": null + }, + { + "localized_name": "repetition_penalty", + "name": "repetition_penalty", + "type": "FLOAT", + "widget": { + "name": "repetition_penalty" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AR sampling params", + "name": "AR sampling params", + "type": "SAMPLING_PARAMS", + "links": [ + 12 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniARSampling" + }, + "widgets_values": [ + 100, + 1, + 1, + 1, + -1, + "randomize" + ] + }, + { + "id": 7, + "type": "VLLMOmniARSampling", + "pos": [ + 503.33235181647115, + 419.34158016181806 + ], + "size": [ + 270, + 178 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [ + { + "localized_name": "max_tokens", + "name": "max_tokens", + "type": "INT", + "widget": { + "name": "max_tokens" + }, + "link": null + }, + { + "localized_name": "temperature", + "name": "temperature", + "type": "FLOAT", + "widget": { + "name": "temperature" + }, + "link": null + }, + { + "localized_name": "top_p", + "name": "top_p", + "type": "FLOAT", + "widget": { + "name": "top_p" + }, + "link": null + }, + { + "localized_name": "repetition_penalty", + "name": "repetition_penalty", + "type": "FLOAT", + "widget": { + "name": "repetition_penalty" + }, + "link": null + }, + { + "localized_name": "seed", + "name": "seed", + "type": "INT", + "widget": { + "name": "seed" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "AR sampling params", + "name": "AR sampling params", + "type": "SAMPLING_PARAMS", + "links": [ + 5, + 13 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniARSampling" + }, + "widgets_values": [ + 100, + 1, + 1, + 1, + -1, + "randomize" + ] + }, + { + "id": 8, + "type": "VLLMOmniSamplingParamsList", + "pos": [ + 820.6056617389042, + 426.38372037182273 + ], + "size": [ + 263.066015625, + 66 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "localized_name": "param1", + "name": "param1", + "type": "SAMPLING_PARAMS", + "link": 5 + }, + { + "localized_name": "param2", + "name": "param2", + "shape": 7, + "type": "SAMPLING_PARAMS", + "link": 12 + }, + { + "localized_name": "param3", + "name": "param3", + "shape": 7, + "type": "SAMPLING_PARAMS", + "link": 13 + } + ], + "outputs": [ + { + "localized_name": "param list", + "name": "param list", + "type": "SAMPLING_PARAMS", + "links": [ + 14 + ] + } + ], + "properties": { + "Node name for S&R": "VLLMOmniSamplingParamsList" + }, + "widgets_values": [] + }, + { + "id": 11, + "type": "MarkdownNote", + "pos": [ + 826.2328280438272, + 569.1890318701705 + ], + "size": [ + 333.8220435590464, + 261.63596728060656 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [], + "title": "Note: Sampling Parameters", + "properties": {}, + "widgets_values": [ + "## Sampling Parameter Types\n\nThere are two types of sampling parameters: one for autoregression and one for diffusion.\nYou should ensure that you have chosen the correct type of sampling parameters for the model you request.\n\n## Stages & Shorthand\n\nFor multi-stage models such as Qwen Omni, you can either\n- connect one sampling parameter node, which is applied to all stages.\n- connect exactly the same number of sampling parameter nodes to a \"Multi-Stage Sampling Parameter List\" node, then connect this node to the primary request node.\n\nNote that this shorthand is intended to stay consistent with the [online serving API](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/qwen2_5_omni/)" + ], + "color": "#432", + "bgcolor": "#000" + }, + { + "id": 12, + "type": "MarkdownNote", + "pos": [ + 378.9866207003777, + 152.59550252215752 + ], + "size": [ + 319.7287574247016, + 107.15904081906785 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [], + "outputs": [], + "title": "Note: Input", + "properties": {}, + "widgets_values": [ + "Note that not all models support every modality as input. For example, `ByteDance-Seed/BAGEL-7B-MoT` in Multimodality Understanding mode only support text and image input.\n\nYou should ensure that the input are supported by the model. You can check the corresponding [online serving documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/user_guide/examples/online_serving/bagel/) for confirmation." + ], + "color": "#432", + "bgcolor": "#000" + }, + { + "id": 2, + "type": "LoadImage", + "pos": [ + 394.4674804308822, + -207.6987397548834 + ], + "size": [ + 282.798828125, + 314 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "localized_name": "image", + "name": "image", + "type": "COMBO", + "widget": { + "name": "image" + }, + "link": null + }, + { + "localized_name": "choose file to upload", + "name": "upload", + "type": "IMAGEUPLOAD", + "widget": { + "name": "upload" + }, + "link": null + } + ], + "outputs": [ + { + "localized_name": "IMAGE", + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 1 + ] + }, + { + "localized_name": "MASK", + "name": "MASK", + "type": "MASK", + "links": null + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "example.png", + "image" + ] + }, + { + "id": 10, + "type": "PreviewAudio", + "pos": [ + 1664.548345556043, + 297.5921292054054 + ], + "size": [ + 270, + 88 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "localized_name": "audio", + "name": "audio", + "type": "AUDIO", + "link": 9 + }, + { + "localized_name": "audioUI", + "name": "audioUI", + "type": "AUDIO_UI", + "widget": { + "name": "audioUI" + }, + "link": null + } + ], + "outputs": [], + "properties": { + "Node name for S&R": "PreviewAudio" + }, + "widgets_values": [] + }, + { + "id": 9, + "type": "ShowText|pysssss", + "pos": [ + 1649.2506875091847, + 66.22823888292349 + ], + "size": [ + 318.7188464232943, + 173.38502269972975 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "localized_name": "text", + "name": "text", + "type": "STRING", + "link": 8 + } + ], + "outputs": [ + { + "localized_name": "STRING", + "name": "STRING", + "shape": 6, + "type": "STRING", + "links": null + } + ], + "properties": { + "Node name for S&R": "ShowText|pysssss" + }, + "widgets_values": [] + } + ], + "links": [ + [ + 1, + 2, + 0, + 1, + 0, + "IMAGE" + ], + [ + 2, + 3, + 0, + 1, + 1, + "VIDEO" + ], + [ + 3, + 4, + 0, + 1, + 2, + "AUDIO" + ], + [ + 5, + 7, + 0, + 8, + 0, + "SAMPLING_PARAMS" + ], + [ + 8, + 1, + 0, + 9, + 0, + "STRING" + ], + [ + 9, + 1, + 1, + 10, + 0, + "AUDIO" + ], + [ + 12, + 5, + 0, + 8, + 1, + "SAMPLING_PARAMS" + ], + [ + 13, + 7, + 0, + 8, + 2, + "SAMPLING_PARAMS" + ], + [ + 14, + 8, + 0, + 1, + 3, + "SAMPLING_PARAMS" + ] + ], + "groups": [ + { + "id": 1, + "title": "Sampling Parameters", + "bounding": [ + 480.1649301181556, + 341.08402513937995, + 692.4113568972277, + 510.48648853403665 + ], + "color": "#3f789e", + "font_size": 24, + "flags": {} + }, + { + "id": 2, + "title": "Input", + "bounding": [ + 344.2364817813528, + -287.5850484183313, + 704.8832238768634, + 559.5124009832894 + ], + "color": "#3f789e", + "font_size": 24, + "flags": {} + } + ], + "config": {}, + "extra": { + "workflowRendererVersion": "LG", + "ds": { + "scale": 0.9478575057427204, + "offset": [ + 33.81037136029557, + 307.1974296197726 + ] + } + }, + "version": 0.4 +} diff --git a/pyproject.toml b/pyproject.toml index 965a3e4b20c..4d6bc033317 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,8 @@ dev = [ "soundfile>=0.13.1", "imageio[ffmpeg]>=0.6.0", "opencv-python>=4.12.0.88", - "mooncake-transfer-engine==0.3.8.post1" + "mooncake-transfer-engine==0.3.8.post1", + "av" # for ComfyUI tests ] docs = [ diff --git a/tests/comfyui/conftest.py b/tests/comfyui/conftest.py new file mode 100644 index 00000000000..7c9770b8e98 --- /dev/null +++ b/tests/comfyui/conftest.py @@ -0,0 +1,82 @@ +""" +Conftest for ComfyUI-vLLM-Omni tests. + +This module sets up the test environment by: +1. Adding the ComfyUI plugin to Python path +2. Mocking comfy_api.input module (AudioInput, VideoInput) since comfyui is not installed +3. Mocking comfy_extras.nodes_audio module +""" + +import os +import sys +from typing import BinaryIO, TypedDict +from unittest.mock import MagicMock + + +def pytest_configure(config): + """ + Called after command line options have been parsed and before test collection. + This is the right place to set up sys.path and mock modules. + """ + _setup_comfyui_test_environment() + + +def _setup_comfyui_test_environment(): + """Set up the test environment for ComfyUI plugin testing.""" + # === Add ComfyUI plugin path to allow importing comfyui_vllm_omni === + _COMFYUI_PLUGIN_PATH = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "..", "apps", "ComfyUI-vLLM-Omni") + ) + if not os.path.isdir(_COMFYUI_PLUGIN_PATH): + raise FileNotFoundError( + f"ComfyUI plugin not found at {_COMFYUI_PLUGIN_PATH}. " + "If it is moved elsewhere, please update the path in this conftest.py." + ) + if _COMFYUI_PLUGIN_PATH not in sys.path: + sys.path.insert(0, _COMFYUI_PLUGIN_PATH) + + # Import torch after changing import paths. (To be used later) + import torch + + # === Mock ComfyUI internal modules (comfy_api & comfy_extras) and "import" them to sys.module === + class AudioInput(TypedDict): + """Mock AudioInput TypedDict from comfy_api.input""" + + waveform: torch.Tensor # Shape: (B, C, T) + sample_rate: int + + class VideoInput: + """Mock VideoInput class from comfy_api.input""" + + def __init__(self, data: bytes = b"mock_video_data"): + self._data = data + + def save_to(self, file: str | BinaryIO): + """Save video data to file or file-like object.""" + if isinstance(file, str): + print("Called VideoInput.save_to with file path. Saving to a path is no-op in tests.") + else: + file.write(self._data) + + mock_comfy_api = MagicMock() + mock_comfy_api_input = MagicMock() + mock_comfy_api_input.AudioInput = AudioInput + mock_comfy_api_input.VideoInput = VideoInput + mock_comfy_api.input = mock_comfy_api_input + + def mock_load(_: str | BinaryIO): + """Mock nodes_audio.load that returns a waveform tensor (channels, samples) and sample rate.""" + waveform = torch.zeros((1, 24000), dtype=torch.float32) + sample_rate = 24000 + return waveform, sample_rate + + mock_comfy_extras = MagicMock() + mock_nodes_audio = MagicMock() + mock_nodes_audio.load = mock_load + mock_comfy_extras.nodes_audio = mock_nodes_audio + + # Install mock modules BEFORE importing any comfyui_vllm_omni code + sys.modules["comfy_api"] = mock_comfy_api + sys.modules["comfy_api.input"] = mock_comfy_api_input + sys.modules["comfy_extras"] = mock_comfy_extras + sys.modules["comfy_extras.nodes_audio"] = mock_nodes_audio diff --git a/tests/comfyui/test_comfyui_integration.py b/tests/comfyui/test_comfyui_integration.py new file mode 100644 index 00000000000..46359632609 --- /dev/null +++ b/tests/comfyui/test_comfyui_integration.py @@ -0,0 +1,567 @@ +""" +Integration tests for ComfyUI nodes that use the Omni API client, with a mocked AsyncOmni and a real API server running in a background process. +These tests cover the integration between ComfyUI node and the API server, without actual model inference logic. +It ensures that +1. Changes made to the API (e.g., request and response formats) do not break the ComfyUI frontend that use it. +2. The sampling parameters are correctly passed from the node to AsyncOmni through the API layer. +""" + +import multiprocessing +import time +import traceback +from collections.abc import Iterable, Sequence +from enum import StrEnum, auto +from typing import Any, NamedTuple +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +import requests +import torch +from comfy_api.input import AudioInput, VideoInput +from comfyui_vllm_omni.nodes import ( + VLLMOmniGenerateImage, + VLLMOmniTTS, + VLLMOmniUnderstanding, + VLLMOmniVoiceClone, +) +from comfyui_vllm_omni.utils.types import AutoregressionSamplingParams, DiffusionSamplingParams +from PIL import Image +from vllm import SamplingParams +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.utils.argparse_utils import FlexibleArgumentParser + +from vllm_omni.entrypoints.cli.serve import OmniServeCommand +from vllm_omni.inputs.data import OmniSamplingParams +from vllm_omni.outputs import OmniRequestOutput + + +class ServerCase(NamedTuple): + """Parametrizing the model to serve.""" + + served_model: str + stage_list: list + stage_configs: list[dict] + outputs: list[OmniRequestOutput] + + +class SamplingCase(NamedTuple): + """Parametrizing the input sampling parameters.""" + + kind: "SamplingKind" + sampling_params: dict | list[dict] | None + + +class SamplingKind(StrEnum): + IMAGE_NONE = auto() + IMAGE_DIFFUSION_SINGLE = auto() + UNDERSTANDING_NONE = auto() + UNDERSTANDING_AR_LIST = auto() + TTS_NONE = auto() + TTS_DIFFUSION_SINGLE = auto() + + +# Pre-defined arguments to be used in function calls during the tests +IMAGE_WIDTH = 64 +IMAGE_HEIGHT = 64 +DIFFUSION_SINGLE_SAMPLING_PARAMS = DiffusionSamplingParams( + { + "n": 2, + "num_inference_steps": 30, + "guidance_scale": 6.0, + "true_cfg_scale": 1.5, + } +) + +AR_LIST_SAMPLING_PARAMS = [ + AutoregressionSamplingParams( + { + "max_tokens": 64, + "temperature": 0.6, + "top_p": 0.9, + "repetition_penalty": 1.0, + "seed": 21, + } + ), + AutoregressionSamplingParams( + { + "max_tokens": 96, + "temperature": 0.75, + "top_p": 0.85, + "repetition_penalty": 1.05, + "seed": 22, + } + ), + AutoregressionSamplingParams( + { + "max_tokens": 128, + "temperature": 0.8, + "top_p": 0.8, + "repetition_penalty": 1.1, + "seed": 23, + } + ), +] + + +def _build_image_output(size: tuple[int, int] = (IMAGE_WIDTH, IMAGE_HEIGHT), color: str = "red") -> Image.Image: + return Image.new("RGB", size, color=color) + + +def _build_text_output(text: str = "This is a test response.") -> OmniRequestOutput: + completion_output = CompletionOutput( + index=0, + text=text, + token_ids=[1, 2, 3], + cumulative_logprob=0.0, + logprobs=None, + finish_reason="stop", + stop_reason=None, + ) + request_output = RequestOutput( + request_id="test_req_text", + prompt="test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + ) + return OmniRequestOutput( + request_id="test_req_text", + finished=True, + final_output_type="text", + request_output=request_output, + ) + + +def _build_audio_chat_output(num_samples: int = 24000) -> OmniRequestOutput: + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=0.0, + logprobs=None, + finish_reason="stop", + stop_reason=None, + ) + completion_output.multimodal_output = {"audio": [torch.zeros(1, num_samples)]} + request_output = RequestOutput( + request_id="test_req_audio_chat", + prompt="test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + ) + return OmniRequestOutput( + request_id="test_req_audio_chat", + finished=True, + final_output_type="audio", + request_output=request_output, + ) + + +def _build_audio_speech_output(num_samples: int = 24000) -> OmniRequestOutput: + return OmniRequestOutput.from_diffusion( + request_id="test_req_audio_speech", + images=[], + multimodal_output={"audio": torch.zeros(num_samples), "sr": 24000}, + final_output_type="audio", + ) + + +def _build_diffusion_image_output_for_images_endpoint() -> OmniRequestOutput: + return OmniRequestOutput.from_diffusion( + request_id="test_req_img_dalle", + images=[_build_image_output()], + final_output_type="image", + ) + + +def _build_diffusion_image_output_for_chat_endpoint() -> OmniRequestOutput: + request_output = MagicMock() + request_output.images = [_build_image_output(color="blue")] + request_output.finished = True + return OmniRequestOutput( + request_id="test_req_img_chat", + finished=True, + final_output_type="image", + request_output=request_output, + ) + + +def _assert_sampling_param_values(received: OmniSamplingParams, expected: dict[str, Any]): + for key, expected_value in expected.items(): + actual_value = getattr(received, key, None) + assert actual_value == expected_value, ( + f"Expected sampling param '{key}'={expected_value}, got {actual_value}. The received sampling params: {received}" + ) + + +def _build_mock_outputs(outputs: Iterable[OmniRequestOutput], sampling_case: SamplingCase, server_case: ServerCase): + async def _mock_generate(*args, **kwargs): + received_sampling_params_list: Sequence[OmniSamplingParams] | None = ( + args[2] if len(args) > 2 else kwargs.get("sampling_params_list") + ) + + assert received_sampling_params_list is not None, ( + "In the current codebase, the API layer always provides not-None sampling parameter list when calling `AsyncOmni.generate`" + "This test also uses this assumption for now." + "If this assertion fails, it means the API layer has changed and this test needs to be updated accordingly." + "It does not necessarily mean there is a bug, because `AsyncOmni.generate` does allow sampling_params_list to be None." + ) + assert isinstance(received_sampling_params_list, Sequence), "sampling_params_list should be a Sequence" + + if sampling_case.kind is SamplingKind.IMAGE_NONE: + assert len(received_sampling_params_list) == 1 + _assert_sampling_param_values( + received_sampling_params_list[0], + { + "width": IMAGE_WIDTH, + "height": IMAGE_HEIGHT, + }, + ) + elif sampling_case.kind is SamplingKind.IMAGE_DIFFUSION_SINGLE: + assert len(received_sampling_params_list) == 1 + expected = DIFFUSION_SINGLE_SAMPLING_PARAMS.copy() + expected["num_outputs_per_prompt"] = expected.pop("n") # convert from n to num_outputs_per_prompt + _assert_sampling_param_values( + received_sampling_params_list[0], + { + "width": IMAGE_WIDTH, + "height": IMAGE_HEIGHT, + **expected, + }, + ) + elif sampling_case.kind is SamplingKind.UNDERSTANDING_NONE: + assert len(received_sampling_params_list) == 3 + elif sampling_case.kind is SamplingKind.UNDERSTANDING_AR_LIST: + assert len(received_sampling_params_list) == 3 + for i, expected in enumerate(AR_LIST_SAMPLING_PARAMS): + _assert_sampling_param_values(received_sampling_params_list[i], expected) + elif sampling_case.kind in {SamplingKind.TTS_NONE, SamplingKind.TTS_DIFFUSION_SINGLE}: + assert len(received_sampling_params_list) == 1 + else: + raise AssertionError(f"Unknown sampling case: {sampling_case.kind}") + + for output in outputs: + yield output + + return _mock_generate + + +@pytest.fixture +def server_case(request) -> ServerCase: + return request.param + + +@pytest.fixture +def sampling_case(request) -> SamplingCase: + return request.param + + +@pytest.fixture +def mock_async_omni(server_case: ServerCase, sampling_case: SamplingCase): + async def _mock_preprocess_chat(self, *args, **kwargs): + return ([{"role": "user", "content": "test"}], [{"prompt": "test prompt"}]) + + # Need to mock AsyncOmni itself (not only its generate method) because + # 1. The API layer uses its stage_list and stage_configs attributes + # 2. Its __init__ method has slow side effects (model & config loading). + with ( + patch("vllm_omni.entrypoints.openai.api_server.AsyncOmni") as MockAsyncOmni, + patch( + "vllm_omni.entrypoints.openai.serving_chat.OmniOpenAIServingChat._preprocess_chat", + new=_mock_preprocess_chat, + ), + ): + mock_instance = AsyncMock() + mock_instance.generate = _build_mock_outputs(server_case.outputs, sampling_case, server_case) + + mock_instance.stage_list = server_case.stage_list + mock_instance.stage_configs = server_case.stage_configs + mock_instance.default_sampling_params_list = [ + SamplingParams() if stage.get("stage_type") != "diffusion" else MagicMock() + for stage in server_case.stage_configs + ] + mock_instance.errored = False + mock_instance.dead_error = RuntimeError("Mock engine error") + mock_instance.model_config = MagicMock(max_model_len=4096, io_processor_plugin=None) + mock_instance.io_processor = MagicMock() + mock_instance.input_processor = MagicMock() + mock_instance.shutdown = MagicMock() + mock_instance.get_vllm_config = AsyncMock(return_value=None) + mock_instance.get_supported_tasks = AsyncMock(return_value=["generate"]) + mock_instance.get_tokenizer = AsyncMock(return_value=None) + + MockAsyncOmni.return_value = mock_instance + yield MockAsyncOmni + + +@pytest.fixture +def api_server(unused_tcp_port_factory, server_case: ServerCase, mock_async_omni): + """Set up a API server in background process from command line with parametrized model name and mocked AsyncOmni.""" + parser = FlexibleArgumentParser() + subparsers = parser.add_subparsers(dest="command") + cmd = OmniServeCommand() + cmd.subparser_init(subparsers) + + port = unused_tcp_port_factory() + args = parser.parse_args(["serve", server_case.served_model, "--omni", "--port", str(port)]) + + def run_server(): + try: + cmd.cmd(args) + except Exception: + traceback.print_exc() + + server_process = multiprocessing.Process(target=run_server) + server_process.start() + + # Wait for the server to be ready by polling the health endpoint. + wait_time = 30 + wait_poll_interval = 1 + for _ in range(wait_time // wait_poll_interval): + try: + response = requests.get(f"http://127.0.0.1:{port}/health") + if response.status_code == 200: + break + except requests.ConnectionError: + time.sleep(wait_poll_interval) + else: + if server_process.is_alive(): + server_process.terminate() + server_process.join(timeout=5) + if server_process.is_alive(): + server_process.kill() + server_process.join(timeout=5) + pytest.fail(f"API server failed to start within {wait_time} seconds") + + yield f"http://127.0.0.1:{port}/v1" + + if server_process.is_alive(): + server_process.terminate() + server_process.join(timeout=10) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "server_case,model,image_input", + [ + pytest.param( + ServerCase( + served_model="Tongyi-MAI/Z-Image-Turbo", + stage_list=["diffusion"], + stage_configs=[{"stage_type": "diffusion"}], + outputs=[_build_diffusion_image_output_for_images_endpoint()], + ), + "Tongyi-MAI/Z-Image-Turbo", + False, + id="text-to-image-dalle-endpoint", + ), + pytest.param( + ServerCase( + served_model="ByteDance-Seed/BAGEL-7B-MoT", + stage_list=["diffusion"], + stage_configs=[{"stage_type": "diffusion"}], + outputs=[_build_diffusion_image_output_for_chat_endpoint()], + ), + "ByteDance-Seed/BAGEL-7B-MoT", + False, + id="text-to-image-bagel-chat-endpoint", + ), + pytest.param( + ServerCase( + served_model="Qwen/Qwen-Image-Edit", + stage_list=["diffusion"], + stage_configs=[{"stage_type": "diffusion"}], + outputs=[_build_diffusion_image_output_for_images_endpoint()], + ), + "Qwen/Qwen-Image-Edit", + True, + id="image-to-image-dalle-endpoint", + ), + pytest.param( + ServerCase( + served_model="ByteDance-Seed/BAGEL-7B-MoT", + stage_list=["diffusion"], + stage_configs=[{"stage_type": "diffusion"}], + outputs=[_build_diffusion_image_output_for_chat_endpoint()], + ), + "ByteDance-Seed/BAGEL-7B-MoT", + True, + id="image-to-image-bagel-chat-endpoint", + ), + ], + indirect=["server_case"], +) +@pytest.mark.parametrize( + "sampling_case", + [ + pytest.param(SamplingCase(kind=SamplingKind.IMAGE_NONE, sampling_params=None), id="no-sampling-params"), + pytest.param( + SamplingCase(kind=SamplingKind.IMAGE_DIFFUSION_SINGLE, sampling_params=DIFFUSION_SINGLE_SAMPLING_PARAMS), + id="single-diffusion-sampling-params", + ), + ], + indirect=["sampling_case"], +) +async def test_image_generation_node(api_server: str, model: str, image_input: bool, sampling_case: SamplingCase): + node = VLLMOmniGenerateImage() + + kwargs = { + "url": api_server, + "model": model, + "prompt": "A beautiful sunset", + "width": IMAGE_WIDTH, + "height": IMAGE_HEIGHT, + } + if image_input: + kwargs["image"] = torch.zeros((1, IMAGE_WIDTH, IMAGE_HEIGHT, 3), dtype=torch.float32) + if sampling_case.sampling_params is not None: + kwargs["sampling_params"] = sampling_case.sampling_params + print(f"!!!!!! Calling {model} node.generate with kwargs: {sampling_case.sampling_params}") + + result = await node.generate(**kwargs) + + assert isinstance(result, tuple) + assert len(result) == 1 + assert isinstance(result[0], torch.Tensor) + assert result[0].shape == (1, IMAGE_WIDTH, IMAGE_HEIGHT, 3) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "server_case", + [ + pytest.param( + ServerCase( + served_model="Qwen/Qwen2.5-Omni-7B", + stage_list=[ + MagicMock(is_comprehension=True, model_stage="llm"), + MagicMock(is_comprehension=False, model_stage="llm"), + MagicMock(is_comprehension=False, model_stage="llm"), + ], + stage_configs=[ + {"stage_type": "llm"}, + {"stage_type": "llm"}, + {"stage_type": "llm"}, + ], + outputs=[_build_audio_chat_output(), _build_text_output("Understanding response")], + ), + id="multimodal-understanding", + ) + ], + indirect=["server_case"], +) +@pytest.mark.parametrize( + "sampling_case", + [ + pytest.param(SamplingCase(kind=SamplingKind.UNDERSTANDING_NONE, sampling_params=None), id="no-sampling-params"), + pytest.param( + SamplingCase(kind=SamplingKind.UNDERSTANDING_AR_LIST, sampling_params=AR_LIST_SAMPLING_PARAMS), + id="ar-sampling-params-list", + ), + ], + indirect=["sampling_case"], +) +async def test_understanding_node(api_server: str, sampling_case: SamplingCase): + node = VLLMOmniUnderstanding() + + image = torch.zeros((1, IMAGE_WIDTH, IMAGE_HEIGHT, 3), dtype=torch.float32) + video = VideoInput(b"mock_video_for_test") # type: ignore[reportAbstractUsage] + audio: AudioInput = {"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000} + + text_response, audio_response = await node.generate( + url=api_server, + model="Qwen/Qwen2.5-Omni-7B", + prompt="Describe all modalities.", + image=image, + audio=audio, + video=video, + sampling_params=sampling_case.sampling_params, + output_text=True, + output_audio=True, + use_audio_in_video=True, + ) + + assert text_response == "Understanding response" + assert isinstance(audio_response, dict) + assert audio_response["sample_rate"] == 24000 + assert isinstance(audio_response["waveform"], torch.Tensor) + assert audio_response["waveform"].shape == (1, 1, 24000) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "server_case,node_cls,call_kwargs", + [ + pytest.param( + ServerCase( + served_model="Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + stage_list=["llm"], + stage_configs=[{"stage_type": "llm"}], + outputs=[_build_audio_speech_output()], + ), + VLLMOmniTTS, + { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice", + "input": "Hello from TTS test", + "voice": "Vivian", + "response_format": "wav", + "speed": 1.0, + "model_specific_params": None, + }, + id="tts", + ), + pytest.param( + ServerCase( + served_model="Qwen/Qwen3-TTS-12Hz-1.7B-Base", + stage_list=["llm"], + stage_configs=[{"stage_type": "llm"}], + outputs=[_build_audio_speech_output()], + ), + VLLMOmniVoiceClone, + { + "model": "Qwen/Qwen3-TTS-12Hz-1.7B-Base", + "input": "Hello from voice clone test", + "voice": "Vivian", + "response_format": "wav", + "speed": 1.0, + "ref_audio": {"waveform": torch.zeros((1, 1, 24000), dtype=torch.float32), "sample_rate": 24000}, + "ref_text": "Reference transcript", + "x_vector_only_mode": False, + "model_specific_params": None, + }, + id="tts-voice-clone", + ), + ], + indirect=["server_case"], +) +@pytest.mark.parametrize( + "sampling_case", + [ + pytest.param(SamplingCase(kind=SamplingKind.TTS_NONE, sampling_params=None), id="no-sampling-params"), + pytest.param( + SamplingCase(kind=SamplingKind.TTS_DIFFUSION_SINGLE, sampling_params=DIFFUSION_SINGLE_SAMPLING_PARAMS), + id="single-diffusion-sampling-params", + ), + ], + indirect=["sampling_case"], +) +async def test_tts_nodes(api_server: str, node_cls, call_kwargs: dict, sampling_case: SamplingCase): + node = node_cls() + actual_kwargs = dict(call_kwargs) + if sampling_case.sampling_params is not None: + actual_kwargs["model_specific_params"] = sampling_case.sampling_params + result = await node.generate(url=api_server, **actual_kwargs) + + assert isinstance(result, tuple) + assert len(result) == 1 + assert isinstance(result[0], dict) + assert result[0]["sample_rate"] == 24000 + assert isinstance(result[0]["waveform"], torch.Tensor) + assert result[0]["waveform"].shape == (1, 1, 24000)