diff --git a/examples/offline_inference/ming_flash_omni/README.md b/examples/offline_inference/ming_flash_omni/README.md new file mode 100644 index 00000000000..7414163fc01 --- /dev/null +++ b/examples/offline_inference/ming_flash_omni/README.md @@ -0,0 +1,76 @@ +# Ming-flash-omni 2.0 + +[Ming-flash-omni-2.0](https://github.com/inclusionAI/Ming) is an omni-modal model supporting text, image, video, and audio understanding, with outputs in text, image, and audio. For now, Ming-flash-omni-2.0 in vLLM-Omni is supported with thinker stage (multi-modal understanding). + +## Setup + +Please refer to the [stage configuration documentation](https://docs.vllm.ai/projects/vllm-omni/en/latest/configuration/stage_configs/) to configure memory allocation appropriately for your hardware setup. + +## Run examples + +### Text-only +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type text +``` + +#### Reasoning (Thinking Mode) + +Reasoning (Thinking) mode is enabled via applying "detailed thinking on" when building the system prompt template (in `apply_chat_template`). + +In the end2end example, a default problem for thinking mode is provided, as referred to the example usage of Ming's cookbook; +To utilize it, you have to download the example figure from https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png + +```bash +python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png +``` + +### Image understanding +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image + +# With a local image +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image --image-path /path/to/image.jpg +``` + +### Audio understanding +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio + +# With a local audio file +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio --audio-path /path/to/audio.wav +``` + +### Video understanding +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video + +# With a local video and custom frame count +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_video --video-path /path/to/video.mp4 --num-frames 16 +``` + +### Mixed modalities (image + audio) +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_mixed_modalities \ + --image-path /path/to/image.jpg \ + --audio-path /path/to/audio.wav +``` + +If media file paths are not provided, the script uses built-in default assets. + +### Modality control +To control output modalities (e.g. text-only output): +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_audio --modalities text +``` + +*For now, only text output is supported* + +### Custom stage config +```bash +python examples/offline_inference/ming_flash_omni/end2end.py --query-type use_image \ + --stage-configs-path /path/to/your_config.yaml +``` + +## Online serving + +For online serving via the OpenAI-compatible API, see [examples/online_serving/ming_flash_omni/README.md](../../online_serving/ming_flash_omni/README.md). diff --git a/examples/offline_inference/ming_flash_omni/end2end.py b/examples/offline_inference/ming_flash_omni/end2end.py new file mode 100644 index 00000000000..49cdbcc0186 --- /dev/null +++ b/examples/offline_inference/ming_flash_omni/end2end.py @@ -0,0 +1,485 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# Partial example cases are referred from +# https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/cookbook.ipynb +import os +import time +from typing import NamedTuple + +import librosa +import numpy as np +import vllm +from PIL import Image +from transformers import AutoProcessor +from vllm import SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset, video_to_ndarrays +from vllm.multimodal.image import convert_image_mode +from vllm.utils.argparse_utils import FlexibleArgumentParser + +import vllm_omni +from vllm_omni.entrypoints.omni import Omni + +# Imports the processor also registers itself +from vllm_omni.transformers_utils.processors.ming import MingFlashOmniProcessor # noqa: F401 + +SEED = 42 +MODEL_NAME = "Jonathan1909/Ming-flash-omni-2.0" + + +class QueryResult(NamedTuple): + inputs: dict + limit_mm_per_prompt: dict[str, int] + + +def get_text_query(processor: MingFlashOmniProcessor, question: str | None = None) -> QueryResult: + if question is None: + question = "请详细介绍鹦鹉的生活习性。" + conversation = [{"role": "HUMAN", "content": question}] + prompt = processor.apply_chat_template(conversation, tokenize=False) + return QueryResult( + inputs={"prompt": prompt}, + limit_mm_per_prompt={}, + ) + + +def get_image_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + image_path: str | None = None, +) -> QueryResult: + if question is None: + question = "Describe this image in detail." + + if image_path: + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + image_data = convert_image_mode(Image.open(image_path), "RGB") + else: + image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") + + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "image", "image": image_data}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"image": image_data}, + }, + limit_mm_per_prompt={"image": 1}, + ) + + +def get_audio_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + audio_path: str | None = None, + sampling_rate: int = 16000, +) -> QueryResult: + if question is None: + question = "Please recognize the language of this speech and transcribe it. Format: oral." + + if audio_path: + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + audio_signal, sr = librosa.load(audio_path, sr=sampling_rate) + audio_data = (audio_signal.astype(np.float32), sr) + else: + audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate + + # Use a string for "audio" so the processor counts it as 1 audio input + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "audio", "audio": "input"}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"audio": audio_data}, + }, + limit_mm_per_prompt={"audio": 1}, + ) + + +def get_video_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + video_path: str | None = None, + num_frames: int = 16, +) -> QueryResult: + if question is None: + question = "Describe what is happening in this video." + + if video_path: + if not os.path.exists(video_path): + raise FileNotFoundError(f"Video file not found: {video_path}") + video_frames = video_to_ndarrays(video_path, num_frames=num_frames) + else: + video_frames = VideoAsset(name="baby_reading", num_frames=num_frames).np_ndarrays + + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "video"}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"video": video_frames}, + }, + limit_mm_per_prompt={"video": 1}, + ) + + +def get_mixed_modalities_query( + processor: MingFlashOmniProcessor, + image_path: str | None = None, + audio_path: str | None = None, + sampling_rate: int = 16000, +) -> QueryResult: + """Mixed image + audio understanding.""" + question = "Describe the image, and recognize the language of this speech and transcribe it. Format: oral" + + if image_path: + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + image_data = convert_image_mode(Image.open(image_path), "RGB") + else: + image_data = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") + + if audio_path: + if not os.path.exists(audio_path): + raise FileNotFoundError(f"Audio file not found: {audio_path}") + sig, sr = librosa.load(audio_path, sr=sampling_rate) + audio_data = (sig.astype(np.float32), sr) + else: + audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate + + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "image", "image": image_data}, + {"type": "audio", "audio": "input"}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False) + + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"image": image_data, "audio": audio_data}, + }, + limit_mm_per_prompt={"image": 1, "audio": 1}, + ) + + +def get_reasoning_query( + processor: MingFlashOmniProcessor, + question: str | None = None, + image_path: str | None = None, +) -> QueryResult: + if question is None: + # NOTE: To use the following default question, input with example figure provided by Ming + # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/figures/cases/3_0.png + # E.g., + # python examples/offline_inference/ming_flash_omni/end2end.py -q reasoning --image-path ./3_0.png + # Otherwise, the problem solving might be false. + question = ( + "Based on the following rules:\n•\tYou control the smiley face character\n" + "•\tYou can move up, down, left, and right, and only a single square at a time\n" + "•\tWalls are dark grey and cannot be moved into\n•\tThe brown square is a box\n•" + "\tThe box can be pushed by moving into it (i.e., if you are in the square " + "adjacent to the box to the left, and move onto the square with the box, " + "the box will move one square to the right).\n" + "•\tThe box cannot be pushed into walls\n" + "•\tThe blue door at the bottom is locked and cannot be passed through, " + "unless the box is placed on the blue square\n" + "•\tThe square beneath the blue door is the exit\n" + "•\tMoving from one square to another\n\n" + "Let's assume a coordinate system where the smiley face is " + "on the top left at (1,1) and the square below it is (1,2). " + "The smiley face performs the following moves: {down, right, right, right}, " + "such that the smiley face is at square (4,2) and the box is in square (5,2). " + "What are the next sequence of moves that must be done to move the box down to (5,3)? " + "Give your answer as a comma separated list." + ) + + if image_path: + if not os.path.exists(image_path): + raise FileNotFoundError(f"Image file not found: {image_path}") + image_data = convert_image_mode(Image.open(image_path), "RGB") + conversation = [ + { + "role": "HUMAN", + "content": [ + {"type": "image", "image": image_data}, + {"type": "text", "text": question}, + ], + } + ] + prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True) + return QueryResult( + inputs={ + "prompt": prompt, + "multi_modal_data": {"image": image_data}, + }, + limit_mm_per_prompt={"image": 1}, + ) + + conversation = [{"role": "HUMAN", "content": question}] + prompt = processor.apply_chat_template(conversation, tokenize=False, use_cot_system_prompt=True) + return QueryResult( + inputs={"prompt": prompt}, + limit_mm_per_prompt={}, + ) + + +query_map = { + "text": get_text_query, + "use_audio": get_audio_query, + "use_image": get_image_query, + "use_video": get_video_query, + "use_mixed_modalities": get_mixed_modalities_query, + "reasoning": get_reasoning_query, +} + + +def main(args): + print( + "=" * 20, + "\n", + f"vllm version: {vllm.__version__}\n", + f"vllm-omni version: {vllm_omni.__version__}\n", + "=" * 20, + sep="", + ) + + processor = AutoProcessor.from_pretrained(MODEL_NAME, trust_remote_code=True) + assert isinstance(processor, MingFlashOmniProcessor), f"Wrong processor type being used: {type(processor)}" + + query_func = query_map[args.query_type] + if args.query_type == "use_image": + query_result = query_func(processor, image_path=args.image_path) + elif args.query_type == "use_audio": + query_result = query_func(processor, audio_path=args.audio_path, sampling_rate=args.sampling_rate) + elif args.query_type == "use_video": + query_result = query_func(processor, video_path=args.video_path, num_frames=args.num_frames) + elif args.query_type == "use_mixed_modalities": + query_result = query_func( + processor, + image_path=args.image_path, + audio_path=args.audio_path, + sampling_rate=args.sampling_rate, + ) + elif args.query_type == "reasoning": + query_result = query_func(processor, image_path=args.image_path) + else: + query_result = query_func(processor) + + # Initialize Omni (with thinker-only stage config) + omni = Omni( + model=MODEL_NAME, + stage_configs_path=args.stage_configs_path, + log_stats=args.log_stats, + init_timeout=args.init_timeout, + stage_init_timeout=args.stage_init_timeout, + ) + + # Thinker sampling params + thinker_sampling_params = SamplingParams( + temperature=0.4, + top_p=0.9, + max_tokens=args.max_tokens, + repetition_penalty=1.05, + seed=SEED, + detokenize=True, + ) + sampling_params_list = [thinker_sampling_params] + + prompts = [query_result.inputs for _ in range(args.num_prompts)] + + if args.modalities is not None: + output_modalities = args.modalities.split(",") + for prompt in prompts: + prompt["modalities"] = output_modalities + + total_requests = len(prompts) + processed_count = 0 + print(f"Query type: {args.query_type}") + print(f"Number of prompts: {total_requests}") + + output_dir = args.output_dir + os.makedirs(output_dir, exist_ok=True) + + profiler_enabled = args.enable_profiler + if profiler_enabled: + omni.start_profile(stages=args.profiler_stages) + + for stage_outputs in omni.generate(prompts, sampling_params_list): + output = stage_outputs.request_output + if stage_outputs.final_output_type == "text": + request_id = output.request_id + text_output = output.outputs[0].text + lines = [] + lines.append("Prompt:\n") + lines.append(str(output.prompt) + "\n") + lines.append("Text Output:\n") + lines.append(str(text_output).strip() + "\n") + print(*lines, sep="") + + # Save to file + out_txt = os.path.join(output_dir, f"{request_id}.txt") + try: + with open(out_txt, "w", encoding="utf-8") as f: + f.writelines(lines) + print(f"Request ID: {request_id}, text saved to {out_txt}") + except Exception as e: + print(f"Failed to write output file {out_txt}: {e}") + + elif stage_outputs.final_output_type == "audio": + raise NotImplementedError("Add audio example after talker supported.") + + processed_count += 1 + if profiler_enabled and processed_count >= total_requests: + print(f"[Info] Processed {processed_count}/{total_requests}. Stopping profiler inside active loop...") + # Stop the profiler while workers are still alive + omni.stop_profile(stages=args.profiler_stages) + + print("[Info] Waiting 30s for workers to write trace files to disk...") + time.sleep(30) + print("[Info] Trace export wait time finished.") + + omni.close() + + +def parse_args(): + parser = FlexibleArgumentParser(description="Ming-flash-omni 2.0 offline inference example") + parser.add_argument( + "--query-type", + "-q", + type=str, + default="text", + choices=query_map.keys(), + help="Query type.", + ) + parser.add_argument( + "--stage-configs-path", + type=str, + default=None, + help="Path to a stage configs YAML file.", + ) + parser.add_argument( + "--log-stats", + action="store_true", + default=False, + help="Enable detailed statistics logging.", + ) + parser.add_argument("--init-timeout", type=int, default=2000, help="Timeout for initializing in seconds.") + parser.add_argument( + "--stage-init-timeout", + type=int, + default=2000, + help="Timeout for initializing a single stage in seconds.", + ) + parser.add_argument( + "--enable-profiler", + action="store_true", + default=False, + help="Enables profiling when set.", + ) + parser.add_argument( + "--profiler-stages", + type=int, + nargs="*", + default=[0], + help="List of stage IDs to profile. If not set, profiles all stages.", + ) + parser.add_argument( + "--image-path", + "-i", + type=str, + default=None, + help="Path to local image file. Uses default asset if not provided.", + ) + parser.add_argument( + "--audio-path", + "-a", + type=str, + default=None, + help="Path to local audio file. Uses default asset if not provided.", + ) + parser.add_argument( + "--video-path", + "-v", + type=str, + default=None, + help="Path to local video file. Uses default asset if not provided.", + ) + parser.add_argument( + "--num-frames", + type=int, + default=16, + help="Number of frames to extract from video.", + ) + parser.add_argument( + "--sampling-rate", + type=int, + default=16000, + help="Sampling rate for audio loading.", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=16384, + help="Maximum tokens to generate.", + ) + parser.add_argument( + "--num-prompts", + type=int, + default=1, + help="Number of prompts to generate.", + ) + parser.add_argument( + "--modalities", + type=str, + default=None, + help="Output modalities (comma-separated).", + ) + parser.add_argument( + "--output-dir", + type=str, + default="output_ming", + help="Output directory for results.", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/online_serving/ming_flash_omni/README.md b/examples/online_serving/ming_flash_omni/README.md new file mode 100644 index 00000000000..502232725c2 --- /dev/null +++ b/examples/online_serving/ming_flash_omni/README.md @@ -0,0 +1,204 @@ +# Ming-flash-omni 2.0 + +## Installation + +Please refer to [README.md](../../../README.md) + +## Run examples (Ming-flash-omni 2.0) + +### Launch the Server + +```bash +vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 +``` + +If you have custom stage configs file, launch the server with command below +```bash +vllm serve Jonathan1909/Ming-flash-omni-2.0 --omni --port 8091 --stage-configs-path /path/to/stage_configs_file +``` + +### Send Multi-modal Request + +#### Send request via python + +```bash +python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py --model Jonathan1909/Ming-flash-omni-2.0 --query-type use_mixed_modalities --port 8091 --host "localhost" --modalities text +``` + +The Python client supports the following command-line arguments: + +- `--query-type` (or `-q`): Query type. Options: `text`, `use_audio`, `use_image`, `use_video`, `use_mixed_modalities` +- `--video-path` (or `-v`): Path to local video file or URL. If not provided and query-type uses video, uses default video URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs. Example: `--video-path /path/to/video.mp4` or `--video-path https://example.com/video.mp4` +- `--image-path` (or `-i`): Path to local image file or URL. If not provided and query-type uses image, uses default image URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common image formats: JPEG, PNG, GIF, WebP. Example: `--image-path /path/to/image.jpg` or `--image-path https://example.com/image.png` +- `--audio-path` (or `-a`): Path to local audio file or URL. If not provided and query-type uses audio, uses default audio URL. Supports local file paths (automatically encoded to base64) or HTTP/HTTPS URLs and common audio formats: MP3, WAV, OGG, FLAC, M4A. Example: `--audio-path /path/to/audio.wav` or `--audio-path https://example.com/audio.mp3` +- `--prompt` (or `-p`): Custom text prompt/question. If not provided, uses default prompt for the selected query type. Example: `--prompt "What are the main activities shown in this video?"` +- `--modalities`: Output modalities. For now, only `text` is supported. Example: `--modalities text` + + +#### Send request via curl + +```bash +bash run_curl_multimodal_generation.sh text +bash run_curl_multimodal_generation.sh use_image +bash run_curl_multimodal_generation.sh use_audio +bash run_curl_multimodal_generation.sh use_video +bash run_curl_multimodal_generation.sh use_mixed_modalities +``` + +## Modality control + +Ming-flash-omni 2.0 currently supports text output only (thinker stage). + +| Modalities | Output | +|------------|--------| +| `["text"]` | Text only | +| Not specified | Text only (default) | + +### Using curl + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Jonathan1909/Ming-flash-omni-2.0", + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"} + ], + "modalities": ["text"] + }' +``` + +### Using OpenAI Python SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}, + ], + modalities=["text"], +) +print(response.choices[0].message.content) +``` + +### Multi-modal input with OpenAI Python SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + { + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg"}}, + {"type": "text", "text": "Describe this image in detail."}, + ], + }, + ], + modalities=["text"], +) +print(response.choices[0].message.content) +``` + +## Streaming Output + +To enable streaming output: + +```bash +python examples/online_serving/openai_chat_completion_client_for_multimodal_generation.py \ + --query-type use_image \ + --model Jonathan1909/Ming-flash-omni-2.0 \ + --modalities text \ + --stream +``` + +Or with the OpenAI Python SDK: + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"}, + ], + modalities=["text"], + stream=True, +) +for chunk in response: + for choice in chunk.choices: + if hasattr(choice, "delta") and choice.delta.content: + print(choice.delta.content, end="", flush=True) +print() +``` + +Or using curl: + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Jonathan1909/Ming-flash-omni-2.0", + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking off"}]}, + {"role": "user", "content": "请详细介绍鹦鹉的生活习性。"} + ], + "modalities": ["text"], + "stream": true, + }' +``` + + +## Reasoning (Thinking Mode) + +To enable reasoning/thinking mode, change `detailed thinking off` to `detailed thinking on` in the system prompt: + +### Using curl + +```bash +curl http://localhost:8091/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Jonathan1909/Ming-flash-omni-2.0", + "messages": [ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking on"}]}, + {"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": "https://example.com/math_problem.png"}}, + {"type": "text", "text": "Solve this math problem step by step."} + ]} + ], + "modalities": ["text"] + }' +``` + +### Using OpenAI Python SDK + +```python +from openai import OpenAI + +client = OpenAI(base_url="http://localhost:8091/v1", api_key="EMPTY") + +response = client.chat.completions.create( + model="Jonathan1909/Ming-flash-omni-2.0", + messages=[ + {"role": "system", "content": [{"type": "text", "text": "你是一个友好的AI助手。\n\ndetailed thinking on"}]}, + {"role": "user", "content": "If a train travels 120 km in 2 hours, what is its average speed?"}, + ], + modalities=["text"], +) +print(response.choices[0].message.content) +``` diff --git a/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh new file mode 100755 index 00000000000..768a424e451 --- /dev/null +++ b/examples/online_serving/ming_flash_omni/run_curl_multimodal_generation.sh @@ -0,0 +1,145 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Server port +PORT="${PORT:-8091}" +# Default query type +QUERY_TYPE="${1:-text}" + +# Validate query type +if [[ ! "$QUERY_TYPE" =~ ^(text|use_audio|use_image|use_video|use_mixed_modalities)$ ]]; then + echo "Error: Invalid query type '$QUERY_TYPE'" + echo "Usage: $0 [text|use_audio|use_image|use_video|use_mixed_modalities]" + echo " text: Text-only query" + echo " use_audio: Audio + Text query" + echo " use_image: Image + Text query" + echo " use_video: Video + Text query" + echo " use_mixed_modalities: Audio + Image + Video + Text query" + exit 1 +fi + +thinker_sampling_params='{ + "temperature": 0.4, + "top_p": 0.9, + "top_k": -1, + "max_tokens": 16384, + "seed": 42, + "detokenize": true, + "repetition_penalty": 1.05 +}' +# Above is optional, it has a default setting in stage_configs of the corresponding model. + +# Define URLs for assets +MARY_HAD_LAMB_AUDIO_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/mary_had_lamb.ogg" +CHERRY_BLOSSOM_IMAGE_URL="https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/cherry_blossom.jpg" +SAMPLE_VIDEO_URL="https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4" + +# Build user content based on query type +case "$QUERY_TYPE" in + text) + user_content='[ + { + "type": "text", + "text": "请详细介绍鹦鹉的生活习性。" + } + ]' + ;; + use_image) + user_content='[ + { + "type": "image_url", + "image_url": { + "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'" + } + }, + { + "type": "text", + "text": "Describe this image in detail." + } + ]' + ;; + use_audio) + user_content='[ + { + "type": "audio_url", + "audio_url": { + "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'" + } + }, + { + "type": "text", + "text": "Please recognize the language of this speech and transcribe it. Format: oral." + } + ]' + ;; + use_video) + user_content='[ + { + "type": "video_url", + "video_url": { + "url": "'"$SAMPLE_VIDEO_URL"'" + } + }, + { + "type": "text", + "text": "Describe what is happening in this video." + } + ]' + ;; + use_mixed_modalities) + user_content='[ + { + "type": "image_url", + "image_url": { + "url": "'"$CHERRY_BLOSSOM_IMAGE_URL"'" + } + }, + { + "type": "audio_url", + "audio_url": { + "url": "'"$MARY_HAD_LAMB_AUDIO_URL"'" + } + }, + { + "type": "text", + "text": "Describe the image, and recognize the language of this speech and transcribe it. Format: oral" + } + ]' + ;; +esac + +echo "Running query type: $QUERY_TYPE" +echo "" + +request_body=$(cat < str: + """Build a Ming chat prompt.""" + return ( + f"SYSTEM{SYSTEM_PROMPT}{EOS_TOKEN}HUMAN{user_text}{EOS_TOKEN}ASSISTANT" + ) + + +def get_eager_config(): + path = modify_stage_config( + str(Path(__file__).parent.parent / "stage_configs" / "bailingmm_moe_v2_lite_ci.yaml"), + updates={ + "stage_args": { + 0: { + "engine_args.enforce_eager": "true", + }, + }, + }, + ) + return path + + +stage_configs = [get_eager_config()] +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_text_to_text(omni_runner, omni_runner_handler) -> None: + """ + Test text-only input processing and text output generation. + Input Modal: text + Output Modal: text + """ + prompt = build_prompt("请详细介绍鹦鹉的生活习性。") + request_config = {"prompts": prompt, "modalities": ["text"]} + + omni_runner_handler.send_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_image_to_text(omni_runner, omni_runner_handler) -> None: + """ + Test image understanding with text output. + Input Modal: image + text + Output Modal: text + """ + image = generate_synthetic_image(224, 224)["np_array"] + prompt = build_prompt(f"{IMAGE_TOKEN}Describe this image briefly.") + request_config = {"prompts": prompt, "images": image, "modalities": ["text"]} + + omni_runner_handler.send_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_audio_to_text(omni_runner, omni_runner_handler) -> None: + """ + Test audio understanding with text output. + Input Modal: audio + text + Output Modal: text + """ + audio = generate_synthetic_audio(2, 1, 16000)["np_array"] + if len(audio.shape) == 2: + audio = audio.squeeze() + prompt = build_prompt(f"{AUDIO_TOKEN}Please recognize the language of this speech and transcribe it. Format: oral.") + request_config = {"prompts": prompt, "audios": audio, "modalities": ["text"]} + + omni_runner_handler.send_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_video_to_text(omni_runner, omni_runner_handler) -> None: + """ + Test video understanding with text output. + Input Modal: video + text + Output Modal: text + """ + video = generate_synthetic_video(224, 224, 30)["np_array"] + prompt = build_prompt(f"{VIDEO_TOKEN}Describe what is happening in this video.") + request_config = {"prompts": prompt, "videos": video, "modalities": ["text"]} + + omni_runner_handler.send_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_runner", test_params, indirect=True) +def test_mixed_to_text(omni_runner, omni_runner_handler) -> None: + """ + Test mixed modality input (image + audio) with text output. + Input Modal: image + audio + text + Output Modal: text + """ + image = generate_synthetic_image(224, 224)["np_array"] + audio = generate_synthetic_audio(2, 1, 16000)["np_array"] + if len(audio.shape) == 2: + audio = audio.squeeze() + prompt = build_prompt(f"{IMAGE_TOKEN}{AUDIO_TOKEN}Describe the image and transcribe the audio.") + request_config = {"prompts": prompt, "images": image, "audios": audio, "modalities": ["text"]} + + omni_runner_handler.send_request(request_config) diff --git a/tests/e2e/online_serving/test_ming_flash_omni.py b/tests/e2e/online_serving/test_ming_flash_omni.py new file mode 100644 index 00000000000..35b7b64c061 --- /dev/null +++ b/tests/e2e/online_serving/test_ming_flash_omni.py @@ -0,0 +1,247 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E online serving tests for Ming-flash-omni-2.0 model (Thinker stage). +Tests multimodal understanding via OpenAI-compatible API. +""" + +import os +from pathlib import Path + +import pytest + +from tests.conftest import ( + OmniServerParams, + dummy_messages_from_mix_data, + generate_synthetic_audio, + generate_synthetic_image, + generate_synthetic_video, + modify_stage_config, +) +from tests.utils import hardware_test + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +models = ["Jonathan1909/Ming-flash-omni-2.0"] + + +def get_eager_config(): + path = modify_stage_config( + str(Path(__file__).parent.parent / "stage_configs" / "bailingmm_moe_v2_lite_ci.yaml"), + updates={ + "stage_args": { + 0: { + "engine_args.enforce_eager": "true", + }, + }, + }, + ) + return path + + +stage_configs = [get_eager_config()] + +# Create parameter combinations for model and stage config +test_params = [ + OmniServerParams(model=model, stage_config_path=stage_config) for model in models for stage_config in stage_configs +] + + +def get_system_prompt(): + return { + "role": "system", + "content": [ + { + "type": "text", + "text": "你是一个友好的AI助手。\n\ndetailed thinking off", + } + ], + } + + +def get_prompt(prompt_type="text_only"): + prompts = { + "text_only": "What is the capital of China? Answer in 20 words.", + "text_image": "What is in this image?", + "text_audio": "What is in this audio?", + "text_video": "What is in this video?", + "mix": "What is recited in the audio? What is in this image? What is in this video?", + } + return prompts.get(prompt_type, prompts["text_only"]) + + +def get_max_batch_size(size_type="few"): + batch_sizes = {"few": 5, "medium": 100, "large": 256} + return batch_sizes.get(size_type, 5) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_text_to_text_001(omni_server, openai_client) -> None: + """ + Input Modal: text + Output Modal: text + Input Setting: stream=False + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text_only"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": False, + "modalities": ["text"], + "key_words": {"text": ["beijing"]}, + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_text_to_text_stream_001(omni_server, openai_client) -> None: + """ + Input Modal: text + Output Modal: text + Input Setting: stream=True + Datasets: few requests + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text_only"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "modalities": ["text"], + "key_words": {"text": ["beijing"]}, + } + + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_image_to_text_001(omni_server, openai_client) -> None: + """ + Input Modal: image + text + Output Modal: text + Input Setting: stream=True + Datasets: single request + """ + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + image_data_url=image_data_url, + content_text=get_prompt("text_image"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "modalities": ["text"], + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_audio_to_text_001(omni_server, openai_client) -> None: + """ + Input Modal: audio + text + Output Modal: text + Input Setting: stream=True + Datasets: single request + """ + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(2, 1)['base64']}" + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + audio_data_url=audio_data_url, + content_text=get_prompt("text_audio"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "modalities": ["text"], + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_video_to_text_001(omni_server, openai_client) -> None: + """ + Input Modal: video + text + Output Modal: text + Input Setting: stream=False + Datasets: single request + """ + video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + video_data_url=video_data_url, + content_text=get_prompt("text_video"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": False, + "modalities": ["text"], + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=4) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_mix_to_text_001(omni_server, openai_client) -> None: + """ + Input Modal: text + audio + image + video + Output Modal: text + Input Setting: stream=True + Datasets: single request + """ + video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(2, 1)['base64']}" + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + video_data_url=video_data_url, + image_data_url=image_data_url, + audio_data_url=audio_data_url, + content_text=get_prompt("mix"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "modalities": ["text"], + } + + openai_client.send_omni_request(request_config) diff --git a/tests/e2e/stage_configs/bailingmm_moe_v2_lite_ci.yaml b/tests/e2e/stage_configs/bailingmm_moe_v2_lite_ci.yaml new file mode 100644 index 00000000000..fb0c72cc513 --- /dev/null +++ b/tests/e2e/stage_configs/bailingmm_moe_v2_lite_ci.yaml @@ -0,0 +1,35 @@ +# Thinker stage only +stage_args: + - stage_id: 0 + stage_type: llm + runtime: + devices: "0,1,2,3" + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: MingFlashOmniForConditionalGeneration + worker_type: ar + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.9 + enforce_eager: false + trust_remote_code: true + engine_output_type: latent + distributed_executor_backend: "mp" + enable_prefix_caching: false + max_num_batched_tokens: 32768 + max_model_len: 32768 + tensor_parallel_size: 4 + hf_config_name: llm_config + load_format: dummy + mm_processor_cache_gb: 0 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + max_tokens: 100 + repetition_penalty: 1.05 + seed: 42 + detokenize: true + ignore_eos: false diff --git a/vllm_omni/model_executor/models/ming_flash_omni/__init__.py b/vllm_omni/model_executor/models/ming_flash_omni/__init__.py new file mode 100644 index 00000000000..d7fa44fd7e4 --- /dev/null +++ b/vllm_omni/model_executor/models/ming_flash_omni/__init__.py @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2025 The vLLM-Omni team. + +from .ming_flash_omni import MingFlashOmniForConditionalGeneration +from .ming_flash_omni_thinker import ( + MingFlashOmniThinkerDummyInputsBuilder, + MingFlashOmniThinkerForConditionalGeneration, + MingFlashOmniThinkerMultiModalProcessor, + MingFlashOmniThinkerProcessingInfo, +) + +__all__ = [ + "MingFlashOmniForConditionalGeneration", + "MingFlashOmniThinkerForConditionalGeneration", + "MingFlashOmniThinkerProcessingInfo", + "MingFlashOmniThinkerMultiModalProcessor", + "MingFlashOmniThinkerDummyInputsBuilder", +] diff --git a/vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py b/vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py new file mode 100644 index 00000000000..6ca19901141 --- /dev/null +++ b/vllm_omni/model_executor/models/ming_flash_omni/audio_encoder.py @@ -0,0 +1,246 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2025 The vLLM-Omni team. +# Copyright 2024 ANT Group and the HuggingFace Inc. team. +# Copyright (c) 2022 OpenAI +# Adapted from Ming repository modeling_whisper_encoder.py +# https://github.com/inclusionAI/Ming + +import operator +from collections.abc import Iterable +from itertools import accumulate + +import torch +import torch.nn as nn +import torch.nn.functional as F +from vllm.logger import init_logger +from vllm.model_executor.model_loader.weight_utils import default_weight_loader + +from vllm_omni.diffusion.attention.backends.utils.fa import HAS_FLASH_ATTN, flash_attn_varlen_func +from vllm_omni.model_executor.models.whisper_utils import Conv1d, Linear, sinusoids + +logger = init_logger(__name__) + + +class MultiHeadAttention(nn.Module): + """Multi-head attention with packed sequence support. + Adapted from Qwen3-TTS WhisperEncoder. + """ + + def __init__(self, n_state: int, n_head: int, use_flash_attn: bool = True): + super().__init__() + self.n_head = n_head + self.query = Linear(n_state, n_state) + self.key = Linear(n_state, n_state, bias=False) + self.value = Linear(n_state, n_state) + self.out = Linear(n_state, n_state) + + if use_flash_attn and not HAS_FLASH_ATTN: + logger.warning("flash-attn is not available. Fallback to manual PyTorch version") + self.use_flash_attn = use_flash_attn and HAS_FLASH_ATTN + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor: + """Forward pass with packed sequence support. + + Args: + x: [total_tokens, n_state] packed sequence + cu_seqlens: [num_seqs + 1] cumulative sequence lengths, e.g. [0, len1, len1+len2, ...] + + Returns: + [total_tokens, n_state] attention output + """ + q = self.query(x) + k = self.key(x) + v = self.value(x) + + n_ctx, n_state = q.shape + head_dim = n_state // self.n_head + + q = q.view(n_ctx, self.n_head, head_dim) + k = k.view(n_ctx, self.n_head, head_dim) + v = v.view(n_ctx, self.n_head, head_dim) + + # Try flash attention varlen + if self.use_flash_attn and cu_seqlens is not None and q.dtype in [torch.float16, torch.bfloat16]: + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() + attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen) + else: + attn_output = self._manual_attention(q, k, v, cu_seqlens) + + # Reshape back: [T, H, D] -> [T, H*D] + attn_output = attn_output.contiguous().view(n_ctx, n_state) + return self.out(attn_output) + + def _manual_attention( + self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cu_seqlens: torch.Tensor + ) -> torch.Tensor: + """Manual attention for variable-length sequences (fallback).""" + _, n_head, head_dim = q.shape + scale = head_dim**-0.5 + + # Unpack sequences and pad to max length + seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() + batch_size = len(seqlens) + max_seqlen = max(seqlens) + + # Create padded tensors + q_padded = torch.zeros(batch_size, max_seqlen, n_head, head_dim, dtype=q.dtype, device=q.device) + k_padded = torch.zeros_like(q_padded) + v_padded = torch.zeros_like(q_padded) + + # Fill with actual sequences + for i in range(batch_size): + start_idx = cu_seqlens[i] + end_idx = cu_seqlens[i + 1] + seq_len = seqlens[i] + q_padded[i, :seq_len] = q[start_idx:end_idx] + k_padded[i, :seq_len] = k[start_idx:end_idx] + v_padded[i, :seq_len] = v[start_idx:end_idx] + + # Transpose for attention: [B, H, T, D] + q_padded = q_padded.transpose(1, 2) + k_padded = k_padded.transpose(1, 2) + v_padded = v_padded.transpose(1, 2) + + # Create attention mask for variable lengths: 0 for valid positions, -inf for padding + padding_mask = ( + torch.arange(max_seqlen, device=q.device)[None, :] >= torch.tensor(seqlens, device=q.device)[:, None] + ) + attn_mask = torch.zeros(batch_size, 1, 1, max_seqlen, dtype=q.dtype, device=q.device) + attn_mask = attn_mask.masked_fill(padding_mask.unsqueeze(1).unsqueeze(2), -torch.finfo(q.dtype).max) + + # Compute attention + attn_scores = torch.matmul(q_padded, k_padded.transpose(-2, -1)) * scale + attn_scores = attn_scores + attn_mask + attn_weights = F.softmax(attn_scores, dim=-1) + context = torch.matmul(attn_weights, v_padded) + + # Transpose back: [B, H, T, D] -> [B, T, H, D] + context = context.transpose(1, 2).contiguous() + output_packed = torch.cat([context[i, : seqlens[i]] for i in range(batch_size)], dim=0) + + return output_packed + + +class ResidualAttentionBlock(nn.Module): + """Whisper-style residual attention block with packed sequence support. + + Adapted from + https://github.com/openai/whisper/blob/v20250625/whisper/model.py + vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py + """ + + def __init__(self, n_state: int, n_head: int, use_flash_attn: bool = True): + super().__init__() + self.attn = MultiHeadAttention(n_state, n_head, use_flash_attn=use_flash_attn) + self.attn_ln = nn.LayerNorm(n_state) + + n_mlp = n_state * 4 + self.mlp = nn.Sequential( + Linear(n_state, n_mlp), + nn.GELU(), + Linear(n_mlp, n_state), + ) + self.mlp_ln = nn.LayerNorm(n_state) + + def forward(self, x: torch.Tensor, cu_seqlens: torch.Tensor) -> torch.Tensor: + x = x + self.attn(self.attn_ln(x), cu_seqlens=cu_seqlens) + x = x + self.mlp(self.mlp_ln(x)) + return x + + +class WhisperAudioEncoder(nn.Module): + """Whisper audio encoder for Ming with packed sequence support. + + Adapted from + https://github.com/openai/whisper/blob/v20250625/whisper/model.py + vllm_omni/model_executor/models/qwen3_tts/tokenizer_25hz/vq/whisper_encoder.py + """ + + def __init__( + self, + n_mels: int = 128, + n_ctx: int = 15000, + n_state: int = 1280, + n_head: int = 20, + n_layer: int = 32, + use_flash_attn: bool = True, + ): + super().__init__() + self.conv1 = Conv1d(n_mels, n_state, kernel_size=3, padding=1) + self.conv2 = Conv1d(n_state, n_state, kernel_size=3, stride=2, padding=1) + # self.positional_embedding = nn.Parameter(torch.empty(n_ctx, n_state)) + self.register_buffer("positional_embedding", sinusoids(n_ctx, n_state)) + self.blocks = nn.ModuleList( + [ResidualAttentionBlock(n_state, n_head, use_flash_attn=use_flash_attn) for _ in range(n_layer)] + ) + self.ln_post = nn.LayerNorm(n_state) + self.audio_emb_dim = n_state + + self.n_layer = n_layer + self.n_mels = n_mels + self.use_flash_attn = use_flash_attn + + def forward( + self, + x_list: list[torch.Tensor], + audio_lens: list[int], + ) -> torch.Tensor: + """Forward pass with packed sequence format for variable-length inputs. + + Args: + x_list: List of [n_mels, T_i] mel spectrogram features for each audio + audio_lens: List of original audio lengths in frames + + Returns: + [total_T', n_state] packed encoded audio features, where + total_T' is the sum of all encoded sequence lengths + """ + # Cast inputs to model dtype + target_dtype = self.conv1.weight.dtype + x_list = [x.to(target_dtype) for x in x_list] + + encoded_list = [] + encoded_lens = [] + for mel_spec in x_list: + # mel_spec: [n_mels, T] - process through conv layers + x = mel_spec.unsqueeze(0) # [1, n_mels, T] + x = F.gelu(self.conv1(x)) + x = F.gelu(self.conv2(x)) + x = x.squeeze(0).transpose(0, 1) # [T', n_state] + + # Add positional embedding + seq_len = x.shape[0] + positional_embedding = self.positional_embedding[:seq_len, :] + x = (x + positional_embedding).to(x.dtype) + + encoded_list.append(x) + encoded_lens.append(seq_len) + + x_packed = torch.cat(encoded_list, dim=0) # [total_T', n_state] + + cu_seqlens = list(accumulate(encoded_lens, func=operator.add, initial=0)) + cu_seqlens = torch.tensor(cu_seqlens, device=x_packed.device, dtype=torch.int32) + + for block in self.blocks: + x_packed = block(x_packed, cu_seqlens=cu_seqlens) + + x_packed = self.ln_post(x_packed) + return x_packed + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + params_dict: dict[str, torch.Tensor] = { + **dict(self.named_parameters(remove_duplicate=False)), + **dict(self.named_buffers()), + } + loaded_params: set[str] = set() + + for name, loaded_weight in weights: + if name not in params_dict: + logger.warning("Skipping unknown audio encoder weight: %s", name) + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + + return loaded_params diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni.py new file mode 100644 index 00000000000..87728890b67 --- /dev/null +++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni.py @@ -0,0 +1,223 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2025 The vLLM-Omni team. +# Copyright 2024 ANT Group and the HuggingFace Inc. team. All rights reserved. +# Adapted from Ming repository modeling_bailingmm2.py +# https://github.com/inclusionAI/Ming +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Ming-flash-omni-2.0 unified model (thinker + imagegen + talker).""" + +from collections.abc import Iterable + +import torch +import torch.nn as nn +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.model_executor.models.interfaces import ( + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) +from vllm.model_executor.models.module_mapping import MultiModelKeys +from vllm.model_executor.models.utils import ( + init_vllm_registered_model, + maybe_prefix, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.sequence import IntermediateTensors + +from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin +from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.model_executor.models.utils import add_prefix_to_loaded_weights +from vllm_omni.transformers_utils.configs.ming_flash_omni import BailingMM2Config, MingFlashOmniConfig + +from .ming_flash_omni_thinker import ( + MingFlashOmniThinkerDummyInputsBuilder, + MingFlashOmniThinkerMultiModalProcessor, + MingFlashOmniThinkerProcessingInfo, +) + +logger = init_logger(__name__) + + +@MULTIMODAL_REGISTRY.register_processor( + MingFlashOmniThinkerMultiModalProcessor, + info=MingFlashOmniThinkerProcessingInfo, + dummy_inputs=MingFlashOmniThinkerDummyInputsBuilder, +) +class MingFlashOmniForConditionalGeneration( + nn.Module, + SupportsMultiModal, + SupportsPP, + SupportsMRoPE, + CustomProcessMixin, +): + """Unified Ming-flash-omni-2.0 model combining thinker, imagegen, and talker.""" + + supports_multimodal = True + requires_raw_input_tokens: bool = True + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.have_multimodal_outputs = True + self.has_preprocess = False + self.has_postprocess = False + + config = vllm_config.model_config.hf_config + + self.vllm_config = vllm_config + self.config = config + + if isinstance(config, MingFlashOmniConfig): + thinker_config = config.thinker_config + else: + thinker_config = config + + self.thinker_config: BailingMM2Config = thinker_config + self.model_stage = vllm_config.model_config.model_stage + + if self.model_stage == "thinker": + thinker_vllm_config = vllm_config.with_hf_config( + thinker_config, architectures=["MingFlashOmniThinkerForConditionalGeneration"] + ) + self.thinker = init_vllm_registered_model( + vllm_config=thinker_vllm_config, + prefix=maybe_prefix(prefix, "thinker"), + architectures=["MingFlashOmniThinkerForConditionalGeneration"], + ) + self.model = self.thinker + self.imagegen = None + self.talker = None + + elif self.model_stage == "imagegen": + # TODO: Implement image generator stage + raise NotImplementedError( + "Image generation stage is not yet implemented. Please use model_stage='thinker' for now." + ) + + elif self.model_stage == "talker": + # TODO: Implement talker (TTS) stage + raise NotImplementedError( + "Talker (TTS) stage is not yet implemented. Please use model_stage='thinker' for now." + ) + + else: + raise ValueError( + f"Invalid model_stage: {self.model_stage}. Must be one of: 'thinker', 'imagegen', 'talker'" + ) + + # Set up intermediate tensors + self.make_empty_intermediate_tensors = ( + self.thinker.make_empty_intermediate_tensors if self.model_stage == "thinker" else lambda: None + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors: IntermediateTensors | None = None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> OmniOutput: + return self.model.forward( + input_ids=input_ids, + positions=positions, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds, + **kwargs, + ) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata=None, + ) -> torch.Tensor | None: + if hasattr(self.model, "compute_logits"): + return self.model.compute_logits(hidden_states, sampling_metadata) + return None + + def sample( + self, + logits: torch.Tensor, + sampling_metadata, + ): + if hasattr(self.model, "sample"): + return self.model.sample(logits, sampling_metadata) + raise NotImplementedError("sample method not available on current stage") + + def get_mrope_input_positions(self, *args, **kwargs): + if hasattr(self.model, "get_mrope_input_positions"): + return self.model.get_mrope_input_positions(*args, **kwargs) + raise NotImplementedError("get_mrope_input_positions not available on current stage") + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + loaded_weights = set() + thinker_weights = [] + imagegen_weights = [] + talker_weights = [] + + for name, value in weights: + if name.startswith("thinker."): + thinker_weights.append((name, value)) + elif name.startswith("imagegen."): + imagegen_weights.append((name, value)) + elif name.startswith("talker."): + talker_weights.append((name, value)) + else: + # Weights without prefix go to thinker by default + thinker_weights.append((name, value)) + + if self.model_stage == "thinker" and thinker_weights: + # Remove "thinker." prefix before loading + thinker_weights_stripped = [ + (name.replace("thinker.", "", 1) if name.startswith("thinker.") else name, value) + for name, value in thinker_weights + ] + thinker_loaded = self.thinker.load_weights(thinker_weights_stripped) + thinker_loaded = add_prefix_to_loaded_weights(thinker_loaded, "thinker") + loaded_weights.update(thinker_loaded) + + # TODO: Load imagegen weights when implemented + # TODO: Load talker weights when implemented + + return loaded_weights + + def get_mm_mapping(self) -> MultiModelKeys: + return MultiModelKeys.from_string_field( + language_model="thinker.language_model", + connector=["thinker.linear_proj.", "thinker.linear_proj_audio."], + tower_model=["thinker.vision.", "thinker.audio."], + ) + + @property + def sampler(self): + if hasattr(self.model, "sampler"): + return self.model.sampler + return None + + def embed_input_ids( + self, + input_ids: torch.Tensor, + multimodal_embeddings=None, + *, + is_multimodal=None, + ) -> torch.Tensor: + return self.model.embed_input_ids( + input_ids, + multimodal_embeddings, + is_multimodal=is_multimodal, + ) + + def embed_multimodal(self, **kwargs): + return self.model.embed_multimodal(**kwargs) diff --git a/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py new file mode 100644 index 00000000000..bde7477b945 --- /dev/null +++ b/vllm_omni/model_executor/models/ming_flash_omni/ming_flash_omni_thinker.py @@ -0,0 +1,893 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2025 The vLLM-Omni team. +# Copyright 2024 ANT Group and the HuggingFace Inc. team. +# Adapted from Ming repository modeling_bailingmm2.py and processing_bailingmm2.py +# https://github.com/inclusionAI/Ming + +"""Ming-flash-omni-2.0 Thinker stage implementation (multimodal understanding).""" + +from collections.abc import Iterable, Iterator, Mapping, Sequence +from typing import Annotated, Any + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers.feature_extraction_utils import BatchFeature +from vllm.config import VllmConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.inputs import MultiModalDataDict +from vllm.logger import init_logger +from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, + SupportsMRoPE, + SupportsMultiModal, + SupportsPP, +) +from vllm.model_executor.models.qwen2_5_vl import ( + Qwen2_5_VLImageInputs, + Qwen2_5_VLImagePixelInputs, + Qwen2_5_VLVideoInputs, + Qwen2_5_VLVideoPixelInputs, +) +from vllm.model_executor.models.qwen2_vl import ( + Qwen2VLProcessingInfo, +) +from vllm.model_executor.models.utils import ( + AutoWeightsLoader, + WeightsMapper, + _merge_multimodal_embeddings, + maybe_prefix, +) +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalFeatureSpec, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import ( + AudioProcessorItems, + ImageProcessorItems, + MultiModalDataItems, + MultiModalDataParser, + VideoProcessorItems, +) +from vllm.multimodal.processing import ( + BaseDummyInputsBuilder, + BaseMultiModalProcessor, + PromptReplacement, + PromptUpdate, + PromptUpdateDetails, +) +from vllm.sequence import IntermediateTensors +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +from vllm_omni.model_executor.custom_process_mixin import CustomProcessMixin +from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.transformers_utils.configs.ming_flash_omni import BailingMM2Config +from vllm_omni.transformers_utils.processors.ming import ( + PLACEHOLDER_AUDIO_TOKEN_IN_TEXT, + PLACEHOLDER_IMAGE_TOKEN_IN_TEXT, + PLACEHOLDER_VIDEO_TOKEN_IN_TEXT, + MingFlashOmniProcessor, + MingWhisperFeatureExtractor, +) + +from .audio_encoder import WhisperAudioEncoder +from .modeling_bailing_moe_v2 import BailingMoeV2ForCausalLM +from .projectors import AudioProjector, VisionProjector +from .vision_encoder import MingVisionEncoder + +logger = init_logger(__name__) + + +class MingAudioInput(TensorSchema): + """ + Dimensions: + - b: Batch size + - l: Total audio frames (clips concatenated along the time axis) + - nm: Number of mel bins + - N: Max number of audio clips per batch item + """ + + audio_feats: Annotated[ + torch.Tensor, + TensorShape("b", "l", "nm"), + ] + + audio_feats_lengths: Annotated[ + torch.Tensor, + TensorShape("b", "N"), + ] + + +class MingFlashOmniThinkerProcessingInfo(Qwen2VLProcessingInfo): + def get_hf_config(self) -> BailingMM2Config: + return self.ctx.get_hf_config(BailingMM2Config) + + def get_hf_processor(self, **kwargs: object): + return self.ctx.get_hf_processor(MingFlashOmniProcessor, **kwargs) + + def get_target_channels(self) -> int: + # See `_normalize_audio_tensor` in vllm_omni/transformers_utils/processors/ming.py + return 1 + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"image": None, "video": None, "audio": None} + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int]: + mm_counts = mm_counts or {} + requested_modalities = {m for m, c in mm_counts.items() if c > 0} + mm_max_tokens: dict[str, int] = {} + + if requested_modalities & {"image", "video"}: + vl_tokens = super().get_mm_max_tokens_per_item( + seq_len=seq_len, + mm_counts=mm_counts, + ) + mm_max_tokens.update({m: vl_tokens[m] for m in ["image", "video"] if m in requested_modalities}) + + if "audio" in requested_modalities: + # TODO: consider computing from audio config + mm_max_tokens["audio"] = 3000 + + return mm_max_tokens + + def get_feature_extractor(self, **kwargs: object) -> MingWhisperFeatureExtractor: + hf_processor = self.get_hf_processor(**kwargs) + feature_extractor = hf_processor.audio_processor + assert isinstance(feature_extractor, MingWhisperFeatureExtractor) + return feature_extractor + + def get_data_parser(self): + feature_extractor = self.get_feature_extractor() + return MultiModalDataParser( + target_sr=feature_extractor.sampling_rate, + target_channels=self.get_target_channels(), + expected_hidden_size=self._get_expected_hidden_size(), + ) + + +class MingFlashOmniThinkerDummyInputsBuilder(BaseDummyInputsBuilder[MingFlashOmniThinkerProcessingInfo]): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + num_audios = mm_counts.get("audio", 0) + + hf_processor = self.info.get_hf_processor() + + audio_token: str = hf_processor.audio_token + image_token: str = hf_processor.image_token + video_token: str = hf_processor.video_token + + return image_token * num_images + video_token * num_videos + audio_token * num_audios + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + num_audios = mm_counts.get("audio", 0) + + # Default dimensions for dummy data + image_width, image_height = 448, 448 + video_width, video_height = 448, 448 + num_frames = 8 + audio_duration = 3.0 # seconds + sample_rate = 16000 + + audio_length = int(audio_duration * sample_rate) + + mm_data: MultiModalDataDict = { + "image": self._get_dummy_images( + width=image_width, + height=image_height, + num_images=num_images, + ), + "video": self._get_dummy_videos( + width=video_width, + height=video_height, + num_frames=num_frames, + num_videos=num_videos, + ), + "audio": [(np.random.randn(audio_length).astype(np.float32), sample_rate) for _ in range(num_audios)], + } + + return mm_data + + +class MingFlashOmniThinkerMultiModalProcessor(BaseMultiModalProcessor[MingFlashOmniThinkerProcessingInfo]): + """Multimodal processor for Ming-flash-omni Thinker stage. + + Handles preprocessing of 1) image, 2) video, and 3) audio inputs, + and expands placeholder tokens to the correct number of patch tokens. + """ + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, Any], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + tokenizer = self.info.get_tokenizer() + # might want to add a fallback to resolve token ids + # vocab = tokenizer.get_vocab() + thinker_config = self.info.get_hf_config() + + # patch/delimiter token IDs (used in replacement sequences) + image_start_token_id = thinker_config.llm_config.image_start_token + image_patch_token_id = thinker_config.llm_config.image_patch_token + image_end_token_id = thinker_config.llm_config.image_end_token + + video_start_token_id = thinker_config.llm_config.video_start_token + frame_patch_token_id = thinker_config.llm_config.video_patch_token + video_end_token_id = thinker_config.llm_config.video_end_token + + audio_start_token_id = thinker_config.llm_config.audio_start_token + audio_patch_token_id = thinker_config.llm_config.audio_patch_token + audio_end_token_id = thinker_config.llm_config.audio_end_token + + vision_config = thinker_config.vision_config + spatial_merge_size = vision_config.spatial_merge_size if vision_config else 2 + + newline_token_ids: list[int] = tokenizer.encode("\n", add_special_tokens=False) + + out_mm_data = out_mm_kwargs.get_data() + + def get_replacement_image(item_idx: int) -> PromptUpdateDetails: + """Generate token sequence for an image.""" + grid_thw = out_mm_data.get("image_grid_thw") + if grid_thw is None: + raise ValueError( + "image_grid_thw missing from processor output; " + "cannot determine image patch count for prompt replacement." + ) + if isinstance(grid_thw, torch.Tensor): + thw = grid_thw[item_idx] + num_patches = int(thw.prod().item()) // (spatial_merge_size**2) + else: + thw = grid_thw[item_idx] + num_patches = (thw[0] * thw[1] * thw[2]) // (spatial_merge_size**2) + + # Build token sequence: *N \n + # the newline token is added in purpose from original model processing + tokens: list[int] = [] + tokens.append(image_start_token_id) + tokens.extend([image_patch_token_id] * num_patches) + tokens.append(image_end_token_id) + # Refer to Ming's BailingMM2Processor._expand_image_tokens + # https://github.com/inclusionAI/Ming/blob/3954fcb880ff5e61ff128bcf7f1ec344d46a6fe3/processing_bailingmm2.py + tokens.extend(newline_token_ids) + + # Only tokens receive multimodal embeddings + return PromptUpdateDetails.select_token_id(tokens, image_patch_token_id) + + def get_replacement_video(item_idx: int) -> PromptUpdateDetails: + """Generate token sequence for a video.""" + grid_thw = out_mm_data.get("video_grid_thw", None) + if grid_thw is None: + raise ValueError( + "video_grid_thw missing from processor output; " + "cannot determine video patch count for prompt replacement." + ) + if isinstance(grid_thw, torch.Tensor): + thw = grid_thw[item_idx] + num_patches = int(thw.prod().item()) // (spatial_merge_size**2) + else: + thw = grid_thw[item_idx] + num_patches = (thw[0] * thw[1] * thw[2]) // (spatial_merge_size**2) + + # Build token sequence: \n + # the newline token is added in purpose from original model processing + tokens: list[int] = [] + tokens.append(video_start_token_id) + tokens.extend([frame_patch_token_id] * num_patches) + tokens.append(video_end_token_id) + tokens.extend(newline_token_ids) + + # Only tokens receive multimodal embeddings + return PromptUpdateDetails.select_token_id(tokens, frame_patch_token_id) + + def get_replacement_audio(item_idx: int) -> PromptUpdateDetails: + """Generate token sequence for an audio.""" + encoder_feats_lengths = out_mm_data.get("encoder_feats_lengths", None) + if encoder_feats_lengths is None: + raise ValueError( + "encoder_feats_lengths missing from processor output; " + "cannot determine audio patch count for prompt replacement." + ) + if isinstance(encoder_feats_lengths, torch.Tensor): + num_patches = int(encoder_feats_lengths[item_idx].item()) + else: + num_patches = encoder_feats_lengths[item_idx] + + # Build token sequence: + tokens: list[int] = [] + tokens.append(audio_start_token_id) + tokens.extend([audio_patch_token_id] * num_patches) + tokens.append(audio_end_token_id) + + # Only tokens receive multimodal embeddings + return PromptUpdateDetails.select_token_id(tokens, audio_patch_token_id) + + # Build prompt updates and process replacement + updates: list[PromptUpdate] = [] + + if "image" in mm_items and mm_items.get_items("image", ImageProcessorItems): + updates.append( + PromptReplacement( + modality="image", + target=PLACEHOLDER_IMAGE_TOKEN_IN_TEXT, + replacement=get_replacement_image, + ) + ) + if "video" in mm_items and mm_items.get_items("video", VideoProcessorItems): + updates.append( + PromptReplacement( + modality="video", + target=PLACEHOLDER_VIDEO_TOKEN_IN_TEXT, + replacement=get_replacement_video, + ) + ) + if "audio" in mm_items and mm_items.get_items("audio", AudioProcessorItems): + updates.append( + PromptReplacement( + modality="audio", + target=PLACEHOLDER_AUDIO_TOKEN_IN_TEXT, + replacement=get_replacement_audio, + ) + ) + return updates + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + config: dict[str, MultiModalFieldConfig] = {} + + # Image fields, pixel_values is flat (concatenated patches from all images) + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + if "pixel_values" in hf_inputs: + image_sizes = image_grid_thw.prod(-1) + config["pixel_values"] = MultiModalFieldConfig.flat_from_sizes( + "image", + image_sizes, + ) + if "image_grid_thw" in hf_inputs: + config["image_grid_thw"] = MultiModalFieldConfig.batched("image") + + # Video fields, same flat layout as images + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + if "pixel_values_videos" in hf_inputs: + video_sizes = video_grid_thw.prod(-1) + config["pixel_values_videos"] = MultiModalFieldConfig.flat_from_sizes( + "video", + video_sizes, + ) + if "video_grid_thw" in hf_inputs: + config["video_grid_thw"] = MultiModalFieldConfig.batched("video") + + # Audio fields + if "audio_feats" in hf_inputs: + config["audio_feats"] = MultiModalFieldConfig.batched("audio") + if "audio_feats_lengths" in hf_inputs: + config["audio_feats_lengths"] = MultiModalFieldConfig.batched("audio") + if "encoder_feats_lengths" in hf_inputs: + config["encoder_feats_lengths"] = MultiModalFieldConfig.batched("audio") + if "placeholder_audio_loc_lens" in hf_inputs: + config["placeholder_audio_loc_lens"] = MultiModalFieldConfig.batched("audio") + + return config + + def _hf_processor_applies_updates( + self, + prompt_text: str, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + tokenization_kwargs: Mapping[str, object], + ) -> bool: + return False + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + """Call sub-processors for multimodal inputs and tokenize. + + We call the image/audio sub-processors directly (instead of going + through `MingFlashOmniProcessor.__call__`) so that the high-level + placeholder tokens remain **unexpanded** in the tokenized output. + """ + hf_processor = self.info.get_hf_processor() + tokenizer = self.info.get_tokenizer() + + data: dict[str, object] = {} + + images = mm_data.get("images", None) + if images is not None: + image_outputs = hf_processor.image_processor( + images=images, + videos=None, + return_tensors="pt", + ) + data.update(image_outputs) + + videos = mm_data.get("videos", None) + if videos is not None: + video_outputs = hf_processor.image_processor( + images=None, + videos=videos, + return_tensors="pt", + ) + # Rename keys to distinguish from images + if "pixel_values" in video_outputs: + video_outputs["pixel_values_videos"] = video_outputs.pop("pixel_values") + if "image_grid_thw" in video_outputs: + video_outputs["video_grid_thw"] = video_outputs.pop("image_grid_thw") + data.update(video_outputs) + + audios = mm_data.get("audios", None) + if audios is not None: + # vLLM's AudioProcessorItems provides raw numpy arrays (already resampled). + # MingWhisperAudioProcessor expects (waveform, sr) tuples, + # so wrap them with the target sample rate. + target_sr = hf_processor.audio_processor.sampling_rate + audio_tuples = [(a, target_sr) if not isinstance(a, tuple) else a for a in audios] + + audio_outputs = hf_processor.audio_processor( + audio_tuples, + return_tensors="pt", + ) + data.update(audio_outputs) + + # Tokenize text with placeholders still intact + text_outputs = tokenizer(prompt, return_tensors="pt", **tok_kwargs) + data.update(text_outputs) + + return BatchFeature(data=data) + + +@MULTIMODAL_REGISTRY.register_processor( + MingFlashOmniThinkerMultiModalProcessor, + info=MingFlashOmniThinkerProcessingInfo, + dummy_inputs=MingFlashOmniThinkerDummyInputsBuilder, +) +class MingFlashOmniThinkerForConditionalGeneration( + nn.Module, + SupportsMultiModal, + SupportsPP, + SupportsMRoPE, + CustomProcessMixin, +): + """Ming Thinker stage: multimodal understanding + (text + image + video + audio) -> text generation. + """ + + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={"model.": "language_model."}, + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + # vllm_omni/transformers_utils/processors/ming.py + if modality.startswith("image"): + return "" + elif modality.startswith("video"): + return "